line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
// This file is part of UDPipe . |
2
|
|
|
|
|
|
|
// |
3
|
|
|
|
|
|
|
// This file is a bundle of all sources and headers of UDPipe library. |
4
|
|
|
|
|
|
|
// Comments and copyrights of all individual files are kept. |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
#include |
7
|
|
|
|
|
|
|
#include |
8
|
|
|
|
|
|
|
#include |
9
|
|
|
|
|
|
|
#include |
10
|
|
|
|
|
|
|
#include |
11
|
|
|
|
|
|
|
#include |
12
|
|
|
|
|
|
|
#include |
13
|
|
|
|
|
|
|
#include |
14
|
|
|
|
|
|
|
#include |
15
|
|
|
|
|
|
|
#include |
16
|
|
|
|
|
|
|
#include |
17
|
|
|
|
|
|
|
#include |
18
|
|
|
|
|
|
|
#include |
19
|
|
|
|
|
|
|
#include |
20
|
|
|
|
|
|
|
#include |
21
|
|
|
|
|
|
|
#include |
22
|
|
|
|
|
|
|
#include |
23
|
|
|
|
|
|
|
#include |
24
|
|
|
|
|
|
|
#include |
25
|
|
|
|
|
|
|
#include |
26
|
|
|
|
|
|
|
#include |
27
|
|
|
|
|
|
|
#include |
28
|
|
|
|
|
|
|
#include |
29
|
|
|
|
|
|
|
#include |
30
|
|
|
|
|
|
|
#include |
31
|
|
|
|
|
|
|
#include |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
namespace ufal { |
34
|
|
|
|
|
|
|
namespace udpipe { |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
///////// |
37
|
|
|
|
|
|
|
// File: utils/common.h |
38
|
|
|
|
|
|
|
///////// |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
41
|
|
|
|
|
|
|
// |
42
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
43
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
44
|
|
|
|
|
|
|
// |
45
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
46
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
47
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
// Headers available in all sources |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
namespace utils { |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
using namespace std; |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
// Assert that int is at least 4B |
56
|
|
|
|
|
|
|
static_assert(sizeof(int) >= sizeof(int32_t), "Int must be at least 4B wide!"); |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
// Assert that we are on a little endian system |
59
|
|
|
|
|
|
|
#ifdef __BYTE_ORDER__ |
60
|
|
|
|
|
|
|
static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Only little endian systems are supported!"); |
61
|
|
|
|
|
|
|
#endif |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
#define runtime_failure(message) exit((cerr << message << endl, 1)) |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
} // namespace utils |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
///////// |
68
|
|
|
|
|
|
|
// File: utils/string_piece.h |
69
|
|
|
|
|
|
|
///////// |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
72
|
|
|
|
|
|
|
// |
73
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
74
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
75
|
|
|
|
|
|
|
// |
76
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
77
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
78
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
namespace utils { |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
struct string_piece { |
83
|
|
|
|
|
|
|
const char* str; |
84
|
|
|
|
|
|
|
size_t len; |
85
|
|
|
|
|
|
|
|
86
|
16
|
|
|
|
|
|
string_piece() : str(nullptr), len(0) {} |
87
|
42
|
|
|
|
|
|
string_piece(const char* str) : str(str), len(strlen(str)) {} |
88
|
128
|
|
|
|
|
|
string_piece(const char* str, size_t len) : str(str), len(len) {} |
89
|
43
|
|
|
|
|
|
string_piece(const string& str) : str(str.c_str()), len(str.size()) {} |
90
|
|
|
|
|
|
|
}; |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
inline ostream& operator<<(ostream& os, const string_piece& str) { |
93
|
0
|
0
|
|
|
|
|
return os.write(str.str, str.len); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
94
|
|
|
|
|
|
|
} |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
inline bool operator==(const string_piece& a, const string_piece& b) { |
97
|
73
|
100
|
|
|
|
|
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
98
|
|
|
|
|
|
|
} |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
inline bool operator!=(const string_piece& a, const string_piece& b) { |
101
|
|
|
|
|
|
|
return a.len != b.len || memcmp(a.str, b.str, a.len) != 0; |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
} // namespace utils |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
///////// |
107
|
|
|
|
|
|
|
// File: common.h |
108
|
|
|
|
|
|
|
///////// |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
// This file is part of UDPipe . |
111
|
|
|
|
|
|
|
// |
112
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
113
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
114
|
|
|
|
|
|
|
// |
115
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
116
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
117
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
using namespace utils; |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
///////// |
122
|
|
|
|
|
|
|
// File: sentence/empty_node.h |
123
|
|
|
|
|
|
|
///////// |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
// This file is part of UDPipe . |
126
|
|
|
|
|
|
|
// |
127
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
128
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
129
|
|
|
|
|
|
|
// |
130
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
131
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
132
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
133
|
|
|
|
|
|
|
|
134
|
0
|
|
|
|
|
|
class empty_node { |
135
|
|
|
|
|
|
|
public: |
136
|
|
|
|
|
|
|
int id; // 0 is root, >0 is sentence word, <0 is undefined |
137
|
|
|
|
|
|
|
int index; // index for the current id, should be numbered from 1, 0=undefined |
138
|
|
|
|
|
|
|
string form; // form |
139
|
|
|
|
|
|
|
string lemma; // lemma |
140
|
|
|
|
|
|
|
string upostag; // universal part-of-speech tag |
141
|
|
|
|
|
|
|
string xpostag; // language-specific part-of-speech tag |
142
|
|
|
|
|
|
|
string feats; // list of morphological features |
143
|
|
|
|
|
|
|
string deps; // secondary dependencies |
144
|
|
|
|
|
|
|
string misc; // miscellaneous information |
145
|
|
|
|
|
|
|
|
146
|
0
|
|
|
|
|
|
empty_node(int id = -1, int index = 0) : id(id), index(index) {} |
147
|
|
|
|
|
|
|
}; |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
///////// |
150
|
|
|
|
|
|
|
// File: sentence/token.h |
151
|
|
|
|
|
|
|
///////// |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
// This file is part of UDPipe . |
154
|
|
|
|
|
|
|
// |
155
|
|
|
|
|
|
|
// Copyright 2017 Institute of Formal and Applied Linguistics, Faculty of |
156
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
157
|
|
|
|
|
|
|
// |
158
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
159
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
160
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
161
|
|
|
|
|
|
|
|
162
|
24
|
|
|
|
|
|
class token { |
163
|
|
|
|
|
|
|
public: |
164
|
|
|
|
|
|
|
string form; |
165
|
|
|
|
|
|
|
string misc; |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
token(string_piece form = string_piece(), string_piece misc = string_piece()); |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
// CoNLL-U defined SpaceAfter=No feature |
170
|
|
|
|
|
|
|
bool get_space_after() const; |
171
|
|
|
|
|
|
|
void set_space_after(bool space_after); |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
// UDPipe-specific all-spaces-preserving SpacesBefore and SpacesAfter features |
174
|
|
|
|
|
|
|
void get_spaces_before(string& spaces_before) const; |
175
|
|
|
|
|
|
|
void set_spaces_before(string_piece spaces_before); |
176
|
|
|
|
|
|
|
void get_spaces_after(string& spaces_after) const; |
177
|
|
|
|
|
|
|
void set_spaces_after(string_piece spaces_after); |
178
|
|
|
|
|
|
|
void get_spaces_in_token(string& spaces_in_token) const; |
179
|
|
|
|
|
|
|
void set_spaces_in_token(string_piece spaces_in_token); |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
// UDPipe-specific TokenRange feature |
182
|
|
|
|
|
|
|
bool get_token_range(size_t& start, size_t& end) const; |
183
|
|
|
|
|
|
|
void set_token_range(size_t start, size_t end); |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
private: |
186
|
|
|
|
|
|
|
bool get_misc_field(string_piece name, string_piece& value) const; |
187
|
|
|
|
|
|
|
void remove_misc_field(string_piece name); |
188
|
|
|
|
|
|
|
string& start_misc_field(string_piece name); |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
void append_escaped_spaces(string_piece spaces, string& escaped_spaces) const; |
191
|
|
|
|
|
|
|
void unescape_spaces(string_piece escaped_spaces, string& spaces) const; |
192
|
|
|
|
|
|
|
}; |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
///////// |
195
|
|
|
|
|
|
|
// File: sentence/multiword_token.h |
196
|
|
|
|
|
|
|
///////// |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
// This file is part of UDPipe . |
199
|
|
|
|
|
|
|
// |
200
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
201
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
202
|
|
|
|
|
|
|
// |
203
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
204
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
205
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
206
|
|
|
|
|
|
|
|
207
|
0
|
0
|
|
|
|
|
class multiword_token : public token { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
208
|
|
|
|
|
|
|
public: |
209
|
|
|
|
|
|
|
// form and misc are inherited from token |
210
|
|
|
|
|
|
|
int id_first, id_last; |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
multiword_token(int id_first = -1, int id_last = -1, string_piece form = string_piece(), string_piece misc = string_piece()) |
213
|
0
|
|
|
|
|
|
: token(form, misc), id_first(id_first), id_last(id_last) {} |
214
|
|
|
|
|
|
|
}; |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
///////// |
217
|
|
|
|
|
|
|
// File: sentence/word.h |
218
|
|
|
|
|
|
|
///////// |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
// This file is part of UDPipe . |
221
|
|
|
|
|
|
|
// |
222
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
223
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
224
|
|
|
|
|
|
|
// |
225
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
226
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
227
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
228
|
|
|
|
|
|
|
|
229
|
46
|
0
|
|
|
|
|
class word : public token { |
230
|
|
|
|
|
|
|
public: |
231
|
|
|
|
|
|
|
// form and misc are inherited from token |
232
|
|
|
|
|
|
|
int id; // 0 is root, >0 is sentence word, <0 is undefined |
233
|
|
|
|
|
|
|
string lemma; // lemma |
234
|
|
|
|
|
|
|
string upostag; // universal part-of-speech tag |
235
|
|
|
|
|
|
|
string xpostag; // language-specific part-of-speech tag |
236
|
|
|
|
|
|
|
string feats; // list of morphological features |
237
|
|
|
|
|
|
|
int head; // head, 0 is root, <0 is undefined |
238
|
|
|
|
|
|
|
string deprel; // dependency relation to the head |
239
|
|
|
|
|
|
|
string deps; // secondary dependencies |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
vector children; |
242
|
|
|
|
|
|
|
|
243
|
20
|
|
|
|
|
|
word(int id = -1, string_piece form = string_piece()) : token(form), id(id), head(-1) {} |
244
|
|
|
|
|
|
|
}; |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
///////// |
247
|
|
|
|
|
|
|
// File: sentence/sentence.h |
248
|
|
|
|
|
|
|
///////// |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
// This file is part of UDPipe . |
251
|
|
|
|
|
|
|
// |
252
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
253
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
254
|
|
|
|
|
|
|
// |
255
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
256
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
257
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
258
|
|
|
|
|
|
|
|
259
|
0
|
0
|
|
|
|
|
class sentence { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
260
|
|
|
|
|
|
|
public: |
261
|
|
|
|
|
|
|
sentence(); |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
vector words; |
264
|
|
|
|
|
|
|
vector multiword_tokens; |
265
|
|
|
|
|
|
|
vector empty_nodes; |
266
|
|
|
|
|
|
|
vector comments; |
267
|
|
|
|
|
|
|
static const string root_form; |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
// Basic sentence modifications |
270
|
|
|
|
|
|
|
bool empty(); |
271
|
|
|
|
|
|
|
void clear(); |
272
|
|
|
|
|
|
|
word& add_word(string_piece form = string_piece()); |
273
|
|
|
|
|
|
|
void set_head(int id, int head, const string& deprel); |
274
|
|
|
|
|
|
|
void unlink_all_words(); |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
// CoNLL-U defined comments |
277
|
|
|
|
|
|
|
bool get_new_doc(string* id = nullptr) const; |
278
|
|
|
|
|
|
|
void set_new_doc(bool new_doc, string_piece id = string_piece()); |
279
|
|
|
|
|
|
|
bool get_new_par(string* id = nullptr) const; |
280
|
|
|
|
|
|
|
void set_new_par(bool new_par, string_piece id = string_piece()); |
281
|
|
|
|
|
|
|
bool get_sent_id(string& id) const; |
282
|
|
|
|
|
|
|
void set_sent_id(string_piece id); |
283
|
|
|
|
|
|
|
bool get_text(string& text) const; |
284
|
|
|
|
|
|
|
void set_text(string_piece text); |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
private: |
287
|
|
|
|
|
|
|
bool get_comment(string_piece name, string* value) const; |
288
|
|
|
|
|
|
|
void remove_comment(string_piece name); |
289
|
|
|
|
|
|
|
void set_comment(string_piece name, string_piece value = string_piece()); |
290
|
|
|
|
|
|
|
}; |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
///////// |
293
|
|
|
|
|
|
|
// File: sentence/input_format.h |
294
|
|
|
|
|
|
|
///////// |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
// This file is part of UDPipe . |
297
|
|
|
|
|
|
|
// |
298
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
299
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
300
|
|
|
|
|
|
|
// |
301
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
302
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
303
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
304
|
|
|
|
|
|
|
|
305
|
1
|
|
|
|
|
|
class input_format { |
306
|
|
|
|
|
|
|
public: |
307
|
1
|
|
|
|
|
|
virtual ~input_format() {} |
308
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
virtual bool read_block(istream& is, string& block) const = 0; |
310
|
|
|
|
|
|
|
virtual void reset_document(string_piece id = string_piece()) = 0; |
311
|
|
|
|
|
|
|
virtual void set_text(string_piece text, bool make_copy = false) = 0; |
312
|
|
|
|
|
|
|
virtual bool next_sentence(sentence& s, string& error) = 0; |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
// Static factory methods |
315
|
|
|
|
|
|
|
static input_format* new_input_format(const string& name); |
316
|
|
|
|
|
|
|
static input_format* new_conllu_input_format(const string& options = string()); |
317
|
|
|
|
|
|
|
static input_format* new_generic_tokenizer_input_format(const string& options = string()); |
318
|
|
|
|
|
|
|
static input_format* new_horizontal_input_format(const string& options = string()); |
319
|
|
|
|
|
|
|
static input_format* new_vertical_input_format(const string& options = string()); |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
static input_format* new_presegmented_tokenizer(input_format* tokenizer); |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
static const string CONLLU_V1; |
324
|
|
|
|
|
|
|
static const string CONLLU_V2; |
325
|
|
|
|
|
|
|
static const string GENERIC_TOKENIZER_NORMALIZED_SPACES; |
326
|
|
|
|
|
|
|
static const string GENERIC_TOKENIZER_PRESEGMENTED; |
327
|
|
|
|
|
|
|
static const string GENERIC_TOKENIZER_RANGES; |
328
|
|
|
|
|
|
|
}; |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
///////// |
331
|
|
|
|
|
|
|
// File: model/model.h |
332
|
|
|
|
|
|
|
///////// |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
// This file is part of UDPipe . |
335
|
|
|
|
|
|
|
// |
336
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
337
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
338
|
|
|
|
|
|
|
// |
339
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
340
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
341
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
342
|
|
|
|
|
|
|
|
343
|
1
|
|
|
|
|
|
class model { |
344
|
|
|
|
|
|
|
public: |
345
|
1
|
|
|
|
|
|
virtual ~model() {} |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
static model* load(const char* fname); |
348
|
|
|
|
|
|
|
static model* load(istream& is); |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
virtual input_format* new_tokenizer(const string& options) const = 0; |
351
|
|
|
|
|
|
|
virtual bool tag(sentence& s, const string& options, string& error) const = 0; |
352
|
|
|
|
|
|
|
virtual bool parse(sentence& s, const string& options, string& error) const = 0; |
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
static const string DEFAULT; |
355
|
|
|
|
|
|
|
static const string TOKENIZER_NORMALIZED_SPACES; |
356
|
|
|
|
|
|
|
static const string TOKENIZER_PRESEGMENTED; |
357
|
|
|
|
|
|
|
static const string TOKENIZER_RANGES; |
358
|
|
|
|
|
|
|
}; |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
///////// |
361
|
|
|
|
|
|
|
// File: model/evaluator.h |
362
|
|
|
|
|
|
|
///////// |
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
// This file is part of UDPipe . |
365
|
|
|
|
|
|
|
// |
366
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
367
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
368
|
|
|
|
|
|
|
// |
369
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
370
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
371
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
class evaluator { |
374
|
|
|
|
|
|
|
public: |
375
|
|
|
|
|
|
|
evaluator(const model* m, const string& tokenizer, const string& tagger, const string& parser); |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
void set_model(const model* m); |
378
|
|
|
|
|
|
|
void set_tokenizer(const string& tokenizer); |
379
|
|
|
|
|
|
|
void set_tagger(const string& tagger); |
380
|
|
|
|
|
|
|
void set_parser(const string& parser); |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
bool evaluate(istream& is, ostream& os, string& error) const; |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
static const string DEFAULT; |
385
|
|
|
|
|
|
|
static const string NONE; |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
private: |
388
|
|
|
|
|
|
|
const model* m; |
389
|
|
|
|
|
|
|
string tokenizer, tagger, parser; |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
struct f1_info { size_t total_system, total_gold; double precision, recall, f1; }; |
392
|
|
|
|
|
|
|
template |
393
|
|
|
|
|
|
|
static f1_info evaluate_f1(const vector>& system, const vector>& gold); |
394
|
|
|
|
|
|
|
|
395
|
0
|
|
|
|
|
|
class evaluation_data { |
396
|
|
|
|
|
|
|
public: |
397
|
0
|
|
|
|
|
|
struct word_data { |
398
|
|
|
|
|
|
|
size_t start, end; |
399
|
|
|
|
|
|
|
bool is_multiword; |
400
|
|
|
|
|
|
|
word w; |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
word_data(size_t start, size_t end, int id, bool is_multiword, const word& w); |
403
|
|
|
|
|
|
|
}; |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
void add_sentence(const sentence& s); |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
u32string chars; |
408
|
|
|
|
|
|
|
vector> sentences, tokens; |
409
|
|
|
|
|
|
|
vector> multiwords; |
410
|
|
|
|
|
|
|
vector words; |
411
|
|
|
|
|
|
|
}; |
412
|
|
|
|
|
|
|
|
413
|
0
|
|
|
|
|
|
class word_alignment { |
414
|
|
|
|
|
|
|
public: |
415
|
0
|
|
|
|
|
|
struct pair_system_gold { |
416
|
|
|
|
|
|
|
word system; const word& gold; |
417
|
0
|
0
|
|
|
|
|
pair_system_gold(const word& system, const word& gold) : system(system), gold(gold) {} |
418
|
|
|
|
|
|
|
}; |
419
|
|
|
|
|
|
|
vector matched; |
420
|
|
|
|
|
|
|
size_t total_system, total_gold; |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
template |
423
|
|
|
|
|
|
|
f1_info evaluate_f1(Equals equals); |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
static bool perfect_alignment(const evaluation_data& system, const evaluation_data& gold, word_alignment& alignment); |
426
|
|
|
|
|
|
|
static void best_alignment(const evaluation_data& system, const evaluation_data& gold, word_alignment& alignment); |
427
|
|
|
|
|
|
|
}; |
428
|
|
|
|
|
|
|
}; |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
///////// |
431
|
|
|
|
|
|
|
// File: unilib/unicode.h |
432
|
|
|
|
|
|
|
///////// |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
// This file is part of UniLib . |
435
|
|
|
|
|
|
|
// |
436
|
|
|
|
|
|
|
// Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of |
437
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
438
|
|
|
|
|
|
|
// |
439
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
440
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
441
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
442
|
|
|
|
|
|
|
// |
443
|
|
|
|
|
|
|
// UniLib version: 3.3.0 |
444
|
|
|
|
|
|
|
// Unicode version: 15.0.0 |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
namespace unilib { |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
class unicode { |
449
|
|
|
|
|
|
|
enum : uint8_t { |
450
|
|
|
|
|
|
|
_Lu = 1, _Ll = 2, _Lt = 3, _Lm = 4, _Lo = 5, |
451
|
|
|
|
|
|
|
_Mn = 6, _Mc = 7, _Me = 8, |
452
|
|
|
|
|
|
|
_Nd = 9, _Nl = 10, _No = 11, |
453
|
|
|
|
|
|
|
_Pc = 12, _Pd = 13, _Ps = 14, _Pe = 15, _Pi = 16, _Pf = 17, _Po = 18, |
454
|
|
|
|
|
|
|
_Sm = 19, _Sc = 20, _Sk = 21, _So = 22, |
455
|
|
|
|
|
|
|
_Zs = 23, _Zl = 24, _Zp = 25, |
456
|
|
|
|
|
|
|
_Cc = 26, _Cf = 27, _Cs = 28, _Co = 29, _Cn = 30 |
457
|
|
|
|
|
|
|
}; |
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
public: |
460
|
|
|
|
|
|
|
typedef uint32_t category_t; |
461
|
|
|
|
|
|
|
enum : category_t { |
462
|
|
|
|
|
|
|
Lu = 1 << _Lu, Ll = 1 << _Ll, Lt = 1 << _Lt, Lut = Lu | Lt, LC = Lu | Ll | Lt, |
463
|
|
|
|
|
|
|
Lm = 1 << _Lm, Lo = 1 << _Lo, L = Lu | Ll | Lt | Lm | Lo, |
464
|
|
|
|
|
|
|
Mn = 1 << _Mn, Mc = 1 << _Mc, Me = 1 << _Me, M = Mn | Mc | Me, |
465
|
|
|
|
|
|
|
Nd = 1 << _Nd, Nl = 1 << _Nl, No = 1 << _No, N = Nd | Nl | No, |
466
|
|
|
|
|
|
|
Pc = 1 << _Pc, Pd = 1 << _Pd, Ps = 1 << _Ps, Pe = 1 << _Pe, Pi = 1 << _Pi, |
467
|
|
|
|
|
|
|
Pf = 1 << _Pf, Po = 1 << _Po, P = Pc | Pd | Ps | Pe | Pi | Pf | Po, |
468
|
|
|
|
|
|
|
Sm = 1 << _Sm, Sc = 1 << _Sc, Sk = 1 << _Sk, So = 1 << _So, S = Sm | Sc | Sk | So, |
469
|
|
|
|
|
|
|
Zs = 1 << _Zs, Zl = 1 << _Zl, Zp = 1 << _Zp, Z = Zs | Zl | Zp, |
470
|
|
|
|
|
|
|
Cc = 1 << _Cc, Cf = 1 << _Cf, Cs = 1 << _Cs, Co = 1 << _Co, Cn = 1 << _Cn, C = Cc | Cf | Cs | Co | Cn |
471
|
|
|
|
|
|
|
}; |
472
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
static inline category_t category(char32_t chr); |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
static inline char32_t lowercase(char32_t chr); |
476
|
|
|
|
|
|
|
static inline char32_t uppercase(char32_t chr); |
477
|
|
|
|
|
|
|
static inline char32_t titlecase(char32_t chr); |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
private: |
480
|
|
|
|
|
|
|
static const char32_t CHARS = 0x110000; |
481
|
|
|
|
|
|
|
static const int32_t DEFAULT_CAT = Cn; |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
static const uint8_t category_index[CHARS >> 8]; |
484
|
|
|
|
|
|
|
static const uint8_t category_block[][256]; |
485
|
|
|
|
|
|
|
static const uint8_t othercase_index[CHARS >> 8]; |
486
|
|
|
|
|
|
|
static const char32_t othercase_block[][256]; |
487
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
enum othercase_type { LOWER_ONLY = 1, UPPERTITLE_ONLY = 2, UPPER_ONLY = 3, LOWER_THEN_UPPER = 4, UPPER_THEN_TITLE = 5, TITLE_THEN_LOWER = 6 }; |
489
|
|
|
|
|
|
|
}; |
490
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
unicode::category_t unicode::category(char32_t chr) { |
492
|
101
|
0
|
|
|
|
|
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
493
|
|
|
|
|
|
|
} |
494
|
|
|
|
|
|
|
|
495
|
30
|
|
|
|
|
|
char32_t unicode::lowercase(char32_t chr) { |
496
|
30
|
50
|
|
|
|
|
if (chr < CHARS) { |
497
|
30
|
|
|
|
|
|
char32_t othercase = othercase_block[othercase_index[chr >> 8]][chr & 0xFF]; |
498
|
30
|
100
|
|
|
|
|
if ((othercase & 0xFF) == othercase_type::LOWER_ONLY) return othercase >> 8; |
499
|
28
|
50
|
|
|
|
|
if ((othercase & 0xFF) == othercase_type::LOWER_THEN_UPPER) return othercase >> 8; |
500
|
28
|
50
|
|
|
|
|
if ((othercase & 0xFF) == othercase_type::TITLE_THEN_LOWER) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8; |
501
|
|
|
|
|
|
|
} |
502
|
|
|
|
|
|
|
return chr; |
503
|
|
|
|
|
|
|
} |
504
|
|
|
|
|
|
|
|
505
|
0
|
|
|
|
|
|
char32_t unicode::uppercase(char32_t chr) { |
506
|
0
|
0
|
|
|
|
|
if (chr < CHARS) { |
507
|
0
|
|
|
|
|
|
char32_t othercase = othercase_block[othercase_index[chr >> 8]][chr & 0xFF]; |
508
|
0
|
0
|
|
|
|
|
if ((othercase & 0xFF) == othercase_type::UPPERTITLE_ONLY) return othercase >> 8; |
509
|
0
|
0
|
|
|
|
|
if ((othercase & 0xFF) == othercase_type::UPPER_ONLY) return othercase >> 8; |
510
|
0
|
0
|
|
|
|
|
if ((othercase & 0xFF) == othercase_type::UPPER_THEN_TITLE) return othercase >> 8; |
511
|
0
|
0
|
|
|
|
|
if ((othercase & 0xFF) == othercase_type::LOWER_THEN_UPPER) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8; |
512
|
|
|
|
|
|
|
} |
513
|
|
|
|
|
|
|
return chr; |
514
|
|
|
|
|
|
|
} |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
char32_t unicode::titlecase(char32_t chr) { |
517
|
|
|
|
|
|
|
if (chr < CHARS) { |
518
|
|
|
|
|
|
|
char32_t othercase = othercase_block[othercase_index[chr >> 8]][chr & 0xFF]; |
519
|
|
|
|
|
|
|
if ((othercase & 0xFF) == othercase_type::UPPERTITLE_ONLY) return othercase >> 8; |
520
|
|
|
|
|
|
|
if ((othercase & 0xFF) == othercase_type::TITLE_THEN_LOWER) return othercase >> 8; |
521
|
|
|
|
|
|
|
if ((othercase & 0xFF) == othercase_type::UPPER_THEN_TITLE) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8; |
522
|
|
|
|
|
|
|
} |
523
|
|
|
|
|
|
|
return chr; |
524
|
|
|
|
|
|
|
} |
525
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
} // namespace unilib |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
///////// |
529
|
|
|
|
|
|
|
// File: unilib/utf8.h |
530
|
|
|
|
|
|
|
///////// |
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
// This file is part of UniLib . |
533
|
|
|
|
|
|
|
// |
534
|
|
|
|
|
|
|
// Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of |
535
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
536
|
|
|
|
|
|
|
// |
537
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
538
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
539
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
540
|
|
|
|
|
|
|
// |
541
|
|
|
|
|
|
|
// UniLib version: 3.3.0 |
542
|
|
|
|
|
|
|
// Unicode version: 15.0.0 |
543
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
namespace unilib { |
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
class utf8 { |
547
|
|
|
|
|
|
|
public: |
548
|
|
|
|
|
|
|
static bool valid(const char* str); |
549
|
|
|
|
|
|
|
static bool valid(const char* str, size_t len); |
550
|
|
|
|
|
|
|
static inline bool valid(const std::string& str); |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
static inline char32_t decode(const char*& str); |
553
|
|
|
|
|
|
|
static inline char32_t decode(const char*& str, size_t& len); |
554
|
|
|
|
|
|
|
static inline char32_t first(const char* str); |
555
|
|
|
|
|
|
|
static inline char32_t first(const char* str, size_t len); |
556
|
|
|
|
|
|
|
static inline char32_t first(const std::string& str); |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
static void decode(const char* str, std::u32string& decoded); |
559
|
|
|
|
|
|
|
static void decode(const char* str, size_t len, std::u32string& decoded); |
560
|
|
|
|
|
|
|
static inline void decode(const std::string& str, std::u32string& decoded); |
561
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
class string_decoder { |
563
|
|
|
|
|
|
|
public: |
564
|
|
|
|
|
|
|
class iterator; |
565
|
|
|
|
|
|
|
inline iterator begin(); |
566
|
|
|
|
|
|
|
inline iterator end(); |
567
|
|
|
|
|
|
|
private: |
568
|
|
|
|
|
|
|
inline string_decoder(const char* str); |
569
|
|
|
|
|
|
|
const char* str; |
570
|
|
|
|
|
|
|
friend class utf8; |
571
|
|
|
|
|
|
|
}; |
572
|
|
|
|
|
|
|
static inline string_decoder decoder(const char* str); |
573
|
|
|
|
|
|
|
static inline string_decoder decoder(const std::string& str); |
574
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
class buffer_decoder { |
576
|
|
|
|
|
|
|
public: |
577
|
|
|
|
|
|
|
class iterator; |
578
|
|
|
|
|
|
|
inline iterator begin(); |
579
|
|
|
|
|
|
|
inline iterator end(); |
580
|
|
|
|
|
|
|
private: |
581
|
|
|
|
|
|
|
inline buffer_decoder(const char* str, size_t len); |
582
|
|
|
|
|
|
|
const char* str; |
583
|
|
|
|
|
|
|
size_t len; |
584
|
|
|
|
|
|
|
friend class utf8; |
585
|
|
|
|
|
|
|
}; |
586
|
|
|
|
|
|
|
static inline buffer_decoder decoder(const char* str, size_t len); |
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
static inline void append(char*& str, char32_t chr); |
589
|
|
|
|
|
|
|
static inline void append(std::string& str, char32_t chr); |
590
|
|
|
|
|
|
|
static void encode(const std::u32string& str, std::string& encoded); |
591
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
template static void map(F f, const char* str, std::string& result); |
593
|
|
|
|
|
|
|
template static void map(F f, const char* str, size_t len, std::string& result); |
594
|
|
|
|
|
|
|
template static void map(F f, const std::string& str, std::string& result); |
595
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
private: |
597
|
|
|
|
|
|
|
static const char REPLACEMENT_CHAR = '?'; |
598
|
|
|
|
|
|
|
}; |
599
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
bool utf8::valid(const std::string& str) { |
601
|
|
|
|
|
|
|
return valid(str.c_str()); |
602
|
|
|
|
|
|
|
} |
603
|
|
|
|
|
|
|
|
604
|
54
|
|
|
|
|
|
char32_t utf8::decode(const char*& str) { |
605
|
54
|
50
|
|
|
|
|
if (((unsigned char)*str) < 0x80) return (unsigned char)*str++; |
606
|
0
|
0
|
|
|
|
|
else if (((unsigned char)*str) < 0xC0) return ++str, REPLACEMENT_CHAR; |
607
|
0
|
0
|
|
|
|
|
else if (((unsigned char)*str) < 0xE0) { |
608
|
0
|
|
|
|
|
|
char32_t res = (((unsigned char)*str++) & 0x1F) << 6; |
609
|
0
|
0
|
|
|
|
|
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
0
|
|
|
|
|
|
610
|
0
|
|
|
|
|
|
return res + (((unsigned char)*str++) & 0x3F); |
611
|
0
|
0
|
|
|
|
|
} else if (((unsigned char)*str) < 0xF0) { |
612
|
0
|
|
|
|
|
|
char32_t res = (((unsigned char)*str++) & 0x0F) << 12; |
613
|
0
|
0
|
|
|
|
|
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
0
|
|
|
|
|
|
614
|
0
|
|
|
|
|
|
res += (((unsigned char)*str++) & 0x3F) << 6; |
615
|
0
|
0
|
|
|
|
|
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
0
|
|
|
|
|
|
616
|
0
|
|
|
|
|
|
return res + (((unsigned char)*str++) & 0x3F); |
617
|
0
|
0
|
|
|
|
|
} else if (((unsigned char)*str) < 0xF8) { |
618
|
0
|
|
|
|
|
|
char32_t res = (((unsigned char)*str++) & 0x07) << 18; |
619
|
0
|
0
|
|
|
|
|
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
0
|
|
|
|
|
|
620
|
0
|
|
|
|
|
|
res += (((unsigned char)*str++) & 0x3F) << 12; |
621
|
0
|
0
|
|
|
|
|
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
0
|
|
|
|
|
|
622
|
0
|
|
|
|
|
|
res += (((unsigned char)*str++) & 0x3F) << 6; |
623
|
0
|
0
|
|
|
|
|
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
0
|
|
|
|
|
|
624
|
0
|
|
|
|
|
|
return res + (((unsigned char)*str++) & 0x3F); |
625
|
0
|
|
|
|
|
|
} else return ++str, REPLACEMENT_CHAR; |
626
|
|
|
|
|
|
|
} |
627
|
|
|
|
|
|
|
|
628
|
145
|
|
|
|
|
|
char32_t utf8::decode(const char*& str, size_t& len) { |
629
|
145
|
50
|
|
|
|
|
if (!len) return 0; |
630
|
145
|
|
|
|
|
|
--len; |
631
|
145
|
100
|
|
|
|
|
if (((unsigned char)*str) < 0x80) return (unsigned char)*str++; |
632
|
23
|
50
|
|
|
|
|
else if (((unsigned char)*str) < 0xC0) return ++str, REPLACEMENT_CHAR; |
633
|
23
|
50
|
|
|
|
|
else if (((unsigned char)*str) < 0xE0) { |
634
|
23
|
|
|
|
|
|
char32_t res = (((unsigned char)*str++) & 0x1F) << 6; |
635
|
23
|
50
|
|
|
|
|
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
636
|
23
|
|
|
|
|
|
return res + ((--len, ((unsigned char)*str++)) & 0x3F); |
637
|
0
|
0
|
|
|
|
|
} else if (((unsigned char)*str) < 0xF0) { |
638
|
0
|
|
|
|
|
|
char32_t res = (((unsigned char)*str++) & 0x0F) << 12; |
639
|
0
|
0
|
|
|
|
|
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
640
|
0
|
|
|
|
|
|
res += ((--len, ((unsigned char)*str++)) & 0x3F) << 6; |
641
|
0
|
0
|
|
|
|
|
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
642
|
0
|
|
|
|
|
|
return res + ((--len, ((unsigned char)*str++)) & 0x3F); |
643
|
0
|
0
|
|
|
|
|
} else if (((unsigned char)*str) < 0xF8) { |
644
|
0
|
|
|
|
|
|
char32_t res = (((unsigned char)*str++) & 0x07) << 18; |
645
|
0
|
0
|
|
|
|
|
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
646
|
0
|
|
|
|
|
|
res += ((--len, ((unsigned char)*str++)) & 0x3F) << 12; |
647
|
0
|
0
|
|
|
|
|
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
648
|
0
|
|
|
|
|
|
res += ((--len, ((unsigned char)*str++)) & 0x3F) << 6; |
649
|
0
|
0
|
|
|
|
|
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
650
|
0
|
|
|
|
|
|
return res + ((--len, ((unsigned char)*str++)) & 0x3F); |
651
|
0
|
|
|
|
|
|
} else return ++str, REPLACEMENT_CHAR; |
652
|
|
|
|
|
|
|
} |
653
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
char32_t utf8::first(const char* str) { |
655
|
0
|
|
|
|
|
|
return decode(str); |
656
|
|
|
|
|
|
|
} |
657
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
char32_t utf8::first(const char* str, size_t len) { |
659
|
0
|
|
|
|
|
|
return decode(str, len); |
660
|
|
|
|
|
|
|
} |
661
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
char32_t utf8::first(const std::string& str) { |
663
|
|
|
|
|
|
|
return first(str.c_str()); |
664
|
|
|
|
|
|
|
} |
665
|
|
|
|
|
|
|
|
666
|
|
|
|
|
|
|
void utf8::decode(const std::string& str, std::u32string& decoded) { |
667
|
|
|
|
|
|
|
decode(str.c_str(), decoded); |
668
|
|
|
|
|
|
|
} |
669
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
class utf8::string_decoder::iterator : public std::iterator { |
671
|
|
|
|
|
|
|
public: |
672
|
36
|
|
|
|
|
|
iterator(const char* str) : codepoint(0), next(str) { operator++(); } |
673
|
|
|
|
|
|
|
iterator(const iterator& it) : codepoint(it.codepoint), next(it.next) {} |
674
|
54
|
0
|
|
|
|
|
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
675
|
|
|
|
|
|
|
iterator operator++(int) { iterator tmp(*this); operator++(); return tmp; } |
676
|
|
|
|
|
|
|
bool operator==(const iterator& other) const { return next == other.next; } |
677
|
|
|
|
|
|
|
bool operator!=(const iterator& other) const { return next != other.next; } |
678
|
|
|
|
|
|
|
const char32_t& operator*() { return codepoint; } |
679
|
|
|
|
|
|
|
private: |
680
|
|
|
|
|
|
|
char32_t codepoint; |
681
|
|
|
|
|
|
|
const char* next; |
682
|
|
|
|
|
|
|
}; |
683
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
utf8::string_decoder::string_decoder(const char* str) : str(str) {} |
685
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
utf8::string_decoder::iterator utf8::string_decoder::begin() { |
687
|
|
|
|
|
|
|
return iterator(str); |
688
|
|
|
|
|
|
|
} |
689
|
|
|
|
|
|
|
|
690
|
|
|
|
|
|
|
utf8::string_decoder::iterator utf8::string_decoder::end() { |
691
|
|
|
|
|
|
|
return iterator(nullptr); |
692
|
|
|
|
|
|
|
} |
693
|
|
|
|
|
|
|
|
694
|
|
|
|
|
|
|
utf8::string_decoder utf8::decoder(const char* str) { |
695
|
|
|
|
|
|
|
return string_decoder(str); |
696
|
|
|
|
|
|
|
} |
697
|
|
|
|
|
|
|
|
698
|
|
|
|
|
|
|
utf8::string_decoder utf8::decoder(const std::string& str) { |
699
|
|
|
|
|
|
|
return string_decoder(str.c_str()); |
700
|
|
|
|
|
|
|
} |
701
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
class utf8::buffer_decoder::iterator : public std::iterator { |
703
|
|
|
|
|
|
|
public: |
704
|
0
|
|
|
|
|
|
iterator(const char* str, size_t len) : codepoint(0), next(str), len(len) { operator++(); } |
705
|
|
|
|
|
|
|
iterator(const iterator& it) : codepoint(it.codepoint), next(it.next), len(it.len) {} |
706
|
0
|
0
|
|
|
|
|
iterator& operator++() { if (!len) next = nullptr; if (next) codepoint = decode(next, len); return *this; } |
|
|
0
|
|
|
|
|
|
707
|
|
|
|
|
|
|
iterator operator++(int) { iterator tmp(*this); operator++(); return tmp; } |
708
|
|
|
|
|
|
|
bool operator==(const iterator& other) const { return next == other.next; } |
709
|
|
|
|
|
|
|
bool operator!=(const iterator& other) const { return next != other.next; } |
710
|
|
|
|
|
|
|
const char32_t& operator*() { return codepoint; } |
711
|
|
|
|
|
|
|
private: |
712
|
|
|
|
|
|
|
char32_t codepoint; |
713
|
|
|
|
|
|
|
const char* next; |
714
|
|
|
|
|
|
|
size_t len; |
715
|
|
|
|
|
|
|
}; |
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
utf8::buffer_decoder::buffer_decoder(const char* str, size_t len) : str(str), len(len) {} |
718
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
utf8::buffer_decoder::iterator utf8::buffer_decoder::begin() { |
720
|
|
|
|
|
|
|
return iterator(str, len); |
721
|
|
|
|
|
|
|
} |
722
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
utf8::buffer_decoder::iterator utf8::buffer_decoder::end() { |
724
|
|
|
|
|
|
|
return iterator(nullptr, 0); |
725
|
|
|
|
|
|
|
} |
726
|
|
|
|
|
|
|
|
727
|
|
|
|
|
|
|
utf8::buffer_decoder utf8::decoder(const char* str, size_t len) { |
728
|
|
|
|
|
|
|
return buffer_decoder(str, len); |
729
|
|
|
|
|
|
|
} |
730
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
void utf8::append(char*& str, char32_t chr) { |
732
|
|
|
|
|
|
|
if (chr < 0x80) *str++ = chr; |
733
|
|
|
|
|
|
|
else if (chr < 0x800) { *str++ = 0xC0 + (chr >> 6); *str++ = 0x80 + (chr & 0x3F); } |
734
|
|
|
|
|
|
|
else if (chr < 0x10000) { *str++ = 0xE0 + (chr >> 12); *str++ = 0x80 + ((chr >> 6) & 0x3F); *str++ = 0x80 + (chr & 0x3F); } |
735
|
|
|
|
|
|
|
else if (chr < 0x200000) { *str++ = 0xF0 + (chr >> 18); *str++ = 0x80 + ((chr >> 12) & 0x3F); *str++ = 0x80 + ((chr >> 6) & 0x3F); *str++ = 0x80 + (chr & 0x3F); } |
736
|
|
|
|
|
|
|
else *str++ = REPLACEMENT_CHAR; |
737
|
|
|
|
|
|
|
} |
738
|
|
|
|
|
|
|
|
739
|
30
|
|
|
|
|
|
void utf8::append(std::string& str, char32_t chr) { |
740
|
30
|
100
|
|
|
|
|
if (chr < 0x80) str += chr; |
741
|
5
|
50
|
|
|
|
|
else if (chr < 0x800) { str += 0xC0 + (chr >> 6); str += 0x80 + (chr & 0x3F); } |
742
|
0
|
0
|
|
|
|
|
else if (chr < 0x10000) { str += 0xE0 + (chr >> 12); str += 0x80 + ((chr >> 6) & 0x3F); str += 0x80 + (chr & 0x3F); } |
743
|
0
|
0
|
|
|
|
|
else if (chr < 0x200000) { str += 0xF0 + (chr >> 18); str += 0x80 + ((chr >> 12) & 0x3F); str += 0x80 + ((chr >> 6) & 0x3F); str += 0x80 + (chr & 0x3F); } |
744
|
|
|
|
|
|
|
else str += REPLACEMENT_CHAR; |
745
|
30
|
|
|
|
|
|
} |
746
|
|
|
|
|
|
|
|
747
|
0
|
|
|
|
|
|
template void utf8::map(F f, const char* str, std::string& result) { |
748
|
|
|
|
|
|
|
result.clear(); |
749
|
|
|
|
|
|
|
|
750
|
0
|
0
|
|
|
|
|
for (char32_t chr; (chr = decode(str)); ) |
751
|
0
|
|
|
|
|
|
append(result, f(chr)); |
752
|
0
|
|
|
|
|
|
} |
753
|
|
|
|
|
|
|
|
754
|
7
|
|
|
|
|
|
template void utf8::map(F f, const char* str, size_t len, std::string& result) { |
755
|
|
|
|
|
|
|
result.clear(); |
756
|
|
|
|
|
|
|
|
757
|
36
|
100
|
|
|
|
|
while (len) |
758
|
29
|
|
|
|
|
|
append(result, f(decode(str, len))); |
759
|
7
|
|
|
|
|
|
} |
760
|
|
|
|
|
|
|
|
761
|
|
|
|
|
|
|
template void utf8::map(F f, const std::string& str, std::string& result) { |
762
|
0
|
0
|
|
|
|
|
map(f, str.c_str(), result); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
763
|
|
|
|
|
|
|
} |
764
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
} // namespace unilib |
766
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
///////// |
768
|
|
|
|
|
|
|
// File: model/evaluator.cpp |
769
|
|
|
|
|
|
|
///////// |
770
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
// This file is part of UDPipe . |
772
|
|
|
|
|
|
|
// |
773
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
774
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
775
|
|
|
|
|
|
|
// |
776
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
777
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
778
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
779
|
|
|
|
|
|
|
|
780
|
2
|
|
|
|
|
|
const string evaluator::DEFAULT; |
781
|
2
|
|
|
|
|
|
const string evaluator::NONE = "none"; |
782
|
|
|
|
|
|
|
|
783
|
0
|
|
|
|
|
|
evaluator::evaluator(const model* m, const string& tokenizer, const string& tagger, const string& parser) { |
784
|
|
|
|
|
|
|
set_model(m); |
785
|
|
|
|
|
|
|
set_tokenizer(tokenizer); |
786
|
|
|
|
|
|
|
set_tagger(tagger); |
787
|
|
|
|
|
|
|
set_parser(parser); |
788
|
0
|
|
|
|
|
|
} |
789
|
|
|
|
|
|
|
|
790
|
0
|
|
|
|
|
|
void evaluator::set_model(const model* m) { |
791
|
0
|
|
|
|
|
|
this->m = m; |
792
|
0
|
|
|
|
|
|
} |
793
|
|
|
|
|
|
|
|
794
|
0
|
|
|
|
|
|
void evaluator::set_tokenizer(const string& tokenizer) { |
795
|
0
|
|
|
|
|
|
this->tokenizer = tokenizer; |
796
|
0
|
|
|
|
|
|
} |
797
|
|
|
|
|
|
|
|
798
|
0
|
|
|
|
|
|
void evaluator::set_tagger(const string& tagger) { |
799
|
0
|
|
|
|
|
|
this->tagger = tagger; |
800
|
0
|
|
|
|
|
|
} |
801
|
|
|
|
|
|
|
|
802
|
0
|
|
|
|
|
|
void evaluator::set_parser(const string& parser) { |
803
|
0
|
|
|
|
|
|
this->parser = parser; |
804
|
0
|
|
|
|
|
|
} |
805
|
|
|
|
|
|
|
|
806
|
0
|
|
|
|
|
|
bool evaluator::evaluate(istream& is, ostream& os, string& error) const { |
807
|
|
|
|
|
|
|
error.clear(); |
808
|
|
|
|
|
|
|
|
809
|
0
|
0
|
|
|
|
|
unique_ptr conllu_input(input_format::new_conllu_input_format()); |
810
|
0
|
0
|
|
|
|
|
if (!conllu_input) return error.assign("Cannot allocate CoNLL-U input format instance!"), false; |
|
|
0
|
|
|
|
|
|
811
|
|
|
|
|
|
|
|
812
|
0
|
0
|
|
|
|
|
vector plain_text_paragraphs(1); unsigned space_after_nos = 0; |
813
|
0
|
0
|
|
|
|
|
sentence system, gold; |
|
|
0
|
|
|
|
|
|
814
|
0
|
|
|
|
|
|
evaluation_data gold_data, system_goldtok_data, system_goldtok_goldtags_data, system_plaintext_data; |
815
|
|
|
|
|
|
|
|
816
|
|
|
|
|
|
|
string block; |
817
|
0
|
0
|
|
|
|
|
while (conllu_input->read_block(is, block)) { |
|
|
0
|
|
|
|
|
|
818
|
0
|
0
|
|
|
|
|
conllu_input->set_text(block); |
819
|
0
|
0
|
|
|
|
|
while (conllu_input->next_sentence(gold, error)) { |
|
|
0
|
|
|
|
|
|
820
|
0
|
0
|
|
|
|
|
gold_data.add_sentence(gold); |
821
|
|
|
|
|
|
|
|
822
|
|
|
|
|
|
|
// Detokenize the input when tokenizing |
823
|
0
|
0
|
|
|
|
|
if (tokenizer != NONE) { |
824
|
0
|
0
|
|
|
|
|
if (gold.get_new_doc() || gold.get_new_par()) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
825
|
0
|
0
|
|
|
|
|
plain_text_paragraphs.back().append("\n\n"); |
826
|
0
|
0
|
|
|
|
|
plain_text_paragraphs.emplace_back(); |
827
|
|
|
|
|
|
|
} |
828
|
|
|
|
|
|
|
|
829
|
0
|
0
|
|
|
|
|
for (size_t i = 1, j = 0; i < gold.words.size(); i++) { |
830
|
0
|
0
|
|
|
|
|
const token& tok = j < gold.multiword_tokens.size() && gold.multiword_tokens[j].id_first == int(i) ? (const token&)gold.multiword_tokens[j] : (const token&)gold.words[i]; |
|
|
0
|
|
|
|
|
|
831
|
|
|
|
|
|
|
plain_text_paragraphs.back().append(tok.form); |
832
|
0
|
0
|
|
|
|
|
if (tok.get_space_after()) |
|
|
0
|
|
|
|
|
|
833
|
0
|
0
|
|
|
|
|
plain_text_paragraphs.back().push_back(' '); |
834
|
|
|
|
|
|
|
else |
835
|
0
|
|
|
|
|
|
space_after_nos += 1; |
836
|
0
|
0
|
|
|
|
|
if (j < gold.multiword_tokens.size() && gold.multiword_tokens[j].id_first == int(i)) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
837
|
0
|
|
|
|
|
|
i = gold.multiword_tokens[j++].id_last; |
838
|
|
|
|
|
|
|
} |
839
|
|
|
|
|
|
|
} |
840
|
|
|
|
|
|
|
|
841
|
|
|
|
|
|
|
// Goldtok data |
842
|
0
|
0
|
|
|
|
|
if (tokenizer == NONE && tagger != NONE) { |
843
|
0
|
0
|
|
|
|
|
system.clear(); |
844
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < gold.words.size(); i++) |
845
|
|
|
|
|
|
|
system.add_word(gold.words[i].form); |
846
|
|
|
|
|
|
|
|
847
|
0
|
0
|
|
|
|
|
if (tagger != NONE) { |
848
|
0
|
0
|
|
|
|
|
if (!m->tag(system, tagger, error)) |
|
|
0
|
|
|
|
|
|
849
|
|
|
|
|
|
|
return false; |
850
|
0
|
0
|
|
|
|
|
if (parser != NONE) |
851
|
0
|
0
|
|
|
|
|
if (!m->parse(system, parser, error)) |
|
|
0
|
|
|
|
|
|
852
|
|
|
|
|
|
|
return false; |
853
|
|
|
|
|
|
|
} |
854
|
0
|
0
|
|
|
|
|
system_goldtok_data.add_sentence(system); |
855
|
|
|
|
|
|
|
} |
856
|
|
|
|
|
|
|
|
857
|
|
|
|
|
|
|
// Goldtok_goldtags data |
858
|
0
|
0
|
|
|
|
|
if (tokenizer == NONE && tagger == NONE && parser != NONE) { |
|
|
0
|
|
|
|
|
|
859
|
0
|
0
|
|
|
|
|
system.clear(); |
860
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < gold.words.size(); i++) { |
861
|
|
|
|
|
|
|
system.add_word(gold.words[i].form); |
862
|
0
|
|
|
|
|
|
system.words[i].upostag = gold.words[i].upostag; |
863
|
0
|
|
|
|
|
|
system.words[i].xpostag = gold.words[i].xpostag; |
864
|
0
|
|
|
|
|
|
system.words[i].feats = gold.words[i].feats; |
865
|
0
|
|
|
|
|
|
system.words[i].lemma = gold.words[i].lemma; |
866
|
|
|
|
|
|
|
} |
867
|
0
|
0
|
|
|
|
|
if (parser != NONE) |
868
|
0
|
0
|
|
|
|
|
if (!m->parse(system, parser, error)) |
|
|
0
|
|
|
|
|
|
869
|
|
|
|
|
|
|
return false; |
870
|
0
|
0
|
|
|
|
|
system_goldtok_goldtags_data.add_sentence(system); |
871
|
|
|
|
|
|
|
} |
872
|
|
|
|
|
|
|
} |
873
|
0
|
0
|
|
|
|
|
if (!error.empty()) return false; |
874
|
|
|
|
|
|
|
} |
875
|
|
|
|
|
|
|
|
876
|
|
|
|
|
|
|
// Tokenize, tag and parse plaintext input |
877
|
0
|
0
|
|
|
|
|
if (tokenizer != NONE) { |
878
|
0
|
0
|
|
|
|
|
unique_ptr t(m->new_tokenizer(tokenizer)); |
879
|
0
|
0
|
|
|
|
|
if (!t) return error.assign("Cannot allocate new tokenizer!"), false; |
|
|
0
|
|
|
|
|
|
880
|
|
|
|
|
|
|
|
881
|
0
|
0
|
|
|
|
|
for (auto&& plain_text : plain_text_paragraphs) { |
882
|
0
|
0
|
|
|
|
|
t->set_text(plain_text); |
883
|
0
|
0
|
|
|
|
|
while (t->next_sentence(system, error)) { |
|
|
0
|
|
|
|
|
|
884
|
0
|
0
|
|
|
|
|
if (tagger != NONE) { |
885
|
0
|
0
|
|
|
|
|
if (!m->tag(system, tagger, error)) |
|
|
0
|
|
|
|
|
|
886
|
|
|
|
|
|
|
return false; |
887
|
|
|
|
|
|
|
|
888
|
0
|
0
|
|
|
|
|
if (parser != NONE) |
889
|
0
|
0
|
|
|
|
|
if (!m->parse(system, parser, error)) |
|
|
0
|
|
|
|
|
|
890
|
|
|
|
|
|
|
return false; |
891
|
|
|
|
|
|
|
} |
892
|
0
|
0
|
|
|
|
|
system_plaintext_data.add_sentence(system); |
893
|
|
|
|
|
|
|
} |
894
|
0
|
0
|
|
|
|
|
if (!error.empty()) return false; |
895
|
|
|
|
|
|
|
} |
896
|
|
|
|
|
|
|
} |
897
|
|
|
|
|
|
|
|
898
|
|
|
|
|
|
|
// Evaluate from plain text |
899
|
0
|
0
|
|
|
|
|
if (tokenizer != NONE) { |
900
|
0
|
0
|
|
|
|
|
if (system_plaintext_data.chars != gold_data.chars) { |
901
|
|
|
|
|
|
|
os << "Cannot evaluate tokenizer, it returned different sequence of token characters!" << endl; |
902
|
|
|
|
|
|
|
} else { |
903
|
|
|
|
|
|
|
word_alignment plaintext_alignment; |
904
|
0
|
0
|
|
|
|
|
word_alignment::best_alignment(system_plaintext_data, gold_data, plaintext_alignment); |
905
|
|
|
|
|
|
|
|
906
|
|
|
|
|
|
|
os << "Number of SpaceAfter=No features in gold data: " << space_after_nos << endl; |
907
|
|
|
|
|
|
|
|
908
|
0
|
|
|
|
|
|
auto tokens = evaluate_f1(system_plaintext_data.tokens, gold_data.tokens); |
909
|
0
|
|
|
|
|
|
auto multiwords = evaluate_f1(system_plaintext_data.multiwords, gold_data.multiwords); |
910
|
0
|
|
|
|
|
|
auto sentences = evaluate_f1(system_plaintext_data.sentences, gold_data.sentences); |
911
|
0
|
|
|
|
|
|
auto words = plaintext_alignment.evaluate_f1([](const word&, const word&) {return true;}); |
912
|
0
|
0
|
|
|
|
|
if (multiwords.total_gold || multiwords.total_system) |
|
|
0
|
|
|
|
|
|
913
|
0
|
|
|
|
|
|
os << "Tokenizer tokens - system: " << tokens.total_system << ", gold: " << tokens.total_gold |
914
|
0
|
|
|
|
|
|
<< ", precision: " << fixed << setprecision(2) << 100. * tokens.precision |
915
|
0
|
|
|
|
|
|
<< "%, recall: " << 100. * tokens.recall << "%, f1: " << 100. * tokens.f1 << "%" << endl |
916
|
|
|
|
|
|
|
<< "Tokenizer multiword tokens - system: " << multiwords.total_system << ", gold: " << multiwords.total_gold |
917
|
0
|
|
|
|
|
|
<< ", precision: " << fixed << setprecision(2) << 100. * multiwords.precision |
918
|
0
|
|
|
|
|
|
<< "%, recall: " << 100. * multiwords.recall << "%, f1: " << 100. * multiwords.f1 << "%" << endl; |
919
|
0
|
|
|
|
|
|
os << "Tokenizer words - system: " << words.total_system << ", gold: " << words.total_gold |
920
|
0
|
|
|
|
|
|
<< ", precision: " << fixed << setprecision(2) << 100. * words.precision |
921
|
0
|
|
|
|
|
|
<< "%, recall: " << 100. * words.recall << "%, f1: " << 100. * words.f1 << "%" << endl |
922
|
0
|
|
|
|
|
|
<< "Tokenizer sentences - system: " << sentences.total_system << ", gold: " << sentences.total_gold |
923
|
0
|
|
|
|
|
|
<< ", precision: " << fixed << setprecision(2) << 100. * sentences.precision |
924
|
0
|
|
|
|
|
|
<< "%, recall: " << 100. * sentences.recall << "%, f1: " << 100. * sentences.f1 << "%" << endl; |
925
|
|
|
|
|
|
|
|
926
|
0
|
0
|
|
|
|
|
if (tagger != NONE) { |
927
|
0
|
|
|
|
|
|
auto upostags = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag; }); |
928
|
0
|
|
|
|
|
|
auto xpostags = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.xpostag == u.xpostag; }); |
929
|
0
|
|
|
|
|
|
auto feats = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.feats == u.feats; }); |
930
|
0
|
0
|
|
|
|
|
auto alltags = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag && w.xpostag == u.xpostag && w.feats == u.feats; }); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
931
|
0
|
|
|
|
|
|
auto lemmas = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.lemma == u.lemma; }); |
932
|
0
|
|
|
|
|
|
os << "Tagging from plain text (CoNLL17 F1 score) - gold forms: " << upostags.total_gold << ", upostag: " |
933
|
0
|
|
|
|
|
|
<< fixed << setprecision(2) << 100. * upostags.f1 << "%, xpostag: " |
934
|
0
|
|
|
|
|
|
<< 100. * xpostags.f1 << "%, feats: " << 100. * feats.f1 << "%, alltags: " |
935
|
0
|
|
|
|
|
|
<< 100. * alltags.f1 << "%, lemmas: " << 100. * lemmas.f1 << '%' << endl; |
936
|
|
|
|
|
|
|
} |
937
|
|
|
|
|
|
|
|
938
|
0
|
0
|
|
|
|
|
if (tagger != NONE && parser != NONE) { |
939
|
0
|
|
|
|
|
|
auto uas = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head; }); |
940
|
0
|
0
|
|
|
|
|
auto las = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; }); |
|
|
0
|
|
|
|
|
|
941
|
0
|
|
|
|
|
|
os << "Parsing from plain text with computed tags (CoNLL17 F1 score) - gold forms: " << uas.total_gold |
942
|
0
|
|
|
|
|
|
<< ", UAS: " << fixed << setprecision(2) << 100. * uas.f1 << "%, LAS: " << 100. * las.f1 << '%' << endl; |
943
|
|
|
|
|
|
|
} |
944
|
|
|
|
|
|
|
} |
945
|
|
|
|
|
|
|
} |
946
|
|
|
|
|
|
|
|
947
|
|
|
|
|
|
|
// Evaluate tagger from gold tokenization |
948
|
0
|
0
|
|
|
|
|
if (tokenizer == NONE && tagger != NONE) { |
949
|
|
|
|
|
|
|
word_alignment goldtok_alignment; |
950
|
0
|
0
|
|
|
|
|
if (!word_alignment::perfect_alignment(system_goldtok_data, gold_data, goldtok_alignment)) |
|
|
0
|
|
|
|
|
|
951
|
0
|
0
|
|
|
|
|
return error.assign("Internal UDPipe error (the words of the gold data do not match)!"), false; |
952
|
|
|
|
|
|
|
|
953
|
0
|
|
|
|
|
|
auto upostags = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag; }); |
954
|
0
|
|
|
|
|
|
auto xpostags = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.xpostag == u.xpostag; }); |
955
|
0
|
|
|
|
|
|
auto feats = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.feats == u.feats; }); |
956
|
0
|
0
|
|
|
|
|
auto alltags = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag && w.xpostag == u.xpostag && w.feats == u.feats; }); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
957
|
0
|
|
|
|
|
|
auto lemmas = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.lemma == u.lemma; }); |
958
|
0
|
|
|
|
|
|
os << "Tagging from gold tokenization - forms: " << upostags.total_gold << ", upostag: " |
959
|
0
|
|
|
|
|
|
<< fixed << setprecision(2) << 100. * upostags.f1 << "%, xpostag: " |
960
|
0
|
|
|
|
|
|
<< 100. * xpostags.f1 << "%, feats: " << 100. * feats.f1 << "%, alltags: " |
961
|
0
|
|
|
|
|
|
<< 100. * alltags.f1 << "%, lemmas: " << 100. * lemmas.f1 << '%' << endl; |
962
|
|
|
|
|
|
|
|
963
|
0
|
0
|
|
|
|
|
if (parser != NONE) { |
964
|
0
|
|
|
|
|
|
auto uas = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head; }); |
965
|
0
|
0
|
|
|
|
|
auto las = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; }); |
|
|
0
|
|
|
|
|
|
966
|
0
|
|
|
|
|
|
os << "Parsing from gold tokenization with computed tags - forms: " << uas.total_gold |
967
|
0
|
|
|
|
|
|
<< ", UAS: " << fixed << setprecision(2) << 100. * uas.f1 << "%, LAS: " << 100. * las.f1 << '%' << endl; |
968
|
|
|
|
|
|
|
} |
969
|
|
|
|
|
|
|
} |
970
|
|
|
|
|
|
|
|
971
|
|
|
|
|
|
|
// Evaluate parser from gold tokenization |
972
|
0
|
0
|
|
|
|
|
if (tokenizer == NONE && tagger == NONE && parser != NONE) { |
|
|
0
|
|
|
|
|
|
973
|
|
|
|
|
|
|
word_alignment goldtok_goldtags_alignment; |
974
|
0
|
0
|
|
|
|
|
if (!word_alignment::perfect_alignment(system_goldtok_goldtags_data, gold_data, goldtok_goldtags_alignment)) |
|
|
0
|
|
|
|
|
|
975
|
0
|
0
|
|
|
|
|
return error.assign("Internal UDPipe error (the words of the goldtok data do not match)!"), false; |
976
|
|
|
|
|
|
|
|
977
|
0
|
|
|
|
|
|
auto uas = goldtok_goldtags_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head; }); |
978
|
0
|
0
|
|
|
|
|
auto las = goldtok_goldtags_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; }); |
|
|
0
|
|
|
|
|
|
979
|
0
|
|
|
|
|
|
os << "Parsing from gold tokenization with gold tags - forms: " << uas.total_gold |
980
|
0
|
|
|
|
|
|
<< ", UAS: " << fixed << setprecision(2) << 100. * uas.f1 << "%, LAS: " << 100. * las.f1 << '%' << endl; |
981
|
|
|
|
|
|
|
} |
982
|
|
|
|
|
|
|
|
983
|
|
|
|
|
|
|
return true; |
984
|
|
|
|
|
|
|
} |
985
|
|
|
|
|
|
|
|
986
|
|
|
|
|
|
|
template |
987
|
0
|
|
|
|
|
|
evaluator::f1_info evaluator::evaluate_f1(const vector>& system, const vector>& gold) { |
988
|
|
|
|
|
|
|
size_t both = 0; |
989
|
0
|
0
|
|
|
|
|
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
990
|
0
|
0
|
|
|
|
|
if (si < system.size() && (gi == gold.size() || system[si].first < gold[gi].first)) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
991
|
0
|
|
|
|
|
|
si++; |
992
|
0
|
0
|
|
|
|
|
else if (gi < gold.size() && (si == system.size() || gold[gi].first < system[si].first)) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
993
|
0
|
|
|
|
|
|
gi++; |
994
|
|
|
|
|
|
|
else |
995
|
0
|
|
|
|
|
|
both += system[si++].second == gold[gi++].second; |
996
|
|
|
|
|
|
|
|
997
|
|
|
|
|
|
|
return {system.size(), gold.size(), system.size() ? both / double(system.size()) : 0., |
998
|
0
|
0
|
|
|
|
|
gold.size() ? both / double(gold.size()) : 0., system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0. }; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
999
|
|
|
|
|
|
|
} |
1000
|
|
|
|
|
|
|
|
1001
|
0
|
|
|
|
|
|
evaluator::evaluation_data::word_data::word_data(size_t start, size_t end, int id, bool is_multiword, const word& w) |
1002
|
0
|
|
|
|
|
|
: start(start), end(end), is_multiword(is_multiword), w(w) |
1003
|
|
|
|
|
|
|
{ |
1004
|
|
|
|
|
|
|
// Use absolute ids for words and heads |
1005
|
0
|
|
|
|
|
|
this->w.id = id; |
1006
|
0
|
0
|
|
|
|
|
this->w.head = w.head ? id + (w.head - w.id) : 0; |
1007
|
|
|
|
|
|
|
|
1008
|
|
|
|
|
|
|
// Forms in MWTs are compares case-insensitively in LCS, therefore |
1009
|
|
|
|
|
|
|
// we lowercase them here. |
1010
|
0
|
|
|
|
|
|
unilib::utf8::map(unilib::unicode::lowercase, w.form, this->w.form); |
1011
|
|
|
|
|
|
|
|
1012
|
|
|
|
|
|
|
// During evaluation, only universal part of DEPREL (up to a colon) is used. |
1013
|
0
|
|
|
|
|
|
auto colon = w.deprel.find(':'); |
1014
|
0
|
0
|
|
|
|
|
if (colon != string::npos) |
1015
|
0
|
0
|
|
|
|
|
this->w.deprel.erase(colon); |
1016
|
0
|
|
|
|
|
|
} |
1017
|
|
|
|
|
|
|
|
1018
|
0
|
|
|
|
|
|
void evaluator::evaluation_data::add_sentence(const sentence& s) { |
1019
|
0
|
|
|
|
|
|
sentences.emplace_back(chars.size(), chars.size()); |
1020
|
0
|
0
|
|
|
|
|
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
1021
|
0
|
|
|
|
|
|
tokens.emplace_back(chars.size(), chars.size()); |
1022
|
0
|
0
|
|
|
|
|
const string& form = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? s.multiword_tokens[j].form : s.words[i].form; |
|
|
0
|
|
|
|
|
|
1023
|
0
|
0
|
|
|
|
|
for (auto&& chr : unilib::utf8::decoder(form)) |
1024
|
0
|
0
|
|
|
|
|
if (chr != ' ') |
1025
|
0
|
|
|
|
|
|
chars.push_back(chr); |
1026
|
0
|
|
|
|
|
|
tokens.back().second = chars.size(); |
1027
|
|
|
|
|
|
|
|
1028
|
0
|
0
|
|
|
|
|
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1029
|
0
|
|
|
|
|
|
multiwords.emplace_back(tokens.back().first, form); |
1030
|
0
|
0
|
|
|
|
|
for (size_t k = i; int(k) <= s.multiword_tokens[j].id_last; k++) { |
1031
|
0
|
|
|
|
|
|
words.emplace_back(tokens.back().first, tokens.back().second, (int)words.size() + 1, true, s.words[k]); |
1032
|
0
|
|
|
|
|
|
multiwords.back().second.append(" ").append(words.back().w.form); |
1033
|
|
|
|
|
|
|
} |
1034
|
0
|
|
|
|
|
|
i = s.multiword_tokens[j++].id_last; |
1035
|
|
|
|
|
|
|
} else { |
1036
|
0
|
|
|
|
|
|
words.emplace_back(tokens.back().first, tokens.back().second, (int)words.size() + 1, false, s.words[i]); |
1037
|
|
|
|
|
|
|
} |
1038
|
|
|
|
|
|
|
} |
1039
|
0
|
|
|
|
|
|
sentences.back().second = chars.size(); |
1040
|
0
|
|
|
|
|
|
} |
1041
|
|
|
|
|
|
|
|
1042
|
|
|
|
|
|
|
template |
1043
|
0
|
|
|
|
|
|
evaluator::f1_info evaluator::word_alignment::evaluate_f1(Equals equals) { |
1044
|
|
|
|
|
|
|
size_t both = 0; |
1045
|
0
|
0
|
|
|
|
|
for (auto&& match : matched) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1046
|
0
|
0
|
|
|
|
|
if (equals(match.system, match.gold)) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1047
|
0
|
|
|
|
|
|
both++; |
1048
|
|
|
|
|
|
|
|
1049
|
|
|
|
|
|
|
return {total_system, total_gold, total_system ? both / double(total_system) : 0., |
1050
|
0
|
0
|
|
|
|
|
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1051
|
|
|
|
|
|
|
} |
1052
|
|
|
|
|
|
|
|
1053
|
0
|
|
|
|
|
|
bool evaluator::word_alignment::perfect_alignment(const evaluation_data& system, const evaluation_data& gold, word_alignment& alignment) { |
1054
|
0
|
|
|
|
|
|
alignment.total_system = system.words.size(); |
1055
|
0
|
|
|
|
|
|
alignment.total_gold = gold.words.size(); |
1056
|
0
|
0
|
|
|
|
|
if (alignment.total_system != alignment.total_gold) return false; |
1057
|
|
|
|
|
|
|
|
1058
|
|
|
|
|
|
|
alignment.matched.clear(); |
1059
|
0
|
|
|
|
|
|
alignment.matched.reserve(alignment.total_system); |
1060
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < system.words.size(); i++) { |
1061
|
0
|
0
|
|
|
|
|
if (system.words[i].w.form != gold.words[i].w.form) |
1062
|
|
|
|
|
|
|
return false; |
1063
|
0
|
|
|
|
|
|
alignment.matched.emplace_back(system.words[i].w, gold.words[i].w); |
1064
|
|
|
|
|
|
|
} |
1065
|
|
|
|
|
|
|
|
1066
|
|
|
|
|
|
|
return true; |
1067
|
|
|
|
|
|
|
} |
1068
|
|
|
|
|
|
|
|
1069
|
0
|
|
|
|
|
|
void evaluator::word_alignment::best_alignment(const evaluation_data& system, const evaluation_data& gold, word_alignment& alignment) { |
1070
|
0
|
|
|
|
|
|
alignment.total_system = system.words.size(); |
1071
|
0
|
|
|
|
|
|
alignment.total_gold = gold.words.size(); |
1072
|
|
|
|
|
|
|
alignment.matched.clear(); |
1073
|
|
|
|
|
|
|
|
1074
|
0
|
0
|
|
|
|
|
for (size_t si = 0, gi = 0; si < system.words.size() && gi < gold.words.size(); ) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1075
|
0
|
0
|
|
|
|
|
if ((system.words[si].start > gold.words[gi].start || !system.words[si].is_multiword) && |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1076
|
0
|
0
|
|
|
|
|
(gold.words[gi].start > system.words[si].start || !gold.words[gi].is_multiword)) { |
1077
|
|
|
|
|
|
|
// No multiword, align using start+end indices |
1078
|
0
|
0
|
|
|
|
|
if (system.words[si].start == gold.words[gi].start && system.words[si].end == gold.words[gi].end) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1079
|
0
|
|
|
|
|
|
alignment.matched.emplace_back(system.words[si++].w, gold.words[gi++].w); |
1080
|
0
|
0
|
|
|
|
|
else if (system.words[si].start <= gold.words[gi].start) |
1081
|
0
|
|
|
|
|
|
si++; |
1082
|
|
|
|
|
|
|
else |
1083
|
0
|
|
|
|
|
|
gi++; |
1084
|
|
|
|
|
|
|
} else { |
1085
|
|
|
|
|
|
|
// We have a multiword |
1086
|
0
|
0
|
|
|
|
|
size_t ss = si, gs = gi, multiword_range_end = system.words[si].is_multiword ? system.words[si].end : gold.words[gi].end; |
1087
|
|
|
|
|
|
|
|
1088
|
|
|
|
|
|
|
// Find all words in the multiword range |
1089
|
0
|
0
|
|
|
|
|
while ((si < system.words.size() && (system.words[si].is_multiword ? system.words[si].start < multiword_range_end : |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1090
|
0
|
0
|
|
|
|
|
system.words[si].end <= multiword_range_end)) || |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1091
|
0
|
0
|
|
|
|
|
(gi < gold.words.size() && (gold.words[gi].is_multiword ? gold.words[gi].start < multiword_range_end : |
|
|
0
|
|
|
|
|
|
1092
|
0
|
|
|
|
|
|
gold.words[gi].end <= multiword_range_end))) { |
1093
|
|
|
|
|
|
|
// Extend the multiword range |
1094
|
0
|
0
|
|
|
|
|
if (si < system.words.size() && (gi >= gold.words.size() || system.words[si].start <= gold.words[gi].start)) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1095
|
0
|
0
|
|
|
|
|
if (system.words[si].is_multiword) multiword_range_end = max(multiword_range_end, system.words[si].end); |
1096
|
0
|
|
|
|
|
|
si++; |
1097
|
|
|
|
|
|
|
} else { |
1098
|
0
|
0
|
|
|
|
|
if (gold.words[gi].is_multiword) multiword_range_end = max(multiword_range_end, gold.words[gi].end); |
1099
|
0
|
|
|
|
|
|
gi++; |
1100
|
|
|
|
|
|
|
} |
1101
|
|
|
|
|
|
|
} |
1102
|
|
|
|
|
|
|
|
1103
|
|
|
|
|
|
|
// LCS on the chosen words |
1104
|
0
|
|
|
|
|
|
vector> lcs(si - ss); |
1105
|
0
|
0
|
|
|
|
|
for (unsigned s = si - ss; s--; ) { |
1106
|
0
|
0
|
|
|
|
|
lcs[s].resize(gi - gs); |
1107
|
0
|
0
|
|
|
|
|
for (unsigned g = gi - gs; g--; ) { |
1108
|
0
|
0
|
|
|
|
|
lcs[s][g] = max(lcs[s][g], s+1 < lcs.size() ? lcs[s+1][g] : 0); |
1109
|
0
|
0
|
|
|
|
|
lcs[s][g] = max(lcs[s][g], g+1 < lcs[s].size() ? lcs[s][g+1] : 0); |
1110
|
0
|
0
|
|
|
|
|
if (system.words[ss + s].w.form == gold.words[gs + g].w.form) |
1111
|
0
|
0
|
|
|
|
|
lcs[s][g] = max(lcs[s][g], 1 + (s+1 < lcs.size() && g+1 < lcs[s].size() ? lcs[s+1][g+1] : 0)); |
|
|
0
|
|
|
|
|
|
1112
|
|
|
|
|
|
|
} |
1113
|
|
|
|
|
|
|
} |
1114
|
|
|
|
|
|
|
|
1115
|
0
|
0
|
|
|
|
|
for (unsigned s = 0, g = 0; s < si - ss && g < gi - gs; ) { |
|
|
0
|
|
|
|
|
|
1116
|
0
|
0
|
|
|
|
|
if (system.words[ss + s].w.form == gold.words[gs + g].w.form) |
1117
|
0
|
0
|
|
|
|
|
alignment.matched.emplace_back(system.words[ss + s++].w, gold.words[gs + g++].w); |
1118
|
0
|
0
|
|
|
|
|
else if (lcs[s][g] == (s+1 < lcs.size() ? lcs[s+1][g] : 0)) |
|
|
0
|
|
|
|
|
|
1119
|
|
|
|
|
|
|
s++; |
1120
|
|
|
|
|
|
|
else /* if (lcs[s][g] == (g+1 < lcs[s].size() ? lcs[s][g+1] : 0)) */ |
1121
|
0
|
|
|
|
|
|
g++; |
1122
|
|
|
|
|
|
|
} |
1123
|
|
|
|
|
|
|
} |
1124
|
|
|
|
|
|
|
|
1125
|
|
|
|
|
|
|
// Reindex HEAD pointers in system to use gold indices |
1126
|
0
|
|
|
|
|
|
vector gold_aligned(system.words.size(), -1); |
1127
|
0
|
0
|
|
|
|
|
for (auto&& match : alignment.matched) |
1128
|
0
|
|
|
|
|
|
gold_aligned[match.system.id - 1] = match.gold.id; |
1129
|
0
|
0
|
|
|
|
|
for (auto&& match : alignment.matched) |
1130
|
0
|
0
|
|
|
|
|
if (match.system.head > 0) |
1131
|
0
|
|
|
|
|
|
match.system.head = gold_aligned[match.system.head - 1]; |
1132
|
0
|
|
|
|
|
|
} |
1133
|
|
|
|
|
|
|
|
1134
|
|
|
|
|
|
|
///////// |
1135
|
|
|
|
|
|
|
// File: morphodita/derivator/derivator.h |
1136
|
|
|
|
|
|
|
///////// |
1137
|
|
|
|
|
|
|
|
1138
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
1139
|
|
|
|
|
|
|
// |
1140
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
1141
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
1142
|
|
|
|
|
|
|
// |
1143
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
1144
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
1145
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
1146
|
|
|
|
|
|
|
|
1147
|
|
|
|
|
|
|
namespace morphodita { |
1148
|
|
|
|
|
|
|
|
1149
|
0
|
|
|
|
|
|
struct derivated_lemma { |
1150
|
|
|
|
|
|
|
string lemma; |
1151
|
|
|
|
|
|
|
}; |
1152
|
|
|
|
|
|
|
|
1153
|
0
|
|
|
|
|
|
class derivator { |
1154
|
|
|
|
|
|
|
public: |
1155
|
0
|
|
|
|
|
|
virtual ~derivator() {} |
1156
|
|
|
|
|
|
|
|
1157
|
|
|
|
|
|
|
// For given lemma, return the parent in the derivation graph. |
1158
|
|
|
|
|
|
|
// The lemma is assumed to be lemma id and any lemma comments are ignored. |
1159
|
|
|
|
|
|
|
virtual bool parent(string_piece lemma, derivated_lemma& parent) const = 0; |
1160
|
|
|
|
|
|
|
|
1161
|
|
|
|
|
|
|
// For given lemma, return the children in the derivation graph. |
1162
|
|
|
|
|
|
|
// The lemma is assumed to be lemma id and any lemma comments are ignored. |
1163
|
|
|
|
|
|
|
virtual bool children(string_piece lemma, vector& children) const = 0; |
1164
|
|
|
|
|
|
|
}; |
1165
|
|
|
|
|
|
|
|
1166
|
|
|
|
|
|
|
} // namespace morphodita |
1167
|
|
|
|
|
|
|
|
1168
|
|
|
|
|
|
|
///////// |
1169
|
|
|
|
|
|
|
// File: morphodita/tokenizer/tokenizer.h |
1170
|
|
|
|
|
|
|
///////// |
1171
|
|
|
|
|
|
|
|
1172
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
1173
|
|
|
|
|
|
|
// |
1174
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
1175
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
1176
|
|
|
|
|
|
|
// |
1177
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
1178
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
1179
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
1180
|
|
|
|
|
|
|
|
1181
|
|
|
|
|
|
|
namespace morphodita { |
1182
|
|
|
|
|
|
|
|
1183
|
|
|
|
|
|
|
// Range of a token, measured in Unicode characters, not UTF8 bytes. |
1184
|
|
|
|
|
|
|
struct token_range { |
1185
|
|
|
|
|
|
|
size_t start; |
1186
|
|
|
|
|
|
|
size_t length; |
1187
|
|
|
|
|
|
|
|
1188
|
|
|
|
|
|
|
token_range() {} |
1189
|
7
|
|
|
|
|
|
token_range(size_t start, size_t length) : start(start), length(length) {} |
1190
|
|
|
|
|
|
|
}; |
1191
|
|
|
|
|
|
|
|
1192
|
1
|
|
|
|
|
|
class tokenizer { |
1193
|
|
|
|
|
|
|
public: |
1194
|
1
|
|
|
|
|
|
virtual ~tokenizer() {} |
1195
|
|
|
|
|
|
|
|
1196
|
|
|
|
|
|
|
virtual void set_text(string_piece text, bool make_copy = false) = 0; |
1197
|
|
|
|
|
|
|
virtual bool next_sentence(vector* forms, vector* tokens) = 0; |
1198
|
|
|
|
|
|
|
|
1199
|
|
|
|
|
|
|
// Static factory methods |
1200
|
|
|
|
|
|
|
static tokenizer* new_vertical_tokenizer(); |
1201
|
|
|
|
|
|
|
|
1202
|
|
|
|
|
|
|
static tokenizer* new_czech_tokenizer(); |
1203
|
|
|
|
|
|
|
static tokenizer* new_english_tokenizer(); |
1204
|
|
|
|
|
|
|
static tokenizer* new_generic_tokenizer(); |
1205
|
|
|
|
|
|
|
}; |
1206
|
|
|
|
|
|
|
|
1207
|
|
|
|
|
|
|
} // namespace morphodita |
1208
|
|
|
|
|
|
|
|
1209
|
|
|
|
|
|
|
///////// |
1210
|
|
|
|
|
|
|
// File: morphodita/morpho/morpho.h |
1211
|
|
|
|
|
|
|
///////// |
1212
|
|
|
|
|
|
|
|
1213
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
1214
|
|
|
|
|
|
|
// |
1215
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
1216
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
1217
|
|
|
|
|
|
|
// |
1218
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
1219
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
1220
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
1221
|
|
|
|
|
|
|
|
1222
|
|
|
|
|
|
|
namespace morphodita { |
1223
|
|
|
|
|
|
|
|
1224
|
0
|
|
|
|
|
|
struct tagged_form { |
1225
|
|
|
|
|
|
|
string form; |
1226
|
|
|
|
|
|
|
string tag; |
1227
|
|
|
|
|
|
|
|
1228
|
|
|
|
|
|
|
tagged_form() {} |
1229
|
0
|
|
|
|
|
|
tagged_form(const string& form, const string& tag) : form(form), tag(tag) {} |
1230
|
|
|
|
|
|
|
}; |
1231
|
|
|
|
|
|
|
|
1232
|
46
|
|
|
|
|
|
struct tagged_lemma { |
1233
|
|
|
|
|
|
|
string lemma; |
1234
|
|
|
|
|
|
|
string tag; |
1235
|
|
|
|
|
|
|
|
1236
|
|
|
|
|
|
|
tagged_lemma() {} |
1237
|
10
|
|
|
|
|
|
tagged_lemma(const string& lemma, const string& tag) : lemma(lemma), tag(tag) {} |
1238
|
|
|
|
|
|
|
}; |
1239
|
|
|
|
|
|
|
|
1240
|
0
|
|
|
|
|
|
struct tagged_lemma_forms { |
1241
|
|
|
|
|
|
|
string lemma; |
1242
|
|
|
|
|
|
|
vector forms; |
1243
|
|
|
|
|
|
|
|
1244
|
|
|
|
|
|
|
tagged_lemma_forms() {} |
1245
|
0
|
|
|
|
|
|
tagged_lemma_forms(const string& lemma) : lemma(lemma) {} |
1246
|
|
|
|
|
|
|
}; |
1247
|
|
|
|
|
|
|
|
1248
|
1
|
|
|
|
|
|
class morpho { |
1249
|
|
|
|
|
|
|
public: |
1250
|
2
|
|
|
|
|
|
virtual ~morpho() {} |
1251
|
|
|
|
|
|
|
|
1252
|
|
|
|
|
|
|
static morpho* load(istream& is); |
1253
|
|
|
|
|
|
|
static morpho* load(const char* fname); |
1254
|
|
|
|
|
|
|
|
1255
|
|
|
|
|
|
|
enum guesser_mode { NO_GUESSER = 0, GUESSER = 1, GUESSER_UNSPECIFIED = -1 }; |
1256
|
|
|
|
|
|
|
|
1257
|
|
|
|
|
|
|
// Perform morphologic analysis of a form. The form is given by a pointer and |
1258
|
|
|
|
|
|
|
// length and therefore does not need to be '\0' terminated. The guesser |
1259
|
|
|
|
|
|
|
// parameter specifies whether a guesser can be used if the form is not found |
1260
|
|
|
|
|
|
|
// in the dictionary. Output is assigned to the lemmas vector. |
1261
|
|
|
|
|
|
|
// |
1262
|
|
|
|
|
|
|
// If the form is found in the dictionary, analyses are assigned to lemmas |
1263
|
|
|
|
|
|
|
// and NO_GUESSER returned. If guesser == GUESSER and the form analyses are |
1264
|
|
|
|
|
|
|
// found using a guesser, they are assigned to lemmas and GUESSER is |
1265
|
|
|
|
|
|
|
// returned. Otherwise <0 is returned and lemmas are filled with one |
1266
|
|
|
|
|
|
|
// analysis containing given form as lemma and a tag for unknown word. |
1267
|
|
|
|
|
|
|
virtual int analyze(string_piece form, guesser_mode guesser, vector& lemmas) const = 0; |
1268
|
|
|
|
|
|
|
|
1269
|
|
|
|
|
|
|
// Perform morphologic generation of a lemma. The lemma is given by a pointer |
1270
|
|
|
|
|
|
|
// and length and therefore does not need to be '\0' terminated. Optionally |
1271
|
|
|
|
|
|
|
// a tag_wildcard can be specified (or be NULL) and if so, results are |
1272
|
|
|
|
|
|
|
// filtered using this wildcard. The guesser parameter speficies whether |
1273
|
|
|
|
|
|
|
// a guesser can be used if the lemma is not found in the dictionary. Output |
1274
|
|
|
|
|
|
|
// is assigned to the forms vector. |
1275
|
|
|
|
|
|
|
// |
1276
|
|
|
|
|
|
|
// Tag_wildcard can be either NULL or a wildcard applied to the results. |
1277
|
|
|
|
|
|
|
// A ? in the wildcard matches any character, [bytes] matches any of the |
1278
|
|
|
|
|
|
|
// bytes and [^bytes] matches any byte different from the specified ones. |
1279
|
|
|
|
|
|
|
// A - has no special meaning inside the bytes and if ] is first in bytes, it |
1280
|
|
|
|
|
|
|
// does not end the bytes group. |
1281
|
|
|
|
|
|
|
// |
1282
|
|
|
|
|
|
|
// If the given lemma is only a raw lemma, all lemma ids with this raw lemma |
1283
|
|
|
|
|
|
|
// are returned. Otherwise only matching lemma ids are returned, ignoring any |
1284
|
|
|
|
|
|
|
// lemma comments. For every found lemma, matching forms are filtered using |
1285
|
|
|
|
|
|
|
// the tag_wildcard. If at least one lemma is found in the dictionary, |
1286
|
|
|
|
|
|
|
// NO_GUESSER is returned. If guesser == GUESSER and the lemma is found by |
1287
|
|
|
|
|
|
|
// the guesser, GUESSER is returned. Otherwise, forms are cleared and <0 is |
1288
|
|
|
|
|
|
|
// returned. |
1289
|
|
|
|
|
|
|
virtual int generate(string_piece lemma, const char* tag_wildcard, guesser_mode guesser, vector& forms) const = 0; |
1290
|
|
|
|
|
|
|
|
1291
|
|
|
|
|
|
|
// Rawlemma and lemma id identification |
1292
|
|
|
|
|
|
|
virtual int raw_lemma_len(string_piece lemma) const = 0; |
1293
|
|
|
|
|
|
|
virtual int lemma_id_len(string_piece lemma) const = 0; |
1294
|
|
|
|
|
|
|
|
1295
|
|
|
|
|
|
|
// Rawform identification |
1296
|
|
|
|
|
|
|
virtual int raw_form_len(string_piece form) const = 0; |
1297
|
|
|
|
|
|
|
|
1298
|
|
|
|
|
|
|
// Construct a new tokenizer instance appropriate for this morphology. |
1299
|
|
|
|
|
|
|
// Can return NULL if no such tokenizer exists. |
1300
|
|
|
|
|
|
|
virtual tokenizer* new_tokenizer() const = 0; |
1301
|
|
|
|
|
|
|
|
1302
|
|
|
|
|
|
|
// Return a derivator for this morphology, or NULL if it does not exist. |
1303
|
|
|
|
|
|
|
// The returned instance is owned by the morphology and should not be deleted. |
1304
|
|
|
|
|
|
|
virtual const derivator* get_derivator() const; |
1305
|
|
|
|
|
|
|
|
1306
|
|
|
|
|
|
|
protected: |
1307
|
|
|
|
|
|
|
unique_ptr derinet; |
1308
|
|
|
|
|
|
|
}; |
1309
|
|
|
|
|
|
|
|
1310
|
|
|
|
|
|
|
} // namespace morphodita |
1311
|
|
|
|
|
|
|
|
1312
|
|
|
|
|
|
|
///////// |
1313
|
|
|
|
|
|
|
// File: morphodita/tokenizer/tokenizer_factory.h |
1314
|
|
|
|
|
|
|
///////// |
1315
|
|
|
|
|
|
|
|
1316
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
1317
|
|
|
|
|
|
|
// |
1318
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
1319
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
1320
|
|
|
|
|
|
|
// |
1321
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
1322
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
1323
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
1324
|
|
|
|
|
|
|
|
1325
|
|
|
|
|
|
|
namespace morphodita { |
1326
|
|
|
|
|
|
|
|
1327
|
1
|
|
|
|
|
|
class tokenizer_factory { |
1328
|
|
|
|
|
|
|
public: |
1329
|
1
|
|
|
|
|
|
virtual ~tokenizer_factory() {} |
1330
|
|
|
|
|
|
|
|
1331
|
|
|
|
|
|
|
static tokenizer_factory* load(istream& is); |
1332
|
|
|
|
|
|
|
static tokenizer_factory* load(const char* fname); |
1333
|
|
|
|
|
|
|
|
1334
|
|
|
|
|
|
|
// Construct a new tokenizer instance. |
1335
|
|
|
|
|
|
|
virtual tokenizer* new_tokenizer(const morpho* m) const = 0; |
1336
|
|
|
|
|
|
|
}; |
1337
|
|
|
|
|
|
|
|
1338
|
|
|
|
|
|
|
} // namespace morphodita |
1339
|
|
|
|
|
|
|
|
1340
|
|
|
|
|
|
|
///////// |
1341
|
|
|
|
|
|
|
// File: morphodita/tagger/tagger.h |
1342
|
|
|
|
|
|
|
///////// |
1343
|
|
|
|
|
|
|
|
1344
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
1345
|
|
|
|
|
|
|
// |
1346
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
1347
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
1348
|
|
|
|
|
|
|
// |
1349
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
1350
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
1351
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
1352
|
|
|
|
|
|
|
|
1353
|
|
|
|
|
|
|
namespace morphodita { |
1354
|
|
|
|
|
|
|
|
1355
|
1
|
|
|
|
|
|
class tagger { |
1356
|
|
|
|
|
|
|
public: |
1357
|
1
|
|
|
|
|
|
virtual ~tagger() {} |
1358
|
|
|
|
|
|
|
|
1359
|
|
|
|
|
|
|
static tagger* load(const char* fname); |
1360
|
|
|
|
|
|
|
static tagger* load(istream& is); |
1361
|
|
|
|
|
|
|
|
1362
|
|
|
|
|
|
|
// Return morpho associated with the tagger. Do not delete the pointer, it is |
1363
|
|
|
|
|
|
|
// owned by the tagger instance and deleted in the tagger destructor. |
1364
|
|
|
|
|
|
|
virtual const morpho* get_morpho() const = 0; |
1365
|
|
|
|
|
|
|
|
1366
|
|
|
|
|
|
|
// Perform morphologic analysis and subsequent disambiguation. |
1367
|
|
|
|
|
|
|
virtual void tag(const vector& forms, vector& tags, morpho::guesser_mode guesser = morpho::GUESSER_UNSPECIFIED) const = 0; |
1368
|
|
|
|
|
|
|
|
1369
|
|
|
|
|
|
|
// Perform disambiguation only on given analyses. |
1370
|
|
|
|
|
|
|
virtual void tag_analyzed(const vector& forms, const vector>& analyses, vector& tags) const = 0; |
1371
|
|
|
|
|
|
|
|
1372
|
|
|
|
|
|
|
// Construct a new tokenizer instance appropriate for this tagger. |
1373
|
|
|
|
|
|
|
// Can return NULL if no such tokenizer exists. |
1374
|
|
|
|
|
|
|
// Is equal to get_morpho()->new_tokenizer. |
1375
|
|
|
|
|
|
|
tokenizer* new_tokenizer() const; |
1376
|
|
|
|
|
|
|
}; |
1377
|
|
|
|
|
|
|
|
1378
|
|
|
|
|
|
|
} // namespace morphodita |
1379
|
|
|
|
|
|
|
|
1380
|
|
|
|
|
|
|
///////// |
1381
|
|
|
|
|
|
|
// File: parsito/tree/node.h |
1382
|
|
|
|
|
|
|
///////// |
1383
|
|
|
|
|
|
|
|
1384
|
|
|
|
|
|
|
// This file is part of Parsito . |
1385
|
|
|
|
|
|
|
// |
1386
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
1387
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
1388
|
|
|
|
|
|
|
// |
1389
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
1390
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
1391
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
1392
|
|
|
|
|
|
|
|
1393
|
|
|
|
|
|
|
namespace parsito { |
1394
|
|
|
|
|
|
|
|
1395
|
23
|
0
|
|
|
|
|
class node { |
1396
|
|
|
|
|
|
|
public: |
1397
|
|
|
|
|
|
|
int id; // 0 is root, >0 is sentence node, <0 is undefined |
1398
|
|
|
|
|
|
|
string form; // form |
1399
|
|
|
|
|
|
|
string lemma; // lemma |
1400
|
|
|
|
|
|
|
string upostag; // universal part-of-speech tag |
1401
|
|
|
|
|
|
|
string xpostag; // language-specific part-of-speech tag |
1402
|
|
|
|
|
|
|
string feats; // list of morphological features |
1403
|
|
|
|
|
|
|
int head; // head, 0 is root, <0 is without parent |
1404
|
|
|
|
|
|
|
string deprel; // dependency relation to the head |
1405
|
|
|
|
|
|
|
string deps; // secondary dependencies |
1406
|
|
|
|
|
|
|
string misc; // miscellaneous information |
1407
|
|
|
|
|
|
|
|
1408
|
|
|
|
|
|
|
vector children; |
1409
|
|
|
|
|
|
|
|
1410
|
9
|
|
|
|
|
|
node(int id = -1, const string& form = string()) : id(id), form(form), head(-1) {} |
1411
|
|
|
|
|
|
|
}; |
1412
|
|
|
|
|
|
|
|
1413
|
|
|
|
|
|
|
} // namespace parsito |
1414
|
|
|
|
|
|
|
|
1415
|
|
|
|
|
|
|
///////// |
1416
|
|
|
|
|
|
|
// File: parsito/tree/tree.h |
1417
|
|
|
|
|
|
|
///////// |
1418
|
|
|
|
|
|
|
|
1419
|
|
|
|
|
|
|
// This file is part of Parsito . |
1420
|
|
|
|
|
|
|
// |
1421
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
1422
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
1423
|
|
|
|
|
|
|
// |
1424
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
1425
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
1426
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
1427
|
|
|
|
|
|
|
|
1428
|
|
|
|
|
|
|
namespace parsito { |
1429
|
|
|
|
|
|
|
|
1430
|
1
|
0
|
|
|
|
|
class tree { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1431
|
|
|
|
|
|
|
public: |
1432
|
|
|
|
|
|
|
tree(); |
1433
|
|
|
|
|
|
|
|
1434
|
|
|
|
|
|
|
vector nodes; |
1435
|
|
|
|
|
|
|
|
1436
|
|
|
|
|
|
|
bool empty(); |
1437
|
|
|
|
|
|
|
void clear(); |
1438
|
|
|
|
|
|
|
node& add_node(const string& form); |
1439
|
|
|
|
|
|
|
void set_head(int id, int head, const string& deprel); |
1440
|
|
|
|
|
|
|
void unlink_all_nodes(); |
1441
|
|
|
|
|
|
|
|
1442
|
|
|
|
|
|
|
static const string root_form; |
1443
|
|
|
|
|
|
|
}; |
1444
|
|
|
|
|
|
|
|
1445
|
|
|
|
|
|
|
} // namespace parsito |
1446
|
|
|
|
|
|
|
|
1447
|
|
|
|
|
|
|
///////// |
1448
|
|
|
|
|
|
|
// File: parsito/configuration/configuration.h |
1449
|
|
|
|
|
|
|
///////// |
1450
|
|
|
|
|
|
|
|
1451
|
|
|
|
|
|
|
// This file is part of Parsito . |
1452
|
|
|
|
|
|
|
// |
1453
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
1454
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
1455
|
|
|
|
|
|
|
// |
1456
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
1457
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
1458
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
1459
|
|
|
|
|
|
|
|
1460
|
|
|
|
|
|
|
namespace parsito { |
1461
|
|
|
|
|
|
|
|
1462
|
167
|
|
|
|
|
|
class configuration { |
1463
|
|
|
|
|
|
|
public: |
1464
|
11
|
|
|
|
|
|
configuration(bool single_root) : single_root(single_root) {} |
1465
|
|
|
|
|
|
|
|
1466
|
|
|
|
|
|
|
void init(tree* t); |
1467
|
|
|
|
|
|
|
bool final(); |
1468
|
|
|
|
|
|
|
|
1469
|
|
|
|
|
|
|
tree* t; |
1470
|
|
|
|
|
|
|
vector stack; |
1471
|
|
|
|
|
|
|
vector buffer; |
1472
|
|
|
|
|
|
|
|
1473
|
|
|
|
|
|
|
bool single_root; |
1474
|
|
|
|
|
|
|
}; |
1475
|
|
|
|
|
|
|
|
1476
|
|
|
|
|
|
|
} // namespace parsito |
1477
|
|
|
|
|
|
|
|
1478
|
|
|
|
|
|
|
///////// |
1479
|
|
|
|
|
|
|
// File: utils/binary_decoder.h |
1480
|
|
|
|
|
|
|
///////// |
1481
|
|
|
|
|
|
|
|
1482
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
1483
|
|
|
|
|
|
|
// |
1484
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
1485
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
1486
|
|
|
|
|
|
|
// |
1487
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
1488
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
1489
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
1490
|
|
|
|
|
|
|
|
1491
|
|
|
|
|
|
|
namespace utils { |
1492
|
|
|
|
|
|
|
|
1493
|
|
|
|
|
|
|
// |
1494
|
|
|
|
|
|
|
// Declarations |
1495
|
|
|
|
|
|
|
// |
1496
|
|
|
|
|
|
|
|
1497
|
0
|
|
|
|
|
|
class binary_decoder_error : public runtime_error { |
1498
|
|
|
|
|
|
|
public: |
1499
|
0
|
0
|
|
|
|
|
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1500
|
|
|
|
|
|
|
}; |
1501
|
|
|
|
|
|
|
|
1502
|
5
|
0
|
|
|
|
|
class binary_decoder { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1503
|
|
|
|
|
|
|
public: |
1504
|
|
|
|
|
|
|
inline unsigned char* fill(unsigned len); |
1505
|
|
|
|
|
|
|
|
1506
|
|
|
|
|
|
|
inline unsigned next_1B(); |
1507
|
|
|
|
|
|
|
inline unsigned next_2B(); |
1508
|
|
|
|
|
|
|
inline unsigned next_4B(); |
1509
|
|
|
|
|
|
|
inline void next_str(string& str); |
1510
|
|
|
|
|
|
|
template inline const T* next(unsigned elements); |
1511
|
|
|
|
|
|
|
|
1512
|
|
|
|
|
|
|
inline bool is_end(); |
1513
|
|
|
|
|
|
|
inline unsigned tell(); |
1514
|
|
|
|
|
|
|
inline void seek(unsigned pos); |
1515
|
|
|
|
|
|
|
|
1516
|
|
|
|
|
|
|
private: |
1517
|
|
|
|
|
|
|
vector buffer; |
1518
|
|
|
|
|
|
|
const unsigned char* data; |
1519
|
|
|
|
|
|
|
const unsigned char* data_end; |
1520
|
|
|
|
|
|
|
}; |
1521
|
|
|
|
|
|
|
|
1522
|
|
|
|
|
|
|
// |
1523
|
|
|
|
|
|
|
// Definitions |
1524
|
|
|
|
|
|
|
// |
1525
|
|
|
|
|
|
|
|
1526
|
|
|
|
|
|
|
unsigned char* binary_decoder::fill(unsigned len) { |
1527
|
6
|
50
|
|
|
|
|
buffer.resize(len); |
1528
|
6
|
|
|
|
|
|
data = buffer.data(); |
1529
|
6
|
|
|
|
|
|
data_end = buffer.data() + len; |
1530
|
|
|
|
|
|
|
|
1531
|
|
|
|
|
|
|
return buffer.data(); |
1532
|
|
|
|
|
|
|
} |
1533
|
|
|
|
|
|
|
|
1534
|
2616
|
|
|
|
|
|
unsigned binary_decoder::next_1B() { |
1535
|
1308
|
50
|
|
|
|
|
if (data + 1 > data_end) throw binary_decoder_error("No more data in binary_decoder"); |
1536
|
1308
|
|
|
|
|
|
return *data++; |
1537
|
|
|
|
|
|
|
} |
1538
|
|
|
|
|
|
|
|
1539
|
26
|
|
|
|
|
|
unsigned binary_decoder::next_2B() { |
1540
|
26
|
50
|
|
|
|
|
if (data + sizeof(uint16_t) > data_end) throw binary_decoder_error("No more data in binary_decoder"); |
1541
|
|
|
|
|
|
|
uint16_t result; |
1542
|
|
|
|
|
|
|
memcpy(&result, data, sizeof(uint16_t)); |
1543
|
26
|
|
|
|
|
|
data += sizeof(uint16_t); |
1544
|
26
|
|
|
|
|
|
return result; |
1545
|
|
|
|
|
|
|
} |
1546
|
|
|
|
|
|
|
|
1547
|
1573
|
|
|
|
|
|
unsigned binary_decoder::next_4B() { |
1548
|
1573
|
50
|
|
|
|
|
if (data + sizeof(uint32_t) > data_end) throw binary_decoder_error("No more data in binary_decoder"); |
1549
|
|
|
|
|
|
|
uint32_t result; |
1550
|
|
|
|
|
|
|
memcpy(&result, data, sizeof(uint32_t)); |
1551
|
1573
|
|
|
|
|
|
data += sizeof(uint32_t); |
1552
|
1573
|
|
|
|
|
|
return result; |
1553
|
|
|
|
|
|
|
} |
1554
|
|
|
|
|
|
|
|
1555
|
36
|
|
|
|
|
|
void binary_decoder::next_str(string& str) { |
1556
|
36
|
|
|
|
|
|
unsigned len = next_1B(); |
1557
|
36
|
100
|
|
|
|
|
if (len == 255) len = next_4B(); |
1558
|
36
|
|
|
|
|
|
str.assign(next(len), len); |
1559
|
36
|
|
|
|
|
|
} |
1560
|
|
|
|
|
|
|
|
1561
|
2544
|
|
|
|
|
|
template const T* binary_decoder::next(unsigned elements) { |
1562
|
1272
|
50
|
|
|
|
|
if (data + sizeof(T) * elements > data_end) throw binary_decoder_error("No more data in binary_decoder"); |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
1563
|
|
|
|
|
|
|
const T* result = (const T*) data; |
1564
|
1272
|
|
|
|
|
|
data += sizeof(T) * elements; |
1565
|
1272
|
|
|
|
|
|
return result; |
1566
|
|
|
|
|
|
|
} |
1567
|
|
|
|
|
|
|
|
1568
|
|
|
|
|
|
|
bool binary_decoder::is_end() { |
1569
|
4
|
|
|
|
|
|
return data >= data_end; |
1570
|
|
|
|
|
|
|
} |
1571
|
|
|
|
|
|
|
|
1572
|
|
|
|
|
|
|
unsigned binary_decoder::tell() { |
1573
|
1
|
|
|
|
|
|
return data - buffer.data(); |
1574
|
|
|
|
|
|
|
} |
1575
|
|
|
|
|
|
|
|
1576
|
1
|
|
|
|
|
|
void binary_decoder::seek(unsigned pos) { |
1577
|
1
|
50
|
|
|
|
|
if (pos > buffer.size()) throw binary_decoder_error("Cannot seek past end of binary_decoder"); |
1578
|
1
|
|
|
|
|
|
data = buffer.data() + pos; |
1579
|
1
|
|
|
|
|
|
} |
1580
|
|
|
|
|
|
|
|
1581
|
|
|
|
|
|
|
} // namespace utils |
1582
|
|
|
|
|
|
|
|
1583
|
|
|
|
|
|
|
///////// |
1584
|
|
|
|
|
|
|
// File: parsito/parser/parser.h |
1585
|
|
|
|
|
|
|
///////// |
1586
|
|
|
|
|
|
|
|
1587
|
|
|
|
|
|
|
// This file is part of Parsito . |
1588
|
|
|
|
|
|
|
// |
1589
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
1590
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
1591
|
|
|
|
|
|
|
// |
1592
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
1593
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
1594
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
1595
|
|
|
|
|
|
|
|
1596
|
|
|
|
|
|
|
namespace parsito { |
1597
|
|
|
|
|
|
|
|
1598
|
|
|
|
|
|
|
// Parser |
1599
|
1
|
|
|
|
|
|
class parser { |
1600
|
|
|
|
|
|
|
public: |
1601
|
1
|
|
|
|
|
|
virtual ~parser() {}; |
1602
|
|
|
|
|
|
|
|
1603
|
|
|
|
|
|
|
virtual void parse(tree& t, unsigned beam_size = 0, double* cost = nullptr) const = 0; |
1604
|
|
|
|
|
|
|
|
1605
|
|
|
|
|
|
|
enum { NO_CACHE = 0, FULL_CACHE = 2147483647}; |
1606
|
|
|
|
|
|
|
static parser* load(const char* file, unsigned cache = 1000); |
1607
|
|
|
|
|
|
|
static parser* load(istream& in, unsigned cache = 1000); |
1608
|
|
|
|
|
|
|
|
1609
|
|
|
|
|
|
|
protected: |
1610
|
|
|
|
|
|
|
virtual void load(binary_decoder& data, unsigned cache) = 0; |
1611
|
|
|
|
|
|
|
static parser* create(const string& name); |
1612
|
|
|
|
|
|
|
}; |
1613
|
|
|
|
|
|
|
|
1614
|
|
|
|
|
|
|
} // namespace parsito |
1615
|
|
|
|
|
|
|
|
1616
|
|
|
|
|
|
|
///////// |
1617
|
|
|
|
|
|
|
// File: tokenizer/multiword_splitter.h |
1618
|
|
|
|
|
|
|
///////// |
1619
|
|
|
|
|
|
|
|
1620
|
|
|
|
|
|
|
// This file is part of UDPipe . |
1621
|
|
|
|
|
|
|
// |
1622
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
1623
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
1624
|
|
|
|
|
|
|
// |
1625
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
1626
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
1627
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
1628
|
|
|
|
|
|
|
|
1629
|
1
|
|
|
|
|
|
class multiword_splitter { |
1630
|
|
|
|
|
|
|
public: |
1631
|
|
|
|
|
|
|
void append_token(string_piece token, string_piece misc, sentence& s) const; |
1632
|
|
|
|
|
|
|
|
1633
|
|
|
|
|
|
|
static multiword_splitter* load(istream& is); |
1634
|
|
|
|
|
|
|
|
1635
|
|
|
|
|
|
|
private: |
1636
|
1
|
|
|
|
|
|
multiword_splitter(unsigned version) : version(version) {} |
1637
|
|
|
|
|
|
|
unsigned version; |
1638
|
|
|
|
|
|
|
enum { VERSION_LATEST = 2 }; |
1639
|
|
|
|
|
|
|
friend class multiword_splitter_trainer; |
1640
|
|
|
|
|
|
|
|
1641
|
0
|
|
|
|
|
|
struct suffix_info { |
1642
|
|
|
|
|
|
|
vector words; |
1643
|
|
|
|
|
|
|
}; |
1644
|
|
|
|
|
|
|
unordered_map full_rules, suffix_rules; |
1645
|
|
|
|
|
|
|
}; |
1646
|
|
|
|
|
|
|
|
1647
|
|
|
|
|
|
|
///////// |
1648
|
|
|
|
|
|
|
// File: utils/parse_int.h |
1649
|
|
|
|
|
|
|
///////// |
1650
|
|
|
|
|
|
|
|
1651
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
1652
|
|
|
|
|
|
|
// |
1653
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
1654
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
1655
|
|
|
|
|
|
|
// |
1656
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
1657
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
1658
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
1659
|
|
|
|
|
|
|
|
1660
|
|
|
|
|
|
|
namespace utils { |
1661
|
|
|
|
|
|
|
|
1662
|
|
|
|
|
|
|
// |
1663
|
|
|
|
|
|
|
// Declarations |
1664
|
|
|
|
|
|
|
// |
1665
|
|
|
|
|
|
|
|
1666
|
|
|
|
|
|
|
// Try to parse an int from given string. If the int cannot be parsed or does |
1667
|
|
|
|
|
|
|
// not fit into int, false is returned and the error string is filled using the |
1668
|
|
|
|
|
|
|
// value_name argument. |
1669
|
|
|
|
|
|
|
inline bool parse_int(string_piece str, const char* value_name, int& value, string& error); |
1670
|
|
|
|
|
|
|
|
1671
|
|
|
|
|
|
|
// Try to parse an int from given string. If the int cannot be parsed or does |
1672
|
|
|
|
|
|
|
// not fit into int, an error is displayed and program exits. |
1673
|
|
|
|
|
|
|
inline int parse_int(string_piece str, const char* value_name); |
1674
|
|
|
|
|
|
|
|
1675
|
|
|
|
|
|
|
// |
1676
|
|
|
|
|
|
|
// Definitions |
1677
|
|
|
|
|
|
|
// |
1678
|
|
|
|
|
|
|
|
1679
|
68
|
|
|
|
|
|
bool parse_int(string_piece str, const char* value_name, int& value, string& error) { |
1680
|
|
|
|
|
|
|
string_piece original = str; |
1681
|
|
|
|
|
|
|
|
1682
|
|
|
|
|
|
|
// Skip spaces |
1683
|
34
|
50
|
|
|
|
|
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
1684
|
0
|
|
|
|
|
|
str.str++, str.len--; |
1685
|
|
|
|
|
|
|
|
1686
|
|
|
|
|
|
|
// Allow minus |
1687
|
|
|
|
|
|
|
bool positive = true; |
1688
|
34
|
50
|
|
|
|
|
if (str.len && (str.str[0] == '+' || str.str[0] == '-')) { |
|
|
100
|
|
|
|
|
|
1689
|
|
|
|
|
|
|
positive = str.str[0] == '+'; |
1690
|
8
|
|
|
|
|
|
str.str++, str.len--; |
1691
|
|
|
|
|
|
|
} |
1692
|
|
|
|
|
|
|
|
1693
|
|
|
|
|
|
|
// Parse value, checking for overflow/underflow |
1694
|
34
|
50
|
|
|
|
|
if (!str.len) return error.assign("Cannot parse ").append(value_name).append(" int value '").append(original.str, original.len).append("': empty string."), false; |
1695
|
|
|
|
|
|
|
if (!(str.str[0] >= '0' || str.str[0] <= '9')) return error.assign("Cannot parse ").append(value_name).append(" int value '").append(original.str, original.len).append("': non-digit character found."), false; |
1696
|
|
|
|
|
|
|
|
1697
|
34
|
|
|
|
|
|
value = 0; |
1698
|
68
|
100
|
|
|
|
|
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
1699
|
34
|
100
|
|
|
|
|
if (positive) { |
1700
|
26
|
50
|
|
|
|
|
if (value > (numeric_limits::max() - (str.str[0] - '0')) / 10) |
1701
|
0
|
|
|
|
|
|
return error.assign("Cannot parse ").append(value_name).append(" int value '").append(original.str, original.len).append("': overflow occured."), false; |
1702
|
26
|
|
|
|
|
|
value = 10 * value + (str.str[0] - '0'); |
1703
|
|
|
|
|
|
|
} else { |
1704
|
8
|
50
|
|
|
|
|
if (value < (numeric_limits::min() + (str.str[0] - '0')) / 10) |
1705
|
0
|
|
|
|
|
|
return error.assign("Cannot parse ").append(value_name).append(" int value '").append(original.str, original.len).append("': underflow occured."), false; |
1706
|
8
|
|
|
|
|
|
value = 10 * value - (str.str[0] - '0'); |
1707
|
|
|
|
|
|
|
} |
1708
|
34
|
|
|
|
|
|
str.str++, str.len--; |
1709
|
|
|
|
|
|
|
} |
1710
|
|
|
|
|
|
|
|
1711
|
|
|
|
|
|
|
// Skip spaces |
1712
|
34
|
50
|
|
|
|
|
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1713
|
0
|
|
|
|
|
|
str.str++, str.len--; |
1714
|
|
|
|
|
|
|
|
1715
|
|
|
|
|
|
|
// Check for remaining characters |
1716
|
34
|
50
|
|
|
|
|
if (str.len) return error.assign("Cannot parse ").append(value_name).append(" int value '").append(original.str, original.len).append("': non-digit character found."), false; |
1717
|
|
|
|
|
|
|
|
1718
|
|
|
|
|
|
|
return true; |
1719
|
|
|
|
|
|
|
} |
1720
|
|
|
|
|
|
|
|
1721
|
0
|
|
|
|
|
|
int parse_int(string_piece str, const char* value_name) { |
1722
|
|
|
|
|
|
|
int result; |
1723
|
|
|
|
|
|
|
string error; |
1724
|
|
|
|
|
|
|
|
1725
|
0
|
0
|
|
|
|
|
if (!parse_int(str, value_name, result, error)) |
|
|
0
|
|
|
|
|
|
1726
|
0
|
|
|
|
|
|
runtime_failure(error); |
1727
|
|
|
|
|
|
|
|
1728
|
0
|
|
|
|
|
|
return result; |
1729
|
|
|
|
|
|
|
} |
1730
|
|
|
|
|
|
|
|
1731
|
|
|
|
|
|
|
} // namespace utils |
1732
|
|
|
|
|
|
|
|
1733
|
|
|
|
|
|
|
///////// |
1734
|
|
|
|
|
|
|
// File: utils/path_from_utf8.h |
1735
|
|
|
|
|
|
|
///////// |
1736
|
|
|
|
|
|
|
|
1737
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
1738
|
|
|
|
|
|
|
// |
1739
|
|
|
|
|
|
|
// Copyright 2022 Institute of Formal and Applied Linguistics, Faculty of |
1740
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
1741
|
|
|
|
|
|
|
// |
1742
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
1743
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
1744
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
1745
|
|
|
|
|
|
|
|
1746
|
|
|
|
|
|
|
namespace utils { |
1747
|
|
|
|
|
|
|
|
1748
|
|
|
|
|
|
|
// |
1749
|
|
|
|
|
|
|
// Declarations |
1750
|
|
|
|
|
|
|
// |
1751
|
|
|
|
|
|
|
|
1752
|
|
|
|
|
|
|
#ifdef _WIN32 |
1753
|
|
|
|
|
|
|
inline wstring path_from_utf8(const char* str); |
1754
|
|
|
|
|
|
|
inline wstring path_from_utf8(const string& str); |
1755
|
|
|
|
|
|
|
#else |
1756
|
|
|
|
|
|
|
inline string path_from_utf8(const char* str); |
1757
|
|
|
|
|
|
|
inline const string& path_from_utf8(const string& str); |
1758
|
|
|
|
|
|
|
#endif |
1759
|
|
|
|
|
|
|
|
1760
|
|
|
|
|
|
|
// |
1761
|
|
|
|
|
|
|
// Definitions |
1762
|
|
|
|
|
|
|
// |
1763
|
|
|
|
|
|
|
|
1764
|
|
|
|
|
|
|
#ifdef _WIN32 |
1765
|
|
|
|
|
|
|
|
1766
|
|
|
|
|
|
|
inline wstring path_from_utf8(const char* str) { |
1767
|
|
|
|
|
|
|
// We could implement this using codecvt_utf8_utf16, but it is not available |
1768
|
|
|
|
|
|
|
// in GCC 4.9, which we still use. We could also use MultiByteToWideChar, |
1769
|
|
|
|
|
|
|
// but using it would require changing our build infrastructure -- hence |
1770
|
|
|
|
|
|
|
// we implement the conversion manually. |
1771
|
|
|
|
|
|
|
wstring wstr; |
1772
|
|
|
|
|
|
|
while (*str) { |
1773
|
|
|
|
|
|
|
char32_t chr; |
1774
|
|
|
|
|
|
|
if (((unsigned char)*str) < 0x80) chr = (unsigned char)*str++; |
1775
|
|
|
|
|
|
|
else if (((unsigned char)*str) < 0xC0) chr = '?', ++str; |
1776
|
|
|
|
|
|
|
else if (((unsigned char)*str) < 0xE0) { |
1777
|
|
|
|
|
|
|
chr = (((unsigned char)*str++) & 0x1F) << 6; |
1778
|
|
|
|
|
|
|
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) chr = '?'; |
1779
|
|
|
|
|
|
|
else chr += ((unsigned char)*str++) & 0x3F; |
1780
|
|
|
|
|
|
|
} else if (((unsigned char)*str) < 0xF0) { |
1781
|
|
|
|
|
|
|
chr = (((unsigned char)*str++) & 0x0F) << 12; |
1782
|
|
|
|
|
|
|
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) chr = '?'; |
1783
|
|
|
|
|
|
|
else { |
1784
|
|
|
|
|
|
|
chr += (((unsigned char)*str++) & 0x3F) << 6; |
1785
|
|
|
|
|
|
|
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) chr = '?'; |
1786
|
|
|
|
|
|
|
else chr += ((unsigned char)*str++) & 0x3F; |
1787
|
|
|
|
|
|
|
} |
1788
|
|
|
|
|
|
|
} else if (((unsigned char)*str) < 0xF8) { |
1789
|
|
|
|
|
|
|
chr = (((unsigned char)*str++) & 0x07) << 18; |
1790
|
|
|
|
|
|
|
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) chr = '?'; |
1791
|
|
|
|
|
|
|
else { |
1792
|
|
|
|
|
|
|
chr += (((unsigned char)*str++) & 0x3F) << 12; |
1793
|
|
|
|
|
|
|
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) chr = '?'; |
1794
|
|
|
|
|
|
|
else { |
1795
|
|
|
|
|
|
|
chr += (((unsigned char)*str++) & 0x3F) << 6; |
1796
|
|
|
|
|
|
|
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) chr = '?'; |
1797
|
|
|
|
|
|
|
else chr += ((unsigned char)*str++) & 0x3F; |
1798
|
|
|
|
|
|
|
} |
1799
|
|
|
|
|
|
|
} |
1800
|
|
|
|
|
|
|
} else chr = '?', ++str; |
1801
|
|
|
|
|
|
|
|
1802
|
|
|
|
|
|
|
if (chr <= 0xFFFF) wstr.push_back(chr); |
1803
|
|
|
|
|
|
|
else if (chr <= 0x10FFFF) { |
1804
|
|
|
|
|
|
|
wstr.push_back(0xD800 + ((chr - 0x10000) >> 10)); |
1805
|
|
|
|
|
|
|
wstr.push_back(0xDC00 + ((chr - 0x10000) & 0x3FF)); |
1806
|
|
|
|
|
|
|
} else { |
1807
|
|
|
|
|
|
|
wstr.push_back('?'); |
1808
|
|
|
|
|
|
|
} |
1809
|
|
|
|
|
|
|
} |
1810
|
|
|
|
|
|
|
return wstr; |
1811
|
|
|
|
|
|
|
} |
1812
|
|
|
|
|
|
|
|
1813
|
|
|
|
|
|
|
inline wstring path_from_utf8(const string& str) { |
1814
|
|
|
|
|
|
|
return path_from_utf8(str.c_str()); |
1815
|
|
|
|
|
|
|
} |
1816
|
|
|
|
|
|
|
|
1817
|
|
|
|
|
|
|
#else |
1818
|
|
|
|
|
|
|
|
1819
|
|
|
|
|
|
|
inline string path_from_utf8(const char* str) { |
1820
|
1
|
|
|
|
|
|
return str; |
1821
|
|
|
|
|
|
|
} |
1822
|
|
|
|
|
|
|
|
1823
|
|
|
|
|
|
|
inline const string& path_from_utf8(const string& str) { |
1824
|
|
|
|
|
|
|
return str; |
1825
|
|
|
|
|
|
|
} |
1826
|
|
|
|
|
|
|
|
1827
|
|
|
|
|
|
|
#endif |
1828
|
|
|
|
|
|
|
|
1829
|
|
|
|
|
|
|
} // namespace utils |
1830
|
|
|
|
|
|
|
|
1831
|
|
|
|
|
|
|
///////// |
1832
|
|
|
|
|
|
|
// File: utils/named_values.h |
1833
|
|
|
|
|
|
|
///////// |
1834
|
|
|
|
|
|
|
|
1835
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
1836
|
|
|
|
|
|
|
// |
1837
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
1838
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
1839
|
|
|
|
|
|
|
// |
1840
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
1841
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
1842
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
1843
|
|
|
|
|
|
|
|
1844
|
|
|
|
|
|
|
namespace utils { |
1845
|
|
|
|
|
|
|
|
1846
|
|
|
|
|
|
|
// |
1847
|
|
|
|
|
|
|
// Declarations |
1848
|
|
|
|
|
|
|
// |
1849
|
|
|
|
|
|
|
|
1850
|
|
|
|
|
|
|
class named_values { |
1851
|
|
|
|
|
|
|
public: |
1852
|
|
|
|
|
|
|
typedef unordered_map map; |
1853
|
|
|
|
|
|
|
|
1854
|
|
|
|
|
|
|
inline static bool parse(const string& values, map& parsed_values, string& error); |
1855
|
|
|
|
|
|
|
}; |
1856
|
|
|
|
|
|
|
|
1857
|
|
|
|
|
|
|
// |
1858
|
|
|
|
|
|
|
// Definitions |
1859
|
|
|
|
|
|
|
// |
1860
|
|
|
|
|
|
|
|
1861
|
3
|
|
|
|
|
|
bool named_values::parse(const string& values, map& parsed_values, string& error) { |
1862
|
|
|
|
|
|
|
error.clear(); |
1863
|
|
|
|
|
|
|
parsed_values.clear(); |
1864
|
|
|
|
|
|
|
|
1865
|
|
|
|
|
|
|
string name, file; |
1866
|
3
|
50
|
|
|
|
|
for (size_t start = 0; start < values.size(); ) { |
1867
|
0
|
0
|
|
|
|
|
while (start < values.size() && values[start] == ';') start++; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1868
|
0
|
0
|
|
|
|
|
if (start >= values.size()) break; |
1869
|
|
|
|
|
|
|
|
1870
|
|
|
|
|
|
|
size_t name_end = values.find_first_of("=;", start); |
1871
|
0
|
0
|
|
|
|
|
name.assign(values, start, name_end - start); |
1872
|
|
|
|
|
|
|
string& value = parsed_values[name]; |
1873
|
|
|
|
|
|
|
|
1874
|
0
|
0
|
|
|
|
|
if (name_end == string::npos) { |
1875
|
|
|
|
|
|
|
start = name_end; |
1876
|
0
|
0
|
|
|
|
|
} else if (values[name_end] == ';') { |
1877
|
0
|
|
|
|
|
|
start = name_end + 1; |
1878
|
|
|
|
|
|
|
} else /* if (values[name_end] == '=') */ { |
1879
|
|
|
|
|
|
|
size_t equal_sign = name_end; |
1880
|
|
|
|
|
|
|
|
1881
|
0
|
0
|
|
|
|
|
if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "file:") == 0) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1882
|
|
|
|
|
|
|
// Value of type file: |
1883
|
|
|
|
|
|
|
size_t file_name = equal_sign + 1 + 5; |
1884
|
0
|
|
|
|
|
|
size_t semicolon = min(values.find(';', file_name), values.size()); |
1885
|
|
|
|
|
|
|
|
1886
|
0
|
0
|
|
|
|
|
file.assign(values, file_name, semicolon - file_name); |
1887
|
0
|
0
|
|
|
|
|
ifstream is(path_from_utf8(file).c_str()); |
1888
|
0
|
0
|
|
|
|
|
if (!is.is_open()) return error.assign("Cannot open file '").append(file).append("'!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1889
|
|
|
|
|
|
|
|
1890
|
|
|
|
|
|
|
char buffer[1024]; |
1891
|
0
|
0
|
|
|
|
|
for (value.clear(); is.read(buffer, sizeof(buffer)); ) |
|
|
0
|
|
|
|
|
|
1892
|
0
|
0
|
|
|
|
|
value.append(buffer, sizeof(buffer)); |
1893
|
0
|
0
|
|
|
|
|
value.append(buffer, is.gcount()); |
1894
|
|
|
|
|
|
|
|
1895
|
0
|
|
|
|
|
|
start = semicolon + 1; |
1896
|
0
|
0
|
|
|
|
|
} else if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "data:") == 0) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1897
|
|
|
|
|
|
|
// Value of type data: |
1898
|
|
|
|
|
|
|
size_t data_size_start = equal_sign + 1 + 5; |
1899
|
0
|
|
|
|
|
|
size_t data_size_end = values.find(':', data_size_start); |
1900
|
0
|
0
|
|
|
|
|
if (data_size_end == string::npos) return error.assign("Cannot parse named values, data size of value '").append(name).append("' not terminated!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1901
|
|
|
|
|
|
|
|
1902
|
|
|
|
|
|
|
int data_size; |
1903
|
0
|
0
|
|
|
|
|
if (!parse_int(string_piece(values.c_str() + data_size_start, data_size_end - data_size_start), "data_size", data_size, error)) return false; |
|
|
0
|
|
|
|
|
|
1904
|
|
|
|
|
|
|
|
1905
|
0
|
|
|
|
|
|
size_t data_start = data_size_end + 1, data_end = data_start + data_size; |
1906
|
0
|
0
|
|
|
|
|
if (data_end > values.size()) return error.assign("Cannot parse named values, value '").append(name).append("' shorter than specified length!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1907
|
0
|
0
|
|
|
|
|
if (data_end < values.size() && values[data_end] != ';') return error.assign("Cannot parse named values, value '").append(name).append("' not terminated by semicolon!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1908
|
|
|
|
|
|
|
|
1909
|
0
|
0
|
|
|
|
|
value.assign(values, data_start, data_end - data_start); |
1910
|
0
|
|
|
|
|
|
start = data_end + 1; |
1911
|
|
|
|
|
|
|
} else { |
1912
|
|
|
|
|
|
|
// Value of string type |
1913
|
0
|
|
|
|
|
|
size_t semicolon = min(values.find(';', equal_sign), values.size()); |
1914
|
0
|
0
|
|
|
|
|
value.assign(values, equal_sign + 1, semicolon - equal_sign - 1); |
1915
|
0
|
|
|
|
|
|
start = semicolon + 1; |
1916
|
|
|
|
|
|
|
} |
1917
|
|
|
|
|
|
|
} |
1918
|
|
|
|
|
|
|
} |
1919
|
|
|
|
|
|
|
|
1920
|
|
|
|
|
|
|
return true; |
1921
|
|
|
|
|
|
|
} |
1922
|
|
|
|
|
|
|
|
1923
|
|
|
|
|
|
|
} // namespace utils |
1924
|
|
|
|
|
|
|
|
1925
|
|
|
|
|
|
|
///////// |
1926
|
|
|
|
|
|
|
// File: utils/threadsafe_stack.h |
1927
|
|
|
|
|
|
|
///////// |
1928
|
|
|
|
|
|
|
|
1929
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
1930
|
|
|
|
|
|
|
// |
1931
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
1932
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
1933
|
|
|
|
|
|
|
// |
1934
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
1935
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
1936
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
1937
|
|
|
|
|
|
|
|
1938
|
|
|
|
|
|
|
namespace utils { |
1939
|
|
|
|
|
|
|
|
1940
|
|
|
|
|
|
|
// |
1941
|
|
|
|
|
|
|
// Declarations |
1942
|
|
|
|
|
|
|
// |
1943
|
|
|
|
|
|
|
|
1944
|
|
|
|
|
|
|
template |
1945
|
3
|
|
|
|
|
|
class threadsafe_stack { |
1946
|
|
|
|
|
|
|
public: |
1947
|
|
|
|
|
|
|
inline void push(T* t); |
1948
|
|
|
|
|
|
|
inline T* pop(); |
1949
|
|
|
|
|
|
|
|
1950
|
|
|
|
|
|
|
private: |
1951
|
|
|
|
|
|
|
vector> stack; |
1952
|
|
|
|
|
|
|
atomic_flag lock = ATOMIC_FLAG_INIT; |
1953
|
|
|
|
|
|
|
}; |
1954
|
|
|
|
|
|
|
|
1955
|
|
|
|
|
|
|
// |
1956
|
|
|
|
|
|
|
// Definitions |
1957
|
|
|
|
|
|
|
// |
1958
|
|
|
|
|
|
|
|
1959
|
|
|
|
|
|
|
template |
1960
|
8
|
|
|
|
|
|
void threadsafe_stack::push(T* t) { |
1961
|
4
|
0
|
|
|
|
|
while (lock.test_and_set(memory_order_acquire)) {} |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
1962
|
4
|
|
|
|
|
|
stack.emplace_back(t); |
1963
|
|
|
|
|
|
|
lock.clear(memory_order_release); |
1964
|
4
|
|
|
|
|
|
} |
1965
|
|
|
|
|
|
|
|
1966
|
|
|
|
|
|
|
template |
1967
|
8
|
|
|
|
|
|
T* threadsafe_stack::pop() { |
1968
|
|
|
|
|
|
|
T* res = nullptr; |
1969
|
|
|
|
|
|
|
|
1970
|
4
|
0
|
|
|
|
|
while (lock.test_and_set(memory_order_acquire)) {} |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
1971
|
4
|
0
|
|
|
|
|
if (!stack.empty()) { |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
1972
|
|
|
|
|
|
|
res = stack.back().release(); |
1973
|
|
|
|
|
|
|
stack.pop_back(); |
1974
|
|
|
|
|
|
|
} |
1975
|
|
|
|
|
|
|
lock.clear(memory_order_release); |
1976
|
|
|
|
|
|
|
|
1977
|
4
|
|
|
|
|
|
return res; |
1978
|
|
|
|
|
|
|
} |
1979
|
|
|
|
|
|
|
|
1980
|
|
|
|
|
|
|
} // namespace utils |
1981
|
|
|
|
|
|
|
|
1982
|
|
|
|
|
|
|
///////// |
1983
|
|
|
|
|
|
|
// File: model/model_morphodita_parsito.h |
1984
|
|
|
|
|
|
|
///////// |
1985
|
|
|
|
|
|
|
|
1986
|
|
|
|
|
|
|
// This file is part of UDPipe . |
1987
|
|
|
|
|
|
|
// |
1988
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
1989
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
1990
|
|
|
|
|
|
|
// |
1991
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
1992
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
1993
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
1994
|
|
|
|
|
|
|
|
1995
|
4
|
|
|
|
|
|
class model_morphodita_parsito : public model { |
1996
|
|
|
|
|
|
|
public: |
1997
|
|
|
|
|
|
|
virtual input_format* new_tokenizer(const string& options) const override; |
1998
|
|
|
|
|
|
|
virtual bool tag(sentence& s, const string& options, string& error) const override; |
1999
|
|
|
|
|
|
|
virtual bool parse(sentence& s, const string& options, string& error) const override; |
2000
|
|
|
|
|
|
|
|
2001
|
|
|
|
|
|
|
static model* load(istream& is); |
2002
|
|
|
|
|
|
|
|
2003
|
|
|
|
|
|
|
private: |
2004
|
|
|
|
|
|
|
model_morphodita_parsito(unsigned version); |
2005
|
|
|
|
|
|
|
unsigned version; |
2006
|
|
|
|
|
|
|
enum { VERSION_LATEST = 3 }; |
2007
|
|
|
|
|
|
|
|
2008
|
|
|
|
|
|
|
unique_ptr tokenizer_factory; |
2009
|
|
|
|
|
|
|
unique_ptr splitter; |
2010
|
1
|
|
|
|
|
|
struct tagger_model { |
2011
|
|
|
|
|
|
|
bool raw; bool upostag; int lemma; bool xpostag, feats; |
2012
|
|
|
|
|
|
|
unique_ptr tagger; |
2013
|
|
|
|
|
|
|
|
2014
|
|
|
|
|
|
|
tagger_model(bool raw, bool upostag, int lemma, bool xpostag, bool feats, morphodita::tagger* tagger) |
2015
|
1
|
|
|
|
|
|
: raw(raw), upostag(upostag), lemma(lemma), xpostag(xpostag), feats(feats), tagger(tagger) {} |
2016
|
|
|
|
|
|
|
}; |
2017
|
|
|
|
|
|
|
vector taggers; |
2018
|
|
|
|
|
|
|
unique_ptr parser; |
2019
|
|
|
|
|
|
|
|
2020
|
3
|
|
|
|
|
|
struct tagger_cache { |
2021
|
|
|
|
|
|
|
vector forms_normalized; |
2022
|
|
|
|
|
|
|
vector forms_string_pieces; |
2023
|
|
|
|
|
|
|
vector lemmas; |
2024
|
|
|
|
|
|
|
}; |
2025
|
|
|
|
|
|
|
mutable threadsafe_stack tagger_caches; |
2026
|
|
|
|
|
|
|
|
2027
|
1
|
50
|
|
|
|
|
struct parser_cache { |
2028
|
|
|
|
|
|
|
parsito::tree tree; |
2029
|
|
|
|
|
|
|
named_values::map options; |
2030
|
|
|
|
|
|
|
}; |
2031
|
|
|
|
|
|
|
mutable threadsafe_stack parser_caches; |
2032
|
|
|
|
|
|
|
|
2033
|
|
|
|
|
|
|
bool parse(sentence& s, const string& options, string& error, double* cost) const; |
2034
|
|
|
|
|
|
|
|
2035
|
0
|
|
|
|
|
|
class joint_with_parsing_tokenizer : public input_format { |
2036
|
|
|
|
|
|
|
public: |
2037
|
|
|
|
|
|
|
joint_with_parsing_tokenizer(input_format* tokenizer, const model_morphodita_parsito& model, |
2038
|
|
|
|
|
|
|
int max_sentence_len, double change_boundary_logprob, double sentence_logprob) |
2039
|
|
|
|
|
|
|
: tokenizer(tokenizer), model(model), max_sentence_len(max_sentence_len), |
2040
|
0
|
|
|
|
|
|
change_boundary_logprob(change_boundary_logprob), sentence_logprob(sentence_logprob) {} |
2041
|
|
|
|
|
|
|
|
2042
|
|
|
|
|
|
|
virtual bool read_block(istream& is, string& block) const override; |
2043
|
|
|
|
|
|
|
virtual void reset_document(string_piece id) override; |
2044
|
|
|
|
|
|
|
virtual void set_text(string_piece text, bool make_copy = false) override; |
2045
|
|
|
|
|
|
|
virtual bool next_sentence(sentence& s, string& error) override; |
2046
|
|
|
|
|
|
|
|
2047
|
|
|
|
|
|
|
private: |
2048
|
|
|
|
|
|
|
bool parse_paragraph(vector& paragraph, string& error); |
2049
|
|
|
|
|
|
|
|
2050
|
|
|
|
|
|
|
unique_ptr tokenizer; |
2051
|
|
|
|
|
|
|
const model_morphodita_parsito& model; |
2052
|
|
|
|
|
|
|
int max_sentence_len; |
2053
|
|
|
|
|
|
|
double change_boundary_logprob; |
2054
|
|
|
|
|
|
|
double sentence_logprob; |
2055
|
|
|
|
|
|
|
|
2056
|
|
|
|
|
|
|
string_piece text; |
2057
|
|
|
|
|
|
|
string text_copy; |
2058
|
|
|
|
|
|
|
bool new_document = true; |
2059
|
|
|
|
|
|
|
string document_id; |
2060
|
|
|
|
|
|
|
unsigned sentence_id = 1; |
2061
|
|
|
|
|
|
|
vector sentences; |
2062
|
|
|
|
|
|
|
size_t sentences_index = 0; |
2063
|
|
|
|
|
|
|
}; |
2064
|
|
|
|
|
|
|
|
2065
|
|
|
|
|
|
|
void fill_word_analysis(const morphodita::tagged_lemma& analysis, bool raw, bool upostag, int lemma, bool xpostag, bool feats, word& word) const; |
2066
|
|
|
|
|
|
|
const string& normalize_form(string_piece form, string& output) const; |
2067
|
|
|
|
|
|
|
const string& normalize_lemma(string_piece lemma, string& output) const; |
2068
|
|
|
|
|
|
|
friend class trainer_morphodita_parsito; |
2069
|
|
|
|
|
|
|
}; |
2070
|
|
|
|
|
|
|
|
2071
|
|
|
|
|
|
|
///////// |
2072
|
|
|
|
|
|
|
// File: model/model.cpp |
2073
|
|
|
|
|
|
|
///////// |
2074
|
|
|
|
|
|
|
|
2075
|
|
|
|
|
|
|
// This file is part of UDPipe . |
2076
|
|
|
|
|
|
|
// |
2077
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
2078
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
2079
|
|
|
|
|
|
|
// |
2080
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
2081
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
2082
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
2083
|
|
|
|
|
|
|
|
2084
|
2
|
|
|
|
|
|
const string model::DEFAULT; |
2085
|
2
|
|
|
|
|
|
const string model::TOKENIZER_NORMALIZED_SPACES = "normalized_spaces"; |
2086
|
2
|
|
|
|
|
|
const string model::TOKENIZER_PRESEGMENTED = "presegmented"; |
2087
|
2
|
|
|
|
|
|
const string model::TOKENIZER_RANGES = "ranges"; |
2088
|
|
|
|
|
|
|
|
2089
|
1
|
|
|
|
|
|
model* model::load(const char* fname) { |
2090
|
2
|
50
|
|
|
|
|
ifstream in(path_from_utf8(fname).c_str(), ifstream::in | ifstream::binary); |
2091
|
1
|
50
|
|
|
|
|
if (!in.is_open()) return nullptr; |
2092
|
1
|
50
|
|
|
|
|
return load(in); |
2093
|
|
|
|
|
|
|
} |
2094
|
|
|
|
|
|
|
|
2095
|
1
|
|
|
|
|
|
model* model::load(istream& is) { |
2096
|
|
|
|
|
|
|
char len; |
2097
|
1
|
50
|
|
|
|
|
if (!is.get(len)) return nullptr; |
2098
|
1
|
|
|
|
|
|
string name(len, ' '); |
2099
|
1
|
50
|
|
|
|
|
if (!is.read(&name[0], len)) return nullptr; |
|
|
50
|
|
|
|
|
|
2100
|
|
|
|
|
|
|
|
2101
|
1
|
50
|
|
|
|
|
if (name == "morphodita_parsito") return model_morphodita_parsito::load(is); |
|
|
50
|
|
|
|
|
|
2102
|
|
|
|
|
|
|
|
2103
|
|
|
|
|
|
|
return nullptr; |
2104
|
|
|
|
|
|
|
} |
2105
|
|
|
|
|
|
|
|
2106
|
|
|
|
|
|
|
///////// |
2107
|
|
|
|
|
|
|
// File: morphodita/tagger/tagger_ids.h |
2108
|
|
|
|
|
|
|
///////// |
2109
|
|
|
|
|
|
|
|
2110
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
2111
|
|
|
|
|
|
|
// |
2112
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
2113
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
2114
|
|
|
|
|
|
|
// |
2115
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
2116
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
2117
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
2118
|
|
|
|
|
|
|
|
2119
|
|
|
|
|
|
|
namespace morphodita { |
2120
|
|
|
|
|
|
|
|
2121
|
|
|
|
|
|
|
class tagger_ids { |
2122
|
|
|
|
|
|
|
public: |
2123
|
|
|
|
|
|
|
enum tagger_id { |
2124
|
|
|
|
|
|
|
CZECH2 = 0, CZECH3 = 1, CZECH2_3 = 6, |
2125
|
|
|
|
|
|
|
/* 2 was used internally for ENGLISH3, but never released publicly */ |
2126
|
|
|
|
|
|
|
GENERIC2 = 3, GENERIC3 = 4, GENERIC4 = 5, GENERIC2_3 = 7, |
2127
|
|
|
|
|
|
|
CONLLU2 = 8, CONLLU2_3 = 9, CONLLU3 = 10, |
2128
|
|
|
|
|
|
|
}; |
2129
|
|
|
|
|
|
|
|
2130
|
|
|
|
|
|
|
static bool parse(const string& str, tagger_id& id) { |
2131
|
|
|
|
|
|
|
if (str == "czech2") return id = CZECH2, true; |
2132
|
|
|
|
|
|
|
if (str == "czech2_3") return id = CZECH2_3, true; |
2133
|
|
|
|
|
|
|
if (str == "czech3") return id = CZECH3, true; |
2134
|
|
|
|
|
|
|
if (str == "generic2") return id = GENERIC2, true; |
2135
|
|
|
|
|
|
|
if (str == "generic2_3") return id = GENERIC2_3, true; |
2136
|
|
|
|
|
|
|
if (str == "generic3") return id = GENERIC3, true; |
2137
|
|
|
|
|
|
|
if (str == "generic4") return id = GENERIC4, true; |
2138
|
|
|
|
|
|
|
if (str == "conllu2") return id = CONLLU2, true; |
2139
|
|
|
|
|
|
|
if (str == "conllu2_3") return id = CONLLU2_3, true; |
2140
|
|
|
|
|
|
|
if (str == "conllu3") return id = CONLLU3, true; |
2141
|
|
|
|
|
|
|
return false; |
2142
|
|
|
|
|
|
|
} |
2143
|
|
|
|
|
|
|
|
2144
|
|
|
|
|
|
|
static int decoding_order(tagger_id id) { |
2145
|
|
|
|
|
|
|
switch (id) { |
2146
|
|
|
|
|
|
|
case CZECH2: return 2; |
2147
|
|
|
|
|
|
|
case CZECH2_3: return 2; |
2148
|
|
|
|
|
|
|
case CZECH3: return 3; |
2149
|
|
|
|
|
|
|
case GENERIC2: return 2; |
2150
|
|
|
|
|
|
|
case GENERIC2_3: return 2; |
2151
|
|
|
|
|
|
|
case GENERIC3: return 3; |
2152
|
|
|
|
|
|
|
case GENERIC4: return 4; |
2153
|
|
|
|
|
|
|
case CONLLU2: return 2; |
2154
|
|
|
|
|
|
|
case CONLLU2_3: return 2; |
2155
|
|
|
|
|
|
|
case CONLLU3: return 3; |
2156
|
|
|
|
|
|
|
} |
2157
|
|
|
|
|
|
|
return 0; |
2158
|
|
|
|
|
|
|
} |
2159
|
|
|
|
|
|
|
|
2160
|
|
|
|
|
|
|
static int window_size(tagger_id id) { |
2161
|
|
|
|
|
|
|
switch (id) { |
2162
|
|
|
|
|
|
|
case CZECH2_3: return 3; |
2163
|
|
|
|
|
|
|
case GENERIC2_3: return 3; |
2164
|
|
|
|
|
|
|
case CONLLU2_3: return 3; |
2165
|
|
|
|
|
|
|
default: break; |
2166
|
|
|
|
|
|
|
} |
2167
|
|
|
|
|
|
|
return decoding_order(id); |
2168
|
|
|
|
|
|
|
} |
2169
|
|
|
|
|
|
|
}; |
2170
|
|
|
|
|
|
|
|
2171
|
|
|
|
|
|
|
typedef tagger_ids::tagger_id tagger_id; |
2172
|
|
|
|
|
|
|
|
2173
|
|
|
|
|
|
|
} // namespace morphodita |
2174
|
|
|
|
|
|
|
|
2175
|
|
|
|
|
|
|
///////// |
2176
|
|
|
|
|
|
|
// File: tokenizer/morphodita_tokenizer_wrapper.h |
2177
|
|
|
|
|
|
|
///////// |
2178
|
|
|
|
|
|
|
|
2179
|
|
|
|
|
|
|
// This file is part of UDPipe . |
2180
|
|
|
|
|
|
|
// |
2181
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
2182
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
2183
|
|
|
|
|
|
|
// |
2184
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
2185
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
2186
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
2187
|
|
|
|
|
|
|
|
2188
|
4
|
|
|
|
|
|
class morphodita_tokenizer_wrapper : public input_format { |
2189
|
|
|
|
|
|
|
public: |
2190
|
|
|
|
|
|
|
morphodita_tokenizer_wrapper(morphodita::tokenizer* tokenizer, const multiword_splitter* splitter, bool normalized_spaces, bool token_ranges); |
2191
|
|
|
|
|
|
|
|
2192
|
|
|
|
|
|
|
virtual bool read_block(istream& is, string& block) const override; |
2193
|
|
|
|
|
|
|
virtual void reset_document(string_piece id) override; |
2194
|
|
|
|
|
|
|
virtual void set_text(string_piece text, bool make_copy = false) override; |
2195
|
|
|
|
|
|
|
virtual bool next_sentence(sentence& s, string& error) override; |
2196
|
|
|
|
|
|
|
|
2197
|
|
|
|
|
|
|
private: |
2198
|
|
|
|
|
|
|
unique_ptr tokenizer; |
2199
|
|
|
|
|
|
|
const multiword_splitter* splitter; |
2200
|
|
|
|
|
|
|
bool normalized_spaces, token_ranges; |
2201
|
|
|
|
|
|
|
|
2202
|
|
|
|
|
|
|
bool new_document = true; |
2203
|
|
|
|
|
|
|
string document_id; |
2204
|
|
|
|
|
|
|
unsigned preceeding_newlines = 2; |
2205
|
|
|
|
|
|
|
unsigned sentence_id = 1; |
2206
|
|
|
|
|
|
|
|
2207
|
|
|
|
|
|
|
string_piece text; |
2208
|
|
|
|
|
|
|
string text_copy; |
2209
|
|
|
|
|
|
|
size_t unicode_offset = 0, text_unicode_length = 0; |
2210
|
|
|
|
|
|
|
string saved_spaces; |
2211
|
|
|
|
|
|
|
vector forms; |
2212
|
|
|
|
|
|
|
vector tokens; |
2213
|
|
|
|
|
|
|
token tok; |
2214
|
|
|
|
|
|
|
}; |
2215
|
|
|
|
|
|
|
|
2216
|
|
|
|
|
|
|
///////// |
2217
|
|
|
|
|
|
|
// File: utils/getpara.h |
2218
|
|
|
|
|
|
|
///////// |
2219
|
|
|
|
|
|
|
|
2220
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
2221
|
|
|
|
|
|
|
// |
2222
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
2223
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
2224
|
|
|
|
|
|
|
// |
2225
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
2226
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
2227
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
2228
|
|
|
|
|
|
|
|
2229
|
|
|
|
|
|
|
namespace utils { |
2230
|
|
|
|
|
|
|
|
2231
|
|
|
|
|
|
|
// |
2232
|
|
|
|
|
|
|
// Declarations |
2233
|
|
|
|
|
|
|
// |
2234
|
|
|
|
|
|
|
|
2235
|
|
|
|
|
|
|
// Read paragraph until EOF or end line. All encountered \n are stored. |
2236
|
|
|
|
|
|
|
inline istream& getpara(istream& is, string& para); |
2237
|
|
|
|
|
|
|
|
2238
|
|
|
|
|
|
|
// |
2239
|
|
|
|
|
|
|
// Definitions |
2240
|
|
|
|
|
|
|
// |
2241
|
|
|
|
|
|
|
|
2242
|
0
|
|
|
|
|
|
istream& getpara(istream& is, string& para) { |
2243
|
|
|
|
|
|
|
para.clear(); |
2244
|
|
|
|
|
|
|
|
2245
|
0
|
0
|
|
|
|
|
for (string line; getline(is, line); ) { |
|
|
0
|
|
|
|
|
|
2246
|
|
|
|
|
|
|
para.append(line); |
2247
|
0
|
0
|
|
|
|
|
para.push_back('\n'); |
2248
|
|
|
|
|
|
|
|
2249
|
0
|
0
|
|
|
|
|
if (line.empty()) break; |
2250
|
|
|
|
|
|
|
} |
2251
|
|
|
|
|
|
|
|
2252
|
0
|
0
|
|
|
|
|
if (is.eof() && !para.empty()) is.clear(istream::eofbit); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
2253
|
0
|
|
|
|
|
|
return is; |
2254
|
|
|
|
|
|
|
} |
2255
|
|
|
|
|
|
|
|
2256
|
|
|
|
|
|
|
} // namespace utils |
2257
|
|
|
|
|
|
|
|
2258
|
|
|
|
|
|
|
///////// |
2259
|
|
|
|
|
|
|
// File: utils/parse_double.h |
2260
|
|
|
|
|
|
|
///////// |
2261
|
|
|
|
|
|
|
|
2262
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
2263
|
|
|
|
|
|
|
// |
2264
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
2265
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
2266
|
|
|
|
|
|
|
// |
2267
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
2268
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
2269
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
2270
|
|
|
|
|
|
|
|
2271
|
|
|
|
|
|
|
namespace utils { |
2272
|
|
|
|
|
|
|
|
2273
|
|
|
|
|
|
|
// |
2274
|
|
|
|
|
|
|
// Declarations |
2275
|
|
|
|
|
|
|
// |
2276
|
|
|
|
|
|
|
|
2277
|
|
|
|
|
|
|
// Try to parse an double from given string. If the double cannot be parsed or does |
2278
|
|
|
|
|
|
|
// not fit doubleo double, false is returned and the error string is filled using the |
2279
|
|
|
|
|
|
|
// value_name argument. |
2280
|
|
|
|
|
|
|
inline bool parse_double(string_piece str, const char* value_name, double& value, string& error); |
2281
|
|
|
|
|
|
|
|
2282
|
|
|
|
|
|
|
// Try to parse an double from given string. If the double cannot be parsed or does |
2283
|
|
|
|
|
|
|
// not fit doubleo double, an error is displayed and program exits. |
2284
|
|
|
|
|
|
|
inline double parse_double(string_piece str, const char* value_name); |
2285
|
|
|
|
|
|
|
|
2286
|
|
|
|
|
|
|
// |
2287
|
|
|
|
|
|
|
// Definitions |
2288
|
|
|
|
|
|
|
// |
2289
|
|
|
|
|
|
|
|
2290
|
0
|
|
|
|
|
|
bool parse_double(string_piece str, const char* value_name, double& value, string& error) { |
2291
|
|
|
|
|
|
|
string_piece original = str; |
2292
|
|
|
|
|
|
|
|
2293
|
|
|
|
|
|
|
// Skip spaces |
2294
|
0
|
0
|
|
|
|
|
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
2295
|
0
|
|
|
|
|
|
str.str++, str.len--; |
2296
|
|
|
|
|
|
|
|
2297
|
|
|
|
|
|
|
// Allow plus/minus |
2298
|
|
|
|
|
|
|
bool negative = false; |
2299
|
0
|
0
|
|
|
|
|
if (str.len && (str.str[0] == '+' || str.str[0] == '-')) { |
|
|
0
|
|
|
|
|
|
2300
|
|
|
|
|
|
|
negative = str.str[0] == '-'; |
2301
|
0
|
|
|
|
|
|
str.str++, str.len--; |
2302
|
|
|
|
|
|
|
} |
2303
|
|
|
|
|
|
|
|
2304
|
|
|
|
|
|
|
// Parse value, checking for overflow/underflow |
2305
|
0
|
0
|
|
|
|
|
if (!str.len) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': empty string."), false; |
2306
|
|
|
|
|
|
|
if (!(str.str[0] >= '0' || str.str[0] <= '9')) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': non-digit character found."), false; |
2307
|
|
|
|
|
|
|
|
2308
|
0
|
|
|
|
|
|
value = 0; |
2309
|
0
|
0
|
|
|
|
|
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
2310
|
0
|
|
|
|
|
|
value = 10 * value + (str.str[0] - '0'); |
2311
|
0
|
|
|
|
|
|
str.str++, str.len--; |
2312
|
|
|
|
|
|
|
} |
2313
|
|
|
|
|
|
|
|
2314
|
|
|
|
|
|
|
// If there is a decimal point, parse the rest of the |
2315
|
0
|
0
|
|
|
|
|
if (str.len && str.str[0] == '.') { |
|
|
0
|
|
|
|
|
|
2316
|
|
|
|
|
|
|
double divider = 1; |
2317
|
|
|
|
|
|
|
|
2318
|
0
|
|
|
|
|
|
str.str++, str.len--; |
2319
|
0
|
0
|
|
|
|
|
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
2320
|
0
|
|
|
|
|
|
value = 10 * value + (str.str[0] - '0'); |
2321
|
0
|
|
|
|
|
|
divider *= 10.; |
2322
|
0
|
|
|
|
|
|
str.str++, str.len--; |
2323
|
|
|
|
|
|
|
} |
2324
|
|
|
|
|
|
|
|
2325
|
0
|
|
|
|
|
|
value /= divider; |
2326
|
|
|
|
|
|
|
} |
2327
|
0
|
0
|
|
|
|
|
if (!isfinite(value)) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': overflow occured."), false; |
2328
|
|
|
|
|
|
|
|
2329
|
|
|
|
|
|
|
// Optionally parse an exponent |
2330
|
0
|
0
|
|
|
|
|
if (str.len && (str.str[0] == 'e' || str.str[0] == 'E')) { |
|
|
0
|
|
|
|
|
|
2331
|
0
|
|
|
|
|
|
str.str++, str.len--; |
2332
|
|
|
|
|
|
|
|
2333
|
|
|
|
|
|
|
double exponent = 0; |
2334
|
|
|
|
|
|
|
bool exponent_negative = false; |
2335
|
0
|
0
|
|
|
|
|
if (str.len && (str.str[0] == '+' || str.str[0] == '-')) { |
|
|
0
|
|
|
|
|
|
2336
|
|
|
|
|
|
|
exponent_negative = str.str[0] == '-'; |
2337
|
0
|
|
|
|
|
|
str.str++, str.len--; |
2338
|
|
|
|
|
|
|
} |
2339
|
|
|
|
|
|
|
|
2340
|
0
|
0
|
|
|
|
|
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
2341
|
0
|
|
|
|
|
|
exponent = 10 * exponent + (str.str[0] - '0'); |
2342
|
0
|
|
|
|
|
|
str.str++, str.len--; |
2343
|
|
|
|
|
|
|
} |
2344
|
|
|
|
|
|
|
|
2345
|
0
|
0
|
|
|
|
|
exponent = pow(10., exponent_negative ? -exponent : exponent); |
2346
|
0
|
0
|
|
|
|
|
if (!isfinite(exponent)) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': exponent overflow occured."), false; |
2347
|
0
|
0
|
|
|
|
|
if (exponent == 0) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': exponent underflow occured."), false; |
2348
|
|
|
|
|
|
|
|
2349
|
0
|
0
|
|
|
|
|
if (value) { |
2350
|
0
|
|
|
|
|
|
value *= exponent; |
2351
|
0
|
0
|
|
|
|
|
if (!isfinite(value)) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': overflow occured."), false; |
2352
|
0
|
0
|
|
|
|
|
if (value == 0) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': underflow occured."), false; |
2353
|
|
|
|
|
|
|
} |
2354
|
|
|
|
|
|
|
} |
2355
|
|
|
|
|
|
|
|
2356
|
|
|
|
|
|
|
// Apply initial minus |
2357
|
0
|
0
|
|
|
|
|
if (negative) value *= -1; |
2358
|
|
|
|
|
|
|
|
2359
|
|
|
|
|
|
|
// Skip spaces |
2360
|
0
|
0
|
|
|
|
|
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
2361
|
0
|
|
|
|
|
|
str.str++, str.len--; |
2362
|
|
|
|
|
|
|
|
2363
|
|
|
|
|
|
|
// Check for remaining characters |
2364
|
0
|
0
|
|
|
|
|
if (str.len) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': non-digit character found."), false; |
2365
|
|
|
|
|
|
|
|
2366
|
|
|
|
|
|
|
return true; |
2367
|
|
|
|
|
|
|
} |
2368
|
|
|
|
|
|
|
|
2369
|
0
|
|
|
|
|
|
double parse_double(string_piece str, const char* value_name) { |
2370
|
|
|
|
|
|
|
double result; |
2371
|
|
|
|
|
|
|
string error; |
2372
|
|
|
|
|
|
|
|
2373
|
0
|
0
|
|
|
|
|
if (!parse_double(str, value_name, result, error)) |
|
|
0
|
|
|
|
|
|
2374
|
0
|
|
|
|
|
|
runtime_failure(error); |
2375
|
|
|
|
|
|
|
|
2376
|
0
|
|
|
|
|
|
return result; |
2377
|
|
|
|
|
|
|
} |
2378
|
|
|
|
|
|
|
|
2379
|
|
|
|
|
|
|
} // namespace utils |
2380
|
|
|
|
|
|
|
|
2381
|
|
|
|
|
|
|
///////// |
2382
|
|
|
|
|
|
|
// File: model/model_morphodita_parsito.cpp |
2383
|
|
|
|
|
|
|
///////// |
2384
|
|
|
|
|
|
|
|
2385
|
|
|
|
|
|
|
// This file is part of UDPipe . |
2386
|
|
|
|
|
|
|
// |
2387
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
2388
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
2389
|
|
|
|
|
|
|
// |
2390
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
2391
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
2392
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
2393
|
|
|
|
|
|
|
|
2394
|
|
|
|
|
|
|
// Versions: |
2395
|
|
|
|
|
|
|
// 1 - initial version |
2396
|
|
|
|
|
|
|
// 2 - add absolute lemmas (tagger_model::lemma == 2) |
2397
|
|
|
|
|
|
|
// - use Arabic and space normalization |
2398
|
|
|
|
|
|
|
|
2399
|
1
|
|
|
|
|
|
input_format* model_morphodita_parsito::new_tokenizer(const string& options) const { |
2400
|
1
|
50
|
|
|
|
|
if (!tokenizer_factory) |
2401
|
|
|
|
|
|
|
return nullptr; |
2402
|
|
|
|
|
|
|
|
2403
|
|
|
|
|
|
|
named_values::map parsed_options; |
2404
|
|
|
|
|
|
|
string parse_error; |
2405
|
1
|
50
|
|
|
|
|
if (!named_values::parse(options, parsed_options, parse_error)) |
|
|
50
|
|
|
|
|
|
2406
|
|
|
|
|
|
|
return nullptr; |
2407
|
|
|
|
|
|
|
|
2408
|
1
|
50
|
|
|
|
|
bool normalized_spaces = parsed_options.count("normalized_spaces"); |
2409
|
1
|
50
|
|
|
|
|
bool token_ranges = parsed_options.count("ranges"); |
2410
|
|
|
|
|
|
|
|
2411
|
1
|
50
|
|
|
|
|
const auto* morpho = !taggers.empty() ? taggers[0].tagger->get_morpho() : nullptr; |
|
|
50
|
|
|
|
|
|
2412
|
1
|
50
|
|
|
|
|
unique_ptr result(new morphodita_tokenizer_wrapper(tokenizer_factory->new_tokenizer(morpho), splitter.get(), normalized_spaces, token_ranges)); |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
2413
|
|
|
|
|
|
|
|
2414
|
|
|
|
|
|
|
// Presegmented |
2415
|
3
|
0
|
|
|
|
|
if (parsed_options.count("presegmented") && result) |
|
|
50
|
|
|
|
|
|
2416
|
0
|
0
|
|
|
|
|
result.reset(input_format::new_presegmented_tokenizer(result.release())); |
2417
|
|
|
|
|
|
|
|
2418
|
|
|
|
|
|
|
// Joint with parsing |
2419
|
3
|
0
|
|
|
|
|
if (parsed_options.count("joint_with_parsing") && result) { |
|
|
50
|
|
|
|
|
|
2420
|
0
|
|
|
|
|
|
int max_sentence_len = 20; |
2421
|
0
|
0
|
|
|
|
|
if (parsed_options.count("joint_max_sentence_len") && !parse_int(parsed_options["joint_max_sentence_len"], "joint max sentence len", max_sentence_len, parse_error)) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
2422
|
0
|
|
|
|
|
|
return nullptr; |
2423
|
|
|
|
|
|
|
|
2424
|
0
|
|
|
|
|
|
double change_boundary_logprob = -0.5; |
2425
|
0
|
0
|
|
|
|
|
if (parsed_options.count("joint_change_boundary_logprob") && !parse_double(parsed_options["joint_change_boundary_logprob"], "joint change boundary logprob", change_boundary_logprob, parse_error)) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
2426
|
|
|
|
|
|
|
return nullptr; |
2427
|
|
|
|
|
|
|
|
2428
|
0
|
|
|
|
|
|
double sentence_logprob = -0.5; |
2429
|
0
|
0
|
|
|
|
|
if (parsed_options.count("joint_sentence_logprob") && !parse_double(parsed_options["joint_sentence_logprob"], "joint sentence logprob", sentence_logprob, parse_error)) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
2430
|
|
|
|
|
|
|
return nullptr; |
2431
|
|
|
|
|
|
|
|
2432
|
0
|
0
|
|
|
|
|
result.reset(new joint_with_parsing_tokenizer(result.release(), *this, max_sentence_len, change_boundary_logprob, sentence_logprob)); |
2433
|
|
|
|
|
|
|
} |
2434
|
|
|
|
|
|
|
|
2435
|
1
|
|
|
|
|
|
return result.release(); |
2436
|
|
|
|
|
|
|
} |
2437
|
|
|
|
|
|
|
|
2438
|
1
|
|
|
|
|
|
bool model_morphodita_parsito::tag(sentence& s, const string& /*options*/, string& error) const { |
2439
|
|
|
|
|
|
|
error.clear(); |
2440
|
|
|
|
|
|
|
|
2441
|
1
|
50
|
|
|
|
|
if (taggers.empty()) return error.assign("No tagger defined for the UDPipe model!"), false; |
2442
|
1
|
50
|
|
|
|
|
if (s.empty()) return true; |
2443
|
|
|
|
|
|
|
|
2444
|
1
|
|
|
|
|
|
tagger_cache* c = tagger_caches.pop(); |
2445
|
1
|
50
|
|
|
|
|
if (!c) c = new tagger_cache(); |
2446
|
|
|
|
|
|
|
|
2447
|
|
|
|
|
|
|
// Prepare input forms |
2448
|
1
|
|
|
|
|
|
c->forms_normalized.resize(s.words.size() - 1); |
2449
|
1
|
|
|
|
|
|
c->forms_string_pieces.resize(s.words.size() - 1); |
2450
|
8
|
100
|
|
|
|
|
for (size_t i = 1; i < s.words.size(); i++) |
2451
|
7
|
|
|
|
|
|
c->forms_string_pieces[i - 1] = normalize_form(s.words[i].form, c->forms_normalized[i - 1]); |
2452
|
|
|
|
|
|
|
|
2453
|
|
|
|
|
|
|
// Clear first |
2454
|
8
|
100
|
|
|
|
|
for (size_t i = 1; i < s.words.size(); i++) { |
2455
|
7
|
|
|
|
|
|
s.words[i].lemma.assign("_"); |
2456
|
|
|
|
|
|
|
s.words[i].upostag.clear(); |
2457
|
|
|
|
|
|
|
s.words[i].xpostag.clear(); |
2458
|
|
|
|
|
|
|
s.words[i].feats.clear(); |
2459
|
|
|
|
|
|
|
} |
2460
|
|
|
|
|
|
|
|
2461
|
|
|
|
|
|
|
// Fill information from the tagger models |
2462
|
2
|
100
|
|
|
|
|
for (auto&& tagger : taggers) { |
2463
|
1
|
50
|
|
|
|
|
if (!tagger.tagger) return error.assign("No tagger defined for the UDPipe model!"), false; |
2464
|
|
|
|
|
|
|
|
2465
|
1
|
|
|
|
|
|
tagger.tagger->tag(c->forms_string_pieces, c->lemmas); |
2466
|
|
|
|
|
|
|
|
2467
|
8
|
100
|
|
|
|
|
for (size_t i = 0; i < c->lemmas.size(); i++) |
2468
|
7
|
|
|
|
|
|
fill_word_analysis(c->lemmas[i], tagger.raw, tagger.upostag, tagger.lemma, tagger.xpostag, tagger.feats, s.words[i+1]); |
2469
|
|
|
|
|
|
|
} |
2470
|
|
|
|
|
|
|
|
2471
|
|
|
|
|
|
|
// For raw tagger models, fill MorphoGuesser=Yes where appropriate |
2472
|
1
|
50
|
|
|
|
|
if (taggers.size() == 1 && taggers[0].raw && taggers[0].tagger->get_morpho()) { |
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
2473
|
0
|
|
|
|
|
|
const auto* morpho = taggers[0].tagger->get_morpho(); |
2474
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < c->forms_string_pieces.size(); i++) { |
2475
|
0
|
0
|
|
|
|
|
if (morpho->analyze(c->forms_string_pieces[i], morphodita::morpho::GUESSER, c->lemmas) == morphodita::morpho::GUESSER) |
2476
|
0
|
0
|
|
|
|
|
s.words[i + 1].misc.append(s.words[i + 1].misc.empty() ? "" : "|").append("MorphoGuesser=Yes"); |
2477
|
|
|
|
|
|
|
} |
2478
|
|
|
|
|
|
|
} |
2479
|
|
|
|
|
|
|
|
2480
|
1
|
|
|
|
|
|
tagger_caches.push(c); |
2481
|
1
|
|
|
|
|
|
return true; |
2482
|
|
|
|
|
|
|
} |
2483
|
|
|
|
|
|
|
|
2484
|
1
|
|
|
|
|
|
bool model_morphodita_parsito::parse(sentence& s, const string& options, string& error) const { |
2485
|
1
|
|
|
|
|
|
return parse(s, options, error, nullptr); |
2486
|
|
|
|
|
|
|
} |
2487
|
|
|
|
|
|
|
|
2488
|
1
|
|
|
|
|
|
bool model_morphodita_parsito::parse(sentence& s, const string& options, string& error, double* cost) const { |
2489
|
|
|
|
|
|
|
error.clear(); |
2490
|
|
|
|
|
|
|
|
2491
|
1
|
50
|
|
|
|
|
if (!parser) return error.assign("No parser defined for the UDPipe model!"), false; |
2492
|
1
|
50
|
|
|
|
|
if (s.empty()) return true; |
2493
|
|
|
|
|
|
|
|
2494
|
1
|
|
|
|
|
|
parser_cache* c = parser_caches.pop(); |
2495
|
1
|
50
|
|
|
|
|
if (!c) c = new parser_cache(); |
2496
|
|
|
|
|
|
|
|
2497
|
1
|
|
|
|
|
|
int beam_search = 5; |
2498
|
1
|
50
|
|
|
|
|
if (!named_values::parse(options, c->options, error)) |
2499
|
|
|
|
|
|
|
return false; |
2500
|
2
|
50
|
|
|
|
|
if (c->options.count("beam_search")) |
2501
|
0
|
0
|
|
|
|
|
if (!parse_int(c->options["beam_search"], "beam_search", beam_search, error)) |
|
|
0
|
|
|
|
|
|
2502
|
|
|
|
|
|
|
return false; |
2503
|
|
|
|
|
|
|
|
2504
|
1
|
|
|
|
|
|
c->tree.clear(); |
2505
|
8
|
100
|
|
|
|
|
for (size_t i = 1; i < s.words.size(); i++) { |
2506
|
7
|
|
|
|
|
|
c->tree.add_node(string()); |
2507
|
7
|
|
|
|
|
|
normalize_form(s.words[i].form, c->tree.nodes.back().form); |
2508
|
7
|
|
|
|
|
|
normalize_lemma(s.words[i].lemma, c->tree.nodes.back().lemma); |
2509
|
14
|
|
|
|
|
|
c->tree.nodes.back().upostag.assign(s.words[i].upostag); |
2510
|
14
|
|
|
|
|
|
c->tree.nodes.back().xpostag.assign(s.words[i].xpostag); |
2511
|
14
|
|
|
|
|
|
c->tree.nodes.back().feats.assign(s.words[i].feats); |
2512
|
14
|
|
|
|
|
|
c->tree.nodes.back().deps.assign(s.words[i].deps); |
2513
|
14
|
|
|
|
|
|
c->tree.nodes.back().misc.assign(s.words[i].misc); |
2514
|
|
|
|
|
|
|
} |
2515
|
|
|
|
|
|
|
|
2516
|
1
|
|
|
|
|
|
parser->parse(c->tree, beam_search, cost); |
2517
|
8
|
100
|
|
|
|
|
for (size_t i = 1; i < s.words.size(); i++) |
2518
|
7
|
|
|
|
|
|
s.set_head(i, c->tree.nodes[i].head, c->tree.nodes[i].deprel); |
2519
|
|
|
|
|
|
|
|
2520
|
1
|
|
|
|
|
|
parser_caches.push(c); |
2521
|
|
|
|
|
|
|
return true; |
2522
|
|
|
|
|
|
|
} |
2523
|
|
|
|
|
|
|
|
2524
|
1
|
|
|
|
|
|
model* model_morphodita_parsito::load(istream& is) { |
2525
|
|
|
|
|
|
|
char version; |
2526
|
1
|
50
|
|
|
|
|
if (!is.get(version)) return nullptr; |
2527
|
1
|
50
|
|
|
|
|
if (!(version >= 1 && version <= VERSION_LATEST)) return nullptr; |
2528
|
|
|
|
|
|
|
|
2529
|
|
|
|
|
|
|
// Because UDPipe 1.0 does not check the model version, |
2530
|
|
|
|
|
|
|
// a specific sentinel was added since version 2 so that |
2531
|
|
|
|
|
|
|
// loading of such model fail on UDPipe 1.0 |
2532
|
1
|
50
|
|
|
|
|
if (version >= 2) { |
2533
|
|
|
|
|
|
|
char sentinel; |
2534
|
0
|
0
|
|
|
|
|
if (!is.get(sentinel) || sentinel != 0x7F) return nullptr; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
2535
|
0
|
0
|
|
|
|
|
if (!is.get(sentinel) || sentinel != 0x7F) return nullptr; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
2536
|
|
|
|
|
|
|
} |
2537
|
|
|
|
|
|
|
|
2538
|
1
|
|
|
|
|
|
unique_ptr m(new model_morphodita_parsito((unsigned char)version)); |
2539
|
1
|
50
|
|
|
|
|
if (!m) return nullptr; |
2540
|
|
|
|
|
|
|
|
2541
|
|
|
|
|
|
|
char tokenizer; |
2542
|
1
|
50
|
|
|
|
|
if (!is.get(tokenizer)) return nullptr; |
|
|
50
|
|
|
|
|
|
2543
|
1
|
50
|
|
|
|
|
m->tokenizer_factory.reset(tokenizer ? morphodita::tokenizer_factory::load(is) : nullptr); |
|
|
50
|
|
|
|
|
|
2544
|
1
|
50
|
|
|
|
|
if (tokenizer && !m->tokenizer_factory) return nullptr; |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
2545
|
1
|
50
|
|
|
|
|
m->splitter.reset(tokenizer ? multiword_splitter::load(is) : nullptr); |
|
|
50
|
|
|
|
|
|
2546
|
1
|
50
|
|
|
|
|
if (tokenizer && !m->splitter) return nullptr; |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
2547
|
|
|
|
|
|
|
|
2548
|
1
|
|
|
|
|
|
m->taggers.clear(); |
2549
|
1
|
50
|
|
|
|
|
char taggers; if (!is.get(taggers)) return nullptr; |
|
|
50
|
|
|
|
|
|
2550
|
2
|
100
|
|
|
|
|
for (char i = 0; i < taggers; i++) { |
2551
|
1
|
50
|
|
|
|
|
char lemma; if (!is.get(lemma)) return nullptr; |
|
|
50
|
|
|
|
|
|
2552
|
1
|
50
|
|
|
|
|
char xpostag; if (!is.get(xpostag)) return nullptr; |
|
|
50
|
|
|
|
|
|
2553
|
1
|
50
|
|
|
|
|
char feats; if (!is.get(feats)) return nullptr; |
|
|
50
|
|
|
|
|
|
2554
|
1
|
50
|
|
|
|
|
int model_type = is.peek(); |
2555
|
1
|
|
|
|
|
|
bool raw = !(model_type == morphodita::tagger_ids::CONLLU2 || |
2556
|
|
|
|
|
|
|
model_type == morphodita::tagger_ids::CONLLU2_3 || |
2557
|
1
|
50
|
|
|
|
|
model_type == morphodita::tagger_ids::CONLLU3); |
|
|
50
|
|
|
|
|
|
2558
|
1
|
50
|
|
|
|
|
morphodita::tagger* tagger = morphodita::tagger::load(is); |
2559
|
1
|
50
|
|
|
|
|
if (!tagger) return nullptr; |
2560
|
1
|
50
|
|
|
|
|
m->taggers.emplace_back(raw, i == 0, int(lemma), bool(xpostag), bool(feats), tagger); |
2561
|
|
|
|
|
|
|
} |
2562
|
|
|
|
|
|
|
|
2563
|
|
|
|
|
|
|
char parser; |
2564
|
1
|
50
|
|
|
|
|
if (!is.get(parser)) return nullptr; |
|
|
50
|
|
|
|
|
|
2565
|
1
|
50
|
|
|
|
|
m->parser.reset(parser ? parsito::parser::load(is) : nullptr); |
|
|
50
|
|
|
|
|
|
2566
|
1
|
50
|
|
|
|
|
if (parser && !m->parser) return nullptr; |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
2567
|
|
|
|
|
|
|
|
2568
|
1
|
|
|
|
|
|
return m.release(); |
2569
|
|
|
|
|
|
|
} |
2570
|
|
|
|
|
|
|
|
2571
|
0
|
|
|
|
|
|
model_morphodita_parsito::model_morphodita_parsito(unsigned version) : version(version) {} |
2572
|
|
|
|
|
|
|
|
2573
|
0
|
|
|
|
|
|
bool model_morphodita_parsito::joint_with_parsing_tokenizer::read_block(istream& is, string& block) const { |
2574
|
|
|
|
|
|
|
block.clear(); |
2575
|
|
|
|
|
|
|
|
2576
|
0
|
0
|
|
|
|
|
for (string line; getline(is, line); ) { |
|
|
0
|
|
|
|
|
|
2577
|
|
|
|
|
|
|
block.append(line); |
2578
|
0
|
0
|
|
|
|
|
block.push_back('\n'); |
2579
|
|
|
|
|
|
|
} |
2580
|
|
|
|
|
|
|
|
2581
|
0
|
0
|
|
|
|
|
if (is.eof() && !block.empty()) is.clear(istream::eofbit); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
2582
|
0
|
|
|
|
|
|
return bool(is); |
2583
|
|
|
|
|
|
|
} |
2584
|
|
|
|
|
|
|
|
2585
|
0
|
|
|
|
|
|
void model_morphodita_parsito::joint_with_parsing_tokenizer::reset_document(string_piece id) { |
2586
|
0
|
|
|
|
|
|
new_document = true; |
2587
|
0
|
|
|
|
|
|
document_id.assign(id.str, id.len); |
2588
|
0
|
|
|
|
|
|
sentence_id = 1; |
2589
|
0
|
|
|
|
|
|
set_text(""); |
2590
|
|
|
|
|
|
|
sentences.clear(); |
2591
|
0
|
|
|
|
|
|
sentences_index = 0; |
2592
|
0
|
|
|
|
|
|
} |
2593
|
|
|
|
|
|
|
|
2594
|
0
|
|
|
|
|
|
void model_morphodita_parsito::joint_with_parsing_tokenizer::set_text(string_piece text, bool make_copy) { |
2595
|
0
|
0
|
|
|
|
|
if (make_copy) { |
2596
|
0
|
|
|
|
|
|
text_copy.assign(text.str, text.len); |
2597
|
|
|
|
|
|
|
text.str = text_copy.c_str(); |
2598
|
|
|
|
|
|
|
} |
2599
|
0
|
|
|
|
|
|
this->text = text; |
2600
|
0
|
|
|
|
|
|
} |
2601
|
|
|
|
|
|
|
|
2602
|
0
|
|
|
|
|
|
bool model_morphodita_parsito::joint_with_parsing_tokenizer::next_sentence(sentence& s, string& error) { |
2603
|
|
|
|
|
|
|
error.clear(); |
2604
|
|
|
|
|
|
|
|
2605
|
0
|
0
|
|
|
|
|
if (text.len) { |
2606
|
|
|
|
|
|
|
sentences.clear(); |
2607
|
0
|
|
|
|
|
|
sentences_index = 0; |
2608
|
|
|
|
|
|
|
|
2609
|
0
|
|
|
|
|
|
tokenizer->set_text(text, false); |
2610
|
|
|
|
|
|
|
|
2611
|
0
|
|
|
|
|
|
sentence input; |
2612
|
0
|
|
|
|
|
|
vector paragraph; |
2613
|
0
|
0
|
|
|
|
|
while (tokenizer->next_sentence(input, error)) { |
|
|
0
|
|
|
|
|
|
2614
|
0
|
0
|
|
|
|
|
if (input.get_new_par() && !paragraph.empty()) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
2615
|
0
|
0
|
|
|
|
|
if (!parse_paragraph(paragraph, error)) return false; |
|
|
0
|
|
|
|
|
|
2616
|
0
|
0
|
|
|
|
|
for (auto&& sentence : paragraph) |
2617
|
0
|
0
|
|
|
|
|
sentences.push_back(sentence); |
2618
|
|
|
|
|
|
|
paragraph.clear(); |
2619
|
|
|
|
|
|
|
} |
2620
|
0
|
0
|
|
|
|
|
paragraph.push_back(input); |
2621
|
|
|
|
|
|
|
} |
2622
|
0
|
0
|
|
|
|
|
if (!error.empty()) return false; |
2623
|
|
|
|
|
|
|
|
2624
|
0
|
0
|
|
|
|
|
if (!paragraph.empty()) { |
2625
|
0
|
0
|
|
|
|
|
if (!parse_paragraph(paragraph, error)) return false; |
|
|
0
|
|
|
|
|
|
2626
|
0
|
0
|
|
|
|
|
for (auto&& sentence : paragraph) |
2627
|
0
|
0
|
|
|
|
|
sentences.push_back(sentence); |
2628
|
|
|
|
|
|
|
} |
2629
|
|
|
|
|
|
|
|
2630
|
0
|
|
|
|
|
|
text.len = 0; |
2631
|
|
|
|
|
|
|
} |
2632
|
|
|
|
|
|
|
|
2633
|
0
|
0
|
|
|
|
|
if (sentences_index < sentences.size()) { |
2634
|
0
|
|
|
|
|
|
s = sentences[sentences_index++]; |
2635
|
0
|
|
|
|
|
|
return true; |
2636
|
|
|
|
|
|
|
} |
2637
|
|
|
|
|
|
|
|
2638
|
|
|
|
|
|
|
return false; |
2639
|
|
|
|
|
|
|
} |
2640
|
|
|
|
|
|
|
|
2641
|
0
|
|
|
|
|
|
bool model_morphodita_parsito::joint_with_parsing_tokenizer::parse_paragraph(vector& paragraph, string& error) { |
2642
|
0
|
|
|
|
|
|
sentence all_words; |
2643
|
0
|
0
|
|
|
|
|
vector sentence_boundary(1, true); |
2644
|
0
|
0
|
|
|
|
|
vector token_boundary(1, true); |
2645
|
|
|
|
|
|
|
|
2646
|
0
|
0
|
|
|
|
|
for (auto&& s : paragraph) { |
2647
|
0
|
|
|
|
|
|
unsigned offset = all_words.words.size() - 1; |
2648
|
0
|
0
|
|
|
|
|
for (unsigned i = 1; i < s.words.size(); i++) { |
2649
|
0
|
0
|
|
|
|
|
all_words.words.push_back(s.words[i]); |
2650
|
0
|
|
|
|
|
|
all_words.words.back().id += offset; |
2651
|
0
|
0
|
|
|
|
|
sentence_boundary.push_back(i+1 == s.words.size()); |
2652
|
0
|
0
|
|
|
|
|
token_boundary.push_back(true); |
2653
|
|
|
|
|
|
|
} |
2654
|
|
|
|
|
|
|
|
2655
|
0
|
0
|
|
|
|
|
for (auto&& mwt : s.multiword_tokens) { |
2656
|
0
|
0
|
|
|
|
|
all_words.multiword_tokens.push_back(mwt); |
2657
|
0
|
|
|
|
|
|
all_words.multiword_tokens.back().id_first += offset; |
2658
|
0
|
|
|
|
|
|
all_words.multiword_tokens.back().id_last += offset; |
2659
|
0
|
0
|
|
|
|
|
for (int i = all_words.multiword_tokens.back().id_first; i < all_words.multiword_tokens.back().id_last; i++) |
2660
|
0
|
|
|
|
|
|
token_boundary[i] = false; |
2661
|
|
|
|
|
|
|
} |
2662
|
|
|
|
|
|
|
} |
2663
|
|
|
|
|
|
|
|
2664
|
0
|
0
|
|
|
|
|
vector best_logprob(all_words.words.size(), -numeric_limits::infinity()); best_logprob[0] = 0.; |
2665
|
0
|
0
|
|
|
|
|
vector best_length(all_words.words.size(), 0); |
2666
|
0
|
0
|
|
|
|
|
sentence s; |
2667
|
|
|
|
|
|
|
|
2668
|
0
|
0
|
|
|
|
|
for (unsigned start = 1; start < all_words.words.size(); start++) { |
2669
|
0
|
0
|
|
|
|
|
if (!token_boundary[start - 1]) continue; |
2670
|
0
|
0
|
|
|
|
|
s.clear(); |
2671
|
0
|
0
|
|
|
|
|
for (unsigned end = start + 1; end <= all_words.words.size() && (end - start) <= unsigned(max_sentence_len); end++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
2672
|
0
|
0
|
|
|
|
|
s.words.push_back(all_words.words[end - 1]); |
2673
|
0
|
|
|
|
|
|
s.words.back().id -= start - 1; |
2674
|
0
|
0
|
|
|
|
|
if (!token_boundary[end - 1]) continue; |
2675
|
|
|
|
|
|
|
|
2676
|
0
|
0
|
|
|
|
|
for (unsigned i = 1; i < s.words.size(); i++) { |
2677
|
0
|
|
|
|
|
|
s.words[i].head = -1; |
2678
|
|
|
|
|
|
|
s.words[i].children.clear(); |
2679
|
|
|
|
|
|
|
} |
2680
|
|
|
|
|
|
|
|
2681
|
|
|
|
|
|
|
double cost; |
2682
|
0
|
0
|
|
|
|
|
if (!model.parse(s, DEFAULT, error, &cost)) return false; |
|
|
0
|
|
|
|
|
|
2683
|
0
|
|
|
|
|
|
cost += sentence_logprob + change_boundary_logprob * (2 - int(sentence_boundary[start - 1]) - int(sentence_boundary[end - 1])); |
2684
|
0
|
0
|
|
|
|
|
if (best_logprob[start - 1] + cost > best_logprob[end - 1]) { |
2685
|
0
|
|
|
|
|
|
best_logprob[end - 1] = best_logprob[start - 1] + cost; |
2686
|
0
|
|
|
|
|
|
best_length[end - 1] = end - start; |
2687
|
|
|
|
|
|
|
} |
2688
|
|
|
|
|
|
|
} |
2689
|
|
|
|
|
|
|
} |
2690
|
|
|
|
|
|
|
|
2691
|
|
|
|
|
|
|
vector sentence_lengths; |
2692
|
0
|
0
|
|
|
|
|
for (unsigned end = all_words.words.size(); end > 1; end -= best_length[end - 1]) |
2693
|
0
|
0
|
|
|
|
|
sentence_lengths.push_back(best_length[end - 1]); |
2694
|
|
|
|
|
|
|
|
2695
|
|
|
|
|
|
|
paragraph.clear(); |
2696
|
|
|
|
|
|
|
|
2697
|
0
|
|
|
|
|
|
sentence_lengths.push_back(1); |
2698
|
|
|
|
|
|
|
reverse(sentence_lengths.begin(), sentence_lengths.end()); |
2699
|
0
|
0
|
|
|
|
|
for (unsigned i = 1; i < sentence_lengths.size(); i++) { |
2700
|
0
|
|
|
|
|
|
sentence_lengths[i] += sentence_lengths[i - 1]; |
2701
|
|
|
|
|
|
|
|
2702
|
0
|
0
|
|
|
|
|
paragraph.emplace_back(); |
2703
|
0
|
0
|
|
|
|
|
while (!all_words.multiword_tokens.empty() && unsigned(all_words.multiword_tokens.front().id_first) < sentence_lengths[i]) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
2704
|
0
|
0
|
|
|
|
|
paragraph.back().multiword_tokens.push_back(all_words.multiword_tokens.front()); |
2705
|
0
|
|
|
|
|
|
paragraph.back().multiword_tokens.back().id_first -= sentence_lengths[i-1] - 1; |
2706
|
0
|
|
|
|
|
|
paragraph.back().multiword_tokens.back().id_last -= sentence_lengths[i-1] - 1; |
2707
|
|
|
|
|
|
|
all_words.multiword_tokens.erase(all_words.multiword_tokens.begin()); |
2708
|
|
|
|
|
|
|
} |
2709
|
|
|
|
|
|
|
|
2710
|
0
|
0
|
|
|
|
|
for (unsigned word = sentence_lengths[i - 1]; word < sentence_lengths[i]; word++) { |
2711
|
0
|
0
|
|
|
|
|
paragraph.back().words.push_back(all_words.words[word]); |
2712
|
0
|
|
|
|
|
|
paragraph.back().words.back().id -= sentence_lengths[i-1] - 1; |
2713
|
0
|
|
|
|
|
|
paragraph.back().words.back().head = -1; |
2714
|
|
|
|
|
|
|
paragraph.back().words.back().children.clear(); |
2715
|
|
|
|
|
|
|
} |
2716
|
|
|
|
|
|
|
} |
2717
|
|
|
|
|
|
|
|
2718
|
0
|
0
|
|
|
|
|
if (!paragraph.empty()) { |
2719
|
0
|
0
|
|
|
|
|
if (new_document) { |
2720
|
0
|
0
|
|
|
|
|
paragraph.front().set_new_doc(true, document_id); |
2721
|
0
|
|
|
|
|
|
new_document = false; |
2722
|
|
|
|
|
|
|
} |
2723
|
|
|
|
|
|
|
|
2724
|
0
|
0
|
|
|
|
|
paragraph.front().set_new_par(true); |
2725
|
|
|
|
|
|
|
} |
2726
|
|
|
|
|
|
|
|
2727
|
|
|
|
|
|
|
return true; |
2728
|
|
|
|
|
|
|
} |
2729
|
|
|
|
|
|
|
|
2730
|
7
|
|
|
|
|
|
void model_morphodita_parsito::fill_word_analysis(const morphodita::tagged_lemma& analysis, bool raw, bool upostag, int lemma, bool xpostag, bool feats, word& word) const { |
2731
|
|
|
|
|
|
|
// Handle raw MorphoDiTa models. |
2732
|
7
|
50
|
|
|
|
|
if (raw) { |
2733
|
0
|
0
|
|
|
|
|
if (lemma) word.lemma.assign(analysis.lemma); |
2734
|
0
|
0
|
|
|
|
|
if (xpostag) word.xpostag.assign(analysis.tag); |
2735
|
|
|
|
|
|
|
return; |
2736
|
|
|
|
|
|
|
} |
2737
|
|
|
|
|
|
|
|
2738
|
|
|
|
|
|
|
// Lemma |
2739
|
7
|
50
|
|
|
|
|
if (lemma == 1) { |
2740
|
7
|
|
|
|
|
|
word.lemma.assign(analysis.lemma); |
2741
|
0
|
0
|
|
|
|
|
} else if (lemma == 2) { |
2742
|
0
|
|
|
|
|
|
word.lemma.assign(analysis.lemma); |
2743
|
|
|
|
|
|
|
|
2744
|
|
|
|
|
|
|
// Lemma matching ~replacement~normalized_form is changed to replacement. |
2745
|
0
|
0
|
|
|
|
|
if (analysis.lemma[0] == '~') { |
2746
|
0
|
|
|
|
|
|
auto end = analysis.lemma.find('~', 1); |
2747
|
0
|
0
|
|
|
|
|
if (end != string::npos) { |
2748
|
0
|
|
|
|
|
|
normalize_form(word.form, word.lemma); |
2749
|
0
|
0
|
|
|
|
|
if (analysis.lemma.compare(end + 1, string::npos, word.lemma) == 0) |
2750
|
0
|
|
|
|
|
|
word.lemma.assign(analysis.lemma, 1, end - 1); |
2751
|
|
|
|
|
|
|
else |
2752
|
|
|
|
|
|
|
word.lemma.assign(analysis.lemma); |
2753
|
|
|
|
|
|
|
} |
2754
|
|
|
|
|
|
|
} |
2755
|
|
|
|
|
|
|
} |
2756
|
7
|
50
|
|
|
|
|
if (version == 2) { |
2757
|
|
|
|
|
|
|
// Replace '\001' back to spaces |
2758
|
0
|
0
|
|
|
|
|
for (auto && chr : word.lemma) |
2759
|
0
|
0
|
|
|
|
|
if (chr == '\001') |
2760
|
0
|
|
|
|
|
|
chr = ' '; |
2761
|
7
|
50
|
|
|
|
|
} else if (version >= 3) { |
2762
|
|
|
|
|
|
|
// Replace '0xC2 0xA0' back to spaces |
2763
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i + 1 < word.lemma.size(); i++) |
2764
|
0
|
0
|
|
|
|
|
if (word.lemma[i] == char(0xC2) && word.lemma[i+1] == char(0xA0)) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
2765
|
0
|
|
|
|
|
|
word.lemma.replace(i, 2, 1, ' '); |
2766
|
|
|
|
|
|
|
} |
2767
|
|
|
|
|
|
|
|
2768
|
7
|
50
|
|
|
|
|
if (!upostag && !xpostag && !feats) return; |
|
|
0
|
|
|
|
|
|
2769
|
|
|
|
|
|
|
|
2770
|
|
|
|
|
|
|
// UPOSTag |
2771
|
7
|
|
|
|
|
|
char separator = analysis.tag[0]; |
2772
|
7
|
|
|
|
|
|
size_t start = min(size_t(1), analysis.tag.size()), end = min(analysis.tag.find(separator, 1), analysis.tag.size()); |
2773
|
7
|
50
|
|
|
|
|
if (upostag) word.upostag.assign(analysis.tag, start, end - start); |
2774
|
|
|
|
|
|
|
|
2775
|
7
|
50
|
|
|
|
|
if (!xpostag && !feats) return; |
2776
|
|
|
|
|
|
|
|
2777
|
|
|
|
|
|
|
// XPOSTag |
2778
|
14
|
|
|
|
|
|
start = min(end + 1, analysis.tag.size()); |
2779
|
7
|
|
|
|
|
|
end = min(analysis.tag.find(separator, start), analysis.tag.size()); |
2780
|
7
|
50
|
|
|
|
|
if (xpostag) word.xpostag.assign(analysis.tag, start, end - start); |
2781
|
|
|
|
|
|
|
|
2782
|
7
|
50
|
|
|
|
|
if (!feats) return; |
2783
|
|
|
|
|
|
|
|
2784
|
|
|
|
|
|
|
// Features |
2785
|
14
|
|
|
|
|
|
start = min(end + 1, analysis.tag.size()); |
2786
|
7
|
|
|
|
|
|
word.feats.assign(analysis.tag, start, analysis.tag.size() - start); |
2787
|
|
|
|
|
|
|
} |
2788
|
|
|
|
|
|
|
|
2789
|
14
|
|
|
|
|
|
const string& model_morphodita_parsito::normalize_form(string_piece form, string& output) const { |
2790
|
|
|
|
|
|
|
using unilib::utf8; |
2791
|
|
|
|
|
|
|
|
2792
|
|
|
|
|
|
|
// No normalization on version 1 |
2793
|
28
|
50
|
|
|
|
|
if (version <= 1) return output.assign(form.str, form.len); |
2794
|
|
|
|
|
|
|
|
2795
|
|
|
|
|
|
|
// If requested, replace space by \001 in version 2 and by (\u00a0) since version 3 |
2796
|
|
|
|
|
|
|
|
2797
|
|
|
|
|
|
|
// Arabic normalization since version 2, implementation resulted from |
2798
|
|
|
|
|
|
|
// discussion with Otakar Smrz and Nasrin Taghizadeh. |
2799
|
|
|
|
|
|
|
// 1. Remove https://codepoints.net/U+0640 without any reasonable doubt :) |
2800
|
|
|
|
|
|
|
// 2. Remove https://codepoints.net/U+0652 |
2801
|
|
|
|
|
|
|
// 3. Remove https://codepoints.net/U+0670 |
2802
|
|
|
|
|
|
|
// 4. Remove everything from https://codepoints.net/U+0653 to |
2803
|
|
|
|
|
|
|
// https://codepoints.net/U+0657 though they are probably very rare in date |
2804
|
|
|
|
|
|
|
// 5. Remove everything from https://codepoints.net/U+064B to |
2805
|
|
|
|
|
|
|
// https://codepoints.net/U+0650 |
2806
|
|
|
|
|
|
|
// 6. Remove https://codepoints.net/U+0651 |
2807
|
|
|
|
|
|
|
// 7. Replace https://codepoints.net/U+0671 with https://codepoints.net/U+0627 |
2808
|
|
|
|
|
|
|
// 8. Replace https://codepoints.net/U+0622 with https://codepoints.net/U+0627 |
2809
|
|
|
|
|
|
|
// 9. Replace https://codepoints.net/U+0623 with https://codepoints.net/U+0627 |
2810
|
|
|
|
|
|
|
// 10. Replace https://codepoints.net/U+0625 with https://codepoints.net/U+0627 |
2811
|
|
|
|
|
|
|
// 11. Replace https://codepoints.net/U+0624 with https://codepoints.net/U+0648 |
2812
|
|
|
|
|
|
|
// 12. Replace https://codepoints.net/U+0626 with https://codepoints.net/U+064A |
2813
|
|
|
|
|
|
|
// One might also consider replacing some Farsi characters that might be typed |
2814
|
|
|
|
|
|
|
// unintentionally (by Iranians writing Arabic language texts): |
2815
|
|
|
|
|
|
|
// 13. Replace https://codepoints.net/U+06CC with https://codepoints.net/U+064A |
2816
|
|
|
|
|
|
|
// 14. Replace https://codepoints.net/U+06A9 with https://codepoints.net/U+0643 |
2817
|
|
|
|
|
|
|
// 15. Replace https://codepoints.net/U+06AA with https://codepoints.net/U+0643 |
2818
|
|
|
|
|
|
|
// |
2819
|
|
|
|
|
|
|
// Not implemented: |
2820
|
|
|
|
|
|
|
// There is additional challenge with data coming from Egypt (such as printed |
2821
|
|
|
|
|
|
|
// or online newspapers), where the word-final https://codepoints.net/U+064A |
2822
|
|
|
|
|
|
|
// may be switched for https://codepoints.net/U+0649 and visa versa. Also, the |
2823
|
|
|
|
|
|
|
// word-final https://codepoints.net/U+0647 could actually represent https:// |
2824
|
|
|
|
|
|
|
// codepoints.net/U+0629. You can experiment with the following replacements, |
2825
|
|
|
|
|
|
|
// but I would rather apply them only after classifying the whole document as |
2826
|
|
|
|
|
|
|
// following such convention: |
2827
|
|
|
|
|
|
|
// 1. Replace https://codepoints.net/U+0629 with https://codepoints.net/U+0647 |
2828
|
|
|
|
|
|
|
// (frequent femine ending markers would appear like a third-person |
2829
|
|
|
|
|
|
|
// masculine pronoun clitic instead) |
2830
|
|
|
|
|
|
|
// 2. Replace https://codepoints.net/U+0649 with https://codepoints.net/U+064A |
2831
|
|
|
|
|
|
|
// (some "weak" words would become even more ambiguous or appear as if |
2832
|
|
|
|
|
|
|
// with a first-person pronoun clitic) |
2833
|
|
|
|
|
|
|
|
2834
|
|
|
|
|
|
|
output.clear(); |
2835
|
0
|
0
|
|
|
|
|
for (auto&& chr : utf8::decoder(form.str, form.len)) { |
2836
|
|
|
|
|
|
|
// Arabic normalization |
2837
|
0
|
0
|
|
|
|
|
if (chr == 0x640 || (chr >= 0x64B && chr <= 0x657) || chr == 0x670) {} |
|
|
0
|
|
|
|
|
|
2838
|
0
|
0
|
|
|
|
|
else if (chr == 0x622) utf8::append(output, 0x627); |
2839
|
0
|
0
|
|
|
|
|
else if (chr == 0x623) utf8::append(output, 0x627); |
2840
|
0
|
0
|
|
|
|
|
else if (chr == 0x624) utf8::append(output, 0x648); |
2841
|
0
|
0
|
|
|
|
|
else if (chr == 0x625) utf8::append(output, 0x627); |
2842
|
0
|
0
|
|
|
|
|
else if (chr == 0x626) utf8::append(output, 0x64A); |
2843
|
0
|
0
|
|
|
|
|
else if (chr == 0x671) utf8::append(output, 0x627); |
2844
|
0
|
0
|
|
|
|
|
else if (chr == 0x6A9) utf8::append(output, 0x643); |
2845
|
0
|
0
|
|
|
|
|
else if (chr == 0x6AA) utf8::append(output, 0x643); |
2846
|
0
|
0
|
|
|
|
|
else if (chr == 0x6CC) utf8::append(output, 0x64A); |
2847
|
|
|
|
|
|
|
// Space normalization |
2848
|
0
|
0
|
|
|
|
|
else if (chr == ' ' && version == 2) utf8::append(output, 0x01); |
|
|
0
|
|
|
|
|
|
2849
|
0
|
0
|
|
|
|
|
else if (chr == ' ' && version >= 3) utf8::append(output, 0xA0); |
|
|
0
|
|
|
|
|
|
2850
|
|
|
|
|
|
|
// Default |
2851
|
0
|
|
|
|
|
|
else utf8::append(output, chr); |
2852
|
|
|
|
|
|
|
} |
2853
|
|
|
|
|
|
|
|
2854
|
|
|
|
|
|
|
// Make sure we do not remove everything |
2855
|
0
|
0
|
|
|
|
|
if (output.empty() && form.len) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
2856
|
0
|
|
|
|
|
|
utf8::append(output, utf8::first(form.str, form.len)); |
2857
|
|
|
|
|
|
|
|
2858
|
|
|
|
|
|
|
return output; |
2859
|
|
|
|
|
|
|
} |
2860
|
|
|
|
|
|
|
|
2861
|
7
|
|
|
|
|
|
const string& model_morphodita_parsito::normalize_lemma(string_piece lemma, string& output) const { |
2862
|
|
|
|
|
|
|
using unilib::utf8; |
2863
|
|
|
|
|
|
|
|
2864
|
|
|
|
|
|
|
// No normalization on version 1 and 2 |
2865
|
14
|
50
|
|
|
|
|
if (version <= 2) return output.assign(lemma.str, lemma.len); |
2866
|
|
|
|
|
|
|
|
2867
|
|
|
|
|
|
|
// Normalize spaces by since version 3 |
2868
|
|
|
|
|
|
|
output.clear(); |
2869
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < lemma.len; i++) { |
2870
|
|
|
|
|
|
|
// Space normalization |
2871
|
0
|
0
|
|
|
|
|
if (lemma.str[i] == ' ') utf8::append(output, 0xA0); |
2872
|
|
|
|
|
|
|
// Default |
2873
|
0
|
|
|
|
|
|
else output.push_back(lemma.str[i]); |
2874
|
|
|
|
|
|
|
} |
2875
|
|
|
|
|
|
|
|
2876
|
|
|
|
|
|
|
return output; |
2877
|
|
|
|
|
|
|
} |
2878
|
|
|
|
|
|
|
|
2879
|
|
|
|
|
|
|
///////// |
2880
|
|
|
|
|
|
|
// File: model/pipeline.h |
2881
|
|
|
|
|
|
|
///////// |
2882
|
|
|
|
|
|
|
|
2883
|
|
|
|
|
|
|
// This file is part of UDPipe . |
2884
|
|
|
|
|
|
|
// |
2885
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
2886
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
2887
|
|
|
|
|
|
|
// |
2888
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
2889
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
2890
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
2891
|
|
|
|
|
|
|
|
2892
|
|
|
|
|
|
|
class pipeline { |
2893
|
|
|
|
|
|
|
public: |
2894
|
|
|
|
|
|
|
pipeline(const model* m, const string& input, const string& tagger, const string& parser, const string& output); |
2895
|
|
|
|
|
|
|
|
2896
|
|
|
|
|
|
|
void set_model(const model* m); |
2897
|
|
|
|
|
|
|
void set_input(const string& input); |
2898
|
|
|
|
|
|
|
void set_tagger(const string& tagger); |
2899
|
|
|
|
|
|
|
void set_parser(const string& parser); |
2900
|
|
|
|
|
|
|
void set_output(const string& output); |
2901
|
|
|
|
|
|
|
|
2902
|
|
|
|
|
|
|
void set_immediate(bool immediate); |
2903
|
|
|
|
|
|
|
void set_document_id(const string& document_id); |
2904
|
|
|
|
|
|
|
|
2905
|
|
|
|
|
|
|
bool process(istream& is, ostream& os, string& error) const; |
2906
|
|
|
|
|
|
|
|
2907
|
|
|
|
|
|
|
static const string DEFAULT; |
2908
|
|
|
|
|
|
|
static const string NONE; |
2909
|
|
|
|
|
|
|
|
2910
|
|
|
|
|
|
|
private: |
2911
|
|
|
|
|
|
|
const model* m; |
2912
|
|
|
|
|
|
|
string input, tokenizer, tagger, parser, output; |
2913
|
|
|
|
|
|
|
string document_id; |
2914
|
|
|
|
|
|
|
bool immediate; |
2915
|
|
|
|
|
|
|
}; |
2916
|
|
|
|
|
|
|
|
2917
|
|
|
|
|
|
|
///////// |
2918
|
|
|
|
|
|
|
// File: sentence/output_format.h |
2919
|
|
|
|
|
|
|
///////// |
2920
|
|
|
|
|
|
|
|
2921
|
|
|
|
|
|
|
// This file is part of UDPipe . |
2922
|
|
|
|
|
|
|
// |
2923
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
2924
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
2925
|
|
|
|
|
|
|
// |
2926
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
2927
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
2928
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
2929
|
|
|
|
|
|
|
|
2930
|
1
|
|
|
|
|
|
class output_format { |
2931
|
|
|
|
|
|
|
public: |
2932
|
1
|
|
|
|
|
|
virtual ~output_format() {} |
2933
|
|
|
|
|
|
|
|
2934
|
|
|
|
|
|
|
virtual void write_sentence(const sentence& s, ostream& os) = 0; |
2935
|
0
|
|
|
|
|
|
virtual void finish_document(ostream& /*os*/) {} |
2936
|
|
|
|
|
|
|
|
2937
|
|
|
|
|
|
|
// Static factory methods |
2938
|
|
|
|
|
|
|
static output_format* new_output_format(const string& name); |
2939
|
|
|
|
|
|
|
static output_format* new_conllu_output_format(const string& options = string()); |
2940
|
|
|
|
|
|
|
static output_format* new_epe_output_format(const string& options = string()); |
2941
|
|
|
|
|
|
|
static output_format* new_matxin_output_format(const string& options = string()); |
2942
|
|
|
|
|
|
|
static output_format* new_horizontal_output_format(const string& options = string()); |
2943
|
|
|
|
|
|
|
static output_format* new_plaintext_output_format(const string& options = string()); |
2944
|
|
|
|
|
|
|
static output_format* new_vertical_output_format(const string& options = string()); |
2945
|
|
|
|
|
|
|
|
2946
|
|
|
|
|
|
|
static const string CONLLU_V1; |
2947
|
|
|
|
|
|
|
static const string CONLLU_V2; |
2948
|
|
|
|
|
|
|
static const string HORIZONTAL_PARAGRAPHS; |
2949
|
|
|
|
|
|
|
static const string PLAINTEXT_NORMALIZED_SPACES; |
2950
|
|
|
|
|
|
|
static const string VERTICAL_PARAGRAPHS; |
2951
|
|
|
|
|
|
|
}; |
2952
|
|
|
|
|
|
|
|
2953
|
|
|
|
|
|
|
///////// |
2954
|
|
|
|
|
|
|
// File: utils/getwhole.h |
2955
|
|
|
|
|
|
|
///////// |
2956
|
|
|
|
|
|
|
|
2957
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
2958
|
|
|
|
|
|
|
// |
2959
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
2960
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
2961
|
|
|
|
|
|
|
// |
2962
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
2963
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
2964
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
2965
|
|
|
|
|
|
|
|
2966
|
|
|
|
|
|
|
namespace utils { |
2967
|
|
|
|
|
|
|
|
2968
|
|
|
|
|
|
|
// |
2969
|
|
|
|
|
|
|
// Declarations |
2970
|
|
|
|
|
|
|
// |
2971
|
|
|
|
|
|
|
|
2972
|
|
|
|
|
|
|
// Read whole content until EOF. All encountered \n are stored. |
2973
|
|
|
|
|
|
|
inline istream& getwhole(istream& is, string& whole); |
2974
|
|
|
|
|
|
|
|
2975
|
|
|
|
|
|
|
// |
2976
|
|
|
|
|
|
|
// Definitions |
2977
|
|
|
|
|
|
|
// |
2978
|
|
|
|
|
|
|
|
2979
|
0
|
|
|
|
|
|
istream& getwhole(istream& is, string& whole) { |
2980
|
|
|
|
|
|
|
whole.clear(); |
2981
|
|
|
|
|
|
|
|
2982
|
0
|
0
|
|
|
|
|
for (string line; getline(is, line); ) |
|
|
0
|
|
|
|
|
|
2983
|
0
|
0
|
|
|
|
|
whole.append(line).push_back('\n'); |
2984
|
|
|
|
|
|
|
|
2985
|
0
|
0
|
|
|
|
|
if (is.eof() && !whole.empty()) is.clear(istream::eofbit); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
2986
|
0
|
|
|
|
|
|
return is; |
2987
|
|
|
|
|
|
|
} |
2988
|
|
|
|
|
|
|
|
2989
|
|
|
|
|
|
|
} // namespace utils |
2990
|
|
|
|
|
|
|
|
2991
|
|
|
|
|
|
|
///////// |
2992
|
|
|
|
|
|
|
// File: model/pipeline.cpp |
2993
|
|
|
|
|
|
|
///////// |
2994
|
|
|
|
|
|
|
|
2995
|
|
|
|
|
|
|
// This file is part of UDPipe . |
2996
|
|
|
|
|
|
|
// |
2997
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
2998
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
2999
|
|
|
|
|
|
|
// |
3000
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
3001
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
3002
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
3003
|
|
|
|
|
|
|
|
3004
|
2
|
|
|
|
|
|
const string pipeline::DEFAULT; |
3005
|
2
|
|
|
|
|
|
const string pipeline::NONE = "none"; |
3006
|
|
|
|
|
|
|
|
3007
|
0
|
|
|
|
|
|
pipeline::pipeline(const model* m, const string& input, const string& tagger, const string& parser, const string& output) : immediate(false) { |
3008
|
|
|
|
|
|
|
set_model(m); |
3009
|
0
|
0
|
|
|
|
|
set_input(input); |
3010
|
|
|
|
|
|
|
set_tagger(tagger); |
3011
|
|
|
|
|
|
|
set_parser(parser); |
3012
|
0
|
0
|
|
|
|
|
set_output(output); |
3013
|
0
|
|
|
|
|
|
} |
3014
|
|
|
|
|
|
|
|
3015
|
0
|
|
|
|
|
|
void pipeline::set_model(const model* m) { |
3016
|
0
|
|
|
|
|
|
this->m = m; |
3017
|
0
|
|
|
|
|
|
} |
3018
|
|
|
|
|
|
|
|
3019
|
0
|
|
|
|
|
|
void pipeline::set_input(const string& input) { |
3020
|
|
|
|
|
|
|
tokenizer.clear(); |
3021
|
|
|
|
|
|
|
|
3022
|
0
|
0
|
|
|
|
|
if (input.empty()) { |
3023
|
0
|
|
|
|
|
|
this->input = "conllu"; |
3024
|
0
|
0
|
|
|
|
|
} else if (input == "tokenize" || input == "tokenizer") { |
3025
|
0
|
|
|
|
|
|
this->input = "tokenizer"; |
3026
|
0
|
0
|
|
|
|
|
} else if (input.compare(0, 10, "tokenizer=") == 0) { |
3027
|
0
|
|
|
|
|
|
this->input = "tokenizer"; |
3028
|
0
|
|
|
|
|
|
tokenizer.assign(input, 10, string::npos); |
3029
|
|
|
|
|
|
|
} else { |
3030
|
0
|
|
|
|
|
|
this->input = input; |
3031
|
|
|
|
|
|
|
} |
3032
|
0
|
|
|
|
|
|
} |
3033
|
|
|
|
|
|
|
|
3034
|
0
|
|
|
|
|
|
void pipeline::set_tagger(const string& tagger) { |
3035
|
0
|
|
|
|
|
|
this->tagger = tagger; |
3036
|
0
|
|
|
|
|
|
} |
3037
|
|
|
|
|
|
|
|
3038
|
0
|
|
|
|
|
|
void pipeline::set_parser(const string& parser) { |
3039
|
0
|
|
|
|
|
|
this->parser = parser; |
3040
|
0
|
|
|
|
|
|
} |
3041
|
|
|
|
|
|
|
|
3042
|
0
|
|
|
|
|
|
void pipeline::set_output(const string& output) { |
3043
|
0
|
0
|
|
|
|
|
this->output = output.empty() ? "conllu" : output; |
3044
|
0
|
|
|
|
|
|
} |
3045
|
|
|
|
|
|
|
|
3046
|
0
|
|
|
|
|
|
void pipeline::set_immediate(bool immediate) { |
3047
|
0
|
|
|
|
|
|
this->immediate = immediate; |
3048
|
0
|
|
|
|
|
|
} |
3049
|
|
|
|
|
|
|
|
3050
|
0
|
|
|
|
|
|
void pipeline::set_document_id(const string& document_id) { |
3051
|
0
|
|
|
|
|
|
this->document_id = document_id; |
3052
|
0
|
|
|
|
|
|
} |
3053
|
|
|
|
|
|
|
|
3054
|
0
|
|
|
|
|
|
bool pipeline::process(istream& is, ostream& os, string& error) const { |
3055
|
|
|
|
|
|
|
error.clear(); |
3056
|
|
|
|
|
|
|
|
3057
|
0
|
|
|
|
|
|
sentence s; |
3058
|
|
|
|
|
|
|
|
3059
|
|
|
|
|
|
|
unique_ptr reader; |
3060
|
0
|
0
|
|
|
|
|
if (input == "tokenizer") { |
3061
|
0
|
0
|
|
|
|
|
reader.reset(m->new_tokenizer(tokenizer)); |
3062
|
0
|
0
|
|
|
|
|
if (!reader) return error.assign("The model does not have a tokenizer!"), false; |
|
|
0
|
|
|
|
|
|
3063
|
|
|
|
|
|
|
} else { |
3064
|
0
|
0
|
|
|
|
|
reader.reset(input_format::new_input_format(input)); |
3065
|
0
|
0
|
|
|
|
|
if (!reader) return error.assign("The requested input format '").append(input).append("' does not exist!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
3066
|
|
|
|
|
|
|
} |
3067
|
0
|
0
|
|
|
|
|
reader->reset_document(document_id); |
3068
|
|
|
|
|
|
|
|
3069
|
0
|
0
|
|
|
|
|
unique_ptr writer(output_format::new_output_format(output)); |
3070
|
0
|
0
|
|
|
|
|
if (!writer) return error.assign("The requested output format '").append(output).append("' does not exist!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
3071
|
|
|
|
|
|
|
|
3072
|
|
|
|
|
|
|
string block; |
3073
|
0
|
0
|
|
|
|
|
while (immediate ? reader->read_block(is, block) : bool(getwhole(is, block))) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
3074
|
0
|
0
|
|
|
|
|
reader->set_text(block); |
3075
|
0
|
0
|
|
|
|
|
while (reader->next_sentence(s, error)) { |
|
|
0
|
|
|
|
|
|
3076
|
0
|
0
|
|
|
|
|
if (tagger != NONE) |
3077
|
0
|
0
|
|
|
|
|
if (!m->tag(s, tagger, error)) |
|
|
0
|
|
|
|
|
|
3078
|
|
|
|
|
|
|
return false; |
3079
|
|
|
|
|
|
|
|
3080
|
0
|
0
|
|
|
|
|
if (parser != NONE) |
3081
|
0
|
0
|
|
|
|
|
if (!m->parse(s, parser, error)) |
|
|
0
|
|
|
|
|
|
3082
|
|
|
|
|
|
|
return false; |
3083
|
|
|
|
|
|
|
|
3084
|
0
|
0
|
|
|
|
|
writer->write_sentence(s, os); |
3085
|
|
|
|
|
|
|
} |
3086
|
0
|
0
|
|
|
|
|
if (!error.empty()) return false; |
3087
|
|
|
|
|
|
|
} |
3088
|
0
|
0
|
|
|
|
|
writer->finish_document(os); |
3089
|
|
|
|
|
|
|
|
3090
|
|
|
|
|
|
|
return true; |
3091
|
|
|
|
|
|
|
} |
3092
|
|
|
|
|
|
|
|
3093
|
|
|
|
|
|
|
///////// |
3094
|
|
|
|
|
|
|
// File: morphodita/tagset_converter/tagset_converter.h |
3095
|
|
|
|
|
|
|
///////// |
3096
|
|
|
|
|
|
|
|
3097
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
3098
|
|
|
|
|
|
|
// |
3099
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
3100
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
3101
|
|
|
|
|
|
|
// |
3102
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
3103
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
3104
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
3105
|
|
|
|
|
|
|
|
3106
|
|
|
|
|
|
|
namespace morphodita { |
3107
|
|
|
|
|
|
|
|
3108
|
0
|
|
|
|
|
|
class tagset_converter { |
3109
|
|
|
|
|
|
|
public: |
3110
|
0
|
|
|
|
|
|
virtual ~tagset_converter() {} |
3111
|
|
|
|
|
|
|
|
3112
|
|
|
|
|
|
|
// Convert a tag-lemma pair to a different tag set. |
3113
|
|
|
|
|
|
|
virtual void convert(tagged_lemma& tagged_lemma) const = 0; |
3114
|
|
|
|
|
|
|
// Convert a result of analysis to a different tag set. Apart from calling |
3115
|
|
|
|
|
|
|
// convert, any repeated entry is removed. |
3116
|
|
|
|
|
|
|
virtual void convert_analyzed(vector& tagged_lemmas) const = 0; |
3117
|
|
|
|
|
|
|
// Convert a result of generation to a different tag set. Apart from calling |
3118
|
|
|
|
|
|
|
// convert, any repeated entry is removed. |
3119
|
|
|
|
|
|
|
virtual void convert_generated(vector& forms) const = 0; |
3120
|
|
|
|
|
|
|
|
3121
|
|
|
|
|
|
|
// Static factory methods |
3122
|
|
|
|
|
|
|
static tagset_converter* new_identity_converter(); |
3123
|
|
|
|
|
|
|
|
3124
|
|
|
|
|
|
|
static tagset_converter* new_pdt_to_conll2009_converter(); |
3125
|
|
|
|
|
|
|
static tagset_converter* new_strip_lemma_comment_converter(const morpho& dictionary); |
3126
|
|
|
|
|
|
|
static tagset_converter* new_strip_lemma_id_converter(const morpho& dictionary); |
3127
|
|
|
|
|
|
|
}; |
3128
|
|
|
|
|
|
|
|
3129
|
|
|
|
|
|
|
// Helper method for creating tagset_converter from instance name. |
3130
|
|
|
|
|
|
|
tagset_converter* new_tagset_converter(const string& name, const morpho& dictionary); |
3131
|
|
|
|
|
|
|
|
3132
|
|
|
|
|
|
|
// Helper methods making sure remapped results are unique. |
3133
|
|
|
|
|
|
|
void tagset_converter_unique_analyzed(vector& tagged_lemmas); |
3134
|
|
|
|
|
|
|
void tagset_converter_unique_generated(vector& forms); |
3135
|
|
|
|
|
|
|
|
3136
|
|
|
|
|
|
|
} // namespace morphodita |
3137
|
|
|
|
|
|
|
|
3138
|
|
|
|
|
|
|
///////// |
3139
|
|
|
|
|
|
|
// File: morphodita/derivator/derivation_formatter.h |
3140
|
|
|
|
|
|
|
///////// |
3141
|
|
|
|
|
|
|
|
3142
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
3143
|
|
|
|
|
|
|
// |
3144
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
3145
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
3146
|
|
|
|
|
|
|
// |
3147
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
3148
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
3149
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
3150
|
|
|
|
|
|
|
|
3151
|
|
|
|
|
|
|
namespace morphodita { |
3152
|
|
|
|
|
|
|
|
3153
|
0
|
|
|
|
|
|
class derivation_formatter { |
3154
|
|
|
|
|
|
|
public: |
3155
|
0
|
|
|
|
|
|
virtual ~derivation_formatter() {} |
3156
|
|
|
|
|
|
|
|
3157
|
|
|
|
|
|
|
// Perform the required derivation and store it directly in the lemma. |
3158
|
|
|
|
|
|
|
virtual void format_derivation(string& lemma) const; |
3159
|
|
|
|
|
|
|
|
3160
|
|
|
|
|
|
|
// Perform the required derivation and store it directly in the tagged_lemma. |
3161
|
|
|
|
|
|
|
// If a tagset_converter is given, it is also applied. |
3162
|
|
|
|
|
|
|
virtual void format_tagged_lemma(tagged_lemma& lemma, const tagset_converter* converter = nullptr) const = 0; |
3163
|
|
|
|
|
|
|
|
3164
|
|
|
|
|
|
|
// Perform the required derivation on a list of tagged_lemmas. |
3165
|
|
|
|
|
|
|
// If a tagset_converter is given, it is also applied. |
3166
|
|
|
|
|
|
|
// Either way, only unique entries are returned. |
3167
|
|
|
|
|
|
|
virtual void format_tagged_lemmas(vector& lemmas, const tagset_converter* converter = nullptr) const; |
3168
|
|
|
|
|
|
|
|
3169
|
|
|
|
|
|
|
// Static factory methods. |
3170
|
|
|
|
|
|
|
static derivation_formatter* new_none_derivation_formatter(); |
3171
|
|
|
|
|
|
|
static derivation_formatter* new_root_derivation_formatter(const derivator* derinet); |
3172
|
|
|
|
|
|
|
static derivation_formatter* new_path_derivation_formatter(const derivator* derinet); |
3173
|
|
|
|
|
|
|
static derivation_formatter* new_tree_derivation_formatter(const derivator* derinet); |
3174
|
|
|
|
|
|
|
// String version of static factory method. |
3175
|
|
|
|
|
|
|
static derivation_formatter* new_derivation_formatter(string_piece name, const derivator* derinet); |
3176
|
|
|
|
|
|
|
}; |
3177
|
|
|
|
|
|
|
|
3178
|
|
|
|
|
|
|
} // namespace morphodita |
3179
|
|
|
|
|
|
|
|
3180
|
|
|
|
|
|
|
///////// |
3181
|
|
|
|
|
|
|
// File: morphodita/derivator/derivation_formatter.cpp |
3182
|
|
|
|
|
|
|
///////// |
3183
|
|
|
|
|
|
|
|
3184
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
3185
|
|
|
|
|
|
|
// |
3186
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
3187
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
3188
|
|
|
|
|
|
|
// |
3189
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
3190
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
3191
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
3192
|
|
|
|
|
|
|
|
3193
|
|
|
|
|
|
|
namespace morphodita { |
3194
|
|
|
|
|
|
|
|
3195
|
0
|
|
|
|
|
|
void derivation_formatter::format_derivation(string& lemma) const { |
3196
|
0
|
|
|
|
|
|
tagged_lemma result; |
3197
|
0
|
|
|
|
|
|
result.lemma.swap(lemma); |
3198
|
0
|
0
|
|
|
|
|
format_tagged_lemma(result); |
3199
|
0
|
|
|
|
|
|
lemma.swap(result.lemma); |
3200
|
0
|
|
|
|
|
|
} |
3201
|
|
|
|
|
|
|
|
3202
|
0
|
|
|
|
|
|
void derivation_formatter::format_tagged_lemmas(vector& lemmas, const tagset_converter* converter) const { |
3203
|
0
|
0
|
|
|
|
|
for (auto&& lemma : lemmas) |
3204
|
0
|
|
|
|
|
|
format_tagged_lemma(lemma, converter); |
3205
|
|
|
|
|
|
|
|
3206
|
0
|
0
|
|
|
|
|
if (lemmas.size() > 1) |
3207
|
0
|
|
|
|
|
|
tagset_converter_unique_analyzed(lemmas); |
3208
|
0
|
|
|
|
|
|
} |
3209
|
|
|
|
|
|
|
|
3210
|
0
|
|
|
|
|
|
class none_derivation_formatter : public derivation_formatter { |
3211
|
0
|
|
|
|
|
|
virtual void format_derivation(string& /*lemma*/) const override {} |
3212
|
|
|
|
|
|
|
|
3213
|
0
|
|
|
|
|
|
virtual void format_tagged_lemma(tagged_lemma& lemma, const tagset_converter* converter) const override { |
3214
|
0
|
0
|
|
|
|
|
if (converter) converter->convert(lemma); |
3215
|
0
|
|
|
|
|
|
} |
3216
|
|
|
|
|
|
|
|
3217
|
0
|
|
|
|
|
|
virtual void format_tagged_lemmas(vector& lemmas, const tagset_converter* converter) const override { |
3218
|
0
|
0
|
|
|
|
|
if (converter) converter->convert_analyzed(lemmas); |
3219
|
0
|
|
|
|
|
|
} |
3220
|
|
|
|
|
|
|
}; |
3221
|
|
|
|
|
|
|
|
3222
|
0
|
|
|
|
|
|
derivation_formatter* derivation_formatter::new_none_derivation_formatter() { |
3223
|
0
|
|
|
|
|
|
return new none_derivation_formatter(); |
3224
|
|
|
|
|
|
|
} |
3225
|
|
|
|
|
|
|
|
3226
|
0
|
|
|
|
|
|
class root_derivation_formatter : public derivation_formatter { |
3227
|
|
|
|
|
|
|
public: |
3228
|
0
|
|
|
|
|
|
root_derivation_formatter(const derivator* derinet) : derinet(derinet) {} |
3229
|
|
|
|
|
|
|
|
3230
|
0
|
|
|
|
|
|
virtual void format_tagged_lemma(tagged_lemma& lemma, const tagset_converter* converter) const override { |
3231
|
0
|
0
|
|
|
|
|
for (derivated_lemma parent; derinet->parent(lemma.lemma, parent); ) |
|
|
0
|
|
|
|
|
|
3232
|
0
|
|
|
|
|
|
lemma.lemma.assign(parent.lemma); |
3233
|
0
|
0
|
|
|
|
|
if (converter) converter->convert(lemma); |
3234
|
0
|
|
|
|
|
|
} |
3235
|
|
|
|
|
|
|
|
3236
|
|
|
|
|
|
|
private: |
3237
|
|
|
|
|
|
|
const derivator* derinet; |
3238
|
|
|
|
|
|
|
}; |
3239
|
|
|
|
|
|
|
|
3240
|
0
|
|
|
|
|
|
derivation_formatter* derivation_formatter::new_root_derivation_formatter(const derivator* derinet) { |
3241
|
0
|
0
|
|
|
|
|
return derinet ? new root_derivation_formatter(derinet) : nullptr; |
|
|
0
|
|
|
|
|
|
3242
|
|
|
|
|
|
|
} |
3243
|
|
|
|
|
|
|
|
3244
|
0
|
|
|
|
|
|
class path_derivation_formatter : public derivation_formatter { |
3245
|
|
|
|
|
|
|
public: |
3246
|
0
|
|
|
|
|
|
path_derivation_formatter(const derivator* derinet) : derinet(derinet) {} |
3247
|
|
|
|
|
|
|
|
3248
|
0
|
|
|
|
|
|
virtual void format_tagged_lemma(tagged_lemma& lemma, const tagset_converter* converter) const override { |
3249
|
0
|
|
|
|
|
|
tagged_lemma current(lemma); |
3250
|
0
|
0
|
|
|
|
|
if (converter) converter->convert(lemma); |
|
|
0
|
|
|
|
|
|
3251
|
0
|
0
|
|
|
|
|
for (derivated_lemma parent; derinet->parent(current.lemma, parent); current.lemma.swap(parent.lemma)) { |
|
|
0
|
|
|
|
|
|
3252
|
0
|
0
|
|
|
|
|
tagged_lemma parrent_lemma(parent.lemma, current.tag); |
3253
|
0
|
0
|
|
|
|
|
if (converter) converter->convert(parrent_lemma); |
|
|
0
|
|
|
|
|
|
3254
|
0
|
0
|
|
|
|
|
lemma.lemma.append(" ").append(parrent_lemma.lemma); |
3255
|
|
|
|
|
|
|
} |
3256
|
0
|
|
|
|
|
|
} |
3257
|
|
|
|
|
|
|
|
3258
|
|
|
|
|
|
|
private: |
3259
|
|
|
|
|
|
|
const derivator* derinet; |
3260
|
|
|
|
|
|
|
}; |
3261
|
|
|
|
|
|
|
|
3262
|
0
|
|
|
|
|
|
derivation_formatter* derivation_formatter::new_path_derivation_formatter(const derivator* derinet) { |
3263
|
0
|
0
|
|
|
|
|
return derinet ? new path_derivation_formatter(derinet) : nullptr; |
|
|
0
|
|
|
|
|
|
3264
|
|
|
|
|
|
|
} |
3265
|
|
|
|
|
|
|
|
3266
|
0
|
|
|
|
|
|
class tree_derivation_formatter : public derivation_formatter { |
3267
|
|
|
|
|
|
|
public: |
3268
|
0
|
|
|
|
|
|
tree_derivation_formatter(const derivator* derinet) : derinet(derinet) {} |
3269
|
|
|
|
|
|
|
|
3270
|
0
|
|
|
|
|
|
virtual void format_tagged_lemma(tagged_lemma& lemma, const tagset_converter* converter) const override { |
3271
|
|
|
|
|
|
|
string root(lemma.lemma), tag(lemma.tag); |
3272
|
0
|
0
|
|
|
|
|
if (converter) converter->convert(lemma); |
|
|
0
|
|
|
|
|
|
3273
|
0
|
0
|
|
|
|
|
for (derivated_lemma parent; derinet->parent(root, parent); root.swap(parent.lemma)) {} |
|
|
0
|
|
|
|
|
|
3274
|
0
|
0
|
|
|
|
|
format_tree(root, tag, lemma, converter); |
3275
|
0
|
|
|
|
|
|
} |
3276
|
|
|
|
|
|
|
|
3277
|
0
|
|
|
|
|
|
void format_tree(const string& root, const string& tag, tagged_lemma& tree, const tagset_converter* converter) const { |
3278
|
0
|
|
|
|
|
|
vector children; |
3279
|
|
|
|
|
|
|
|
3280
|
0
|
0
|
|
|
|
|
if (converter) { |
3281
|
0
|
0
|
|
|
|
|
tagged_lemma current(root, tag); |
3282
|
0
|
0
|
|
|
|
|
converter->convert(current); |
3283
|
0
|
0
|
|
|
|
|
tree.lemma.append(" ").append(current.lemma); |
3284
|
|
|
|
|
|
|
} else { |
3285
|
0
|
0
|
|
|
|
|
tree.lemma.append(" ").append(root); |
3286
|
|
|
|
|
|
|
} |
3287
|
|
|
|
|
|
|
|
3288
|
0
|
0
|
|
|
|
|
if (derinet->children(root, children)) |
|
|
0
|
|
|
|
|
|
3289
|
0
|
0
|
|
|
|
|
for (auto&& child : children) |
3290
|
0
|
0
|
|
|
|
|
format_tree(child.lemma, tag, tree, converter); |
3291
|
0
|
0
|
|
|
|
|
tree.lemma.push_back(' '); |
3292
|
0
|
|
|
|
|
|
} |
3293
|
|
|
|
|
|
|
|
3294
|
|
|
|
|
|
|
private: |
3295
|
|
|
|
|
|
|
const derivator* derinet; |
3296
|
|
|
|
|
|
|
}; |
3297
|
|
|
|
|
|
|
|
3298
|
0
|
|
|
|
|
|
derivation_formatter* derivation_formatter::new_tree_derivation_formatter(const derivator* derinet) { |
3299
|
0
|
0
|
|
|
|
|
return derinet ? new tree_derivation_formatter(derinet) : nullptr; |
|
|
0
|
|
|
|
|
|
3300
|
|
|
|
|
|
|
} |
3301
|
|
|
|
|
|
|
|
3302
|
0
|
|
|
|
|
|
derivation_formatter* derivation_formatter::new_derivation_formatter(string_piece name, const derivator* derinet) { |
3303
|
0
|
0
|
|
|
|
|
if (name == "none") return new_none_derivation_formatter(); |
3304
|
0
|
0
|
|
|
|
|
if (name == "root") return new_root_derivation_formatter(derinet); |
3305
|
0
|
0
|
|
|
|
|
if (name == "path") return new_path_derivation_formatter(derinet); |
3306
|
0
|
0
|
|
|
|
|
if (name == "tree") return new_tree_derivation_formatter(derinet); |
3307
|
|
|
|
|
|
|
return nullptr; |
3308
|
|
|
|
|
|
|
} |
3309
|
|
|
|
|
|
|
|
3310
|
|
|
|
|
|
|
} // namespace morphodita |
3311
|
|
|
|
|
|
|
|
3312
|
|
|
|
|
|
|
///////// |
3313
|
|
|
|
|
|
|
// File: morphodita/morpho/small_stringops.h |
3314
|
|
|
|
|
|
|
///////// |
3315
|
|
|
|
|
|
|
|
3316
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
3317
|
|
|
|
|
|
|
// |
3318
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
3319
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
3320
|
|
|
|
|
|
|
// |
3321
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
3322
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
3323
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
3324
|
|
|
|
|
|
|
|
3325
|
|
|
|
|
|
|
namespace morphodita { |
3326
|
|
|
|
|
|
|
|
3327
|
|
|
|
|
|
|
// Declarations |
3328
|
|
|
|
|
|
|
inline bool small_memeq(const void* a, const void* b, size_t len); |
3329
|
|
|
|
|
|
|
inline void small_memcpy(void* dest, const void* src, size_t len); |
3330
|
|
|
|
|
|
|
|
3331
|
|
|
|
|
|
|
// Definitions |
3332
|
|
|
|
|
|
|
bool small_memeq(const void* a_void, const void* b_void, size_t len) { |
3333
|
|
|
|
|
|
|
const char* a = (const char*)a_void; |
3334
|
|
|
|
|
|
|
const char* b = (const char*)b_void; |
3335
|
|
|
|
|
|
|
|
3336
|
1980
|
0
|
|
|
|
|
while (len--) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
3337
|
1735
|
0
|
|
|
|
|
if (*a++ != *b++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
3338
|
|
|
|
|
|
|
return false; |
3339
|
|
|
|
|
|
|
return true; |
3340
|
|
|
|
|
|
|
} |
3341
|
|
|
|
|
|
|
|
3342
|
|
|
|
|
|
|
void small_memcpy(void* dest_void, const void* src_void, size_t len) { |
3343
|
|
|
|
|
|
|
char* dest = (char*)dest_void; |
3344
|
|
|
|
|
|
|
const char* src = (const char*)src_void; |
3345
|
|
|
|
|
|
|
|
3346
|
1353
|
0
|
|
|
|
|
while (len--) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
3347
|
967
|
|
|
|
|
|
*dest++ = *src++; |
3348
|
|
|
|
|
|
|
} |
3349
|
|
|
|
|
|
|
|
3350
|
|
|
|
|
|
|
} // namespace morphodita |
3351
|
|
|
|
|
|
|
|
3352
|
|
|
|
|
|
|
///////// |
3353
|
|
|
|
|
|
|
// File: trainer/training_failure.h |
3354
|
|
|
|
|
|
|
///////// |
3355
|
|
|
|
|
|
|
|
3356
|
|
|
|
|
|
|
// This file is part of UDPipe . |
3357
|
|
|
|
|
|
|
// |
3358
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
3359
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
3360
|
|
|
|
|
|
|
// |
3361
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
3362
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
3363
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
3364
|
|
|
|
|
|
|
|
3365
|
|
|
|
|
|
|
namespace utils { |
3366
|
|
|
|
|
|
|
|
3367
|
0
|
|
|
|
|
|
class training_error : public runtime_error { |
3368
|
|
|
|
|
|
|
public: |
3369
|
|
|
|
|
|
|
training_error(); |
3370
|
|
|
|
|
|
|
|
3371
|
|
|
|
|
|
|
static ostringstream message_collector; |
3372
|
|
|
|
|
|
|
}; |
3373
|
|
|
|
|
|
|
|
3374
|
|
|
|
|
|
|
#define training_failure(message) throw (training_error::message_collector << message, training_error()) |
3375
|
|
|
|
|
|
|
|
3376
|
|
|
|
|
|
|
} // namespace utils |
3377
|
|
|
|
|
|
|
|
3378
|
|
|
|
|
|
|
///////// |
3379
|
|
|
|
|
|
|
// File: utils/binary_encoder.h |
3380
|
|
|
|
|
|
|
///////// |
3381
|
|
|
|
|
|
|
|
3382
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
3383
|
|
|
|
|
|
|
// |
3384
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
3385
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
3386
|
|
|
|
|
|
|
// |
3387
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
3388
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
3389
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
3390
|
|
|
|
|
|
|
|
3391
|
|
|
|
|
|
|
namespace utils { |
3392
|
|
|
|
|
|
|
|
3393
|
|
|
|
|
|
|
// |
3394
|
|
|
|
|
|
|
// Declarations |
3395
|
|
|
|
|
|
|
// |
3396
|
|
|
|
|
|
|
|
3397
|
0
|
|
|
|
|
|
class binary_encoder { |
3398
|
|
|
|
|
|
|
public: |
3399
|
|
|
|
|
|
|
inline binary_encoder(); |
3400
|
|
|
|
|
|
|
|
3401
|
|
|
|
|
|
|
inline void add_1B(unsigned val); |
3402
|
|
|
|
|
|
|
inline void add_2B(unsigned val); |
3403
|
|
|
|
|
|
|
inline void add_4B(unsigned val); |
3404
|
|
|
|
|
|
|
inline void add_float(double val); |
3405
|
|
|
|
|
|
|
inline void add_double(double val); |
3406
|
|
|
|
|
|
|
inline void add_str(string_piece str); |
3407
|
|
|
|
|
|
|
inline void add_data(string_piece data); |
3408
|
|
|
|
|
|
|
template inline void add_data(const vector& data); |
3409
|
|
|
|
|
|
|
template inline void add_data(const T* data, size_t elements); |
3410
|
|
|
|
|
|
|
|
3411
|
|
|
|
|
|
|
vector data; |
3412
|
|
|
|
|
|
|
}; |
3413
|
|
|
|
|
|
|
|
3414
|
|
|
|
|
|
|
// |
3415
|
|
|
|
|
|
|
// Definitions |
3416
|
|
|
|
|
|
|
// |
3417
|
|
|
|
|
|
|
|
3418
|
0
|
|
|
|
|
|
binary_encoder::binary_encoder() { |
3419
|
0
|
0
|
|
|
|
|
data.reserve(16); |
3420
|
0
|
|
|
|
|
|
} |
3421
|
|
|
|
|
|
|
|
3422
|
0
|
|
|
|
|
|
void binary_encoder::add_1B(unsigned val) { |
3423
|
0
|
0
|
|
|
|
|
if (uint8_t(val) != val) training_failure("Should encode value " << val << " in one byte!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
3424
|
0
|
|
|
|
|
|
data.push_back(val); |
3425
|
0
|
|
|
|
|
|
} |
3426
|
|
|
|
|
|
|
|
3427
|
0
|
|
|
|
|
|
void binary_encoder::add_2B(unsigned val) { |
3428
|
0
|
0
|
|
|
|
|
if (uint16_t(val) != val) training_failure("Should encode value " << val << " in two bytes!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
3429
|
0
|
|
|
|
|
|
data.insert(data.end(), (unsigned char*) &val, ((unsigned char*) &val) + sizeof(uint16_t)); |
3430
|
0
|
|
|
|
|
|
} |
3431
|
|
|
|
|
|
|
|
3432
|
|
|
|
|
|
|
void binary_encoder::add_4B(unsigned val) { |
3433
|
|
|
|
|
|
|
if (uint32_t(val) != val) training_failure("Should encode value " << val << " in four bytes!"); |
3434
|
0
|
|
|
|
|
|
data.insert(data.end(), (unsigned char*) &val, ((unsigned char*) &val) + sizeof(uint32_t)); |
3435
|
|
|
|
|
|
|
} |
3436
|
|
|
|
|
|
|
|
3437
|
|
|
|
|
|
|
void binary_encoder::add_float(double val) { |
3438
|
|
|
|
|
|
|
data.insert(data.end(), (unsigned char*) &val, ((unsigned char*) &val) + sizeof(float)); |
3439
|
|
|
|
|
|
|
} |
3440
|
|
|
|
|
|
|
|
3441
|
|
|
|
|
|
|
void binary_encoder::add_double(double val) { |
3442
|
|
|
|
|
|
|
data.insert(data.end(), (unsigned char*) &val, ((unsigned char*) &val) + sizeof(double)); |
3443
|
|
|
|
|
|
|
} |
3444
|
|
|
|
|
|
|
|
3445
|
0
|
|
|
|
|
|
void binary_encoder::add_str(string_piece str) { |
3446
|
0
|
|
|
|
|
|
add_1B(str.len < 255 ? str.len : 255); |
3447
|
0
|
0
|
|
|
|
|
if (!(str.len < 255)) add_4B(str.len); |
3448
|
|
|
|
|
|
|
add_data(str); |
3449
|
0
|
|
|
|
|
|
} |
3450
|
|
|
|
|
|
|
|
3451
|
|
|
|
|
|
|
void binary_encoder::add_data(string_piece data) { |
3452
|
0
|
|
|
|
|
|
this->data.insert(this->data.end(), (const unsigned char*) data.str, (const unsigned char*) (data.str + data.len)); |
3453
|
|
|
|
|
|
|
} |
3454
|
|
|
|
|
|
|
|
3455
|
|
|
|
|
|
|
template |
3456
|
|
|
|
|
|
|
void binary_encoder::add_data(const vector& data) { |
3457
|
0
|
|
|
|
|
|
this->data.insert(this->data.end(), (const unsigned char*) data.data(), (const unsigned char*) (data.data() + data.size())); |
3458
|
|
|
|
|
|
|
} |
3459
|
|
|
|
|
|
|
|
3460
|
|
|
|
|
|
|
template |
3461
|
|
|
|
|
|
|
void binary_encoder::add_data(const T* data, size_t elements) { |
3462
|
0
|
|
|
|
|
|
this->data.insert(this->data.end(), (const unsigned char*) data, (const unsigned char*) (data + elements)); |
3463
|
|
|
|
|
|
|
} |
3464
|
|
|
|
|
|
|
|
3465
|
|
|
|
|
|
|
} // namespace utils |
3466
|
|
|
|
|
|
|
|
3467
|
|
|
|
|
|
|
///////// |
3468
|
|
|
|
|
|
|
// File: utils/pointer_decoder.h |
3469
|
|
|
|
|
|
|
///////// |
3470
|
|
|
|
|
|
|
|
3471
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
3472
|
|
|
|
|
|
|
// |
3473
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
3474
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
3475
|
|
|
|
|
|
|
// |
3476
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
3477
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
3478
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
3479
|
|
|
|
|
|
|
|
3480
|
|
|
|
|
|
|
namespace utils { |
3481
|
|
|
|
|
|
|
|
3482
|
|
|
|
|
|
|
// |
3483
|
|
|
|
|
|
|
// Declarations |
3484
|
|
|
|
|
|
|
// |
3485
|
|
|
|
|
|
|
|
3486
|
|
|
|
|
|
|
class pointer_decoder { |
3487
|
|
|
|
|
|
|
public: |
3488
|
|
|
|
|
|
|
inline pointer_decoder(const unsigned char*& data); |
3489
|
|
|
|
|
|
|
inline unsigned next_1B(); |
3490
|
|
|
|
|
|
|
inline unsigned next_2B(); |
3491
|
|
|
|
|
|
|
inline unsigned next_4B(); |
3492
|
|
|
|
|
|
|
inline void next_str(string& str); |
3493
|
|
|
|
|
|
|
template inline const T* next(unsigned elements); |
3494
|
|
|
|
|
|
|
|
3495
|
|
|
|
|
|
|
private: |
3496
|
|
|
|
|
|
|
const unsigned char*& data; |
3497
|
|
|
|
|
|
|
}; |
3498
|
|
|
|
|
|
|
|
3499
|
|
|
|
|
|
|
// |
3500
|
|
|
|
|
|
|
// Definitions |
3501
|
|
|
|
|
|
|
// |
3502
|
|
|
|
|
|
|
|
3503
|
14
|
|
|
|
|
|
pointer_decoder::pointer_decoder(const unsigned char*& data) : data(data) {} |
3504
|
|
|
|
|
|
|
|
3505
|
|
|
|
|
|
|
unsigned pointer_decoder::next_1B() { |
3506
|
0
|
|
|
|
|
|
return *data++; |
3507
|
|
|
|
|
|
|
} |
3508
|
|
|
|
|
|
|
|
3509
|
|
|
|
|
|
|
unsigned pointer_decoder::next_2B() { |
3510
|
|
|
|
|
|
|
uint16_t result; |
3511
|
14
|
|
|
|
|
|
memcpy(&result, data, sizeof(uint16_t)); |
3512
|
14
|
|
|
|
|
|
data += sizeof(uint16_t); |
3513
|
1
|
|
|
|
|
|
return result; |
3514
|
|
|
|
|
|
|
} |
3515
|
|
|
|
|
|
|
|
3516
|
|
|
|
|
|
|
unsigned pointer_decoder::next_4B() { |
3517
|
|
|
|
|
|
|
uint32_t result; |
3518
|
13
|
|
|
|
|
|
memcpy(&result, data, sizeof(uint32_t)); |
3519
|
13
|
|
|
|
|
|
data += sizeof(uint32_t); |
3520
|
|
|
|
|
|
|
return result; |
3521
|
|
|
|
|
|
|
} |
3522
|
|
|
|
|
|
|
|
3523
|
|
|
|
|
|
|
void pointer_decoder::next_str(string& str) { |
3524
|
|
|
|
|
|
|
unsigned len = next_1B(); |
3525
|
|
|
|
|
|
|
if (len == 255) len = next_4B(); |
3526
|
|
|
|
|
|
|
str.assign(next(len), len); |
3527
|
|
|
|
|
|
|
} |
3528
|
|
|
|
|
|
|
|
3529
|
|
|
|
|
|
|
template const T* pointer_decoder::next(unsigned elements) { |
3530
|
3
|
|
|
|
|
|
const T* result = (const T*) data; |
3531
|
0
|
|
|
|
|
|
data += sizeof(T) * elements; |
3532
|
|
|
|
|
|
|
return result; |
3533
|
|
|
|
|
|
|
} |
3534
|
|
|
|
|
|
|
|
3535
|
|
|
|
|
|
|
} // namespace utils |
3536
|
|
|
|
|
|
|
|
3537
|
|
|
|
|
|
|
///////// |
3538
|
|
|
|
|
|
|
// File: utils/unaligned_access.h |
3539
|
|
|
|
|
|
|
///////// |
3540
|
|
|
|
|
|
|
|
3541
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
3542
|
|
|
|
|
|
|
// |
3543
|
|
|
|
|
|
|
// Copyright 2017 Institute of Formal and Applied Linguistics, Faculty of |
3544
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
3545
|
|
|
|
|
|
|
// |
3546
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
3547
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
3548
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
3549
|
|
|
|
|
|
|
|
3550
|
|
|
|
|
|
|
namespace utils { |
3551
|
|
|
|
|
|
|
|
3552
|
|
|
|
|
|
|
// |
3553
|
|
|
|
|
|
|
// Declarations |
3554
|
|
|
|
|
|
|
// |
3555
|
|
|
|
|
|
|
|
3556
|
|
|
|
|
|
|
template |
3557
|
|
|
|
|
|
|
inline T unaligned_load(const P* ptr); |
3558
|
|
|
|
|
|
|
|
3559
|
|
|
|
|
|
|
template |
3560
|
|
|
|
|
|
|
inline T unaligned_load_inc(const P*& ptr); |
3561
|
|
|
|
|
|
|
|
3562
|
|
|
|
|
|
|
template |
3563
|
|
|
|
|
|
|
inline void unaligned_store(P* ptr, T value); |
3564
|
|
|
|
|
|
|
|
3565
|
|
|
|
|
|
|
template |
3566
|
|
|
|
|
|
|
inline void unaligned_store_inc(P*& ptr, T value); |
3567
|
|
|
|
|
|
|
|
3568
|
|
|
|
|
|
|
template |
3569
|
|
|
|
|
|
|
T* unaligned_lower_bound(T* first, size_t size, T val); |
3570
|
|
|
|
|
|
|
|
3571
|
|
|
|
|
|
|
template |
3572
|
|
|
|
|
|
|
T* unaligned_upper_bound(T* first, size_t size, T val); |
3573
|
|
|
|
|
|
|
|
3574
|
|
|
|
|
|
|
// |
3575
|
|
|
|
|
|
|
// Definitions |
3576
|
|
|
|
|
|
|
// |
3577
|
|
|
|
|
|
|
|
3578
|
|
|
|
|
|
|
template |
3579
|
|
|
|
|
|
|
inline T unaligned_load(const P* ptr) { |
3580
|
|
|
|
|
|
|
T value; |
3581
|
|
|
|
|
|
|
memcpy(&value, ptr, sizeof(T)); |
3582
|
|
|
|
|
|
|
return value; |
3583
|
|
|
|
|
|
|
} |
3584
|
|
|
|
|
|
|
|
3585
|
|
|
|
|
|
|
template |
3586
|
|
|
|
|
|
|
inline T unaligned_load_inc(const P*& ptr) { |
3587
|
|
|
|
|
|
|
T value; |
3588
|
|
|
|
|
|
|
memcpy(&value, ptr, sizeof(T)); |
3589
|
0
|
|
|
|
|
|
((const char*&)ptr) += sizeof(T); |
3590
|
|
|
|
|
|
|
return value; |
3591
|
|
|
|
|
|
|
} |
3592
|
|
|
|
|
|
|
|
3593
|
|
|
|
|
|
|
template |
3594
|
|
|
|
|
|
|
inline void unaligned_store(P* ptr, T value) { |
3595
|
|
|
|
|
|
|
memcpy(ptr, &value, sizeof(T)); |
3596
|
|
|
|
|
|
|
} |
3597
|
|
|
|
|
|
|
|
3598
|
|
|
|
|
|
|
template |
3599
|
|
|
|
|
|
|
inline void unaligned_store_inc(P*& ptr, T value) { |
3600
|
|
|
|
|
|
|
memcpy(ptr, &value, sizeof(T)); |
3601
|
50
|
|
|
|
|
|
((char*&)ptr) += sizeof(T); |
3602
|
|
|
|
|
|
|
} |
3603
|
|
|
|
|
|
|
|
3604
|
|
|
|
|
|
|
template |
3605
|
|
|
|
|
|
|
T* unaligned_lower_bound(T* first, size_t size, T val) { |
3606
|
40
|
100
|
|
|
|
|
while (size) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
3607
|
30
|
|
|
|
|
|
size_t step = size >> 1; |
3608
|
30
|
100
|
|
|
|
|
if (unaligned_load(first + step) < val) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
3609
|
9
|
|
|
|
|
|
first += step + 1; |
3610
|
9
|
|
|
|
|
|
size -= step + 1; |
3611
|
|
|
|
|
|
|
} else { |
3612
|
|
|
|
|
|
|
size = step; |
3613
|
|
|
|
|
|
|
} |
3614
|
|
|
|
|
|
|
} |
3615
|
|
|
|
|
|
|
return first; |
3616
|
|
|
|
|
|
|
} |
3617
|
|
|
|
|
|
|
|
3618
|
|
|
|
|
|
|
template |
3619
|
|
|
|
|
|
|
T* unaligned_upper_bound(T* first, size_t size, T val) { |
3620
|
|
|
|
|
|
|
while (size) { |
3621
|
|
|
|
|
|
|
size_t step = size >> 1; |
3622
|
|
|
|
|
|
|
if (!(val < unaligned_load(first + step))) { |
3623
|
|
|
|
|
|
|
first += step + 1; |
3624
|
|
|
|
|
|
|
size -= step + 1; |
3625
|
|
|
|
|
|
|
} else { |
3626
|
|
|
|
|
|
|
size = step; |
3627
|
|
|
|
|
|
|
} |
3628
|
|
|
|
|
|
|
} |
3629
|
|
|
|
|
|
|
return first; |
3630
|
|
|
|
|
|
|
} |
3631
|
|
|
|
|
|
|
|
3632
|
|
|
|
|
|
|
} // namespace utils |
3633
|
|
|
|
|
|
|
|
3634
|
|
|
|
|
|
|
///////// |
3635
|
|
|
|
|
|
|
// File: morphodita/morpho/persistent_unordered_map.h |
3636
|
|
|
|
|
|
|
///////// |
3637
|
|
|
|
|
|
|
|
3638
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
3639
|
|
|
|
|
|
|
// |
3640
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
3641
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
3642
|
|
|
|
|
|
|
// |
3643
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
3644
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
3645
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
3646
|
|
|
|
|
|
|
|
3647
|
|
|
|
|
|
|
namespace morphodita { |
3648
|
|
|
|
|
|
|
|
3649
|
|
|
|
|
|
|
// Declarations |
3650
|
103
|
0
|
|
|
|
|
class persistent_unordered_map { |
|
|
0
|
|
|
|
|
|
3651
|
|
|
|
|
|
|
public: |
3652
|
|
|
|
|
|
|
// Accessing function |
3653
|
|
|
|
|
|
|
template |
3654
|
|
|
|
|
|
|
inline const unsigned char* at(const char* str, int len, EntrySize entry_size) const; |
3655
|
|
|
|
|
|
|
|
3656
|
|
|
|
|
|
|
template |
3657
|
|
|
|
|
|
|
inline const T* at_typed(const char* str, int len) const; |
3658
|
|
|
|
|
|
|
|
3659
|
|
|
|
|
|
|
template |
3660
|
|
|
|
|
|
|
inline void iter(const char* str, int len, EntryProcess entry_process) const; |
3661
|
|
|
|
|
|
|
|
3662
|
|
|
|
|
|
|
template |
3663
|
|
|
|
|
|
|
inline void iter_all(EntryProcess entry_process) const; |
3664
|
|
|
|
|
|
|
|
3665
|
|
|
|
|
|
|
// Two helper functions accessing some internals |
3666
|
|
|
|
|
|
|
inline int max_length() const; |
3667
|
|
|
|
|
|
|
inline const unsigned char* data_start(int len) const; |
3668
|
|
|
|
|
|
|
|
3669
|
|
|
|
|
|
|
// Creation functions |
3670
|
|
|
|
|
|
|
persistent_unordered_map() {} |
3671
|
|
|
|
|
|
|
template |
3672
|
|
|
|
|
|
|
persistent_unordered_map(const unordered_map& map, double load_factor, EntryEncode entry_encode); |
3673
|
|
|
|
|
|
|
template |
3674
|
|
|
|
|
|
|
persistent_unordered_map(const unordered_map& map, double load_factor, bool add_prefixes, bool add_suffixes, EntryEncode entry_encode); |
3675
|
|
|
|
|
|
|
|
3676
|
|
|
|
|
|
|
// Manual creation functions |
3677
|
|
|
|
|
|
|
inline void resize(unsigned elems); |
3678
|
|
|
|
|
|
|
inline void add(const char* str, int str_len, int data_len); |
3679
|
|
|
|
|
|
|
inline void done_adding(); |
3680
|
|
|
|
|
|
|
inline unsigned char* fill(const char* str, int str_len, int data_len); |
3681
|
|
|
|
|
|
|
inline void done_filling(); |
3682
|
|
|
|
|
|
|
|
3683
|
|
|
|
|
|
|
// Serialization |
3684
|
|
|
|
|
|
|
inline void load(binary_decoder& data); |
3685
|
|
|
|
|
|
|
inline void save(binary_encoder& enc); |
3686
|
|
|
|
|
|
|
|
3687
|
|
|
|
|
|
|
private: |
3688
|
|
|
|
|
|
|
struct fnv_hash; |
3689
|
|
|
|
|
|
|
vector hashes; |
3690
|
|
|
|
|
|
|
|
3691
|
|
|
|
|
|
|
template |
3692
|
|
|
|
|
|
|
void construct(const map& map, double load_factor, EntryEncode entry_encode); |
3693
|
|
|
|
|
|
|
}; |
3694
|
|
|
|
|
|
|
|
3695
|
|
|
|
|
|
|
// Definitions |
3696
|
1063
|
0
|
|
|
|
|
struct persistent_unordered_map::fnv_hash { |
3697
|
24
|
|
|
|
|
|
fnv_hash(unsigned num) { |
3698
|
24
|
|
|
|
|
|
mask = 1; |
3699
|
76
|
100
|
|
|
|
|
while (mask < num) |
3700
|
52
|
|
|
|
|
|
mask <<= 1; |
3701
|
24
|
50
|
|
|
|
|
hash.resize(mask + 1); |
3702
|
24
|
|
|
|
|
|
mask--; |
3703
|
24
|
|
|
|
|
|
} |
3704
|
484
|
|
|
|
|
|
fnv_hash(binary_decoder& data) { |
3705
|
484
|
50
|
|
|
|
|
uint32_t size = data.next_4B(); |
3706
|
484
|
|
|
|
|
|
mask = size - 2; |
3707
|
484
|
50
|
|
|
|
|
hash.resize(size); |
3708
|
484
|
50
|
|
|
|
|
memcpy(hash.data(), data.next(size), size * sizeof(uint32_t)); |
3709
|
|
|
|
|
|
|
|
3710
|
484
|
50
|
|
|
|
|
size = data.next_4B(); |
3711
|
484
|
50
|
|
|
|
|
this->data.resize(size); |
3712
|
484
|
100
|
|
|
|
|
if (size) memcpy(this->data.data(), data.next(size), size); |
|
|
50
|
|
|
|
|
|
3713
|
484
|
|
|
|
|
|
} |
3714
|
|
|
|
|
|
|
|
3715
|
|
|
|
|
|
|
inline uint32_t index(const char* data, int len) const { |
3716
|
464
|
0
|
|
|
|
|
if (len <= 0) return 0; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
3717
|
456
|
0
|
|
|
|
|
if (len == 1) return unaligned_load(data); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
3718
|
427
|
0
|
|
|
|
|
if (len == 2) return unaligned_load(data); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
3719
|
|
|
|
|
|
|
|
3720
|
|
|
|
|
|
|
uint32_t hash = 2166136261U; |
3721
|
1563
|
0
|
|
|
|
|
while (len--) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
3722
|
1413
|
|
|
|
|
|
hash = (hash ^ unsigned((signed char)*data++)) * 16777619U; |
3723
|
150
|
|
|
|
|
|
return hash & mask; |
3724
|
|
|
|
|
|
|
} |
3725
|
|
|
|
|
|
|
|
3726
|
|
|
|
|
|
|
inline void save(binary_encoder& enc); |
3727
|
|
|
|
|
|
|
|
3728
|
|
|
|
|
|
|
unsigned mask; |
3729
|
|
|
|
|
|
|
vector hash; |
3730
|
|
|
|
|
|
|
vector data; |
3731
|
|
|
|
|
|
|
}; |
3732
|
|
|
|
|
|
|
|
3733
|
|
|
|
|
|
|
template |
3734
|
8
|
|
|
|
|
|
const unsigned char* persistent_unordered_map::at(const char* str, int len, EntrySize entry_size) const { |
3735
|
8
|
0
|
|
|
|
|
if (unsigned(len) >= hashes.size()) return nullptr; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
3736
|
|
|
|
|
|
|
|
3737
|
8
|
|
|
|
|
|
unsigned index = hashes[len].index(str, len); |
3738
|
16
|
|
|
|
|
|
const unsigned char* data = hashes[len].data.data() + hashes[len].hash[index]; |
3739
|
16
|
|
|
|
|
|
const unsigned char* end = hashes[len].data.data() + hashes[len].hash[index+1]; |
3740
|
|
|
|
|
|
|
|
3741
|
8
|
0
|
|
|
|
|
if (len <= 2) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
3742
|
8
|
0
|
|
|
|
|
return data != end ? data + len : nullptr; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
3743
|
|
|
|
|
|
|
|
3744
|
0
|
0
|
|
|
|
|
while (data < end) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
3745
|
0
|
0
|
|
|
|
|
if (small_memeq(str, data, len)) return data + len; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
3746
|
0
|
|
|
|
|
|
data += len; |
3747
|
|
|
|
|
|
|
pointer_decoder decoder(data); |
3748
|
0
|
|
|
|
|
|
entry_size(decoder); |
3749
|
|
|
|
|
|
|
} |
3750
|
|
|
|
|
|
|
|
3751
|
|
|
|
|
|
|
return nullptr; |
3752
|
|
|
|
|
|
|
} |
3753
|
|
|
|
|
|
|
|
3754
|
|
|
|
|
|
|
template |
3755
|
438
|
|
|
|
|
|
const T* persistent_unordered_map::at_typed(const char* str, int len) const { |
3756
|
438
|
100
|
|
|
|
|
if (unsigned(len) >= hashes.size()) return nullptr; |
|
|
100
|
|
|
|
|
|
3757
|
|
|
|
|
|
|
|
3758
|
408
|
|
|
|
|
|
unsigned index = hashes[len].index(str, len); |
3759
|
816
|
|
|
|
|
|
const unsigned char* data = hashes[len].data.data() + hashes[len].hash[index]; |
3760
|
816
|
|
|
|
|
|
const unsigned char* end = hashes[len].data.data() + hashes[len].hash[index+1]; |
3761
|
|
|
|
|
|
|
|
3762
|
408
|
100
|
|
|
|
|
if (len <= 2) |
|
|
100
|
|
|
|
|
|
3763
|
293
|
100
|
|
|
|
|
return data != end ? (const T*)(data + len) : nullptr; |
|
|
100
|
|
|
|
|
|
3764
|
|
|
|
|
|
|
|
3765
|
146
|
100
|
|
|
|
|
while (data < end) { |
|
|
100
|
|
|
|
|
|
3766
|
133
|
100
|
|
|
|
|
if (small_memeq(str, data, len)) return (const T*)(data + len); |
|
|
100
|
|
|
|
|
|
3767
|
31
|
|
|
|
|
|
data += len + sizeof(T); |
3768
|
|
|
|
|
|
|
} |
3769
|
|
|
|
|
|
|
|
3770
|
|
|
|
|
|
|
return nullptr; |
3771
|
|
|
|
|
|
|
} |
3772
|
|
|
|
|
|
|
|
3773
|
|
|
|
|
|
|
template |
3774
|
8
|
|
|
|
|
|
void persistent_unordered_map::iter(const char* str, int len, EntryProcess entry_process) const { |
3775
|
8
|
0
|
|
|
|
|
if (unsigned(len) >= hashes.size()) return; |
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
3776
|
|
|
|
|
|
|
|
3777
|
8
|
|
|
|
|
|
unsigned index = hashes[len].index(str, len); |
3778
|
16
|
|
|
|
|
|
const unsigned char* data = hashes[len].data.data() + hashes[len].hash[index]; |
3779
|
8
|
|
|
|
|
|
const unsigned char* end = hashes[len].data.data() + hashes[len].hash[index+1]; |
3780
|
|
|
|
|
|
|
|
3781
|
21
|
0
|
|
|
|
|
while (data < end) { |
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
3782
|
|
|
|
|
|
|
auto start = (const char*) data; |
3783
|
13
|
|
|
|
|
|
data += len; |
3784
|
|
|
|
|
|
|
pointer_decoder decoder(data); |
3785
|
13
|
|
|
|
|
|
entry_process(start, decoder); |
3786
|
|
|
|
|
|
|
} |
3787
|
|
|
|
|
|
|
} |
3788
|
|
|
|
|
|
|
|
3789
|
|
|
|
|
|
|
template |
3790
|
2
|
|
|
|
|
|
void persistent_unordered_map::iter_all(EntryProcess entry_process) const { |
3791
|
2
|
100
|
|
|
|
|
for (unsigned len = 0; len < hashes.size(); len++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
3792
|
1
|
|
|
|
|
|
const unsigned char* data = hashes[len].data.data(); |
3793
|
|
|
|
|
|
|
const unsigned char* end = data + hashes[len].data.size(); |
3794
|
|
|
|
|
|
|
|
3795
|
2
|
100
|
|
|
|
|
while (data < end) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
3796
|
|
|
|
|
|
|
auto start = (const char*) data; |
3797
|
1
|
|
|
|
|
|
data += len; |
3798
|
|
|
|
|
|
|
pointer_decoder decoder(data); |
3799
|
1
|
|
|
|
|
|
entry_process(start, len, decoder); |
3800
|
|
|
|
|
|
|
} |
3801
|
|
|
|
|
|
|
} |
3802
|
1
|
|
|
|
|
|
} |
3803
|
|
|
|
|
|
|
|
3804
|
|
|
|
|
|
|
int persistent_unordered_map::max_length() const { |
3805
|
20
|
|
|
|
|
|
return hashes.size(); |
3806
|
|
|
|
|
|
|
} |
3807
|
|
|
|
|
|
|
|
3808
|
|
|
|
|
|
|
const unsigned char* persistent_unordered_map::data_start(int len) const { |
3809
|
30
|
0
|
|
|
|
|
return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
3810
|
|
|
|
|
|
|
} |
3811
|
|
|
|
|
|
|
|
3812
|
24
|
|
|
|
|
|
void persistent_unordered_map::resize(unsigned elems) { |
3813
|
24
|
100
|
|
|
|
|
if (hashes.size() == 0) hashes.emplace_back(1); |
3814
|
22
|
100
|
|
|
|
|
else if (hashes.size() == 1) hashes.emplace_back(1<<8); |
3815
|
20
|
100
|
|
|
|
|
else if (hashes.size() == 2) hashes.emplace_back(1<<16); |
3816
|
18
|
|
|
|
|
|
else hashes.emplace_back(elems); |
3817
|
24
|
|
|
|
|
|
} |
3818
|
|
|
|
|
|
|
|
3819
|
20
|
|
|
|
|
|
void persistent_unordered_map::add(const char* str, int str_len, int data_len) { |
3820
|
20
|
50
|
|
|
|
|
if (unsigned(str_len) < hashes.size()) |
3821
|
20
|
|
|
|
|
|
hashes[str_len].hash[hashes[str_len].index(str, str_len)] += str_len + data_len; |
3822
|
20
|
|
|
|
|
|
} |
3823
|
|
|
|
|
|
|
|
3824
|
2
|
|
|
|
|
|
void persistent_unordered_map::done_adding() { |
3825
|
26
|
100
|
|
|
|
|
for (auto&& hash : hashes) { |
3826
|
|
|
|
|
|
|
int total = 0; |
3827
|
131657
|
100
|
|
|
|
|
for (auto&& len : hash.hash) total += len, len = total - len; |
3828
|
24
|
|
|
|
|
|
hash.data.resize(total); |
3829
|
|
|
|
|
|
|
} |
3830
|
2
|
|
|
|
|
|
} |
3831
|
|
|
|
|
|
|
|
3832
|
20
|
|
|
|
|
|
unsigned char* persistent_unordered_map::fill(const char* str, int str_len, int data_len) { |
3833
|
20
|
50
|
|
|
|
|
if (unsigned(str_len) < hashes.size()) { |
3834
|
20
|
|
|
|
|
|
unsigned index = hashes[str_len].index(str, str_len); |
3835
|
40
|
|
|
|
|
|
unsigned offset = hashes[str_len].hash[index]; |
3836
|
20
|
|
|
|
|
|
small_memcpy(hashes[str_len].data.data() + offset, str, str_len); |
3837
|
20
|
|
|
|
|
|
hashes[str_len].hash[index] += str_len + data_len; |
3838
|
20
|
|
|
|
|
|
return hashes[str_len].data.data() + offset + str_len; |
3839
|
|
|
|
|
|
|
} |
3840
|
|
|
|
|
|
|
return nullptr; |
3841
|
|
|
|
|
|
|
} |
3842
|
|
|
|
|
|
|
|
3843
|
2
|
|
|
|
|
|
void persistent_unordered_map::done_filling() { |
3844
|
26
|
100
|
|
|
|
|
for (auto&& hash : hashes) |
3845
|
131657
|
100
|
|
|
|
|
for (int i = hash.hash.size() - 1; i >= 0; i--) |
3846
|
131633
|
100
|
|
|
|
|
hash.hash[i] = i > 0 ? hash.hash[i-1] : 0; |
3847
|
2
|
|
|
|
|
|
} |
3848
|
|
|
|
|
|
|
|
3849
|
103
|
|
|
|
|
|
void persistent_unordered_map::load(binary_decoder& data) { |
3850
|
103
|
|
|
|
|
|
unsigned sizes = data.next_1B(); |
3851
|
|
|
|
|
|
|
|
3852
|
|
|
|
|
|
|
hashes.clear(); |
3853
|
587
|
100
|
|
|
|
|
for (unsigned i = 0; i < sizes; i++) |
3854
|
484
|
|
|
|
|
|
hashes.emplace_back(data); |
3855
|
103
|
|
|
|
|
|
} |
3856
|
|
|
|
|
|
|
|
3857
|
|
|
|
|
|
|
} // namespace morphodita |
3858
|
|
|
|
|
|
|
|
3859
|
|
|
|
|
|
|
///////// |
3860
|
|
|
|
|
|
|
// File: morphodita/derivator/derivator_dictionary.h |
3861
|
|
|
|
|
|
|
///////// |
3862
|
|
|
|
|
|
|
|
3863
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
3864
|
|
|
|
|
|
|
// |
3865
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
3866
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
3867
|
|
|
|
|
|
|
// |
3868
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
3869
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
3870
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
3871
|
|
|
|
|
|
|
|
3872
|
|
|
|
|
|
|
namespace morphodita { |
3873
|
|
|
|
|
|
|
|
3874
|
0
|
|
|
|
|
|
class derivator_dictionary : public derivator { |
3875
|
|
|
|
|
|
|
public: |
3876
|
|
|
|
|
|
|
virtual bool parent(string_piece lemma, derivated_lemma& parent) const override; |
3877
|
|
|
|
|
|
|
virtual bool children(string_piece lemma, vector& children) const override; |
3878
|
|
|
|
|
|
|
|
3879
|
|
|
|
|
|
|
bool load(istream& is); |
3880
|
|
|
|
|
|
|
|
3881
|
|
|
|
|
|
|
private: |
3882
|
|
|
|
|
|
|
friend class morpho; |
3883
|
|
|
|
|
|
|
const morpho* dictionary; |
3884
|
|
|
|
|
|
|
persistent_unordered_map derinet; |
3885
|
|
|
|
|
|
|
}; |
3886
|
|
|
|
|
|
|
|
3887
|
|
|
|
|
|
|
} // namespace morphodita |
3888
|
|
|
|
|
|
|
|
3889
|
|
|
|
|
|
|
///////// |
3890
|
|
|
|
|
|
|
// File: utils/compressor.h |
3891
|
|
|
|
|
|
|
///////// |
3892
|
|
|
|
|
|
|
|
3893
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
3894
|
|
|
|
|
|
|
// |
3895
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
3896
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
3897
|
|
|
|
|
|
|
// |
3898
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
3899
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
3900
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
3901
|
|
|
|
|
|
|
|
3902
|
|
|
|
|
|
|
namespace utils { |
3903
|
|
|
|
|
|
|
|
3904
|
|
|
|
|
|
|
class binary_decoder; |
3905
|
|
|
|
|
|
|
class binary_encoder; |
3906
|
|
|
|
|
|
|
|
3907
|
|
|
|
|
|
|
class compressor { |
3908
|
|
|
|
|
|
|
public: |
3909
|
|
|
|
|
|
|
static bool load(istream& is, binary_decoder& data); |
3910
|
|
|
|
|
|
|
static bool save(ostream& os, const binary_encoder& enc); |
3911
|
|
|
|
|
|
|
}; |
3912
|
|
|
|
|
|
|
|
3913
|
|
|
|
|
|
|
} // namespace utils |
3914
|
|
|
|
|
|
|
|
3915
|
|
|
|
|
|
|
///////// |
3916
|
|
|
|
|
|
|
// File: morphodita/derivator/derivator_dictionary.cpp |
3917
|
|
|
|
|
|
|
///////// |
3918
|
|
|
|
|
|
|
|
3919
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
3920
|
|
|
|
|
|
|
// |
3921
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
3922
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
3923
|
|
|
|
|
|
|
// |
3924
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
3925
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
3926
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
3927
|
|
|
|
|
|
|
|
3928
|
|
|
|
|
|
|
namespace morphodita { |
3929
|
|
|
|
|
|
|
|
3930
|
0
|
|
|
|
|
|
bool derivator_dictionary::parent(string_piece lemma, derivated_lemma& parent) const { |
3931
|
0
|
0
|
|
|
|
|
if (dictionary) lemma.len = dictionary->lemma_id_len(lemma); |
3932
|
|
|
|
|
|
|
|
3933
|
0
|
|
|
|
|
|
auto lemma_data = derinet.at(lemma.str, lemma.len, [](pointer_decoder& data) { |
3934
|
|
|
|
|
|
|
data.next(data.next_1B()); |
3935
|
|
|
|
|
|
|
data.next_4B(); |
3936
|
|
|
|
|
|
|
data.next(data.next_2B()); |
3937
|
0
|
|
|
|
|
|
}); |
3938
|
0
|
0
|
|
|
|
|
if (lemma_data) { |
3939
|
0
|
|
|
|
|
|
auto parent_encoded = *(uint32_t*)(lemma_data + 1 + *lemma_data); |
3940
|
0
|
0
|
|
|
|
|
if (parent_encoded) { |
3941
|
0
|
|
|
|
|
|
unsigned parent_len = parent_encoded & 0xFF; |
3942
|
0
|
|
|
|
|
|
auto parent_data = derinet.data_start(parent_len) + (parent_encoded >> 8); |
3943
|
0
|
|
|
|
|
|
parent.lemma.assign((const char*) parent_data, parent_len); |
3944
|
0
|
0
|
|
|
|
|
if (parent_data[parent_len]) |
3945
|
0
|
|
|
|
|
|
parent.lemma.append((const char*) parent_data + parent_len + 1, parent_data[parent_len]); |
3946
|
|
|
|
|
|
|
return true; |
3947
|
|
|
|
|
|
|
} |
3948
|
|
|
|
|
|
|
} |
3949
|
|
|
|
|
|
|
parent.lemma.clear(); |
3950
|
0
|
|
|
|
|
|
return false; |
3951
|
|
|
|
|
|
|
} |
3952
|
|
|
|
|
|
|
|
3953
|
0
|
|
|
|
|
|
bool derivator_dictionary::children(string_piece lemma, vector& children) const { |
3954
|
0
|
0
|
|
|
|
|
if (dictionary) lemma.len = dictionary->lemma_id_len(lemma); |
3955
|
|
|
|
|
|
|
|
3956
|
0
|
|
|
|
|
|
auto lemma_data = derinet.at(lemma.str, lemma.len, [](pointer_decoder& data) { |
3957
|
|
|
|
|
|
|
data.next(data.next_1B()); |
3958
|
|
|
|
|
|
|
data.next_4B(); |
3959
|
|
|
|
|
|
|
data.next(data.next_2B()); |
3960
|
0
|
|
|
|
|
|
}); |
3961
|
0
|
0
|
|
|
|
|
if (lemma_data) { |
3962
|
0
|
|
|
|
|
|
auto children_len = *(uint16_t*)(lemma_data + 1 + *lemma_data + 4); |
3963
|
0
|
|
|
|
|
|
auto children_encoded = (uint32_t*)(lemma_data + 1 + *lemma_data + 4 + 2); |
3964
|
0
|
0
|
|
|
|
|
if (children_len) { |
3965
|
0
|
|
|
|
|
|
children.resize(children_len); |
3966
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < children_len; i++) { |
3967
|
0
|
|
|
|
|
|
unsigned child_len = children_encoded[i] & 0xFF; |
3968
|
0
|
|
|
|
|
|
auto child_data = derinet.data_start(child_len) + (children_encoded[i] >> 8); |
3969
|
0
|
|
|
|
|
|
children[i].lemma.assign((const char*) child_data, child_len); |
3970
|
0
|
0
|
|
|
|
|
if (child_data[child_len]) |
3971
|
0
|
|
|
|
|
|
children[i].lemma.append((const char*) child_data + child_len + 1, child_data[child_len]); |
3972
|
|
|
|
|
|
|
} |
3973
|
|
|
|
|
|
|
return true; |
3974
|
|
|
|
|
|
|
} |
3975
|
|
|
|
|
|
|
} |
3976
|
0
|
|
|
|
|
|
children.clear(); |
3977
|
0
|
|
|
|
|
|
return false; |
3978
|
|
|
|
|
|
|
} |
3979
|
|
|
|
|
|
|
|
3980
|
0
|
|
|
|
|
|
bool derivator_dictionary::load(istream& is) { |
3981
|
|
|
|
|
|
|
binary_decoder data; |
3982
|
0
|
0
|
|
|
|
|
if (!compressor::load(is, data)) return false; |
|
|
0
|
|
|
|
|
|
3983
|
|
|
|
|
|
|
|
3984
|
|
|
|
|
|
|
try { |
3985
|
0
|
0
|
|
|
|
|
for (int i = data.next_1B(); i > 0; i--) |
|
|
0
|
|
|
|
|
|
3986
|
0
|
0
|
|
|
|
|
derinet.resize(data.next_4B()); |
|
|
0
|
|
|
|
|
|
3987
|
|
|
|
|
|
|
|
3988
|
|
|
|
|
|
|
unsigned data_position = data.tell(); |
3989
|
|
|
|
|
|
|
vector lemma, parent; |
3990
|
0
|
0
|
|
|
|
|
for (int pass = 1; pass <= 3; pass++) { |
3991
|
0
|
0
|
|
|
|
|
if (pass > 1) data.seek(data_position); |
|
|
0
|
|
|
|
|
|
3992
|
|
|
|
|
|
|
|
3993
|
|
|
|
|
|
|
lemma.clear(); |
3994
|
0
|
0
|
|
|
|
|
for (int i = data.next_4B(); i > 0; i--) { |
|
|
0
|
|
|
|
|
|
3995
|
0
|
0
|
|
|
|
|
lemma.resize(lemma.size() - data.next_1B()); |
|
|
0
|
|
|
|
|
|
3996
|
0
|
0
|
|
|
|
|
for (int i = data.next_1B(); i > 0; i--) |
|
|
0
|
|
|
|
|
|
3997
|
0
|
0
|
|
|
|
|
lemma.push_back(data.next_1B()); |
3998
|
|
|
|
|
|
|
|
3999
|
0
|
0
|
|
|
|
|
unsigned char lemma_comment_len = data.next_1B(); |
4000
|
0
|
0
|
|
|
|
|
const char* lemma_comment = lemma_comment_len ? data.next(lemma_comment_len) : nullptr; |
|
|
0
|
|
|
|
|
|
4001
|
|
|
|
|
|
|
|
4002
|
0
|
0
|
|
|
|
|
unsigned children = data.next_2B(); |
4003
|
|
|
|
|
|
|
|
4004
|
0
|
0
|
|
|
|
|
if (pass == 3) parent.clear(); |
4005
|
|
|
|
|
|
|
enum { REMOVE_START = 1, REMOVE_END = 2, ADD_START = 4, ADD_END = 8 }; |
4006
|
0
|
0
|
|
|
|
|
int operations = data.next_1B(); |
4007
|
0
|
0
|
|
|
|
|
if (operations) { |
4008
|
0
|
0
|
|
|
|
|
int remove_start = operations & REMOVE_START ? data.next_1B() : 0; |
|
|
0
|
|
|
|
|
|
4009
|
0
|
0
|
|
|
|
|
int remove_end = operations & REMOVE_END ? data.next_1B() : 0; |
|
|
0
|
|
|
|
|
|
4010
|
0
|
0
|
|
|
|
|
if (operations & ADD_START) { |
4011
|
0
|
0
|
|
|
|
|
int add_start = data.next_1B(); |
4012
|
0
|
0
|
|
|
|
|
const char* str = data.next(add_start); |
4013
|
0
|
0
|
|
|
|
|
if (pass == 3) parent.assign(str, str + add_start); |
4014
|
|
|
|
|
|
|
} |
4015
|
0
|
0
|
|
|
|
|
if (pass == 3) parent.insert(parent.end(), lemma.begin() + remove_start, lemma.end() - remove_end); |
|
|
0
|
|
|
|
|
|
4016
|
0
|
0
|
|
|
|
|
if (operations & ADD_END) { |
4017
|
0
|
0
|
|
|
|
|
int add_end = data.next_1B(); |
4018
|
0
|
0
|
|
|
|
|
const char* str = data.next(add_end); |
4019
|
0
|
0
|
|
|
|
|
if (pass == 3) parent.insert(parent.end(), str, str + add_end); |
4020
|
|
|
|
|
|
|
} |
4021
|
|
|
|
|
|
|
} |
4022
|
|
|
|
|
|
|
|
4023
|
0
|
0
|
|
|
|
|
if (pass == 1) { |
4024
|
0
|
|
|
|
|
|
derinet.add(lemma.data(), lemma.size(), 1 + lemma_comment_len + 4 + 2 + 4 * children); |
4025
|
0
|
0
|
|
|
|
|
} else if (pass == 2) { |
4026
|
0
|
|
|
|
|
|
unsigned char* lemma_data = derinet.fill(lemma.data(), lemma.size(), 1 + lemma_comment_len + 4 + 2 + 4 * children); |
4027
|
0
|
|
|
|
|
|
*lemma_data++ = lemma_comment_len; |
4028
|
0
|
0
|
|
|
|
|
while (lemma_comment_len--) *lemma_data++ = *lemma_comment++; |
4029
|
|
|
|
|
|
|
unaligned_store_inc(lemma_data, 0); |
4030
|
|
|
|
|
|
|
unaligned_store_inc(lemma_data, children); |
4031
|
0
|
0
|
|
|
|
|
if (children) unaligned_store(((uint32_t*)lemma_data) + children - 1, 0); |
4032
|
0
|
0
|
|
|
|
|
} else if (pass == 3 && !parent.empty()) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4033
|
0
|
|
|
|
|
|
auto lemma_data = derinet.at(lemma.data(), lemma.size(), [](pointer_decoder& data) { |
4034
|
|
|
|
|
|
|
data.next(data.next_1B()); |
4035
|
|
|
|
|
|
|
data.next_4B(); |
4036
|
|
|
|
|
|
|
data.next(data.next_2B()); |
4037
|
0
|
|
|
|
|
|
}); |
4038
|
0
|
|
|
|
|
|
auto parent_data = derinet.at(parent.data(), parent.size(), [](pointer_decoder& data) { |
4039
|
|
|
|
|
|
|
data.next(data.next_1B()); |
4040
|
|
|
|
|
|
|
data.next_4B(); |
4041
|
|
|
|
|
|
|
data.next(data.next_2B()); |
4042
|
0
|
|
|
|
|
|
}); |
4043
|
0
|
0
|
|
|
|
|
assert(lemma_data && parent_data); |
4044
|
|
|
|
|
|
|
|
4045
|
0
|
|
|
|
|
|
unsigned parent_offset = parent_data - parent.size() - derinet.data_start(parent.size()); |
4046
|
0
|
0
|
|
|
|
|
assert(parent.size() < (1<<8) && parent_offset < (1<<24)); |
|
|
0
|
|
|
|
|
|
4047
|
0
|
|
|
|
|
|
unaligned_store((void *)(lemma_data + 1 + *lemma_data), (parent_offset << 8) | parent.size()); |
4048
|
|
|
|
|
|
|
|
4049
|
0
|
|
|
|
|
|
unsigned lemma_offset = lemma_data - lemma.size() - derinet.data_start(lemma.size()); |
4050
|
0
|
0
|
|
|
|
|
assert(lemma.size() < (1<<8) && lemma_offset < (1<<24)); |
|
|
0
|
|
|
|
|
|
4051
|
0
|
|
|
|
|
|
auto children_len = unaligned_load(parent_data + 1 + *parent_data + 4); |
4052
|
0
|
|
|
|
|
|
auto children = (uint32_t*)(parent_data + 1 + *parent_data + 4 + 2); |
4053
|
0
|
|
|
|
|
|
auto child_index = unaligned_load(children + children_len - 1); |
4054
|
0
|
|
|
|
|
|
unaligned_store(children + child_index, (lemma_offset << 8) | lemma.size()); |
4055
|
0
|
0
|
|
|
|
|
if (child_index+1 < children_len) |
4056
|
0
|
|
|
|
|
|
unaligned_store(children + children_len - 1, unaligned_load(children + children_len - 1) + 1); |
4057
|
|
|
|
|
|
|
} |
4058
|
|
|
|
|
|
|
} |
4059
|
|
|
|
|
|
|
|
4060
|
0
|
0
|
|
|
|
|
if (pass == 1) |
4061
|
0
|
0
|
|
|
|
|
derinet.done_adding(); |
4062
|
0
|
0
|
|
|
|
|
if (pass == 2) |
4063
|
0
|
|
|
|
|
|
derinet.done_filling(); |
4064
|
|
0
|
|
|
|
|
} |
4065
|
|
|
|
|
|
|
} catch (binary_decoder_error&) { |
4066
|
|
|
|
|
|
|
return false; |
4067
|
|
|
|
|
|
|
} |
4068
|
0
|
|
|
|
|
|
return true; |
4069
|
|
|
|
|
|
|
} |
4070
|
|
|
|
|
|
|
|
4071
|
|
|
|
|
|
|
} // namespace morphodita |
4072
|
|
|
|
|
|
|
|
4073
|
|
|
|
|
|
|
///////// |
4074
|
|
|
|
|
|
|
// File: morphodita/morpho/casing_variants.h |
4075
|
|
|
|
|
|
|
///////// |
4076
|
|
|
|
|
|
|
|
4077
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
4078
|
|
|
|
|
|
|
// |
4079
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
4080
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
4081
|
|
|
|
|
|
|
// |
4082
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
4083
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
4084
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
4085
|
|
|
|
|
|
|
|
4086
|
|
|
|
|
|
|
namespace morphodita { |
4087
|
|
|
|
|
|
|
|
4088
|
7
|
|
|
|
|
|
inline void generate_casing_variants(string_piece form, string& form_uclc, string& form_lc) { |
4089
|
|
|
|
|
|
|
using namespace unilib; |
4090
|
|
|
|
|
|
|
|
4091
|
|
|
|
|
|
|
// Detect uppercase+titlecase characters. |
4092
|
|
|
|
|
|
|
bool first_Lut = false; // first character is uppercase or titlecase |
4093
|
|
|
|
|
|
|
bool rest_has_Lut = false; // any character but first is uppercase or titlecase |
4094
|
|
|
|
|
|
|
{ |
4095
|
7
|
|
|
|
|
|
string_piece form_tmp = form; |
4096
|
14
|
|
|
|
|
|
first_Lut = unicode::category(utf8::decode(form_tmp.str, form_tmp.len)) & unicode::Lut; |
4097
|
29
|
100
|
|
|
|
|
while (form_tmp.len && !rest_has_Lut) |
|
|
50
|
|
|
|
|
|
4098
|
22
|
|
|
|
|
|
rest_has_Lut = unicode::category(utf8::decode(form_tmp.str, form_tmp.len)) & unicode::Lut; |
4099
|
|
|
|
|
|
|
} |
4100
|
|
|
|
|
|
|
|
4101
|
|
|
|
|
|
|
// Generate all casing variants if needed (they are different than given form). |
4102
|
|
|
|
|
|
|
// We only replace letters with their lowercase variants. |
4103
|
|
|
|
|
|
|
// - form_uclc: first uppercase, rest lowercase |
4104
|
|
|
|
|
|
|
// - form_lc: all lowercase |
4105
|
|
|
|
|
|
|
|
4106
|
7
|
100
|
|
|
|
|
if (first_Lut && !rest_has_Lut) { // common case allowing fast execution |
4107
|
1
|
|
|
|
|
|
form_lc.reserve(form.len); |
4108
|
1
|
|
|
|
|
|
string_piece form_tmp = form; |
4109
|
1
|
|
|
|
|
|
utf8::append(form_lc, unicode::lowercase(utf8::decode(form_tmp.str, form_tmp.len))); |
4110
|
1
|
|
|
|
|
|
form_lc.append(form_tmp.str, form_tmp.len); |
4111
|
6
|
50
|
|
|
|
|
} else if (!first_Lut && rest_has_Lut) { |
4112
|
0
|
|
|
|
|
|
form_lc.reserve(form.len); |
4113
|
0
|
|
|
|
|
|
utf8::map(unicode::lowercase, form.str, form.len, form_lc); |
4114
|
6
|
50
|
|
|
|
|
} else if (first_Lut && rest_has_Lut) { |
4115
|
0
|
|
|
|
|
|
form_lc.reserve(form.len); |
4116
|
0
|
|
|
|
|
|
form_uclc.reserve(form.len); |
4117
|
0
|
|
|
|
|
|
string_piece form_tmp = form; |
4118
|
0
|
|
|
|
|
|
char32_t first = utf8::decode(form_tmp.str, form_tmp.len); |
4119
|
0
|
|
|
|
|
|
utf8::append(form_lc, unicode::lowercase(first)); |
4120
|
0
|
|
|
|
|
|
utf8::append(form_uclc, first); |
4121
|
0
|
0
|
|
|
|
|
while (form_tmp.len) { |
4122
|
0
|
|
|
|
|
|
char32_t lowercase = unicode::lowercase(utf8::decode(form_tmp.str, form_tmp.len)); |
4123
|
0
|
|
|
|
|
|
utf8::append(form_lc, lowercase); |
4124
|
0
|
|
|
|
|
|
utf8::append(form_uclc, lowercase); |
4125
|
|
|
|
|
|
|
} |
4126
|
|
|
|
|
|
|
} |
4127
|
7
|
|
|
|
|
|
} |
4128
|
|
|
|
|
|
|
|
4129
|
|
|
|
|
|
|
} // namespace morphodita |
4130
|
|
|
|
|
|
|
|
4131
|
|
|
|
|
|
|
///////// |
4132
|
|
|
|
|
|
|
// File: morphodita/morpho/czech_lemma_addinfo.h |
4133
|
|
|
|
|
|
|
///////// |
4134
|
|
|
|
|
|
|
|
4135
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
4136
|
|
|
|
|
|
|
// |
4137
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
4138
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
4139
|
|
|
|
|
|
|
// |
4140
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
4141
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
4142
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
4143
|
|
|
|
|
|
|
|
4144
|
|
|
|
|
|
|
namespace morphodita { |
4145
|
|
|
|
|
|
|
|
4146
|
|
|
|
|
|
|
// Declarations |
4147
|
0
|
|
|
|
|
|
struct czech_lemma_addinfo { |
4148
|
|
|
|
|
|
|
inline static int raw_lemma_len(string_piece lemma); |
4149
|
|
|
|
|
|
|
inline static int lemma_id_len(string_piece lemma); |
4150
|
|
|
|
|
|
|
inline static string format(const unsigned char* addinfo, int addinfo_len); |
4151
|
|
|
|
|
|
|
inline static bool generatable(const unsigned char* addinfo, int addinfo_len); |
4152
|
|
|
|
|
|
|
|
4153
|
|
|
|
|
|
|
inline int parse(string_piece lemma, bool die_on_failure = false); |
4154
|
|
|
|
|
|
|
inline bool match_lemma_id(const unsigned char* other_addinfo, int other_addinfo_len); |
4155
|
|
|
|
|
|
|
|
4156
|
|
|
|
|
|
|
vector data; |
4157
|
|
|
|
|
|
|
}; |
4158
|
|
|
|
|
|
|
|
4159
|
|
|
|
|
|
|
// Definitions |
4160
|
0
|
|
|
|
|
|
int czech_lemma_addinfo::raw_lemma_len(string_piece lemma) { |
4161
|
|
|
|
|
|
|
// Lemma ends by a '-[0-9]', '`' or '_' on non-first position. |
4162
|
0
|
0
|
|
|
|
|
for (unsigned len = 1; len < lemma.len; len++) |
4163
|
0
|
0
|
|
|
|
|
if (lemma.str[len] == '`' || lemma.str[len] == '_' || |
|
|
0
|
|
|
|
|
|
4164
|
0
|
0
|
|
|
|
|
(lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9')) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4165
|
0
|
|
|
|
|
|
return len; |
4166
|
0
|
|
|
|
|
|
return lemma.len; |
4167
|
|
|
|
|
|
|
} |
4168
|
|
|
|
|
|
|
|
4169
|
0
|
|
|
|
|
|
int czech_lemma_addinfo::lemma_id_len(string_piece lemma) { |
4170
|
|
|
|
|
|
|
// Lemma ends by a '-[0-9]', '`' or '_' on non-first position. |
4171
|
0
|
0
|
|
|
|
|
for (unsigned len = 1; len < lemma.len; len++) { |
4172
|
0
|
0
|
|
|
|
|
if (lemma.str[len] == '`' || lemma.str[len] == '_') |
4173
|
0
|
|
|
|
|
|
return len; |
4174
|
0
|
0
|
|
|
|
|
if (lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9') { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4175
|
0
|
|
|
|
|
|
len += 2; |
4176
|
0
|
0
|
|
|
|
|
while (len < lemma.len && lemma.str[len] >= '0' && lemma.str[len] <= '9') len++; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4177
|
0
|
|
|
|
|
|
return len; |
4178
|
|
|
|
|
|
|
} |
4179
|
|
|
|
|
|
|
} |
4180
|
0
|
|
|
|
|
|
return lemma.len; |
4181
|
|
|
|
|
|
|
} |
4182
|
|
|
|
|
|
|
|
4183
|
0
|
|
|
|
|
|
string czech_lemma_addinfo::format(const unsigned char* addinfo, int addinfo_len) { |
4184
|
|
|
|
|
|
|
string res; |
4185
|
|
|
|
|
|
|
|
4186
|
0
|
0
|
|
|
|
|
if (addinfo_len) { |
4187
|
0
|
0
|
|
|
|
|
res.reserve(addinfo_len + 4); |
4188
|
0
|
0
|
|
|
|
|
if (addinfo[0] != 255) { |
4189
|
|
|
|
|
|
|
char num[5]; |
4190
|
0
|
|
|
|
|
|
snprintf(num, sizeof(num), "-%u", addinfo[0]); |
4191
|
|
|
|
|
|
|
res += num; |
4192
|
|
|
|
|
|
|
} |
4193
|
0
|
0
|
|
|
|
|
for (int i = 1; i < addinfo_len; i++) |
4194
|
0
|
|
|
|
|
|
res += addinfo[i]; |
4195
|
|
|
|
|
|
|
} |
4196
|
|
|
|
|
|
|
|
4197
|
0
|
|
|
|
|
|
return res; |
4198
|
|
|
|
|
|
|
} |
4199
|
|
|
|
|
|
|
|
4200
|
|
|
|
|
|
|
bool czech_lemma_addinfo::generatable(const unsigned char* addinfo, int addinfo_len) { |
4201
|
0
|
0
|
|
|
|
|
for (int i = 1; i + 2 < addinfo_len; i++) |
4202
|
0
|
0
|
|
|
|
|
if (addinfo[i] == '_' && addinfo[i+1] == ',' && addinfo[i+2] == 'x') |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4203
|
|
|
|
|
|
|
return false; |
4204
|
|
|
|
|
|
|
|
4205
|
|
|
|
|
|
|
return true; |
4206
|
|
|
|
|
|
|
} |
4207
|
|
|
|
|
|
|
|
4208
|
0
|
|
|
|
|
|
int czech_lemma_addinfo::parse(string_piece lemma, bool die_on_failure) { |
4209
|
|
|
|
|
|
|
data.clear(); |
4210
|
|
|
|
|
|
|
|
4211
|
0
|
|
|
|
|
|
const char* lemma_info = lemma.str + raw_lemma_len(lemma); |
4212
|
0
|
0
|
|
|
|
|
if (lemma_info < lemma.str + lemma.len) { |
4213
|
0
|
|
|
|
|
|
int lemma_num = 255; |
4214
|
|
|
|
|
|
|
const char* lemma_additional_info = lemma_info; |
4215
|
|
|
|
|
|
|
|
4216
|
0
|
0
|
|
|
|
|
if (*lemma_info == '-') { |
4217
|
0
|
|
|
|
|
|
lemma_num = 0; |
4218
|
0
|
|
|
|
|
|
for (lemma_additional_info = lemma_info + 1; |
4219
|
0
|
0
|
|
|
|
|
lemma_additional_info < lemma.str + lemma.len && (*lemma_additional_info >= '0' && *lemma_additional_info <= '9'); |
|
|
0
|
|
|
|
|
|
4220
|
|
|
|
|
|
|
lemma_additional_info++) |
4221
|
0
|
|
|
|
|
|
lemma_num = 10 * lemma_num + (*lemma_additional_info - '0'); |
4222
|
|
|
|
|
|
|
|
4223
|
0
|
0
|
|
|
|
|
if (lemma_additional_info == lemma_info + 1 || (lemma_additional_info < lemma.str + lemma.len && *lemma_additional_info != '`' && *lemma_additional_info != '_') || lemma_num >= 255) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4224
|
0
|
0
|
|
|
|
|
if (die_on_failure) |
4225
|
0
|
0
|
|
|
|
|
training_failure("Lemma number " << lemma_num << " in lemma " << lemma << " out of range!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4226
|
|
|
|
|
|
|
else |
4227
|
0
|
|
|
|
|
|
lemma_num = 255; |
4228
|
|
|
|
|
|
|
} |
4229
|
|
|
|
|
|
|
} |
4230
|
0
|
|
|
|
|
|
data.emplace_back(lemma_num); |
4231
|
0
|
0
|
|
|
|
|
while (lemma_additional_info < lemma.str + lemma.len) |
4232
|
0
|
|
|
|
|
|
data.push_back(*(unsigned char*)lemma_additional_info++); |
4233
|
|
|
|
|
|
|
|
4234
|
0
|
0
|
|
|
|
|
if (data.size() > 255) { |
4235
|
0
|
0
|
|
|
|
|
if (die_on_failure) |
4236
|
0
|
0
|
|
|
|
|
training_failure("Too long lemma info " << lemma_info << " in lemma " << lemma << '!'); |
|
|
0
|
|
|
|
|
|
4237
|
|
|
|
|
|
|
else |
4238
|
0
|
|
|
|
|
|
data.resize(255); |
4239
|
|
|
|
|
|
|
} |
4240
|
|
|
|
|
|
|
} |
4241
|
|
|
|
|
|
|
|
4242
|
0
|
|
|
|
|
|
return lemma_info - lemma.str; |
4243
|
|
|
|
|
|
|
} |
4244
|
|
|
|
|
|
|
|
4245
|
|
|
|
|
|
|
bool czech_lemma_addinfo::match_lemma_id(const unsigned char* other_addinfo, int other_addinfo_len) { |
4246
|
0
|
0
|
|
|
|
|
if (data.empty()) return true; |
4247
|
0
|
0
|
|
|
|
|
if (data[0] != 255 && (!other_addinfo_len || other_addinfo[0] != data[0])) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4248
|
|
|
|
|
|
|
return true; |
4249
|
|
|
|
|
|
|
} |
4250
|
|
|
|
|
|
|
|
4251
|
|
|
|
|
|
|
} // namespace morphodita |
4252
|
|
|
|
|
|
|
|
4253
|
|
|
|
|
|
|
///////// |
4254
|
|
|
|
|
|
|
// File: morphodita/morpho/tag_filter.h |
4255
|
|
|
|
|
|
|
///////// |
4256
|
|
|
|
|
|
|
|
4257
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
4258
|
|
|
|
|
|
|
// |
4259
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
4260
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
4261
|
|
|
|
|
|
|
// |
4262
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
4263
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
4264
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
4265
|
|
|
|
|
|
|
|
4266
|
|
|
|
|
|
|
namespace morphodita { |
4267
|
|
|
|
|
|
|
|
4268
|
|
|
|
|
|
|
// Declarations |
4269
|
0
|
|
|
|
|
|
class tag_filter { |
4270
|
|
|
|
|
|
|
public: |
4271
|
|
|
|
|
|
|
tag_filter(const char* filter = nullptr); |
4272
|
|
|
|
|
|
|
|
4273
|
|
|
|
|
|
|
inline bool matches(const char* tag) const; |
4274
|
|
|
|
|
|
|
|
4275
|
|
|
|
|
|
|
private: |
4276
|
|
|
|
|
|
|
struct char_filter { |
4277
|
|
|
|
|
|
|
char_filter(int pos, bool negate, int chars_offset, int chars_len) |
4278
|
0
|
|
|
|
|
|
: pos(pos), negate(negate), chars_offset(chars_offset), chars_len(chars_len) {} |
4279
|
|
|
|
|
|
|
|
4280
|
|
|
|
|
|
|
int pos; |
4281
|
|
|
|
|
|
|
bool negate; |
4282
|
|
|
|
|
|
|
int chars_offset, chars_len; |
4283
|
|
|
|
|
|
|
}; |
4284
|
|
|
|
|
|
|
|
4285
|
|
|
|
|
|
|
string wildcard; |
4286
|
|
|
|
|
|
|
std::vector filters; |
4287
|
|
|
|
|
|
|
}; |
4288
|
|
|
|
|
|
|
|
4289
|
|
|
|
|
|
|
// Definitions |
4290
|
0
|
|
|
|
|
|
inline bool tag_filter::matches(const char* tag) const { |
4291
|
0
|
0
|
|
|
|
|
if (filters.empty()) return true; |
4292
|
|
|
|
|
|
|
|
4293
|
|
|
|
|
|
|
int tag_pos = 0; |
4294
|
0
|
0
|
|
|
|
|
for (auto&& filter : filters) { |
4295
|
|
|
|
|
|
|
// Skip until next filter position. If the tag ends prematurely, accept. |
4296
|
0
|
0
|
|
|
|
|
while (tag_pos < filter.pos) |
4297
|
0
|
0
|
|
|
|
|
if (!tag[tag_pos++]) |
4298
|
|
|
|
|
|
|
return true; |
4299
|
0
|
0
|
|
|
|
|
if (!tag[tag_pos]) |
4300
|
|
|
|
|
|
|
return true; |
4301
|
|
|
|
|
|
|
|
4302
|
|
|
|
|
|
|
// We assume filter.chars_len >= 1. |
4303
|
0
|
|
|
|
|
|
bool matched = (wildcard[filter.chars_offset] == tag[tag_pos]) ^ filter.negate; |
4304
|
0
|
0
|
|
|
|
|
for (int i = 1; i < filter.chars_len && ((!matched) ^ filter.negate); i++) |
|
|
0
|
|
|
|
|
|
4305
|
0
|
|
|
|
|
|
matched = (wildcard[filter.chars_offset + i] == tag[tag_pos]) ^ filter.negate; |
4306
|
0
|
0
|
|
|
|
|
if (!matched) return false; |
4307
|
|
|
|
|
|
|
} |
4308
|
|
|
|
|
|
|
return true; |
4309
|
|
|
|
|
|
|
} |
4310
|
|
|
|
|
|
|
|
4311
|
|
|
|
|
|
|
} // namespace morphodita |
4312
|
|
|
|
|
|
|
|
4313
|
|
|
|
|
|
|
///////// |
4314
|
|
|
|
|
|
|
// File: morphodita/morpho/morpho_dictionary.h |
4315
|
|
|
|
|
|
|
///////// |
4316
|
|
|
|
|
|
|
|
4317
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
4318
|
|
|
|
|
|
|
// |
4319
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
4320
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
4321
|
|
|
|
|
|
|
// |
4322
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
4323
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
4324
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
4325
|
|
|
|
|
|
|
|
4326
|
|
|
|
|
|
|
namespace morphodita { |
4327
|
|
|
|
|
|
|
|
4328
|
|
|
|
|
|
|
// Declarations |
4329
|
|
|
|
|
|
|
template |
4330
|
2
|
|
|
|
|
|
class morpho_dictionary { |
4331
|
|
|
|
|
|
|
public: |
4332
|
|
|
|
|
|
|
void load(binary_decoder& data); |
4333
|
|
|
|
|
|
|
void analyze(string_piece form, vector& lemmas) const; |
4334
|
|
|
|
|
|
|
bool generate(string_piece lemma, const tag_filter& filter, vector& lemmas_forms) const; |
4335
|
|
|
|
|
|
|
private: |
4336
|
|
|
|
|
|
|
persistent_unordered_map lemmas, roots, suffixes; |
4337
|
|
|
|
|
|
|
|
4338
|
|
|
|
|
|
|
vector tags; |
4339
|
|
|
|
|
|
|
vector>>> classes; |
4340
|
|
|
|
|
|
|
}; |
4341
|
|
|
|
|
|
|
|
4342
|
|
|
|
|
|
|
// Definitions |
4343
|
|
|
|
|
|
|
template |
4344
|
1
|
|
|
|
|
|
void morpho_dictionary::load(binary_decoder& data) { |
4345
|
|
|
|
|
|
|
// Prepare lemmas and roots hashes |
4346
|
13
|
100
|
|
|
|
|
for (int i = data.next_1B(); i > 0; i--) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4347
|
12
|
|
|
|
|
|
lemmas.resize(data.next_4B()); |
4348
|
13
|
100
|
|
|
|
|
for (int i = data.next_1B(); i > 0; i--) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4349
|
12
|
|
|
|
|
|
roots.resize(data.next_4B()); |
4350
|
|
|
|
|
|
|
|
4351
|
|
|
|
|
|
|
// Perform two pass over the lemmas and roots data, filling the hashes. |
4352
|
|
|
|
|
|
|
|
4353
|
1
|
|
|
|
|
|
vector lemma(max(lemmas.max_length(), roots.max_length())); |
4354
|
1
|
50
|
|
|
|
|
vector root(max(lemmas.max_length(), roots.max_length())); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4355
|
|
|
|
|
|
|
unsigned data_position = data.tell(); |
4356
|
3
|
100
|
|
|
|
|
for (int pass = 1; pass <= 2; pass++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4357
|
2
|
100
|
|
|
|
|
if (pass > 1) data.seek(data_position); |
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4358
|
|
|
|
|
|
|
|
4359
|
|
|
|
|
|
|
int lemma_len = 0; |
4360
|
|
|
|
|
|
|
int root_len = 0; |
4361
|
|
|
|
|
|
|
|
4362
|
22
|
50
|
|
|
|
|
for (int i = data.next_4B(); i > 0; i--) { |
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4363
|
20
|
50
|
|
|
|
|
lemma_len -= data.next_1B(); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4364
|
126
|
50
|
|
|
|
|
for (int i = data.next_1B(); i > 0; i--) |
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4365
|
106
|
50
|
|
|
|
|
lemma[lemma_len++] = data.next_1B(); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4366
|
20
|
50
|
|
|
|
|
unsigned char lemma_info_len = data.next_1B(); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4367
|
20
|
50
|
|
|
|
|
const char* lemma_info = lemma_info_len ? data.next(lemma_info_len) : nullptr; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4368
|
20
|
50
|
|
|
|
|
unsigned lemma_roots = data.next_1B(); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4369
|
|
|
|
|
|
|
|
4370
|
|
|
|
|
|
|
unsigned char* lemma_data /* to keep compiler happy */ = nullptr; |
4371
|
|
|
|
|
|
|
unsigned lemma_offset /* to keep compiler happy */ = 0; |
4372
|
|
|
|
|
|
|
|
4373
|
20
|
100
|
|
|
|
|
if (pass == 1) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4374
|
10
|
|
|
|
|
|
lemmas.add(lemma.data(), lemma_len, 1 + lemma_info_len + 1 + lemma_roots * (sizeof(uint32_t) + sizeof(uint8_t) + sizeof(uint16_t))); |
4375
|
|
|
|
|
|
|
} else /*if (pass == 2)*/ { |
4376
|
10
|
|
|
|
|
|
lemma_data = lemmas.fill(lemma.data(), lemma_len, 1 + lemma_info_len + 1 + lemma_roots * (sizeof(uint32_t) + sizeof(uint8_t) + sizeof(uint16_t))); |
4377
|
20
|
|
|
|
|
|
lemma_offset = lemma_data - lemma_len - lemmas.data_start(lemma_len); |
4378
|
|
|
|
|
|
|
|
4379
|
10
|
|
|
|
|
|
*lemma_data++ = lemma_info_len; |
4380
|
10
|
50
|
|
|
|
|
if (lemma_info_len) small_memcpy(lemma_data, lemma_info, lemma_info_len), lemma_data += lemma_info_len; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4381
|
10
|
|
|
|
|
|
*lemma_data++ = lemma_roots; |
4382
|
|
|
|
|
|
|
} |
4383
|
|
|
|
|
|
|
|
4384
|
20
|
|
|
|
|
|
small_memcpy(root.data(), lemma.data(), lemma_len); root_len = lemma_len; |
4385
|
40
|
100
|
|
|
|
|
for (unsigned i = 0; i < lemma_roots; i++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4386
|
|
|
|
|
|
|
enum { REMOVE_START = 1, REMOVE_END = 2, ADD_START = 4, ADD_END = 8 }; |
4387
|
20
|
50
|
|
|
|
|
int operations = data.next_1B(); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4388
|
48
|
100
|
|
|
|
|
if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; } |
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4389
|
20
|
100
|
|
|
|
|
if (operations & REMOVE_END) root_len -= data.next_1B(); |
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4390
|
20
|
100
|
|
|
|
|
if (operations & ADD_START) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4391
|
44
|
50
|
|
|
|
|
int from = root_len, to = from + data.next_1B(); while (from > 0) root[--to] = root[--from]; root_len += to; |
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4392
|
14
|
100
|
|
|
|
|
for (int i = 0; i < to; i++) root[i] = data.next_1B(); |
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4393
|
|
|
|
|
|
|
} |
4394
|
20
|
100
|
|
|
|
|
if (operations & ADD_END) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4395
|
34
|
50
|
|
|
|
|
for (int len = data.next_1B(); len > 0; len--) |
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4396
|
22
|
50
|
|
|
|
|
root[root_len++] = data.next_1B(); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4397
|
20
|
50
|
|
|
|
|
uint16_t clas = data.next_2B(); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4398
|
|
|
|
|
|
|
|
4399
|
20
|
100
|
|
|
|
|
if (pass == 1) { // for each root |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4400
|
10
|
|
|
|
|
|
roots.add(root.data(), root_len, sizeof(uint16_t) + sizeof(uint32_t) + sizeof(uint8_t)); |
4401
|
|
|
|
|
|
|
} else /*if (pass == 2)*/ { |
4402
|
10
|
|
|
|
|
|
unsigned char* root_data = roots.fill(root.data(), root_len, sizeof(uint16_t) + sizeof(uint32_t) + sizeof(uint8_t)); |
4403
|
20
|
|
|
|
|
|
unsigned root_offset = root_data - root_len - roots.data_start(root_len); |
4404
|
|
|
|
|
|
|
|
4405
|
|
|
|
|
|
|
unaligned_store_inc(root_data, clas); |
4406
|
|
|
|
|
|
|
unaligned_store_inc(root_data, lemma_offset); |
4407
|
|
|
|
|
|
|
unaligned_store_inc(root_data, lemma_len); |
4408
|
10
|
50
|
|
|
|
|
assert(uint8_t(lemma_len) == lemma_len); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4409
|
|
|
|
|
|
|
|
4410
|
|
|
|
|
|
|
unaligned_store_inc(lemma_data, root_offset); |
4411
|
|
|
|
|
|
|
unaligned_store_inc(lemma_data, root_len); |
4412
|
|
|
|
|
|
|
unaligned_store_inc(lemma_data, clas); |
4413
|
10
|
50
|
|
|
|
|
assert(uint8_t(root_len) == root_len); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4414
|
|
|
|
|
|
|
} |
4415
|
|
|
|
|
|
|
} |
4416
|
|
|
|
|
|
|
} |
4417
|
|
|
|
|
|
|
|
4418
|
2
|
100
|
|
|
|
|
if (pass == 1) { // after the whole pass |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4419
|
1
|
50
|
|
|
|
|
lemmas.done_adding(); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4420
|
1
|
50
|
|
|
|
|
roots.done_adding(); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4421
|
|
|
|
|
|
|
} else /*if (pass == 2)*/ { |
4422
|
1
|
|
|
|
|
|
lemmas.done_filling(); |
4423
|
1
|
|
|
|
|
|
roots.done_filling(); |
4424
|
|
|
|
|
|
|
} |
4425
|
|
|
|
|
|
|
} |
4426
|
|
|
|
|
|
|
|
4427
|
|
|
|
|
|
|
// Load tags |
4428
|
1
|
50
|
|
|
|
|
tags.resize(data.next_2B()); |
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4429
|
7
|
100
|
|
|
|
|
for (auto&& tag : tags) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4430
|
6
|
50
|
|
|
|
|
tag.resize(data.next_1B()); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4431
|
403
|
100
|
|
|
|
|
for (unsigned i = 0; i < tag.size(); i++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4432
|
397
|
50
|
|
|
|
|
tag[i] = data.next_1B(); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4433
|
|
|
|
|
|
|
} |
4434
|
|
|
|
|
|
|
|
4435
|
|
|
|
|
|
|
// Load suffixes |
4436
|
1
|
50
|
|
|
|
|
suffixes.load(data); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4437
|
|
|
|
|
|
|
|
4438
|
|
|
|
|
|
|
// Fill classes from suffixes |
4439
|
2
|
50
|
|
|
|
|
suffixes.iter_all([this](const char* suffix, int len, pointer_decoder& data) mutable { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4440
|
|
|
|
|
|
|
unsigned classes_len = data.next_2B(); |
4441
|
|
|
|
|
|
|
const uint16_t* classes_ptr = data.next(classes_len); |
4442
|
1
|
|
|
|
|
|
const uint16_t* indices_ptr = data.next(classes_len + 1); |
4443
|
1
|
|
|
|
|
|
uint32_t tags_len = unaligned_load(indices_ptr); |
4444
|
7
|
100
|
|
|
|
|
for (unsigned i = 0; i < classes_len; i++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4445
|
6
|
|
|
|
|
|
tags_len += uint16_t(unaligned_load(indices_ptr + i + 1) - unaligned_load(indices_ptr + i)); |
4446
|
|
|
|
|
|
|
const uint16_t* tags_ptr = data.next(tags_len); |
4447
|
|
|
|
|
|
|
|
4448
|
1
|
|
|
|
|
|
string suffix_str(suffix, len); |
4449
|
1
|
|
|
|
|
|
uint32_t index = unaligned_load(indices_ptr), prev_index = 0; |
4450
|
7
|
100
|
|
|
|
|
for (unsigned i = 0; i < classes_len; i++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4451
|
6
|
|
|
|
|
|
auto classes_ptr_i = unaligned_load(classes_ptr + i); |
4452
|
6
|
50
|
|
|
|
|
if (classes_ptr_i >= classes.size()) classes.resize(classes_ptr_i + 1); |
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4453
|
|
|
|
|
|
|
prev_index = index; |
4454
|
6
|
|
|
|
|
|
index += uint16_t(unaligned_load(indices_ptr + i + 1) - unaligned_load(indices_ptr + i)); |
4455
|
6
|
50
|
|
|
|
|
classes[classes_ptr_i].emplace_back(suffix_str, vector()); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4456
|
12
|
100
|
|
|
|
|
for (const uint16_t* ptr = tags_ptr + prev_index; ptr < tags_ptr + index; ptr++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4457
|
6
|
50
|
|
|
|
|
classes[classes_ptr_i].back().second.emplace_back(unaligned_load(ptr)); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4458
|
|
|
|
|
|
|
} |
4459
|
1
|
|
|
|
|
|
}); |
4460
|
1
|
|
|
|
|
|
} |
4461
|
|
|
|
|
|
|
|
4462
|
|
|
|
|
|
|
template |
4463
|
8
|
|
|
|
|
|
void morpho_dictionary::analyze(string_piece form, vector& lemmas) const { |
4464
|
|
|
|
|
|
|
int max_suffix_len = suffixes.max_length(); |
4465
|
|
|
|
|
|
|
|
4466
|
|
|
|
|
|
|
uint16_t* suff_stack[16]; vector suff_heap; |
4467
|
8
|
50
|
|
|
|
|
uint16_t** suff = max_suffix_len <= 16 ? suff_stack : (suff_heap.resize(max_suffix_len), suff_heap.data()); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4468
|
|
|
|
|
|
|
int suff_len = 0; |
4469
|
16
|
100
|
|
|
|
|
for (int i = form.len; i >= 0 && suff_len < max_suffix_len; i--, suff_len++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4470
|
8
|
|
|
|
|
|
suff[suff_len] = (uint16_t*) suffixes.at(form.str + i, suff_len, [](pointer_decoder& data) { |
4471
|
0
|
|
|
|
|
|
data.next(2 * data.next_2B()); |
4472
|
|
|
|
|
|
|
data.next(data.next_2B()); |
4473
|
0
|
|
|
|
|
|
}); |
4474
|
8
|
|
|
|
|
|
if (!suff[suff_len]) break; |
4475
|
|
|
|
|
|
|
} |
4476
|
|
|
|
|
|
|
|
4477
|
16
|
100
|
|
|
|
|
for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++) |
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4478
|
8
|
50
|
|
|
|
|
if (unaligned_load(suff[suff_len])) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4479
|
8
|
|
|
|
|
|
unsigned suff_classes = unaligned_load(suff[suff_len]); |
4480
|
8
|
|
|
|
|
|
uint16_t* suff_data = suff[suff_len] + 1; |
4481
|
|
|
|
|
|
|
|
4482
|
21
|
50
|
|
|
|
|
roots.iter(form.str, root_len, [&](const char* root, pointer_decoder& root_data) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4483
|
|
|
|
|
|
|
uint16_t root_class = root_data.next_2B(); |
4484
|
|
|
|
|
|
|
unsigned lemma_offset = root_data.next_4B(); |
4485
|
|
|
|
|
|
|
unsigned lemma_len = root_data.next_1B(); |
4486
|
|
|
|
|
|
|
|
4487
|
26
|
100
|
|
|
|
|
if (small_memeq(form.str, root, root_len)) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4488
|
19
|
|
|
|
|
|
uint16_t* suffix_class_ptr = unaligned_lower_bound(suff_data, suff_classes, root_class); |
4489
|
10
|
50
|
|
|
|
|
if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) { |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4490
|
30
|
|
|
|
|
|
const unsigned char* lemma_data = this->lemmas.data_start(lemma_len) + lemma_offset; |
4491
|
|
|
|
|
|
|
string lemma((const char*)lemma_data, lemma_len); |
4492
|
10
|
50
|
|
|
|
|
if (lemma_data[lemma_len]) lemma += LemmaAddinfo::format(lemma_data + lemma_len + 1, lemma_data[lemma_len]); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4493
|
|
|
|
|
|
|
|
4494
|
20
|
|
|
|
|
|
uint16_t* suff_tag_indices = suff_data + suff_classes; |
4495
|
10
|
|
|
|
|
|
uint16_t* suff_tags = suff_tag_indices + suff_classes + 1; |
4496
|
20
|
100
|
|
|
|
|
for (unsigned i = unaligned_load(suff_tag_indices + (suffix_class_ptr - suff_data)); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4497
|
20
|
|
|
|
|
|
i < unaligned_load(suff_tag_indices + (suffix_class_ptr - suff_data) + 1); i++) |
4498
|
10
|
50
|
|
|
|
|
lemmas.emplace_back(lemma, tags[unaligned_load(suff_tags + i)]); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4499
|
|
|
|
|
|
|
} |
4500
|
|
|
|
|
|
|
} |
4501
|
13
|
|
|
|
|
|
}); |
4502
|
|
|
|
|
|
|
} |
4503
|
8
|
|
|
|
|
|
} |
4504
|
|
|
|
|
|
|
|
4505
|
|
|
|
|
|
|
template |
4506
|
0
|
|
|
|
|
|
bool morpho_dictionary::generate(string_piece lemma, const tag_filter& filter, vector& lemmas_forms) const { |
4507
|
|
|
|
|
|
|
LemmaAddinfo addinfo; |
4508
|
0
|
0
|
|
|
|
|
int raw_lemma_len = addinfo.parse(lemma); |
|
|
0
|
|
|
|
|
|
4509
|
0
|
|
|
|
|
|
bool matched_lemma = false; |
4510
|
|
|
|
|
|
|
|
4511
|
0
|
0
|
|
|
|
|
lemmas.iter(lemma.str, raw_lemma_len, [&](const char* lemma_str, pointer_decoder& data) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4512
|
|
|
|
|
|
|
unsigned lemma_info_len = data.next_1B(); |
4513
|
|
|
|
|
|
|
const auto* lemma_info = data.next(lemma_info_len); |
4514
|
|
|
|
|
|
|
unsigned lemma_roots_len = data.next_1B(); |
4515
|
0
|
|
|
|
|
|
auto* lemma_roots_ptr = data.next(lemma_roots_len * (sizeof(uint32_t) + sizeof(uint8_t) + sizeof(uint16_t))); |
4516
|
|
|
|
|
|
|
|
4517
|
0
|
0
|
|
|
|
|
if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4518
|
0
|
|
|
|
|
|
matched_lemma = true; |
4519
|
|
|
|
|
|
|
|
4520
|
|
|
|
|
|
|
vector* forms = nullptr; |
4521
|
|
|
|
|
|
|
pointer_decoder lemma_roots(lemma_roots_ptr); |
4522
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < lemma_roots_len; i++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4523
|
|
|
|
|
|
|
unsigned root_offset = lemma_roots.next_4B(); |
4524
|
|
|
|
|
|
|
unsigned root_len = lemma_roots.next_1B(); |
4525
|
|
|
|
|
|
|
unsigned clas = lemma_roots.next_2B(); |
4526
|
|
|
|
|
|
|
|
4527
|
0
|
|
|
|
|
|
const unsigned char* root_data = roots.data_start(root_len) + root_offset; |
4528
|
0
|
0
|
|
|
|
|
for (auto&& suffix : classes[clas]) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4529
|
|
|
|
|
|
|
string root_with_suffix; |
4530
|
0
|
0
|
|
|
|
|
for (auto&& tag : suffix.second) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4531
|
0
|
0
|
|
|
|
|
if (filter.matches(tags[tag].c_str())) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4532
|
0
|
0
|
|
|
|
|
if (!forms) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4533
|
0
|
0
|
|
|
|
|
lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len)); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4534
|
0
|
|
|
|
|
|
forms = &lemmas_forms.back().forms; |
4535
|
|
|
|
|
|
|
} |
4536
|
|
|
|
|
|
|
|
4537
|
0
|
0
|
|
|
|
|
if (root_with_suffix.empty() && root_len + suffix.first.size()) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4538
|
0
|
0
|
|
|
|
|
root_with_suffix.reserve(root_len + suffix.first.size()); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4539
|
|
|
|
|
|
|
root_with_suffix.assign((const char*)root_data, root_len); |
4540
|
|
|
|
|
|
|
root_with_suffix.append(suffix.first); |
4541
|
|
|
|
|
|
|
} |
4542
|
|
|
|
|
|
|
|
4543
|
0
|
0
|
|
|
|
|
forms->emplace_back(root_with_suffix, tags[tag]); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4544
|
|
|
|
|
|
|
} |
4545
|
|
|
|
|
|
|
} |
4546
|
|
|
|
|
|
|
} |
4547
|
|
|
|
|
|
|
} |
4548
|
0
|
|
|
|
|
|
}); |
4549
|
|
|
|
|
|
|
|
4550
|
0
|
|
|
|
|
|
return matched_lemma; |
4551
|
|
|
|
|
|
|
} |
4552
|
|
|
|
|
|
|
|
4553
|
|
|
|
|
|
|
} // namespace morphodita |
4554
|
|
|
|
|
|
|
|
4555
|
|
|
|
|
|
|
///////// |
4556
|
|
|
|
|
|
|
// File: morphodita/morpho/morpho_prefix_guesser.h |
4557
|
|
|
|
|
|
|
///////// |
4558
|
|
|
|
|
|
|
|
4559
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
4560
|
|
|
|
|
|
|
// |
4561
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
4562
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
4563
|
|
|
|
|
|
|
// |
4564
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
4565
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
4566
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
4567
|
|
|
|
|
|
|
|
4568
|
|
|
|
|
|
|
namespace morphodita { |
4569
|
|
|
|
|
|
|
|
4570
|
|
|
|
|
|
|
// Declarations |
4571
|
|
|
|
|
|
|
template |
4572
|
0
|
|
|
|
|
|
class morpho_prefix_guesser { |
4573
|
|
|
|
|
|
|
public: |
4574
|
0
|
|
|
|
|
|
morpho_prefix_guesser(const MorphoDictionary& dictionary) : dictionary(dictionary) {} |
4575
|
|
|
|
|
|
|
|
4576
|
|
|
|
|
|
|
void load(binary_decoder& data); |
4577
|
|
|
|
|
|
|
void analyze(string_piece form, vector& lemmas); |
4578
|
|
|
|
|
|
|
bool generate(string_piece lemma, const tag_filter& filter, vector& lemmas_forms); |
4579
|
|
|
|
|
|
|
|
4580
|
|
|
|
|
|
|
private: |
4581
|
|
|
|
|
|
|
const MorphoDictionary& dictionary; |
4582
|
|
|
|
|
|
|
vector tag_filters; |
4583
|
|
|
|
|
|
|
persistent_unordered_map prefixes_initial, prefixes_middle; |
4584
|
|
|
|
|
|
|
}; |
4585
|
|
|
|
|
|
|
|
4586
|
|
|
|
|
|
|
// Definitions |
4587
|
|
|
|
|
|
|
template |
4588
|
0
|
|
|
|
|
|
void morpho_prefix_guesser::load(binary_decoder& data) { |
4589
|
|
|
|
|
|
|
// Load and construct tag filters |
4590
|
0
|
0
|
|
|
|
|
for (unsigned tag_filters_len = data.next_1B(); tag_filters_len; tag_filters_len--) { |
4591
|
0
|
|
|
|
|
|
unsigned tag_filter_len = data.next_1B(); |
4592
|
0
|
|
|
|
|
|
string tag_filter(data.next(tag_filter_len), tag_filter_len); |
4593
|
|
|
|
|
|
|
|
4594
|
0
|
0
|
|
|
|
|
tag_filters.emplace_back(tag_filter.c_str()); |
4595
|
|
|
|
|
|
|
} |
4596
|
|
|
|
|
|
|
|
4597
|
|
|
|
|
|
|
// Load prefixes |
4598
|
0
|
|
|
|
|
|
prefixes_initial.load(data); |
4599
|
0
|
|
|
|
|
|
prefixes_middle.load(data); |
4600
|
0
|
|
|
|
|
|
} |
4601
|
|
|
|
|
|
|
|
4602
|
|
|
|
|
|
|
// Analyze can return non-unique lemma-tag pairs. |
4603
|
|
|
|
|
|
|
template |
4604
|
0
|
|
|
|
|
|
void morpho_prefix_guesser::analyze(string_piece form, vector& lemmas) { |
4605
|
0
|
0
|
|
|
|
|
if (!form.len) return; |
4606
|
|
|
|
|
|
|
|
4607
|
|
|
|
|
|
|
vector form_tmp; |
4608
|
|
|
|
|
|
|
vector middle_masks; |
4609
|
0
|
0
|
|
|
|
|
middle_masks.reserve(form.len); |
4610
|
|
|
|
|
|
|
|
4611
|
0
|
0
|
|
|
|
|
for (unsigned initial = 0; initial < form.len; initial++) { |
4612
|
|
|
|
|
|
|
// Match the initial prefix. |
4613
|
0
|
|
|
|
|
|
unsigned initial_mask = (1<
|
4614
|
0
|
0
|
|
|
|
|
if (initial) { |
4615
|
0
|
|
|
|
|
|
auto found = prefixes_initial.at_typed(form.str, initial); |
4616
|
0
|
0
|
|
|
|
|
if (!found) break; |
4617
|
0
|
|
|
|
|
|
initial_mask = unaligned_load(found); |
4618
|
|
|
|
|
|
|
} |
4619
|
|
|
|
|
|
|
|
4620
|
|
|
|
|
|
|
// If we have found an initial prefix (including the empty one), match middle prefixes. |
4621
|
0
|
0
|
|
|
|
|
if (initial_mask) { |
4622
|
0
|
0
|
|
|
|
|
middle_masks.resize(initial); |
4623
|
0
|
0
|
|
|
|
|
middle_masks.emplace_back(initial_mask); |
4624
|
0
|
0
|
|
|
|
|
for (unsigned middle = initial; middle < middle_masks.size(); middle++) { |
4625
|
0
|
0
|
|
|
|
|
if (!middle_masks[middle]) continue; |
4626
|
|
|
|
|
|
|
// Try matching middle prefixes from current index. |
4627
|
0
|
0
|
|
|
|
|
for (unsigned i = middle + 1; i < form.len; i++) { |
4628
|
0
|
|
|
|
|
|
auto found = prefixes_middle.at_typed(form.str + middle, i - middle); |
4629
|
0
|
0
|
|
|
|
|
if (!found) break; |
4630
|
0
|
0
|
|
|
|
|
if (unaligned_load(found)) { |
4631
|
0
|
0
|
|
|
|
|
if (i + 1 > middle_masks.size()) middle_masks.resize(i + 1); |
|
|
0
|
|
|
|
|
|
4632
|
0
|
|
|
|
|
|
middle_masks[i] |= middle_masks[middle] & unaligned_load(found); |
4633
|
|
|
|
|
|
|
} |
4634
|
|
|
|
|
|
|
} |
4635
|
|
|
|
|
|
|
|
4636
|
|
|
|
|
|
|
// Try matching word forms if at least one middle prefix was found. |
4637
|
0
|
0
|
|
|
|
|
if (middle > initial && middle < form.len ) { |
|
|
0
|
|
|
|
|
|
4638
|
0
|
0
|
|
|
|
|
if (initial) { |
4639
|
0
|
0
|
|
|
|
|
if (form_tmp.empty()) form_tmp.assign(form.str, form.str + form.len); |
4640
|
0
|
|
|
|
|
|
small_memcpy(form_tmp.data() + middle - initial, form.str, initial); |
4641
|
|
|
|
|
|
|
} |
4642
|
0
|
|
|
|
|
|
unsigned lemmas_ori_size = lemmas.size(); |
4643
|
0
|
0
|
|
|
|
|
dictionary.analyze(string_piece((initial ? form_tmp.data() : form.str) + middle - initial, form.len - middle + initial), lemmas); |
|
|
0
|
|
|
|
|
|
4644
|
|
|
|
|
|
|
unsigned lemmas_new_size = lemmas_ori_size; |
4645
|
0
|
0
|
|
|
|
|
for (unsigned i = lemmas_ori_size; i < lemmas.size(); i++) { |
4646
|
0
|
0
|
|
|
|
|
for (unsigned filter = 0; filter < tag_filters.size(); filter++) |
4647
|
0
|
0
|
|
|
|
|
if ((middle_masks[middle] & (1<
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4648
|
0
|
0
|
|
|
|
|
if (i == lemmas_new_size) { |
4649
|
0
|
|
|
|
|
|
lemmas[lemmas_new_size].lemma.insert(0, form.str + initial, middle - initial); |
4650
|
|
|
|
|
|
|
} else { |
4651
|
0
|
0
|
|
|
|
|
lemmas[lemmas_new_size].lemma.reserve(lemmas[i].lemma.size() + middle - initial); |
4652
|
0
|
|
|
|
|
|
lemmas[lemmas_new_size].lemma.assign(form.str + initial, middle - initial); |
4653
|
0
|
|
|
|
|
|
lemmas[lemmas_new_size].lemma.append(lemmas[i].lemma); |
4654
|
0
|
|
|
|
|
|
lemmas[lemmas_new_size].tag = lemmas[i].tag; |
4655
|
|
|
|
|
|
|
} |
4656
|
0
|
|
|
|
|
|
lemmas_new_size++; |
4657
|
0
|
|
|
|
|
|
break; |
4658
|
|
|
|
|
|
|
} |
4659
|
|
|
|
|
|
|
} |
4660
|
0
|
0
|
|
|
|
|
if (lemmas_new_size < lemmas.size()) lemmas.erase(lemmas.begin() + lemmas_new_size, lemmas.end()); |
4661
|
|
|
|
|
|
|
} |
4662
|
|
|
|
|
|
|
} |
4663
|
|
|
|
|
|
|
} |
4664
|
|
|
|
|
|
|
} |
4665
|
|
|
|
|
|
|
} |
4666
|
|
|
|
|
|
|
|
4667
|
|
|
|
|
|
|
template |
4668
|
|
|
|
|
|
|
bool morpho_prefix_guesser::generate(string_piece /*lemma*/, const tag_filter& /*filter*/, vector& /*lemmas_forms*/) { |
4669
|
|
|
|
|
|
|
// Not implemented yet. Is it actually needed? |
4670
|
|
|
|
|
|
|
return false; |
4671
|
|
|
|
|
|
|
} |
4672
|
|
|
|
|
|
|
} // namespace morphodita |
4673
|
|
|
|
|
|
|
|
4674
|
|
|
|
|
|
|
///////// |
4675
|
|
|
|
|
|
|
// File: morphodita/morpho/morpho_statistical_guesser.h |
4676
|
|
|
|
|
|
|
///////// |
4677
|
|
|
|
|
|
|
|
4678
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
4679
|
|
|
|
|
|
|
// |
4680
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
4681
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
4682
|
|
|
|
|
|
|
// |
4683
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
4684
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
4685
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
4686
|
|
|
|
|
|
|
|
4687
|
|
|
|
|
|
|
namespace morphodita { |
4688
|
|
|
|
|
|
|
|
4689
|
1
|
|
|
|
|
|
class morpho_statistical_guesser { |
4690
|
|
|
|
|
|
|
public: |
4691
|
|
|
|
|
|
|
void load(binary_decoder& data); |
4692
|
|
|
|
|
|
|
typedef vector used_rules; |
4693
|
|
|
|
|
|
|
void analyze(string_piece form, vector& lemmas, used_rules* used); |
4694
|
|
|
|
|
|
|
|
4695
|
|
|
|
|
|
|
private: |
4696
|
|
|
|
|
|
|
vector tags; |
4697
|
|
|
|
|
|
|
unsigned default_tag; |
4698
|
|
|
|
|
|
|
persistent_unordered_map rules; |
4699
|
|
|
|
|
|
|
}; |
4700
|
|
|
|
|
|
|
|
4701
|
|
|
|
|
|
|
} // namespace morphodita |
4702
|
|
|
|
|
|
|
|
4703
|
|
|
|
|
|
|
///////// |
4704
|
|
|
|
|
|
|
// File: morphodita/tokenizer/unicode_tokenizer.h |
4705
|
|
|
|
|
|
|
///////// |
4706
|
|
|
|
|
|
|
|
4707
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
4708
|
|
|
|
|
|
|
// |
4709
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
4710
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
4711
|
|
|
|
|
|
|
// |
4712
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
4713
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
4714
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
4715
|
|
|
|
|
|
|
|
4716
|
|
|
|
|
|
|
namespace morphodita { |
4717
|
|
|
|
|
|
|
|
4718
|
1
|
|
|
|
|
|
class unicode_tokenizer : public tokenizer { |
4719
|
|
|
|
|
|
|
public: |
4720
|
|
|
|
|
|
|
enum { URL_EMAIL_LATEST = 2 }; |
4721
|
|
|
|
|
|
|
unicode_tokenizer(unsigned url_email_tokenizer); |
4722
|
|
|
|
|
|
|
|
4723
|
|
|
|
|
|
|
virtual void set_text(string_piece text, bool make_copy = false) override; |
4724
|
|
|
|
|
|
|
virtual bool next_sentence(vector* forms, vector* tokens) override; |
4725
|
|
|
|
|
|
|
|
4726
|
|
|
|
|
|
|
virtual bool next_sentence(vector& tokens) = 0; |
4727
|
|
|
|
|
|
|
|
4728
|
|
|
|
|
|
|
protected: |
4729
|
|
|
|
|
|
|
struct char_info { |
4730
|
|
|
|
|
|
|
char32_t chr; |
4731
|
|
|
|
|
|
|
unilib::unicode::category_t cat; |
4732
|
|
|
|
|
|
|
const char* str; |
4733
|
|
|
|
|
|
|
|
4734
|
36
|
|
|
|
|
|
char_info(char32_t chr, const char* str) : chr(chr), cat(unilib::unicode::category(chr)), str(str) {} |
4735
|
|
|
|
|
|
|
}; |
4736
|
|
|
|
|
|
|
vector chars; |
4737
|
|
|
|
|
|
|
size_t current; |
4738
|
|
|
|
|
|
|
|
4739
|
|
|
|
|
|
|
bool tokenize_url_email(vector& tokens); |
4740
|
|
|
|
|
|
|
bool emergency_sentence_split(const vector& tokens); |
4741
|
|
|
|
|
|
|
bool is_eos(const vector& tokens, char32_t eos_chr, const unordered_set* abbreviations); |
4742
|
|
|
|
|
|
|
|
4743
|
|
|
|
|
|
|
private: |
4744
|
|
|
|
|
|
|
unsigned url_email_tokenizer; |
4745
|
|
|
|
|
|
|
string text_buffer; |
4746
|
|
|
|
|
|
|
vector tokens_buffer; |
4747
|
|
|
|
|
|
|
string eos_buffer; |
4748
|
|
|
|
|
|
|
}; |
4749
|
|
|
|
|
|
|
|
4750
|
|
|
|
|
|
|
} // namespace morphodita |
4751
|
|
|
|
|
|
|
|
4752
|
|
|
|
|
|
|
///////// |
4753
|
|
|
|
|
|
|
// File: morphodita/tokenizer/ragel_tokenizer.h |
4754
|
|
|
|
|
|
|
///////// |
4755
|
|
|
|
|
|
|
|
4756
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
4757
|
|
|
|
|
|
|
// |
4758
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
4759
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
4760
|
|
|
|
|
|
|
// |
4761
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
4762
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
4763
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
4764
|
|
|
|
|
|
|
|
4765
|
|
|
|
|
|
|
namespace morphodita { |
4766
|
|
|
|
|
|
|
|
4767
|
0
|
|
|
|
|
|
class ragel_tokenizer : public unicode_tokenizer { |
4768
|
|
|
|
|
|
|
public: |
4769
|
|
|
|
|
|
|
ragel_tokenizer(unsigned url_email_tokenizer); |
4770
|
|
|
|
|
|
|
|
4771
|
|
|
|
|
|
|
protected: |
4772
|
|
|
|
|
|
|
static inline uint8_t ragel_char(const char_info& chr); |
4773
|
|
|
|
|
|
|
|
4774
|
|
|
|
|
|
|
private: |
4775
|
|
|
|
|
|
|
static void initialize_ragel_map(); |
4776
|
|
|
|
|
|
|
static vector ragel_map; |
4777
|
|
|
|
|
|
|
static atomic_flag ragel_map_flag; |
4778
|
|
|
|
|
|
|
static void ragel_map_add(char32_t chr, uint8_t mapping); |
4779
|
|
|
|
|
|
|
|
4780
|
|
|
|
|
|
|
friend class unicode_tokenizer; |
4781
|
|
|
|
|
|
|
static bool ragel_url_email(unsigned version, const vector& chars, size_t& current_char, vector& tokens); |
4782
|
|
|
|
|
|
|
}; |
4783
|
|
|
|
|
|
|
|
4784
|
|
|
|
|
|
|
uint8_t ragel_tokenizer::ragel_char(const char_info& chr) { |
4785
|
30
|
50
|
|
|
|
|
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4786
|
|
|
|
|
|
|
} |
4787
|
|
|
|
|
|
|
|
4788
|
|
|
|
|
|
|
} // namespace morphodita |
4789
|
|
|
|
|
|
|
|
4790
|
|
|
|
|
|
|
///////// |
4791
|
|
|
|
|
|
|
// File: morphodita/tokenizer/czech_tokenizer.h |
4792
|
|
|
|
|
|
|
///////// |
4793
|
|
|
|
|
|
|
|
4794
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
4795
|
|
|
|
|
|
|
// |
4796
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
4797
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
4798
|
|
|
|
|
|
|
// |
4799
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
4800
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
4801
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
4802
|
|
|
|
|
|
|
|
4803
|
|
|
|
|
|
|
namespace morphodita { |
4804
|
|
|
|
|
|
|
|
4805
|
0
|
|
|
|
|
|
class czech_tokenizer : public ragel_tokenizer { |
4806
|
|
|
|
|
|
|
public: |
4807
|
|
|
|
|
|
|
enum tokenizer_language { CZECH = 0, SLOVAK = 1 }; |
4808
|
|
|
|
|
|
|
enum { LATEST = 2 }; |
4809
|
|
|
|
|
|
|
czech_tokenizer(tokenizer_language language, unsigned version, const morpho* m = nullptr); |
4810
|
|
|
|
|
|
|
|
4811
|
|
|
|
|
|
|
virtual bool next_sentence(vector& tokens) override; |
4812
|
|
|
|
|
|
|
|
4813
|
|
|
|
|
|
|
private: |
4814
|
|
|
|
|
|
|
const morpho* m; |
4815
|
|
|
|
|
|
|
const unordered_set* abbreviations; |
4816
|
|
|
|
|
|
|
vector lemmas; |
4817
|
|
|
|
|
|
|
|
4818
|
|
|
|
|
|
|
void merge_hyphenated(vector& tokens); |
4819
|
|
|
|
|
|
|
|
4820
|
|
|
|
|
|
|
static const unordered_set abbreviations_czech; |
4821
|
|
|
|
|
|
|
static const unordered_set abbreviations_slovak; |
4822
|
|
|
|
|
|
|
}; |
4823
|
|
|
|
|
|
|
|
4824
|
|
|
|
|
|
|
} // namespace morphodita |
4825
|
|
|
|
|
|
|
|
4826
|
|
|
|
|
|
|
///////// |
4827
|
|
|
|
|
|
|
// File: morphodita/morpho/czech_morpho.h |
4828
|
|
|
|
|
|
|
///////// |
4829
|
|
|
|
|
|
|
|
4830
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
4831
|
|
|
|
|
|
|
// |
4832
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
4833
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
4834
|
|
|
|
|
|
|
// |
4835
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
4836
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
4837
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
4838
|
|
|
|
|
|
|
|
4839
|
|
|
|
|
|
|
namespace morphodita { |
4840
|
|
|
|
|
|
|
|
4841
|
0
|
|
|
|
|
|
class czech_morpho : public morpho { |
4842
|
|
|
|
|
|
|
public: |
4843
|
|
|
|
|
|
|
using morpho_language = czech_tokenizer::tokenizer_language; |
4844
|
|
|
|
|
|
|
|
4845
|
0
|
0
|
|
|
|
|
czech_morpho(morpho_language language, unsigned version) : language(language), version(version) {} |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4846
|
|
|
|
|
|
|
|
4847
|
|
|
|
|
|
|
virtual int analyze(string_piece form, morpho::guesser_mode guesser, vector& lemmas) const override; |
4848
|
|
|
|
|
|
|
virtual int generate(string_piece lemma, const char* tag_wildcard, guesser_mode guesser, vector& forms) const override; |
4849
|
|
|
|
|
|
|
virtual int raw_lemma_len(string_piece lemma) const override; |
4850
|
|
|
|
|
|
|
virtual int lemma_id_len(string_piece lemma) const override; |
4851
|
|
|
|
|
|
|
virtual int raw_form_len(string_piece form) const override; |
4852
|
|
|
|
|
|
|
virtual tokenizer* new_tokenizer() const override; |
4853
|
|
|
|
|
|
|
|
4854
|
|
|
|
|
|
|
bool load(istream& is); |
4855
|
|
|
|
|
|
|
private: |
4856
|
|
|
|
|
|
|
inline void analyze_special(string_piece form, vector& lemmas) const; |
4857
|
|
|
|
|
|
|
|
4858
|
|
|
|
|
|
|
morpho_language language; |
4859
|
|
|
|
|
|
|
unsigned version; |
4860
|
|
|
|
|
|
|
morpho_dictionary dictionary; |
4861
|
|
|
|
|
|
|
unique_ptr> prefix_guesser; |
4862
|
|
|
|
|
|
|
unique_ptr statistical_guesser; |
4863
|
|
|
|
|
|
|
|
4864
|
|
|
|
|
|
|
string unknown_tag = "X@-------------"; |
4865
|
|
|
|
|
|
|
string number_tag = "C=-------------"; |
4866
|
|
|
|
|
|
|
string punctuation_tag = "Z:-------------"; |
4867
|
|
|
|
|
|
|
}; |
4868
|
|
|
|
|
|
|
|
4869
|
|
|
|
|
|
|
} // namespace morphodita |
4870
|
|
|
|
|
|
|
|
4871
|
|
|
|
|
|
|
///////// |
4872
|
|
|
|
|
|
|
// File: morphodita/morpho/czech_morpho.cpp |
4873
|
|
|
|
|
|
|
///////// |
4874
|
|
|
|
|
|
|
|
4875
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
4876
|
|
|
|
|
|
|
// |
4877
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
4878
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
4879
|
|
|
|
|
|
|
// |
4880
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
4881
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
4882
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
4883
|
|
|
|
|
|
|
|
4884
|
|
|
|
|
|
|
namespace morphodita { |
4885
|
|
|
|
|
|
|
|
4886
|
0
|
|
|
|
|
|
bool czech_morpho::load(istream& is) { |
4887
|
|
|
|
|
|
|
binary_decoder data; |
4888
|
0
|
0
|
|
|
|
|
if (!compressor::load(is, data)) return false; |
|
|
0
|
|
|
|
|
|
4889
|
|
|
|
|
|
|
|
4890
|
|
|
|
|
|
|
try { |
4891
|
|
|
|
|
|
|
// Load tag length |
4892
|
0
|
0
|
|
|
|
|
unsigned tag_length = data.next_1B(); |
4893
|
0
|
0
|
|
|
|
|
if (tag_length < unknown_tag.size()) unknown_tag.erase(tag_length); |
|
|
0
|
|
|
|
|
|
4894
|
0
|
0
|
|
|
|
|
if (tag_length < number_tag.size()) number_tag.erase(tag_length); |
|
|
0
|
|
|
|
|
|
4895
|
0
|
0
|
|
|
|
|
if (tag_length < punctuation_tag.size()) punctuation_tag.erase(tag_length); |
|
|
0
|
|
|
|
|
|
4896
|
|
|
|
|
|
|
|
4897
|
|
|
|
|
|
|
// Load dictionary |
4898
|
0
|
0
|
|
|
|
|
dictionary.load(data); |
4899
|
|
|
|
|
|
|
|
4900
|
|
|
|
|
|
|
// Optionally prefix guesser if present |
4901
|
0
|
|
|
|
|
|
prefix_guesser.reset(); |
4902
|
0
|
0
|
|
|
|
|
if (data.next_1B()) { |
|
|
0
|
|
|
|
|
|
4903
|
0
|
0
|
|
|
|
|
prefix_guesser.reset(new morpho_prefix_guesser(dictionary)); |
4904
|
0
|
0
|
|
|
|
|
prefix_guesser->load(data); |
4905
|
|
|
|
|
|
|
} |
4906
|
|
|
|
|
|
|
|
4907
|
|
|
|
|
|
|
// Optionally statistical guesser if present |
4908
|
|
|
|
|
|
|
statistical_guesser.reset(); |
4909
|
0
|
0
|
|
|
|
|
if (data.next_1B()) { |
|
|
0
|
|
|
|
|
|
4910
|
0
|
0
|
|
|
|
|
statistical_guesser.reset(new morpho_statistical_guesser()); |
4911
|
0
|
0
|
|
|
|
|
statistical_guesser->load(data); |
4912
|
|
0
|
|
|
|
|
} |
4913
|
|
|
|
|
|
|
} catch (binary_decoder_error&) { |
4914
|
|
|
|
|
|
|
return false; |
4915
|
|
|
|
|
|
|
} |
4916
|
|
|
|
|
|
|
|
4917
|
0
|
|
|
|
|
|
return data.is_end(); |
4918
|
|
|
|
|
|
|
} |
4919
|
|
|
|
|
|
|
|
4920
|
0
|
|
|
|
|
|
int czech_morpho::analyze(string_piece form, guesser_mode guesser, vector& lemmas) const { |
4921
|
|
|
|
|
|
|
lemmas.clear(); |
4922
|
|
|
|
|
|
|
|
4923
|
0
|
0
|
|
|
|
|
if (form.len) { |
4924
|
|
|
|
|
|
|
// Generate all casing variants if needed (they are different than given form). |
4925
|
|
|
|
|
|
|
string form_uclc; // first uppercase, rest lowercase |
4926
|
|
|
|
|
|
|
string form_lc; // all lowercase |
4927
|
0
|
0
|
|
|
|
|
generate_casing_variants(form, form_uclc, form_lc); |
4928
|
|
|
|
|
|
|
|
4929
|
|
|
|
|
|
|
// Start by analysing using the dictionary and all casing variants. |
4930
|
0
|
0
|
|
|
|
|
dictionary.analyze(form, lemmas); |
4931
|
0
|
0
|
|
|
|
|
if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas); |
|
|
0
|
|
|
|
|
|
4932
|
0
|
0
|
|
|
|
|
if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas); |
|
|
0
|
|
|
|
|
|
4933
|
0
|
0
|
|
|
|
|
if (!lemmas.empty()) return NO_GUESSER; |
4934
|
|
|
|
|
|
|
|
4935
|
|
|
|
|
|
|
// Then call analyze_special to handle numbers and punctuation. |
4936
|
0
|
0
|
|
|
|
|
analyze_special(form, lemmas); |
4937
|
0
|
0
|
|
|
|
|
if (!lemmas.empty()) return NO_GUESSER; |
4938
|
|
|
|
|
|
|
|
4939
|
|
|
|
|
|
|
// For the prefix guesser, use only form_lc. |
4940
|
0
|
0
|
|
|
|
|
if (guesser == GUESSER && prefix_guesser) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4941
|
0
|
0
|
|
|
|
|
prefix_guesser->analyze(form_lc.empty() ? form : form_lc, lemmas); |
|
|
0
|
|
|
|
|
|
4942
|
|
|
|
|
|
|
bool prefix_guesser_guesses = !lemmas.empty(); |
4943
|
|
|
|
|
|
|
|
4944
|
|
|
|
|
|
|
// For the statistical guesser, use all casing variants. |
4945
|
0
|
0
|
|
|
|
|
if (guesser == GUESSER && statistical_guesser) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4946
|
0
|
0
|
|
|
|
|
if (form_uclc.empty() && form_lc.empty()) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
4947
|
0
|
0
|
|
|
|
|
statistical_guesser->analyze(form, lemmas, nullptr); |
4948
|
|
|
|
|
|
|
else { |
4949
|
0
|
0
|
|
|
|
|
morpho_statistical_guesser::used_rules used_rules; used_rules.reserve(3); |
4950
|
0
|
0
|
|
|
|
|
statistical_guesser->analyze(form, lemmas, &used_rules); |
4951
|
0
|
0
|
|
|
|
|
if (!form_uclc.empty()) statistical_guesser->analyze(form_uclc, lemmas, &used_rules); |
|
|
0
|
|
|
|
|
|
4952
|
0
|
0
|
|
|
|
|
if (!form_lc.empty()) statistical_guesser->analyze(form_lc, lemmas, &used_rules); |
|
|
0
|
|
|
|
|
|
4953
|
|
|
|
|
|
|
} |
4954
|
|
|
|
|
|
|
} |
4955
|
|
|
|
|
|
|
|
4956
|
|
|
|
|
|
|
// Make sure results are unique lemma-tag pairs. Statistical guesser produces |
4957
|
|
|
|
|
|
|
// unique lemma-tag pairs, but prefix guesser does not. |
4958
|
0
|
0
|
|
|
|
|
if (prefix_guesser_guesses) { |
4959
|
0
|
|
|
|
|
|
sort(lemmas.begin(), lemmas.end(), [](const tagged_lemma& a, const tagged_lemma& b) { |
4960
|
0
|
|
|
|
|
|
int lemma_compare = a.lemma.compare(b.lemma); |
4961
|
0
|
0
|
|
|
|
|
return lemma_compare < 0 || (lemma_compare == 0 && a.tag < b.tag); |
4962
|
|
|
|
|
|
|
}); |
4963
|
0
|
|
|
|
|
|
auto lemmas_end = unique(lemmas.begin(), lemmas.end(), [](const tagged_lemma& a, const tagged_lemma& b) { |
4964
|
0
|
0
|
|
|
|
|
return a.lemma == b.lemma && a.tag == b.tag; |
|
|
0
|
|
|
|
|
|
4965
|
0
|
|
|
|
|
|
}); |
4966
|
0
|
0
|
|
|
|
|
if (lemmas_end != lemmas.end()) lemmas.erase(lemmas_end, lemmas.end()); |
4967
|
|
|
|
|
|
|
} |
4968
|
|
|
|
|
|
|
|
4969
|
0
|
0
|
|
|
|
|
if (!lemmas.empty()) return GUESSER; |
4970
|
|
|
|
|
|
|
} |
4971
|
|
|
|
|
|
|
|
4972
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), unknown_tag); |
4973
|
0
|
|
|
|
|
|
return -1; |
4974
|
|
|
|
|
|
|
} |
4975
|
|
|
|
|
|
|
|
4976
|
0
|
|
|
|
|
|
int czech_morpho::generate(string_piece lemma, const char* tag_wildcard, morpho::guesser_mode guesser, vector& forms) const { |
4977
|
|
|
|
|
|
|
forms.clear(); |
4978
|
|
|
|
|
|
|
|
4979
|
0
|
|
|
|
|
|
tag_filter filter(tag_wildcard); |
4980
|
|
|
|
|
|
|
|
4981
|
0
|
0
|
|
|
|
|
if (lemma.len) { |
4982
|
0
|
0
|
|
|
|
|
if (dictionary.generate(lemma, filter, forms)) |
|
|
0
|
|
|
|
|
|
4983
|
|
|
|
|
|
|
return NO_GUESSER; |
4984
|
|
|
|
|
|
|
|
4985
|
0
|
0
|
|
|
|
|
if (guesser == GUESSER && prefix_guesser) |
|
|
0
|
|
|
|
|
|
4986
|
|
|
|
|
|
|
if (prefix_guesser->generate(lemma, filter, forms)) |
4987
|
|
|
|
|
|
|
return GUESSER; |
4988
|
|
|
|
|
|
|
} |
4989
|
|
|
|
|
|
|
|
4990
|
|
|
|
|
|
|
return -1; |
4991
|
|
|
|
|
|
|
} |
4992
|
|
|
|
|
|
|
|
4993
|
0
|
|
|
|
|
|
int czech_morpho::raw_lemma_len(string_piece lemma) const { |
4994
|
0
|
|
|
|
|
|
return czech_lemma_addinfo::raw_lemma_len(lemma); |
4995
|
|
|
|
|
|
|
} |
4996
|
|
|
|
|
|
|
|
4997
|
0
|
|
|
|
|
|
int czech_morpho::lemma_id_len(string_piece lemma) const { |
4998
|
0
|
|
|
|
|
|
return czech_lemma_addinfo::lemma_id_len(lemma); |
4999
|
|
|
|
|
|
|
} |
5000
|
|
|
|
|
|
|
|
5001
|
0
|
|
|
|
|
|
int czech_morpho::raw_form_len(string_piece form) const { |
5002
|
0
|
|
|
|
|
|
return form.len; |
5003
|
|
|
|
|
|
|
} |
5004
|
|
|
|
|
|
|
|
5005
|
0
|
|
|
|
|
|
tokenizer* czech_morpho::new_tokenizer() const { |
5006
|
0
|
0
|
|
|
|
|
return new czech_tokenizer(language, version, this); |
5007
|
|
|
|
|
|
|
} |
5008
|
|
|
|
|
|
|
|
5009
|
|
|
|
|
|
|
// What characters are considered punctuation except for the ones in unicode Punctuation category. |
5010
|
|
|
|
|
|
|
static bool punctuation_additional[] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1/*$*/, |
5011
|
|
|
|
|
|
|
0,0,0,0,0,0,1/*+*/,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1/*<*/,1/*=*/,1/*>*/,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
5012
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,1/*^*/,0,1/*`*/,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1/*|*/,0,1/*~*/,0,0,0,0,0,0,0,0, |
5013
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
5014
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
5015
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
5016
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
5017
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
5018
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
5019
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
5020
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
5021
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
5022
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1/*caron*/}; |
5023
|
|
|
|
|
|
|
|
5024
|
|
|
|
|
|
|
// What characters of unicode Punctuation category are not considered punctuation. |
5025
|
|
|
|
|
|
|
static bool punctuation_exceptions[] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
5026
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
5027
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
5028
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,1/*paragraph*/}; |
5029
|
|
|
|
|
|
|
|
5030
|
0
|
|
|
|
|
|
void czech_morpho::analyze_special(string_piece form, vector& lemmas) const { |
5031
|
|
|
|
|
|
|
using namespace unilib; |
5032
|
|
|
|
|
|
|
|
5033
|
|
|
|
|
|
|
// Analyzer for numbers and punctuation. |
5034
|
|
|
|
|
|
|
// Number is anything matching [+-]? is_Pn* ([.,] is_Pn*)? ([Ee] [+-]? is_Pn+)? for at least one is_Pn* nonempty. |
5035
|
|
|
|
|
|
|
// Punctuation is any form beginning with either unicode punctuation or punctuation_exceptions character. |
5036
|
|
|
|
|
|
|
// Beware that numbers takes precedence, so - is punctuation, -3 is number, -. is punctuation, -.3 is number. |
5037
|
0
|
0
|
|
|
|
|
if (!form.len) return; |
5038
|
|
|
|
|
|
|
|
5039
|
0
|
|
|
|
|
|
string_piece form_ori = form; |
5040
|
0
|
|
|
|
|
|
char32_t first = utf8::decode(form.str, form.len); |
5041
|
|
|
|
|
|
|
|
5042
|
|
|
|
|
|
|
// Try matching a number. |
5043
|
|
|
|
|
|
|
char32_t codepoint = first; |
5044
|
|
|
|
|
|
|
bool any_digit = false; |
5045
|
0
|
0
|
|
|
|
|
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(form.str, form.len); |
5046
|
0
|
0
|
|
|
|
|
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(form.str, form.len); |
5047
|
0
|
0
|
|
|
|
|
if ((codepoint == '.' && form.len) || codepoint == ',') codepoint = utf8::decode(form.str, form.len); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
5048
|
0
|
0
|
|
|
|
|
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(form.str, form.len); |
5049
|
0
|
0
|
|
|
|
|
if (any_digit && (codepoint == 'e' || codepoint == 'E')) { |
|
|
0
|
|
|
|
|
|
5050
|
0
|
|
|
|
|
|
codepoint = utf8::decode(form.str, form.len); |
5051
|
0
|
0
|
|
|
|
|
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(form.str, form.len); |
5052
|
|
|
|
|
|
|
any_digit = false; |
5053
|
0
|
0
|
|
|
|
|
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(form.str, form.len); |
5054
|
|
|
|
|
|
|
} |
5055
|
|
|
|
|
|
|
|
5056
|
0
|
0
|
|
|
|
|
if (any_digit && !form.len && (!codepoint || codepoint == '.')) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
5057
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form_ori.str, form_ori.len), number_tag); |
5058
|
0
|
0
|
|
|
|
|
} else if ((first < sizeof(punctuation_additional) && punctuation_additional[first]) || |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
5059
|
0
|
0
|
|
|
|
|
((unicode::category(first) & unicode::P) && (first >= sizeof(punctuation_exceptions) || !punctuation_exceptions[first]))) |
|
|
0
|
|
|
|
|
|
5060
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form_ori.str, form_ori.len), punctuation_tag); |
5061
|
|
|
|
|
|
|
} |
5062
|
|
|
|
|
|
|
|
5063
|
|
|
|
|
|
|
} // namespace morphodita |
5064
|
|
|
|
|
|
|
|
5065
|
|
|
|
|
|
|
///////// |
5066
|
|
|
|
|
|
|
// File: morphodita/morpho/english_lemma_addinfo.h |
5067
|
|
|
|
|
|
|
///////// |
5068
|
|
|
|
|
|
|
|
5069
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
5070
|
|
|
|
|
|
|
// |
5071
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
5072
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
5073
|
|
|
|
|
|
|
// |
5074
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
5075
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
5076
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
5077
|
|
|
|
|
|
|
|
5078
|
|
|
|
|
|
|
namespace morphodita { |
5079
|
|
|
|
|
|
|
|
5080
|
|
|
|
|
|
|
// Declarations |
5081
|
0
|
|
|
|
|
|
struct english_lemma_addinfo { |
5082
|
|
|
|
|
|
|
inline static int raw_lemma_len(string_piece lemma); |
5083
|
|
|
|
|
|
|
inline static int lemma_id_len(string_piece lemma); |
5084
|
|
|
|
|
|
|
inline static string format(const unsigned char* addinfo, int addinfo_len); |
5085
|
|
|
|
|
|
|
inline static bool generatable(const unsigned char* addinfo, int addinfo_len); |
5086
|
|
|
|
|
|
|
|
5087
|
|
|
|
|
|
|
inline int parse(string_piece lemma, bool die_on_failure = false); |
5088
|
|
|
|
|
|
|
inline bool match_lemma_id(const unsigned char* other_addinfo, int other_addinfo_len); |
5089
|
|
|
|
|
|
|
|
5090
|
|
|
|
|
|
|
vector data; |
5091
|
|
|
|
|
|
|
}; |
5092
|
|
|
|
|
|
|
|
5093
|
|
|
|
|
|
|
// Definitions |
5094
|
0
|
|
|
|
|
|
int english_lemma_addinfo::raw_lemma_len(string_piece lemma) { |
5095
|
|
|
|
|
|
|
// Lemma ends either by |
5096
|
|
|
|
|
|
|
// - '^' on non-first position followed by nothing or [A-Za-z][-A-Za-z]* |
5097
|
|
|
|
|
|
|
// - '+' on non-first position followed by nothing |
5098
|
0
|
0
|
|
|
|
|
for (unsigned len = 1; len < lemma.len; len++) { |
5099
|
0
|
0
|
|
|
|
|
if (len + 1 == lemma.len && (lemma.str[len] == '^' || lemma.str[len] == '+')) |
|
|
0
|
|
|
|
|
|
5100
|
0
|
|
|
|
|
|
return len; |
5101
|
0
|
0
|
|
|
|
|
if (len + 1 < lemma.len && lemma.str[len] == '^') { |
|
|
0
|
|
|
|
|
|
5102
|
|
|
|
|
|
|
bool ok = true; |
5103
|
0
|
0
|
|
|
|
|
for (unsigned i = len + 1; ok && i < lemma.len; i++) |
|
|
0
|
|
|
|
|
|
5104
|
0
|
0
|
|
|
|
|
ok &= (lemma.str[i] >= 'A' && lemma.str[i] <= 'Z') || |
5105
|
0
|
0
|
|
|
|
|
(lemma.str[i] >= 'a' && lemma.str[i] <= 'z') || |
|
|
0
|
|
|
|
|
|
5106
|
0
|
0
|
|
|
|
|
(i > len + 1 && lemma.str[i] == '-'); |
5107
|
0
|
0
|
|
|
|
|
if (ok) return len; |
5108
|
|
|
|
|
|
|
} |
5109
|
|
|
|
|
|
|
} |
5110
|
0
|
|
|
|
|
|
return lemma.len; |
5111
|
|
|
|
|
|
|
} |
5112
|
|
|
|
|
|
|
|
5113
|
|
|
|
|
|
|
int english_lemma_addinfo::lemma_id_len(string_piece lemma) { |
5114
|
|
|
|
|
|
|
// No lemma comments. |
5115
|
0
|
|
|
|
|
|
return lemma.len; |
5116
|
|
|
|
|
|
|
} |
5117
|
|
|
|
|
|
|
|
5118
|
|
|
|
|
|
|
string english_lemma_addinfo::format(const unsigned char* addinfo, int addinfo_len) { |
5119
|
0
|
|
|
|
|
|
return string((const char*) addinfo, addinfo_len); |
5120
|
|
|
|
|
|
|
} |
5121
|
|
|
|
|
|
|
|
5122
|
|
|
|
|
|
|
bool english_lemma_addinfo::generatable(const unsigned char* /*addinfo*/, int /*addinfo_len*/) { |
5123
|
|
|
|
|
|
|
return true; |
5124
|
|
|
|
|
|
|
} |
5125
|
|
|
|
|
|
|
|
5126
|
0
|
|
|
|
|
|
int english_lemma_addinfo::parse(string_piece lemma, bool /*die_on_failure*/) { |
5127
|
|
|
|
|
|
|
data.clear(); |
5128
|
|
|
|
|
|
|
|
5129
|
0
|
|
|
|
|
|
size_t len = raw_lemma_len(lemma); |
5130
|
0
|
0
|
|
|
|
|
for (size_t i = len; i < lemma.len; i++) |
5131
|
0
|
|
|
|
|
|
data.push_back(lemma.str[i]); |
5132
|
|
|
|
|
|
|
|
5133
|
0
|
|
|
|
|
|
return len; |
5134
|
|
|
|
|
|
|
} |
5135
|
|
|
|
|
|
|
|
5136
|
0
|
|
|
|
|
|
bool english_lemma_addinfo::match_lemma_id(const unsigned char* other_addinfo, int other_addinfo_len) { |
5137
|
0
|
0
|
|
|
|
|
if (data.empty()) return true; |
5138
|
0
|
0
|
|
|
|
|
if (data.size() == 1 && data[0] == '^') return other_addinfo_len > 0 && other_addinfo[0] == '^'; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
5139
|
0
|
0
|
|
|
|
|
if (data.size() == 1 && data[0] == '+') return other_addinfo_len == 0; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
5140
|
0
|
0
|
|
|
|
|
return data.size() == size_t(other_addinfo_len) && small_memeq(data.data(), other_addinfo, other_addinfo_len); |
|
|
0
|
|
|
|
|
|
5141
|
|
|
|
|
|
|
} |
5142
|
|
|
|
|
|
|
|
5143
|
|
|
|
|
|
|
} // namespace morphodita |
5144
|
|
|
|
|
|
|
|
5145
|
|
|
|
|
|
|
///////// |
5146
|
|
|
|
|
|
|
// File: morphodita/morpho/english_morpho_guesser.h |
5147
|
|
|
|
|
|
|
///////// |
5148
|
|
|
|
|
|
|
|
5149
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
5150
|
|
|
|
|
|
|
// |
5151
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
5152
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
5153
|
|
|
|
|
|
|
// |
5154
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
5155
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
5156
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
5157
|
|
|
|
|
|
|
|
5158
|
|
|
|
|
|
|
namespace morphodita { |
5159
|
|
|
|
|
|
|
|
5160
|
0
|
0
|
|
|
|
|
class english_morpho_guesser { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
5161
|
|
|
|
|
|
|
public: |
5162
|
|
|
|
|
|
|
void load(binary_decoder& data); |
5163
|
|
|
|
|
|
|
void analyze(string_piece form, string_piece form_lc, vector& lemmas) const; |
5164
|
|
|
|
|
|
|
bool analyze_proper_names(string_piece form, string_piece form_lc, vector& lemmas) const; |
5165
|
|
|
|
|
|
|
|
5166
|
|
|
|
|
|
|
private: |
5167
|
|
|
|
|
|
|
inline void add(const string& tag, const string& form, vector& lemmas) const; |
5168
|
|
|
|
|
|
|
inline void add(const string& tag, const string& tag2, const string& form, vector& lemmas) const; |
5169
|
|
|
|
|
|
|
inline void add(const string& tag, const string& form, unsigned negation_len, vector& lemmas) const; |
5170
|
|
|
|
|
|
|
inline void add(const string& tag, const string& tag2, const string& form, unsigned negation_len, vector& lemmas) const; |
5171
|
|
|
|
|
|
|
void add_NNS(const string& form, unsigned negation_len, vector& lemmas) const; |
5172
|
|
|
|
|
|
|
void add_NNPS(const string& form, vector& lemmas) const; |
5173
|
|
|
|
|
|
|
void add_VBG(const string& form, vector& lemmas) const; |
5174
|
|
|
|
|
|
|
void add_VBD_VBN(const string& form, vector& lemmas) const; |
5175
|
|
|
|
|
|
|
void add_VBZ(const string& form, vector& lemmas) const; |
5176
|
|
|
|
|
|
|
void add_JJR_RBR(const string& form, unsigned negation_len, vector& lemmas) const; |
5177
|
|
|
|
|
|
|
void add_JJS_RBS(const string& form, unsigned negation_len, vector& lemmas) const; |
5178
|
|
|
|
|
|
|
|
5179
|
|
|
|
|
|
|
enum { NEGATION_LEN = 0, TO_FOLLOW = 1, TOTAL = 2 }; |
5180
|
|
|
|
|
|
|
vector exceptions_tags; |
5181
|
|
|
|
|
|
|
persistent_unordered_map exceptions; |
5182
|
|
|
|
|
|
|
persistent_unordered_map negations; |
5183
|
|
|
|
|
|
|
string CD = "CD", FW = "FW", JJ = "JJ", JJR = "JJR", JJS = "JJS", |
5184
|
|
|
|
|
|
|
NN = "NN", NNP = "NNP", NNPS = "NNPS", NNS = "NNS", RB = "RB", |
5185
|
|
|
|
|
|
|
RBR = "RBR", RBS = "RBS", SYM = "SYM", VB = "VB", VBD = "VBD", |
5186
|
|
|
|
|
|
|
VBG = "VBG", VBN = "VBN", VBP = "VBP", VBZ = "VBZ"; |
5187
|
|
|
|
|
|
|
}; |
5188
|
|
|
|
|
|
|
|
5189
|
|
|
|
|
|
|
} // namespace morphodita |
5190
|
|
|
|
|
|
|
|
5191
|
|
|
|
|
|
|
///////// |
5192
|
|
|
|
|
|
|
// File: morphodita/morpho/english_morpho.h |
5193
|
|
|
|
|
|
|
///////// |
5194
|
|
|
|
|
|
|
|
5195
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
5196
|
|
|
|
|
|
|
// |
5197
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
5198
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
5199
|
|
|
|
|
|
|
// |
5200
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
5201
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
5202
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
5203
|
|
|
|
|
|
|
|
5204
|
|
|
|
|
|
|
namespace morphodita { |
5205
|
|
|
|
|
|
|
|
5206
|
0
|
|
|
|
|
|
class english_morpho : public morpho { |
5207
|
|
|
|
|
|
|
public: |
5208
|
0
|
0
|
|
|
|
|
english_morpho(unsigned version) : version(version) {} |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
5209
|
|
|
|
|
|
|
|
5210
|
|
|
|
|
|
|
virtual int analyze(string_piece form, morpho::guesser_mode guesser, vector& lemmas) const override; |
5211
|
|
|
|
|
|
|
virtual int generate(string_piece lemma, const char* tag_wildcard, guesser_mode guesser, vector& forms) const override; |
5212
|
|
|
|
|
|
|
virtual int raw_lemma_len(string_piece lemma) const override; |
5213
|
|
|
|
|
|
|
virtual int lemma_id_len(string_piece lemma) const override; |
5214
|
|
|
|
|
|
|
virtual int raw_form_len(string_piece form) const override; |
5215
|
|
|
|
|
|
|
virtual tokenizer* new_tokenizer() const override; |
5216
|
|
|
|
|
|
|
|
5217
|
|
|
|
|
|
|
bool load(istream& is); |
5218
|
|
|
|
|
|
|
private: |
5219
|
|
|
|
|
|
|
inline void analyze_special(string_piece form, vector& lemmas) const; |
5220
|
|
|
|
|
|
|
|
5221
|
|
|
|
|
|
|
unsigned version; |
5222
|
|
|
|
|
|
|
morpho_dictionary dictionary; |
5223
|
|
|
|
|
|
|
english_morpho_guesser morpho_guesser; |
5224
|
|
|
|
|
|
|
|
5225
|
|
|
|
|
|
|
string unknown_tag = "UNK"; |
5226
|
|
|
|
|
|
|
string number_tag = "CD", nnp_tag = "NNP", ls_tag = "LS"; |
5227
|
|
|
|
|
|
|
string open_quotation_tag = "``", close_quotation_tag = "''"; |
5228
|
|
|
|
|
|
|
string open_parenthesis_tag = "(", close_parenthesis_tag = ")"; |
5229
|
|
|
|
|
|
|
string comma_tag = ",", dot_tag = ".", punctuation_tag = ":", hash_tag = "#", dollar_tag = "$"; |
5230
|
|
|
|
|
|
|
string sym_tag = "SYM", jj_tag = "JJ", nn_tag = "NN", nns_tag = "NNS", cc_tag = "CC", pos_tag = "POS", in_tag = "IN"; |
5231
|
|
|
|
|
|
|
}; |
5232
|
|
|
|
|
|
|
|
5233
|
|
|
|
|
|
|
} // namespace morphodita |
5234
|
|
|
|
|
|
|
|
5235
|
|
|
|
|
|
|
///////// |
5236
|
|
|
|
|
|
|
// File: morphodita/tokenizer/english_tokenizer.h |
5237
|
|
|
|
|
|
|
///////// |
5238
|
|
|
|
|
|
|
|
5239
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
5240
|
|
|
|
|
|
|
// |
5241
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
5242
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
5243
|
|
|
|
|
|
|
// |
5244
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
5245
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
5246
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
5247
|
|
|
|
|
|
|
|
5248
|
|
|
|
|
|
|
namespace morphodita { |
5249
|
|
|
|
|
|
|
|
5250
|
0
|
|
|
|
|
|
class english_tokenizer : public ragel_tokenizer { |
5251
|
|
|
|
|
|
|
public: |
5252
|
|
|
|
|
|
|
enum { LATEST = 2 }; |
5253
|
|
|
|
|
|
|
english_tokenizer(unsigned version); |
5254
|
|
|
|
|
|
|
|
5255
|
|
|
|
|
|
|
virtual bool next_sentence(vector& tokens) override; |
5256
|
|
|
|
|
|
|
|
5257
|
|
|
|
|
|
|
private: |
5258
|
|
|
|
|
|
|
void split_token(vector& tokens); |
5259
|
|
|
|
|
|
|
|
5260
|
|
|
|
|
|
|
static const unordered_set abbreviations; |
5261
|
|
|
|
|
|
|
}; |
5262
|
|
|
|
|
|
|
|
5263
|
|
|
|
|
|
|
} // namespace morphodita |
5264
|
|
|
|
|
|
|
|
5265
|
|
|
|
|
|
|
///////// |
5266
|
|
|
|
|
|
|
// File: morphodita/morpho/english_morpho.cpp |
5267
|
|
|
|
|
|
|
///////// |
5268
|
|
|
|
|
|
|
|
5269
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
5270
|
|
|
|
|
|
|
// |
5271
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
5272
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
5273
|
|
|
|
|
|
|
// |
5274
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
5275
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
5276
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
5277
|
|
|
|
|
|
|
|
5278
|
|
|
|
|
|
|
namespace morphodita { |
5279
|
|
|
|
|
|
|
|
5280
|
0
|
|
|
|
|
|
bool english_morpho::load(istream& is) { |
5281
|
|
|
|
|
|
|
binary_decoder data; |
5282
|
0
|
0
|
|
|
|
|
if (!compressor::load(is, data)) return false; |
|
|
0
|
|
|
|
|
|
5283
|
|
|
|
|
|
|
|
5284
|
|
|
|
|
|
|
try { |
5285
|
0
|
0
|
|
|
|
|
dictionary.load(data); |
5286
|
0
|
0
|
|
|
|
|
morpho_guesser.load(data); |
|
|
0
|
|
|
|
|
|
5287
|
|
|
|
|
|
|
} catch (binary_decoder_error&) { |
5288
|
|
|
|
|
|
|
return false; |
5289
|
|
|
|
|
|
|
} |
5290
|
|
|
|
|
|
|
|
5291
|
0
|
|
|
|
|
|
return data.is_end(); |
5292
|
|
|
|
|
|
|
} |
5293
|
|
|
|
|
|
|
|
5294
|
0
|
|
|
|
|
|
int english_morpho::analyze(string_piece form, guesser_mode guesser, vector& lemmas) const { |
5295
|
|
|
|
|
|
|
lemmas.clear(); |
5296
|
|
|
|
|
|
|
|
5297
|
0
|
0
|
|
|
|
|
if (form.len) { |
5298
|
|
|
|
|
|
|
// Generate all casing variants if needed (they are different than given form). |
5299
|
|
|
|
|
|
|
string form_uclc; // first uppercase, rest lowercase |
5300
|
|
|
|
|
|
|
string form_lc; // all lowercase |
5301
|
0
|
0
|
|
|
|
|
generate_casing_variants(form, form_uclc, form_lc); |
5302
|
|
|
|
|
|
|
|
5303
|
|
|
|
|
|
|
// Start by analysing using the dictionary and all casing variants. |
5304
|
0
|
0
|
|
|
|
|
dictionary.analyze(form, lemmas); |
5305
|
0
|
0
|
|
|
|
|
if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas); |
|
|
0
|
|
|
|
|
|
5306
|
0
|
0
|
|
|
|
|
if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas); |
|
|
0
|
|
|
|
|
|
5307
|
0
|
0
|
|
|
|
|
if (!lemmas.empty()) |
5308
|
0
|
0
|
|
|
|
|
return guesser == NO_GUESSER || !morpho_guesser.analyze_proper_names(form, form_lc.empty() ? form : form_lc, lemmas) ? NO_GUESSER : GUESSER; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
5309
|
|
|
|
|
|
|
|
5310
|
|
|
|
|
|
|
// Then call analyze_special to handle numbers, punctuation and symbols. |
5311
|
0
|
0
|
|
|
|
|
analyze_special(form, lemmas); |
5312
|
0
|
0
|
|
|
|
|
if (!lemmas.empty()) return NO_GUESSER; |
5313
|
|
|
|
|
|
|
|
5314
|
|
|
|
|
|
|
// Use English guesser on form_lc if allowed. |
5315
|
0
|
0
|
|
|
|
|
if (guesser == GUESSER) |
5316
|
0
|
0
|
|
|
|
|
morpho_guesser.analyze(form, form_lc.empty() ? form : form_lc, lemmas); |
|
|
0
|
|
|
|
|
|
5317
|
0
|
0
|
|
|
|
|
if (!lemmas.empty()) return GUESSER; |
5318
|
|
|
|
|
|
|
} |
5319
|
|
|
|
|
|
|
|
5320
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), unknown_tag); |
5321
|
0
|
|
|
|
|
|
return -1; |
5322
|
|
|
|
|
|
|
} |
5323
|
|
|
|
|
|
|
|
5324
|
0
|
|
|
|
|
|
int english_morpho::generate(string_piece lemma, const char* tag_wildcard, morpho::guesser_mode /*guesser*/, vector& forms) const { |
5325
|
|
|
|
|
|
|
forms.clear(); |
5326
|
|
|
|
|
|
|
|
5327
|
0
|
|
|
|
|
|
tag_filter filter(tag_wildcard); |
5328
|
|
|
|
|
|
|
|
5329
|
0
|
0
|
|
|
|
|
if (lemma.len) { |
5330
|
0
|
0
|
|
|
|
|
if (dictionary.generate(lemma, filter, forms)) |
|
|
0
|
|
|
|
|
|
5331
|
|
|
|
|
|
|
return NO_GUESSER; |
5332
|
|
|
|
|
|
|
} |
5333
|
|
|
|
|
|
|
|
5334
|
|
|
|
|
|
|
return -1; |
5335
|
|
|
|
|
|
|
} |
5336
|
|
|
|
|
|
|
|
5337
|
0
|
|
|
|
|
|
int english_morpho::raw_lemma_len(string_piece lemma) const { |
5338
|
0
|
|
|
|
|
|
return english_lemma_addinfo::raw_lemma_len(lemma); |
5339
|
|
|
|
|
|
|
} |
5340
|
|
|
|
|
|
|
|
5341
|
0
|
|
|
|
|
|
int english_morpho::lemma_id_len(string_piece lemma) const { |
5342
|
0
|
|
|
|
|
|
return english_lemma_addinfo::lemma_id_len(lemma); |
5343
|
|
|
|
|
|
|
} |
5344
|
|
|
|
|
|
|
|
5345
|
0
|
|
|
|
|
|
int english_morpho::raw_form_len(string_piece form) const { |
5346
|
0
|
|
|
|
|
|
return form.len; |
5347
|
|
|
|
|
|
|
} |
5348
|
|
|
|
|
|
|
|
5349
|
0
|
|
|
|
|
|
tokenizer* english_morpho::new_tokenizer() const { |
5350
|
0
|
0
|
|
|
|
|
return new english_tokenizer(version <= 2 ? 1 : 2); |
5351
|
|
|
|
|
|
|
} |
5352
|
|
|
|
|
|
|
|
5353
|
0
|
|
|
|
|
|
void english_morpho::analyze_special(string_piece form, vector& lemmas) const { |
5354
|
|
|
|
|
|
|
using namespace unilib; |
5355
|
|
|
|
|
|
|
|
5356
|
|
|
|
|
|
|
// Analyzer for numbers and punctuation. |
5357
|
0
|
0
|
|
|
|
|
if (!form.len) return; |
5358
|
|
|
|
|
|
|
|
5359
|
|
|
|
|
|
|
// One-letter punctuation exceptions. |
5360
|
0
|
0
|
|
|
|
|
if (form.len == 1) |
5361
|
0
|
|
|
|
|
|
switch(*form.str) { |
5362
|
|
|
|
|
|
|
case '.': |
5363
|
|
|
|
|
|
|
case '!': |
5364
|
0
|
0
|
|
|
|
|
case '?': lemmas.emplace_back(string(form.str, form.len), dot_tag); return; |
5365
|
0
|
0
|
|
|
|
|
case ',': lemmas.emplace_back(string(form.str, form.len), comma_tag); return; |
5366
|
0
|
0
|
|
|
|
|
case '#': lemmas.emplace_back(string(form.str, form.len), hash_tag); return; |
5367
|
0
|
0
|
|
|
|
|
case '$': lemmas.emplace_back(string(form.str, form.len), dollar_tag); return; |
5368
|
0
|
0
|
|
|
|
|
case '[': lemmas.emplace_back(string(form.str, form.len), sym_tag); return; |
5369
|
0
|
0
|
|
|
|
|
case ']': lemmas.emplace_back(string(form.str, form.len), sym_tag); return; |
5370
|
0
|
0
|
|
|
|
|
case '%': lemmas.emplace_back(string(form.str, form.len), jj_tag); |
5371
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), nn_tag); return; |
5372
|
0
|
0
|
|
|
|
|
case '&': lemmas.emplace_back(string(form.str, form.len), cc_tag); |
5373
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), sym_tag); return; |
5374
|
0
|
0
|
|
|
|
|
case '*': lemmas.emplace_back(string(form.str, form.len), sym_tag); |
5375
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), nn_tag); return; |
5376
|
0
|
0
|
|
|
|
|
case '@': lemmas.emplace_back(string(form.str, form.len), sym_tag); |
5377
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), in_tag); return; |
5378
|
0
|
0
|
|
|
|
|
case '\'': lemmas.emplace_back(string(form.str, form.len), close_quotation_tag); |
5379
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), pos_tag); return; |
5380
|
|
|
|
|
|
|
} |
5381
|
|
|
|
|
|
|
|
5382
|
|
|
|
|
|
|
// Try matching a number: [+-]? is_Pn* (, is_Pn{3})? (. is_Pn*)? (s | [Ee] [+-]? is_Pn+)? with at least one digit |
5383
|
0
|
|
|
|
|
|
string_piece number = form; |
5384
|
0
|
|
|
|
|
|
char32_t codepoint = utf8::decode(number.str, number.len); |
5385
|
|
|
|
|
|
|
bool any_digit = false; |
5386
|
0
|
0
|
|
|
|
|
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(number.str, number.len); |
5387
|
0
|
0
|
|
|
|
|
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
5388
|
0
|
0
|
|
|
|
|
while (codepoint == ',') { |
5389
|
0
|
|
|
|
|
|
string_piece group = number; |
5390
|
0
|
0
|
|
|
|
|
if (unicode::category(utf8::decode(group.str, group.len) & ~unicode::N)) break; |
5391
|
0
|
0
|
|
|
|
|
if (unicode::category(utf8::decode(group.str, group.len) & ~unicode::N)) break; |
5392
|
0
|
0
|
|
|
|
|
if (unicode::category(utf8::decode(group.str, group.len) & ~unicode::N)) break; |
5393
|
|
|
|
|
|
|
any_digit = true; |
5394
|
0
|
|
|
|
|
|
number = group; |
5395
|
0
|
|
|
|
|
|
codepoint = utf8::decode(number.str, number.len); |
5396
|
|
|
|
|
|
|
} |
5397
|
0
|
0
|
|
|
|
|
if (codepoint == '.' && number.len) { |
|
|
0
|
|
|
|
|
|
5398
|
0
|
|
|
|
|
|
codepoint = utf8::decode(number.str, number.len); |
5399
|
0
|
0
|
|
|
|
|
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
5400
|
|
|
|
|
|
|
} |
5401
|
0
|
0
|
|
|
|
|
if (version >= 2 && any_digit && codepoint == 's' && !number.len) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
5402
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), number_tag); |
5403
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len - 1), nns_tag); |
5404
|
0
|
|
|
|
|
|
return; |
5405
|
|
|
|
|
|
|
} |
5406
|
0
|
0
|
|
|
|
|
if (any_digit && (codepoint == 'e' || codepoint == 'E')) { |
|
|
0
|
|
|
|
|
|
5407
|
0
|
|
|
|
|
|
codepoint = utf8::decode(number.str, number.len); |
5408
|
0
|
0
|
|
|
|
|
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(number.str, number.len); |
5409
|
|
|
|
|
|
|
any_digit = false; |
5410
|
0
|
0
|
|
|
|
|
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
5411
|
|
|
|
|
|
|
} |
5412
|
0
|
0
|
|
|
|
|
if (any_digit && !number.len && (!codepoint || codepoint == '.')) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
5413
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), number_tag); |
5414
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), nnp_tag); |
5415
|
0
|
0
|
|
|
|
|
if (form.len == 1 + (codepoint == '.') && *form.str >= '1' && *form.str <= '9') |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
5416
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), ls_tag); |
5417
|
|
|
|
|
|
|
return; |
5418
|
|
|
|
|
|
|
} |
5419
|
|
|
|
|
|
|
|
5420
|
|
|
|
|
|
|
// Open quotation, end quotation, open parentheses, end parentheses, symbol, or other |
5421
|
0
|
|
|
|
|
|
string_piece punctuation = form; |
5422
|
|
|
|
|
|
|
bool open_quotation = true, close_quotation = true, open_parenthesis = true, close_parenthesis = true, any_punctuation = true, symbol = true; |
5423
|
0
|
0
|
|
|
|
|
while ((symbol || any_punctuation) && punctuation.len) { |
|
|
0
|
|
|
|
|
|
5424
|
0
|
|
|
|
|
|
codepoint = utf8::decode(punctuation.str, punctuation.len); |
5425
|
0
|
0
|
|
|
|
|
if (open_quotation) open_quotation = codepoint == '`' || unicode::category(codepoint) & unicode::Pi; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
5426
|
0
|
0
|
|
|
|
|
if (close_quotation) close_quotation = codepoint == '\'' || codepoint == '"' || unicode::category(codepoint) & unicode::Pf; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
5427
|
0
|
0
|
|
|
|
|
if (open_parenthesis) open_parenthesis = unicode::category(codepoint) & unicode::Ps; |
5428
|
0
|
0
|
|
|
|
|
if (close_parenthesis) close_parenthesis = unicode::category(codepoint) & unicode::Pe; |
5429
|
0
|
0
|
|
|
|
|
if (any_punctuation) any_punctuation = unicode::category(codepoint) & unicode::P; |
5430
|
0
|
0
|
|
|
|
|
if (symbol) symbol = codepoint == '*' || unicode::category(codepoint) & unicode::S; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
5431
|
|
|
|
|
|
|
} |
5432
|
0
|
0
|
|
|
|
|
if (!punctuation.len && open_quotation) { lemmas.emplace_back(string(form.str, form.len), open_quotation_tag); return; } |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
5433
|
0
|
0
|
|
|
|
|
if (!punctuation.len && close_quotation) { lemmas.emplace_back(string(form.str, form.len), close_quotation_tag); return; } |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
5434
|
0
|
0
|
|
|
|
|
if (!punctuation.len && open_parenthesis) { lemmas.emplace_back(string(form.str, form.len), open_parenthesis_tag); return; } |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
5435
|
0
|
0
|
|
|
|
|
if (!punctuation.len && close_parenthesis) { lemmas.emplace_back(string(form.str, form.len), close_parenthesis_tag); return; } |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
5436
|
0
|
0
|
|
|
|
|
if (!punctuation.len && symbol) { lemmas.emplace_back(string(form.str, form.len), sym_tag); return; } |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
5437
|
0
|
0
|
|
|
|
|
if (!punctuation.len && any_punctuation) { lemmas.emplace_back(string(form.str, form.len), punctuation_tag); return; } |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
5438
|
|
|
|
|
|
|
} |
5439
|
|
|
|
|
|
|
|
5440
|
|
|
|
|
|
|
} // namespace morphodita |
5441
|
|
|
|
|
|
|
|
5442
|
|
|
|
|
|
|
///////// |
5443
|
|
|
|
|
|
|
// File: morphodita/morpho/english_morpho_guesser.cpp |
5444
|
|
|
|
|
|
|
///////// |
5445
|
|
|
|
|
|
|
|
5446
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
5447
|
|
|
|
|
|
|
// |
5448
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
5449
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
5450
|
|
|
|
|
|
|
// |
5451
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
5452
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
5453
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
5454
|
|
|
|
|
|
|
|
5455
|
|
|
|
|
|
|
// This code is a reimplementation of morphologic analyzer Morphium |
5456
|
|
|
|
|
|
|
// by Johanka Spoustova (Treex::Tool::EnglishMorpho::Analysis Perl module) |
5457
|
|
|
|
|
|
|
// and reimplementation of morphologic lemmatizer by Martin Popel |
5458
|
|
|
|
|
|
|
// (Treex::Tool::EnglishMorpho::Lemmatizer Perl module). The latter is based |
5459
|
|
|
|
|
|
|
// on morpha: |
5460
|
|
|
|
|
|
|
// Minnen, G., J. Carroll and D. Pearce (2001). Applied morphological |
5461
|
|
|
|
|
|
|
// processing of English, Natural Language Engineering, 7(3). 207-223. |
5462
|
|
|
|
|
|
|
// Morpha has been released under LGPL as a part of RASP system |
5463
|
|
|
|
|
|
|
// http://ilexir.co.uk/applications/rasp/. |
5464
|
|
|
|
|
|
|
|
5465
|
|
|
|
|
|
|
namespace morphodita { |
5466
|
|
|
|
|
|
|
|
5467
|
0
|
|
|
|
|
|
void english_morpho_guesser::load(binary_decoder& data) { |
5468
|
0
|
|
|
|
|
|
unsigned tags = data.next_2B(); |
5469
|
0
|
|
|
|
|
|
exceptions_tags.clear(); |
5470
|
0
|
|
|
|
|
|
exceptions_tags.reserve(tags); |
5471
|
0
|
0
|
|
|
|
|
while (tags--) { |
5472
|
0
|
|
|
|
|
|
unsigned len = data.next_1B(); |
5473
|
0
|
0
|
|
|
|
|
exceptions_tags.emplace_back(string(data.next(len), len)); |
5474
|
|
|
|
|
|
|
} |
5475
|
|
|
|
|
|
|
|
5476
|
0
|
|
|
|
|
|
exceptions.load(data); |
5477
|
0
|
|
|
|
|
|
negations.load(data); |
5478
|
0
|
|
|
|
|
|
} |
5479
|
|
|
|
|
|
|
|
5480
|
|
|
|
|
|
|
static const char _tag_guesser_actions[] = { |
5481
|
|
|
|
|
|
|
0, 1, 0, 1, 1, 1, 2, 1, |
5482
|
|
|
|
|
|
|
3, 1, 4, 1, 5, 1, 6, 1, |
5483
|
|
|
|
|
|
|
7, 2, 2, 6, 2, 2, 7, 2, |
5484
|
|
|
|
|
|
|
4, 6, 2, 4, 7, 2, 5, 6, |
5485
|
|
|
|
|
|
|
2, 5, 7, 2, 6, 7, 3, 2, |
5486
|
|
|
|
|
|
|
6, 7, 3, 4, 6, 7, 3, 5, |
5487
|
|
|
|
|
|
|
6, 7 |
5488
|
|
|
|
|
|
|
}; |
5489
|
|
|
|
|
|
|
|
5490
|
|
|
|
|
|
|
static const unsigned char _tag_guesser_key_offsets[] = { |
5491
|
|
|
|
|
|
|
0, 19, 26, 34, 42, 50, 58, 66, |
5492
|
|
|
|
|
|
|
74, 82, 90, 100, 108, 116, 124, 132, |
5493
|
|
|
|
|
|
|
145, 153, 161, 168, 179, 195, 212, 220, |
5494
|
|
|
|
|
|
|
228, 236 |
5495
|
|
|
|
|
|
|
}; |
5496
|
|
|
|
|
|
|
|
5497
|
|
|
|
|
|
|
static const char _tag_guesser_trans_keys[] = { |
5498
|
|
|
|
|
|
|
45, 46, 99, 100, 103, 105, 109, 110, |
5499
|
|
|
|
|
|
|
114, 115, 116, 118, 120, 48, 57, 65, |
5500
|
|
|
|
|
|
|
90, 97, 122, 45, 48, 57, 65, 90, |
5501
|
|
|
|
|
|
|
97, 122, 45, 114, 48, 57, 65, 90, |
5502
|
|
|
|
|
|
|
97, 122, 45, 111, 48, 57, 65, 90, |
5503
|
|
|
|
|
|
|
97, 122, 45, 109, 48, 57, 65, 90, |
5504
|
|
|
|
|
|
|
97, 122, 45, 101, 48, 57, 65, 90, |
5505
|
|
|
|
|
|
|
97, 122, 45, 115, 48, 57, 65, 90, |
5506
|
|
|
|
|
|
|
97, 122, 45, 101, 48, 57, 65, 90, |
5507
|
|
|
|
|
|
|
97, 122, 45, 108, 48, 57, 65, 90, |
5508
|
|
|
|
|
|
|
97, 122, 45, 115, 48, 57, 65, 90, |
5509
|
|
|
|
|
|
|
97, 122, 45, 97, 101, 111, 48, 57, |
5510
|
|
|
|
|
|
|
65, 90, 98, 122, 45, 101, 48, 57, |
5511
|
|
|
|
|
|
|
65, 90, 97, 122, 45, 108, 48, 57, |
5512
|
|
|
|
|
|
|
65, 90, 97, 122, 45, 109, 48, 57, |
5513
|
|
|
|
|
|
|
65, 90, 97, 122, 45, 105, 48, 57, |
5514
|
|
|
|
|
|
|
65, 90, 97, 122, 45, 97, 101, 105, |
5515
|
|
|
|
|
|
|
111, 117, 121, 48, 57, 65, 90, 98, |
5516
|
|
|
|
|
|
|
122, 45, 115, 48, 57, 65, 90, 97, |
5517
|
|
|
|
|
|
|
122, 45, 101, 48, 57, 65, 90, 97, |
5518
|
|
|
|
|
|
|
122, 45, 48, 57, 65, 90, 97, 122, |
5519
|
|
|
|
|
|
|
45, 101, 114, 115, 116, 48, 57, 65, |
5520
|
|
|
|
|
|
|
90, 97, 122, 45, 46, 105, 109, 118, |
5521
|
|
|
|
|
|
|
120, 48, 57, 65, 90, 97, 98, 99, |
5522
|
|
|
|
|
|
|
100, 101, 122, 45, 46, 101, 105, 109, |
5523
|
|
|
|
|
|
|
118, 120, 48, 57, 65, 90, 97, 98, |
5524
|
|
|
|
|
|
|
99, 100, 102, 122, 45, 110, 48, 57, |
5525
|
|
|
|
|
|
|
65, 90, 97, 122, 45, 105, 48, 57, |
5526
|
|
|
|
|
|
|
65, 90, 97, 122, 45, 101, 48, 57, |
5527
|
|
|
|
|
|
|
65, 90, 97, 122, 45, 115, 48, 57, |
5528
|
|
|
|
|
|
|
65, 90, 97, 122, 0 |
5529
|
|
|
|
|
|
|
}; |
5530
|
|
|
|
|
|
|
|
5531
|
|
|
|
|
|
|
static const char _tag_guesser_single_lengths[] = { |
5532
|
|
|
|
|
|
|
13, 1, 2, 2, 2, 2, 2, 2, |
5533
|
|
|
|
|
|
|
2, 2, 4, 2, 2, 2, 2, 7, |
5534
|
|
|
|
|
|
|
2, 2, 1, 5, 6, 7, 2, 2, |
5535
|
|
|
|
|
|
|
2, 2 |
5536
|
|
|
|
|
|
|
}; |
5537
|
|
|
|
|
|
|
|
5538
|
|
|
|
|
|
|
static const char _tag_guesser_range_lengths[] = { |
5539
|
|
|
|
|
|
|
3, 3, 3, 3, 3, 3, 3, 3, |
5540
|
|
|
|
|
|
|
3, 3, 3, 3, 3, 3, 3, 3, |
5541
|
|
|
|
|
|
|
3, 3, 3, 3, 5, 5, 3, 3, |
5542
|
|
|
|
|
|
|
3, 3 |
5543
|
|
|
|
|
|
|
}; |
5544
|
|
|
|
|
|
|
|
5545
|
|
|
|
|
|
|
static const unsigned char _tag_guesser_index_offsets[] = { |
5546
|
|
|
|
|
|
|
0, 17, 22, 28, 34, 40, 46, 52, |
5547
|
|
|
|
|
|
|
58, 64, 70, 78, 84, 90, 96, 102, |
5548
|
|
|
|
|
|
|
113, 119, 125, 130, 139, 151, 164, 170, |
5549
|
|
|
|
|
|
|
176, 182 |
5550
|
|
|
|
|
|
|
}; |
5551
|
|
|
|
|
|
|
|
5552
|
|
|
|
|
|
|
static const char _tag_guesser_indicies[] = { |
5553
|
|
|
|
|
|
|
1, 2, 5, 6, 7, 5, 5, 8, |
5554
|
|
|
|
|
|
|
9, 10, 11, 5, 5, 3, 4, 4, |
5555
|
|
|
|
|
|
|
0, 13, 14, 15, 15, 12, 13, 16, |
5556
|
|
|
|
|
|
|
14, 15, 15, 12, 13, 17, 14, 15, |
5557
|
|
|
|
|
|
|
15, 12, 13, 18, 14, 15, 15, 12, |
5558
|
|
|
|
|
|
|
13, 18, 14, 15, 15, 12, 13, 19, |
5559
|
|
|
|
|
|
|
14, 15, 15, 12, 13, 20, 14, 15, |
5560
|
|
|
|
|
|
|
15, 12, 13, 18, 14, 15, 15, 12, |
5561
|
|
|
|
|
|
|
13, 21, 14, 15, 15, 12, 13, 22, |
5562
|
|
|
|
|
|
|
23, 24, 14, 15, 15, 12, 13, 25, |
5563
|
|
|
|
|
|
|
14, 15, 15, 12, 13, 23, 14, 15, |
5564
|
|
|
|
|
|
|
15, 12, 13, 23, 14, 15, 15, 12, |
5565
|
|
|
|
|
|
|
13, 26, 14, 15, 15, 12, 28, 15, |
5566
|
|
|
|
|
|
|
15, 15, 15, 15, 15, 29, 26, 26, |
5567
|
|
|
|
|
|
|
27, 31, 4, 32, 33, 33, 30, 13, |
5568
|
|
|
|
|
|
|
23, 14, 15, 15, 12, 13, 14, 15, |
5569
|
|
|
|
|
|
|
15, 12, 13, 34, 35, 36, 37, 14, |
5570
|
|
|
|
|
|
|
15, 15, 12, 13, 38, 39, 39, 39, |
5571
|
|
|
|
|
|
|
39, 14, 15, 15, 39, 15, 12, 13, |
5572
|
|
|
|
|
|
|
38, 40, 39, 39, 39, 39, 14, 15, |
5573
|
|
|
|
|
|
|
15, 39, 15, 12, 13, 41, 14, 15, |
5574
|
|
|
|
|
|
|
15, 12, 13, 42, 14, 15, 15, 12, |
5575
|
|
|
|
|
|
|
13, 18, 14, 15, 15, 12, 13, 43, |
5576
|
|
|
|
|
|
|
14, 15, 15, 12, 0 |
5577
|
|
|
|
|
|
|
}; |
5578
|
|
|
|
|
|
|
|
5579
|
|
|
|
|
|
|
static const char _tag_guesser_trans_targs[] = { |
5580
|
|
|
|
|
|
|
18, 19, 20, 18, 18, 20, 21, 22, |
5581
|
|
|
|
|
|
|
23, 24, 16, 25, 18, 19, 18, 1, |
5582
|
|
|
|
|
|
|
3, 4, 18, 7, 8, 10, 11, 18, |
5583
|
|
|
|
|
|
|
13, 12, 18, 18, 19, 18, 18, 19, |
5584
|
|
|
|
|
|
|
18, 18, 2, 5, 6, 9, 20, 20, |
5585
|
|
|
|
|
|
|
18, 14, 15, 17 |
5586
|
|
|
|
|
|
|
}; |
5587
|
|
|
|
|
|
|
|
5588
|
|
|
|
|
|
|
static const char _tag_guesser_trans_actions[] = { |
5589
|
|
|
|
|
|
|
29, 46, 29, 32, 11, 11, 11, 11, |
5590
|
|
|
|
|
|
|
11, 11, 0, 11, 13, 35, 15, 0, |
5591
|
|
|
|
|
|
|
0, 0, 1, 0, 0, 0, 0, 3, |
5592
|
|
|
|
|
|
|
0, 0, 5, 17, 38, 20, 23, 42, |
5593
|
|
|
|
|
|
|
26, 9, 0, 0, 0, 0, 13, 0, |
5594
|
|
|
|
|
|
|
7, 0, 0, 0 |
5595
|
|
|
|
|
|
|
}; |
5596
|
|
|
|
|
|
|
|
5597
|
|
|
|
|
|
|
static const char _tag_guesser_eof_actions[] = { |
5598
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
5599
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
5600
|
|
|
|
|
|
|
0, 0, 0, 0, 15, 15, 0, 0, |
5601
|
|
|
|
|
|
|
0, 0 |
5602
|
|
|
|
|
|
|
}; |
5603
|
|
|
|
|
|
|
|
5604
|
|
|
|
|
|
|
static const int tag_guesser_start = 0; |
5605
|
|
|
|
|
|
|
|
5606
|
0
|
|
|
|
|
|
void english_morpho_guesser::analyze(string_piece form, string_piece form_lc, vector& lemmas) const { |
5607
|
|
|
|
|
|
|
// Try exceptions list |
5608
|
0
|
|
|
|
|
|
auto* exception = exceptions.at(form_lc.str, form_lc.len, [](pointer_decoder& data){ |
5609
|
0
|
0
|
|
|
|
|
for (unsigned len = data.next_1B(); len; len--) { |
5610
|
|
|
|
|
|
|
data.next(data.next_1B()); |
5611
|
|
|
|
|
|
|
data.next(data.next_1B()); |
5612
|
|
|
|
|
|
|
} |
5613
|
0
|
|
|
|
|
|
}); |
5614
|
|
|
|
|
|
|
|
5615
|
0
|
0
|
|
|
|
|
if (exception) { |
5616
|
|
|
|
|
|
|
// Found in exceptions list |
5617
|
|
|
|
|
|
|
pointer_decoder data(exception); |
5618
|
0
|
0
|
|
|
|
|
for (unsigned len = data.next_1B(); len; len--) { |
5619
|
|
|
|
|
|
|
unsigned lemma_len = data.next_1B(); |
5620
|
0
|
|
|
|
|
|
string lemma(data.next(lemma_len), lemma_len); |
5621
|
0
|
0
|
|
|
|
|
for (unsigned tags = data.next_1B(); tags; tags--) |
5622
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(lemma, exceptions_tags[data.next_2B()]); |
5623
|
|
|
|
|
|
|
} |
5624
|
|
|
|
|
|
|
} else { |
5625
|
|
|
|
|
|
|
// Try stripping negative prefix and use rule guesser |
5626
|
|
|
|
|
|
|
string lemma_lc(form_lc.str, form_lc.len); |
5627
|
|
|
|
|
|
|
// Try finding negative prefix |
5628
|
|
|
|
|
|
|
unsigned negation_len = 0; |
5629
|
0
|
0
|
|
|
|
|
for (unsigned prefix = 1; prefix <= form_lc.len; prefix++) { |
5630
|
0
|
|
|
|
|
|
auto found = negations.at(form_lc.str, prefix, [](pointer_decoder& data){ data.next(TOTAL); }); |
5631
|
0
|
0
|
|
|
|
|
if (!found) break; |
5632
|
0
|
0
|
|
|
|
|
if (found[NEGATION_LEN]) { |
5633
|
0
|
0
|
|
|
|
|
if (form_lc.len - prefix >= found[TO_FOLLOW]) negation_len = found[NEGATION_LEN]; |
5634
|
|
|
|
|
|
|
} |
5635
|
|
|
|
|
|
|
} |
5636
|
|
|
|
|
|
|
|
5637
|
|
|
|
|
|
|
// Add default tags |
5638
|
0
|
|
|
|
|
|
add(FW, lemma_lc, lemmas); |
5639
|
0
|
0
|
|
|
|
|
add(JJ, lemma_lc, negation_len, lemmas); |
5640
|
0
|
0
|
|
|
|
|
add(RB, lemma_lc, negation_len, lemmas); |
5641
|
0
|
0
|
|
|
|
|
add(NN, lemma_lc, negation_len, lemmas); |
5642
|
0
|
0
|
|
|
|
|
add_NNS(lemma_lc, negation_len, lemmas); |
5643
|
|
|
|
|
|
|
|
5644
|
|
|
|
|
|
|
// Add specialized tags |
5645
|
|
|
|
|
|
|
const char* p = form_lc.str; int cs; |
5646
|
|
|
|
|
|
|
bool added_JJR_RBR = false, added_JJS_RBS = false, added_SYM = false, added_CD = false; |
5647
|
|
|
|
|
|
|
|
5648
|
|
|
|
|
|
|
{ |
5649
|
|
|
|
|
|
|
cs = tag_guesser_start; |
5650
|
|
|
|
|
|
|
} |
5651
|
|
|
|
|
|
|
|
5652
|
|
|
|
|
|
|
{ |
5653
|
|
|
|
|
|
|
int _klen; |
5654
|
|
|
|
|
|
|
unsigned int _trans; |
5655
|
|
|
|
|
|
|
const char *_acts; |
5656
|
|
|
|
|
|
|
unsigned int _nacts; |
5657
|
|
|
|
|
|
|
const char *_keys; |
5658
|
|
|
|
|
|
|
|
5659
|
0
|
0
|
|
|
|
|
if ( p == ( (form_lc.str + form_lc.len)) ) |
5660
|
|
|
|
|
|
|
goto _test_eof; |
5661
|
|
|
|
|
|
|
_resume: |
5662
|
0
|
|
|
|
|
|
_keys = _tag_guesser_trans_keys + _tag_guesser_key_offsets[cs]; |
5663
|
0
|
|
|
|
|
|
_trans = _tag_guesser_index_offsets[cs]; |
5664
|
|
|
|
|
|
|
|
5665
|
0
|
|
|
|
|
|
_klen = _tag_guesser_single_lengths[cs]; |
5666
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
5667
|
|
|
|
|
|
|
const char *_lower = _keys; |
5668
|
|
|
|
|
|
|
const char *_mid; |
5669
|
0
|
|
|
|
|
|
const char *_upper = _keys + _klen - 1; |
5670
|
|
|
|
|
|
|
while (1) { |
5671
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
5672
|
|
|
|
|
|
|
break; |
5673
|
|
|
|
|
|
|
|
5674
|
0
|
|
|
|
|
|
_mid = _lower + ((_upper-_lower) >> 1); |
5675
|
0
|
0
|
|
|
|
|
if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) < *_mid ) |
5676
|
0
|
|
|
|
|
|
_upper = _mid - 1; |
5677
|
0
|
0
|
|
|
|
|
else if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) > *_mid ) |
5678
|
0
|
|
|
|
|
|
_lower = _mid + 1; |
5679
|
|
|
|
|
|
|
else { |
5680
|
0
|
|
|
|
|
|
_trans += (unsigned int)(_mid - _keys); |
5681
|
0
|
|
|
|
|
|
goto _match; |
5682
|
|
|
|
|
|
|
} |
5683
|
|
|
|
|
|
|
} |
5684
|
0
|
|
|
|
|
|
_keys += _klen; |
5685
|
0
|
|
|
|
|
|
_trans += _klen; |
5686
|
|
|
|
|
|
|
} |
5687
|
|
|
|
|
|
|
|
5688
|
0
|
|
|
|
|
|
_klen = _tag_guesser_range_lengths[cs]; |
5689
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
5690
|
|
|
|
|
|
|
const char *_lower = _keys; |
5691
|
|
|
|
|
|
|
const char *_mid; |
5692
|
0
|
|
|
|
|
|
const char *_upper = _keys + (_klen<<1) - 2; |
5693
|
|
|
|
|
|
|
while (1) { |
5694
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
5695
|
|
|
|
|
|
|
break; |
5696
|
|
|
|
|
|
|
|
5697
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
5698
|
0
|
0
|
|
|
|
|
if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) < _mid[0] ) |
5699
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
5700
|
0
|
0
|
|
|
|
|
else if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) > _mid[1] ) |
5701
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
5702
|
|
|
|
|
|
|
else { |
5703
|
0
|
|
|
|
|
|
_trans += (unsigned int)((_mid - _keys)>>1); |
5704
|
0
|
|
|
|
|
|
goto _match; |
5705
|
|
|
|
|
|
|
} |
5706
|
|
|
|
|
|
|
} |
5707
|
0
|
|
|
|
|
|
_trans += _klen; |
5708
|
|
|
|
|
|
|
} |
5709
|
|
|
|
|
|
|
|
5710
|
|
|
|
|
|
|
_match: |
5711
|
0
|
|
|
|
|
|
_trans = _tag_guesser_indicies[_trans]; |
5712
|
0
|
|
|
|
|
|
cs = _tag_guesser_trans_targs[_trans]; |
5713
|
|
|
|
|
|
|
|
5714
|
0
|
0
|
|
|
|
|
if ( _tag_guesser_trans_actions[_trans] == 0 ) |
5715
|
|
|
|
|
|
|
goto _again; |
5716
|
|
|
|
|
|
|
|
5717
|
0
|
|
|
|
|
|
_acts = _tag_guesser_actions + _tag_guesser_trans_actions[_trans]; |
5718
|
0
|
|
|
|
|
|
_nacts = (unsigned int) *_acts++; |
5719
|
0
|
0
|
|
|
|
|
while ( _nacts-- > 0 ) |
5720
|
|
|
|
|
|
|
{ |
5721
|
0
|
|
|
|
|
|
switch ( *_acts++ ) |
5722
|
|
|
|
|
|
|
{ |
5723
|
|
|
|
|
|
|
case 0: |
5724
|
0
|
0
|
|
|
|
|
{ if (!added_JJR_RBR) added_JJR_RBR = true, add_JJR_RBR(lemma_lc, negation_len, lemmas); } |
|
|
0
|
|
|
|
|
|
5725
|
|
|
|
|
|
|
break; |
5726
|
|
|
|
|
|
|
case 1: |
5727
|
0
|
0
|
|
|
|
|
{ if (!added_JJS_RBS) added_JJS_RBS = true, add_JJS_RBS(lemma_lc, negation_len, lemmas); } |
|
|
0
|
|
|
|
|
|
5728
|
|
|
|
|
|
|
break; |
5729
|
|
|
|
|
|
|
case 2: |
5730
|
0
|
0
|
|
|
|
|
{ add_VBG(lemma_lc, lemmas); } |
5731
|
|
|
|
|
|
|
break; |
5732
|
|
|
|
|
|
|
case 3: |
5733
|
0
|
0
|
|
|
|
|
{ add_VBD_VBN(lemma_lc, lemmas); } |
5734
|
|
|
|
|
|
|
break; |
5735
|
|
|
|
|
|
|
case 4: |
5736
|
0
|
0
|
|
|
|
|
{ add_VBZ(lemma_lc, lemmas); } |
5737
|
|
|
|
|
|
|
break; |
5738
|
|
|
|
|
|
|
case 5: |
5739
|
0
|
|
|
|
|
|
{ add(VB, lemma_lc, lemmas); add(VBP, lemma_lc, lemmas); } |
5740
|
|
|
|
|
|
|
break; |
5741
|
|
|
|
|
|
|
case 6: |
5742
|
0
|
0
|
|
|
|
|
{ if (!added_SYM) added_SYM = true, add(SYM, lemma_lc, lemmas); } |
5743
|
|
|
|
|
|
|
break; |
5744
|
|
|
|
|
|
|
case 7: |
5745
|
0
|
0
|
|
|
|
|
{ if (!added_CD) added_CD = true, add(CD, lemma_lc, lemmas); } |
5746
|
|
|
|
|
|
|
break; |
5747
|
|
|
|
|
|
|
} |
5748
|
|
|
|
|
|
|
} |
5749
|
|
|
|
|
|
|
|
5750
|
|
|
|
|
|
|
_again: |
5751
|
0
|
0
|
|
|
|
|
if ( ++p != ( (form_lc.str + form_lc.len)) ) |
5752
|
|
|
|
|
|
|
goto _resume; |
5753
|
|
|
|
|
|
|
_test_eof: {} |
5754
|
0
|
0
|
|
|
|
|
if ( p == ( (form_lc.str + form_lc.len)) ) |
5755
|
|
|
|
|
|
|
{ |
5756
|
0
|
|
|
|
|
|
const char *__acts = _tag_guesser_actions + _tag_guesser_eof_actions[cs]; |
5757
|
0
|
|
|
|
|
|
unsigned int __nacts = (unsigned int) *__acts++; |
5758
|
0
|
0
|
|
|
|
|
while ( __nacts-- > 0 ) { |
5759
|
0
|
0
|
|
|
|
|
switch ( *__acts++ ) { |
5760
|
|
|
|
|
|
|
case 7: |
5761
|
0
|
0
|
|
|
|
|
{ if (!added_CD) added_CD = true, add(CD, lemma_lc, lemmas); } |
5762
|
|
|
|
|
|
|
break; |
5763
|
|
|
|
|
|
|
} |
5764
|
|
|
|
|
|
|
} |
5765
|
|
|
|
|
|
|
} |
5766
|
|
|
|
|
|
|
|
5767
|
|
|
|
|
|
|
} |
5768
|
|
|
|
|
|
|
|
5769
|
|
|
|
|
|
|
} |
5770
|
|
|
|
|
|
|
|
5771
|
|
|
|
|
|
|
// Add proper names |
5772
|
0
|
|
|
|
|
|
analyze_proper_names(form, form_lc, lemmas); |
5773
|
0
|
|
|
|
|
|
} |
5774
|
|
|
|
|
|
|
|
5775
|
0
|
|
|
|
|
|
bool english_morpho_guesser::analyze_proper_names(string_piece form, string_piece form_lc, vector& lemmas) const { |
5776
|
|
|
|
|
|
|
// NNP if form_lc != form or form.str[0] =~ /[0-9']/, NNPS if form_lc != form |
5777
|
0
|
0
|
|
|
|
|
bool is_NNP = form.str != form_lc.str || (form.len && (*form.str == '\'' || (*form.str >= '0' && *form.str <= '9'))); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
5778
|
0
|
|
|
|
|
|
bool is_NNPS = form.str != form_lc.str; |
5779
|
0
|
0
|
|
|
|
|
if (!is_NNP && !is_NNPS) return false; |
5780
|
|
|
|
|
|
|
|
5781
|
|
|
|
|
|
|
bool was_NNP = false, was_NNPS = false; |
5782
|
0
|
0
|
|
|
|
|
for (auto&& lemma : lemmas) { |
5783
|
0
|
|
|
|
|
|
was_NNP |= lemma.tag == NNP; |
5784
|
0
|
|
|
|
|
|
was_NNPS |= lemma.tag == NNPS; |
5785
|
|
|
|
|
|
|
} |
5786
|
0
|
0
|
|
|
|
|
if (!((is_NNP && !was_NNP) || (is_NNPS && !was_NNPS))) return false; |
|
|
0
|
|
|
|
|
|
5787
|
|
|
|
|
|
|
|
5788
|
|
|
|
|
|
|
string lemma(form.str, form.len); |
5789
|
0
|
0
|
|
|
|
|
if (is_NNP && !was_NNP) add(NNP, lemma, lemmas); |
5790
|
0
|
0
|
|
|
|
|
if (is_NNPS && !was_NNPS) add_NNPS(lemma, lemmas); |
|
|
0
|
|
|
|
|
|
5791
|
|
|
|
|
|
|
return true; |
5792
|
|
|
|
|
|
|
} |
5793
|
|
|
|
|
|
|
|
5794
|
|
|
|
|
|
|
inline void english_morpho_guesser::add(const string& tag, const string& form, vector& lemmas) const { |
5795
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(form, tag); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
5796
|
|
|
|
|
|
|
} |
5797
|
|
|
|
|
|
|
|
5798
|
|
|
|
|
|
|
inline void english_morpho_guesser::add(const string& tag, const string& tag2, const string& form, vector& lemmas) const { |
5799
|
|
|
|
|
|
|
add(tag, form, lemmas); |
5800
|
|
|
|
|
|
|
add(tag2, form, lemmas); |
5801
|
|
|
|
|
|
|
} |
5802
|
|
|
|
|
|
|
|
5803
|
0
|
|
|
|
|
|
inline void english_morpho_guesser::add(const string& tag, const string& form, unsigned negation_len, vector& lemmas) const { |
5804
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
5805
|
0
|
|
|
|
|
|
} |
5806
|
|
|
|
|
|
|
|
5807
|
0
|
|
|
|
|
|
inline void english_morpho_guesser::add(const string& tag, const string& tag2, const string& form, unsigned negation_len, vector& lemmas) const { |
5808
|
0
|
|
|
|
|
|
add(tag, form, negation_len, lemmas); |
5809
|
0
|
|
|
|
|
|
add(tag2, form, negation_len, lemmas); |
5810
|
0
|
|
|
|
|
|
} |
5811
|
|
|
|
|
|
|
|
5812
|
|
|
|
|
|
|
// Common definitions (written backwards) |
5813
|
|
|
|
|
|
|
#define REM(str, len) (str.substr(0, str.size() - len)) |
5814
|
|
|
|
|
|
|
#define REM_ADD(str, len, add) (str.substr(0, str.size() - len).append(add)) |
5815
|
|
|
|
|
|
|
|
5816
|
|
|
|
|
|
|
static const char _NNS_actions[] = { |
5817
|
|
|
|
|
|
|
0, 1, 0, 1, 1, 1, 2, 1, |
5818
|
|
|
|
|
|
|
3, 1, 4, 1, 5, 1, 6, 1, |
5819
|
|
|
|
|
|
|
7, 1, 8, 1, 9, 1, 10, 1, |
5820
|
|
|
|
|
|
|
11, 1, 12, 1, 13 |
5821
|
|
|
|
|
|
|
}; |
5822
|
|
|
|
|
|
|
|
5823
|
|
|
|
|
|
|
static const char _NNS_key_offsets[] = { |
5824
|
|
|
|
|
|
|
0, 0, 2, 3, 4, 5, 7, 17, |
5825
|
|
|
|
|
|
|
17, 29, 30, 35, 35, 36, 37, 37, |
5826
|
|
|
|
|
|
|
37, 44, 45, 53, 63, 72 |
5827
|
|
|
|
|
|
|
}; |
5828
|
|
|
|
|
|
|
|
5829
|
|
|
|
|
|
|
static const char _NNS_trans_keys[] = { |
5830
|
|
|
|
|
|
|
110, 115, 101, 109, 101, 99, 115, 98, |
5831
|
|
|
|
|
|
|
100, 102, 104, 106, 110, 112, 116, 118, |
5832
|
|
|
|
|
|
|
122, 104, 122, 98, 100, 102, 103, 106, |
5833
|
|
|
|
|
|
|
110, 112, 116, 118, 120, 111, 97, 101, |
5834
|
|
|
|
|
|
|
105, 111, 117, 105, 119, 104, 105, 111, |
5835
|
|
|
|
|
|
|
115, 118, 120, 122, 115, 97, 101, 105, |
5836
|
|
|
|
|
|
|
110, 111, 114, 115, 117, 98, 100, 102, |
5837
|
|
|
|
|
|
|
104, 106, 110, 112, 116, 118, 122, 97, |
5838
|
|
|
|
|
|
|
101, 105, 111, 117, 121, 122, 98, 120, |
5839
|
|
|
|
|
|
|
0 |
5840
|
|
|
|
|
|
|
}; |
5841
|
|
|
|
|
|
|
|
5842
|
|
|
|
|
|
|
static const char _NNS_single_lengths[] = { |
5843
|
|
|
|
|
|
|
0, 2, 1, 1, 1, 2, 0, 0, |
5844
|
|
|
|
|
|
|
2, 1, 5, 0, 1, 1, 0, 0, |
5845
|
|
|
|
|
|
|
7, 1, 8, 0, 7, 0 |
5846
|
|
|
|
|
|
|
}; |
5847
|
|
|
|
|
|
|
|
5848
|
|
|
|
|
|
|
static const char _NNS_range_lengths[] = { |
5849
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 5, 0, |
5850
|
|
|
|
|
|
|
5, 0, 0, 0, 0, 0, 0, 0, |
5851
|
|
|
|
|
|
|
0, 0, 0, 5, 1, 0 |
5852
|
|
|
|
|
|
|
}; |
5853
|
|
|
|
|
|
|
|
5854
|
|
|
|
|
|
|
static const char _NNS_index_offsets[] = { |
5855
|
|
|
|
|
|
|
0, 0, 3, 5, 7, 9, 12, 18, |
5856
|
|
|
|
|
|
|
19, 27, 29, 35, 36, 38, 40, 41, |
5857
|
|
|
|
|
|
|
42, 50, 52, 61, 67, 76 |
5858
|
|
|
|
|
|
|
}; |
5859
|
|
|
|
|
|
|
|
5860
|
|
|
|
|
|
|
static const char _NNS_indicies[] = { |
5861
|
|
|
|
|
|
|
0, 2, 1, 3, 1, 4, 1, 6, |
5862
|
|
|
|
|
|
|
5, 7, 7, 1, 8, 8, 8, 8, |
5863
|
|
|
|
|
|
|
8, 1, 9, 11, 10, 10, 10, 10, |
5864
|
|
|
|
|
|
|
10, 10, 1, 12, 1, 13, 13, 13, |
5865
|
|
|
|
|
|
|
13, 13, 1, 14, 15, 1, 16, 1, |
5866
|
|
|
|
|
|
|
17, 1, 18, 19, 20, 21, 22, 7, |
5867
|
|
|
|
|
|
|
23, 1, 24, 1, 25, 25, 25, 26, |
5868
|
|
|
|
|
|
|
25, 27, 28, 29, 1, 30, 30, 30, |
5869
|
|
|
|
|
|
|
30, 30, 1, 31, 31, 31, 31, 31, |
5870
|
|
|
|
|
|
|
31, 33, 32, 1, 17, 0 |
5871
|
|
|
|
|
|
|
}; |
5872
|
|
|
|
|
|
|
|
5873
|
|
|
|
|
|
|
static const char _NNS_trans_targs[] = { |
5874
|
|
|
|
|
|
|
2, 0, 4, 3, 15, 15, 16, 15, |
5875
|
|
|
|
|
|
|
7, 15, 15, 17, 15, 11, 15, 13, |
5876
|
|
|
|
|
|
|
15, 15, 5, 6, 8, 18, 12, 20, |
5877
|
|
|
|
|
|
|
15, 15, 9, 10, 15, 19, 15, 15, |
5878
|
|
|
|
|
|
|
14, 21 |
5879
|
|
|
|
|
|
|
}; |
5880
|
|
|
|
|
|
|
|
5881
|
|
|
|
|
|
|
static const char _NNS_trans_actions[] = { |
5882
|
|
|
|
|
|
|
0, 0, 0, 0, 1, 27, 27, 21, |
5883
|
|
|
|
|
|
|
0, 23, 25, 25, 19, 0, 17, 0, |
5884
|
|
|
|
|
|
|
5, 11, 0, 0, 0, 21, 0, 21, |
5885
|
|
|
|
|
|
|
3, 9, 0, 0, 15, 9, 7, 13, |
5886
|
|
|
|
|
|
|
0, 15 |
5887
|
|
|
|
|
|
|
}; |
5888
|
|
|
|
|
|
|
|
5889
|
|
|
|
|
|
|
static const int NNS_start = 1; |
5890
|
|
|
|
|
|
|
|
5891
|
0
|
|
|
|
|
|
void english_morpho_guesser::add_NNS(const string& form, unsigned negation_len, vector& lemmas) const { |
5892
|
0
|
|
|
|
|
|
const char* p = form.c_str() + negation_len; int cs; |
5893
|
|
|
|
|
|
|
char best = 'z'; unsigned remove = 0; const char* append = nullptr; |
5894
|
|
|
|
|
|
|
|
5895
|
|
|
|
|
|
|
{ |
5896
|
|
|
|
|
|
|
cs = NNS_start; |
5897
|
|
|
|
|
|
|
} |
5898
|
|
|
|
|
|
|
|
5899
|
|
|
|
|
|
|
{ |
5900
|
|
|
|
|
|
|
int _klen; |
5901
|
|
|
|
|
|
|
unsigned int _trans; |
5902
|
|
|
|
|
|
|
const char *_acts; |
5903
|
|
|
|
|
|
|
unsigned int _nacts; |
5904
|
|
|
|
|
|
|
const char *_keys; |
5905
|
|
|
|
|
|
|
|
5906
|
0
|
0
|
|
|
|
|
if ( p == ( (form.c_str() + form.size())) ) |
5907
|
|
|
|
|
|
|
goto _test_eof; |
5908
|
|
|
|
|
|
|
if ( cs == 0 ) |
5909
|
|
|
|
|
|
|
goto _out; |
5910
|
|
|
|
|
|
|
_resume: |
5911
|
0
|
|
|
|
|
|
_keys = _NNS_trans_keys + _NNS_key_offsets[cs]; |
5912
|
0
|
|
|
|
|
|
_trans = _NNS_index_offsets[cs]; |
5913
|
|
|
|
|
|
|
|
5914
|
0
|
|
|
|
|
|
_klen = _NNS_single_lengths[cs]; |
5915
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
5916
|
|
|
|
|
|
|
const char *_lower = _keys; |
5917
|
|
|
|
|
|
|
const char *_mid; |
5918
|
0
|
|
|
|
|
|
const char *_upper = _keys + _klen - 1; |
5919
|
|
|
|
|
|
|
while (1) { |
5920
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
5921
|
|
|
|
|
|
|
break; |
5922
|
|
|
|
|
|
|
|
5923
|
0
|
|
|
|
|
|
_mid = _lower + ((_upper-_lower) >> 1); |
5924
|
0
|
0
|
|
|
|
|
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < *_mid ) |
5925
|
0
|
|
|
|
|
|
_upper = _mid - 1; |
5926
|
0
|
0
|
|
|
|
|
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > *_mid ) |
5927
|
0
|
|
|
|
|
|
_lower = _mid + 1; |
5928
|
|
|
|
|
|
|
else { |
5929
|
0
|
|
|
|
|
|
_trans += (unsigned int)(_mid - _keys); |
5930
|
0
|
|
|
|
|
|
goto _match; |
5931
|
|
|
|
|
|
|
} |
5932
|
|
|
|
|
|
|
} |
5933
|
0
|
|
|
|
|
|
_keys += _klen; |
5934
|
0
|
|
|
|
|
|
_trans += _klen; |
5935
|
|
|
|
|
|
|
} |
5936
|
|
|
|
|
|
|
|
5937
|
0
|
|
|
|
|
|
_klen = _NNS_range_lengths[cs]; |
5938
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
5939
|
|
|
|
|
|
|
const char *_lower = _keys; |
5940
|
|
|
|
|
|
|
const char *_mid; |
5941
|
0
|
|
|
|
|
|
const char *_upper = _keys + (_klen<<1) - 2; |
5942
|
|
|
|
|
|
|
while (1) { |
5943
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
5944
|
|
|
|
|
|
|
break; |
5945
|
|
|
|
|
|
|
|
5946
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
5947
|
0
|
0
|
|
|
|
|
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < _mid[0] ) |
5948
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
5949
|
0
|
0
|
|
|
|
|
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > _mid[1] ) |
5950
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
5951
|
|
|
|
|
|
|
else { |
5952
|
0
|
|
|
|
|
|
_trans += (unsigned int)((_mid - _keys)>>1); |
5953
|
0
|
|
|
|
|
|
goto _match; |
5954
|
|
|
|
|
|
|
} |
5955
|
|
|
|
|
|
|
} |
5956
|
0
|
|
|
|
|
|
_trans += _klen; |
5957
|
|
|
|
|
|
|
} |
5958
|
|
|
|
|
|
|
|
5959
|
|
|
|
|
|
|
_match: |
5960
|
0
|
|
|
|
|
|
_trans = _NNS_indicies[_trans]; |
5961
|
0
|
|
|
|
|
|
cs = _NNS_trans_targs[_trans]; |
5962
|
|
|
|
|
|
|
|
5963
|
0
|
0
|
|
|
|
|
if ( _NNS_trans_actions[_trans] == 0 ) |
5964
|
|
|
|
|
|
|
goto _again; |
5965
|
|
|
|
|
|
|
|
5966
|
0
|
|
|
|
|
|
_acts = _NNS_actions + _NNS_trans_actions[_trans]; |
5967
|
0
|
|
|
|
|
|
_nacts = (unsigned int) *_acts++; |
5968
|
0
|
0
|
|
|
|
|
while ( _nacts-- > 0 ) |
5969
|
|
|
|
|
|
|
{ |
5970
|
0
|
|
|
|
|
|
switch ( *_acts++ ) |
5971
|
|
|
|
|
|
|
{ |
5972
|
|
|
|
|
|
|
case 0: |
5973
|
0
|
0
|
|
|
|
|
{ if (best > 'a') best = 'a', remove = 2, append = "an"; } |
5974
|
|
|
|
|
|
|
break; |
5975
|
|
|
|
|
|
|
case 1: |
5976
|
0
|
0
|
|
|
|
|
{ if (best > 'b') best = 'b', remove = 1, append = nullptr; } |
5977
|
|
|
|
|
|
|
break; |
5978
|
|
|
|
|
|
|
case 2: |
5979
|
0
|
0
|
|
|
|
|
{ if (best > 'c') best = 'c', remove = 3, append = "fe"; } |
5980
|
|
|
|
|
|
|
break; |
5981
|
|
|
|
|
|
|
case 3: |
5982
|
0
|
0
|
|
|
|
|
{ if (best > 'd') best = 'd', remove = 2, append = nullptr; } |
5983
|
|
|
|
|
|
|
break; |
5984
|
|
|
|
|
|
|
case 4: |
5985
|
0
|
0
|
|
|
|
|
{ if (best > 'e') best = 'e', remove = 1, append = nullptr; } |
5986
|
|
|
|
|
|
|
break; |
5987
|
|
|
|
|
|
|
case 5: |
5988
|
0
|
0
|
|
|
|
|
{ if (best > 'f') best = 'f', remove = 2, append = nullptr; } |
5989
|
|
|
|
|
|
|
break; |
5990
|
|
|
|
|
|
|
case 6: |
5991
|
0
|
0
|
|
|
|
|
{ if (best > 'g') best = 'g', remove = 1, append = nullptr; } |
5992
|
|
|
|
|
|
|
break; |
5993
|
|
|
|
|
|
|
case 7: |
5994
|
0
|
0
|
|
|
|
|
{ if (best > 'h') best = 'h', remove = 2, append = nullptr; } |
5995
|
|
|
|
|
|
|
break; |
5996
|
|
|
|
|
|
|
case 8: |
5997
|
0
|
0
|
|
|
|
|
{ if (best > 'i') best = 'i', remove = 1, append = nullptr; } |
5998
|
|
|
|
|
|
|
break; |
5999
|
|
|
|
|
|
|
case 9: |
6000
|
0
|
0
|
|
|
|
|
{ if (best > 'j') best = 'j', remove = 1, append = nullptr; } |
6001
|
|
|
|
|
|
|
break; |
6002
|
|
|
|
|
|
|
case 10: |
6003
|
0
|
0
|
|
|
|
|
{ if (best > 'k') best = 'k', remove = 2, append = nullptr; } |
6004
|
|
|
|
|
|
|
break; |
6005
|
|
|
|
|
|
|
case 11: |
6006
|
0
|
0
|
|
|
|
|
{ if (best > 'l') best = 'l', remove = 3, append = "y"; } |
6007
|
|
|
|
|
|
|
break; |
6008
|
|
|
|
|
|
|
case 12: |
6009
|
0
|
0
|
|
|
|
|
{ if (best > 'm') best = 'm', remove = 2, append = nullptr; } |
6010
|
|
|
|
|
|
|
break; |
6011
|
|
|
|
|
|
|
case 13: |
6012
|
0
|
0
|
|
|
|
|
{ if (best > 'n') best = 'n', remove = 1, append = nullptr; } |
6013
|
|
|
|
|
|
|
break; |
6014
|
|
|
|
|
|
|
} |
6015
|
|
|
|
|
|
|
} |
6016
|
|
|
|
|
|
|
|
6017
|
|
|
|
|
|
|
_again: |
6018
|
0
|
0
|
|
|
|
|
if ( cs == 0 ) |
6019
|
|
|
|
|
|
|
goto _out; |
6020
|
0
|
0
|
|
|
|
|
if ( ++p != ( (form.c_str() + form.size())) ) |
6021
|
|
|
|
|
|
|
goto _resume; |
6022
|
|
|
|
|
|
|
_test_eof: {} |
6023
|
|
|
|
|
|
|
_out: {} |
6024
|
|
|
|
|
|
|
} |
6025
|
|
|
|
|
|
|
|
6026
|
0
|
0
|
|
|
|
|
add(NNS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
6027
|
0
|
|
|
|
|
|
} |
6028
|
|
|
|
|
|
|
|
6029
|
|
|
|
|
|
|
static const char _NNPS_actions[] = { |
6030
|
|
|
|
|
|
|
0, 1, 1, 1, 2, 1, 4, 1, |
6031
|
|
|
|
|
|
|
5, 1, 6, 1, 7, 1, 8, 1, |
6032
|
|
|
|
|
|
|
9, 1, 10, 1, 11, 1, 12, 1, |
6033
|
|
|
|
|
|
|
14, 1, 15, 1, 16, 2, 0, 1, |
6034
|
|
|
|
|
|
|
2, 3, 4, 2, 13, 14 |
6035
|
|
|
|
|
|
|
}; |
6036
|
|
|
|
|
|
|
|
6037
|
|
|
|
|
|
|
static const unsigned char _NNPS_key_offsets[] = { |
6038
|
|
|
|
|
|
|
0, 0, 4, 6, 8, 10, 12, 16, |
6039
|
|
|
|
|
|
|
36, 36, 60, 62, 72, 72, 74, 76, |
6040
|
|
|
|
|
|
|
78, 78, 98, 98, 100, 102, 104, 104, |
6041
|
|
|
|
|
|
|
118, 120, 136, 156, 174, 174 |
6042
|
|
|
|
|
|
|
}; |
6043
|
|
|
|
|
|
|
|
6044
|
|
|
|
|
|
|
static const char _NNPS_trans_keys[] = { |
6045
|
|
|
|
|
|
|
78, 83, 110, 115, 69, 101, 77, 109, |
6046
|
|
|
|
|
|
|
77, 109, 69, 101, 67, 83, 99, 115, |
6047
|
|
|
|
|
|
|
66, 68, 70, 72, 74, 78, 80, 84, |
6048
|
|
|
|
|
|
|
86, 90, 98, 100, 102, 104, 106, 110, |
6049
|
|
|
|
|
|
|
112, 116, 118, 122, 72, 90, 104, 122, |
6050
|
|
|
|
|
|
|
66, 68, 70, 71, 74, 78, 80, 84, |
6051
|
|
|
|
|
|
|
86, 88, 98, 100, 102, 103, 106, 110, |
6052
|
|
|
|
|
|
|
112, 116, 118, 120, 79, 111, 65, 69, |
6053
|
|
|
|
|
|
|
73, 79, 85, 97, 101, 105, 111, 117, |
6054
|
|
|
|
|
|
|
73, 105, 87, 119, 87, 119, 66, 68, |
6055
|
|
|
|
|
|
|
70, 72, 74, 78, 80, 84, 86, 90, |
6056
|
|
|
|
|
|
|
98, 100, 102, 104, 106, 110, 112, 116, |
6057
|
|
|
|
|
|
|
118, 122, 73, 105, 69, 101, 69, 101, |
6058
|
|
|
|
|
|
|
72, 73, 79, 83, 86, 88, 90, 104, |
6059
|
|
|
|
|
|
|
105, 111, 115, 118, 120, 122, 83, 115, |
6060
|
|
|
|
|
|
|
65, 69, 73, 78, 79, 82, 83, 85, |
6061
|
|
|
|
|
|
|
97, 101, 105, 110, 111, 114, 115, 117, |
6062
|
|
|
|
|
|
|
66, 68, 70, 72, 74, 78, 80, 84, |
6063
|
|
|
|
|
|
|
86, 90, 98, 100, 102, 104, 106, 110, |
6064
|
|
|
|
|
|
|
112, 116, 118, 122, 65, 69, 73, 79, |
6065
|
|
|
|
|
|
|
85, 89, 90, 97, 101, 105, 111, 117, |
6066
|
|
|
|
|
|
|
121, 122, 66, 88, 98, 120, 72, 73, |
6067
|
|
|
|
|
|
|
79, 83, 86, 88, 90, 104, 105, 111, |
6068
|
|
|
|
|
|
|
115, 118, 120, 122, 0 |
6069
|
|
|
|
|
|
|
}; |
6070
|
|
|
|
|
|
|
|
6071
|
|
|
|
|
|
|
static const char _NNPS_single_lengths[] = { |
6072
|
|
|
|
|
|
|
0, 4, 2, 2, 2, 2, 4, 0, |
6073
|
|
|
|
|
|
|
0, 4, 2, 10, 0, 2, 2, 2, |
6074
|
|
|
|
|
|
|
0, 0, 0, 2, 2, 2, 0, 14, |
6075
|
|
|
|
|
|
|
2, 16, 0, 14, 0, 14 |
6076
|
|
|
|
|
|
|
}; |
6077
|
|
|
|
|
|
|
|
6078
|
|
|
|
|
|
|
static const char _NNPS_range_lengths[] = { |
6079
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 10, |
6080
|
|
|
|
|
|
|
0, 10, 0, 0, 0, 0, 0, 0, |
6081
|
|
|
|
|
|
|
0, 10, 0, 0, 0, 0, 0, 0, |
6082
|
|
|
|
|
|
|
0, 0, 10, 2, 0, 0 |
6083
|
|
|
|
|
|
|
}; |
6084
|
|
|
|
|
|
|
|
6085
|
|
|
|
|
|
|
static const unsigned char _NNPS_index_offsets[] = { |
6086
|
|
|
|
|
|
|
0, 0, 5, 8, 11, 14, 17, 22, |
6087
|
|
|
|
|
|
|
33, 34, 49, 52, 63, 64, 67, 70, |
6088
|
|
|
|
|
|
|
73, 74, 85, 86, 89, 92, 95, 96, |
6089
|
|
|
|
|
|
|
111, 114, 131, 142, 159, 160 |
6090
|
|
|
|
|
|
|
}; |
6091
|
|
|
|
|
|
|
|
6092
|
|
|
|
|
|
|
static const char _NNPS_indicies[] = { |
6093
|
|
|
|
|
|
|
0, 2, 3, 4, 1, 5, 6, 1, |
6094
|
|
|
|
|
|
|
7, 8, 1, 8, 8, 1, 10, 11, |
6095
|
|
|
|
|
|
|
9, 12, 12, 12, 12, 1, 13, 13, |
6096
|
|
|
|
|
|
|
13, 13, 13, 13, 13, 13, 13, 13, |
6097
|
|
|
|
|
|
|
1, 14, 16, 15, 16, 15, 15, 15, |
6098
|
|
|
|
|
|
|
15, 15, 15, 15, 15, 15, 15, 15, |
6099
|
|
|
|
|
|
|
1, 17, 17, 1, 18, 18, 18, 18, |
6100
|
|
|
|
|
|
|
18, 18, 18, 18, 18, 18, 1, 19, |
6101
|
|
|
|
|
|
|
20, 21, 1, 22, 23, 1, 23, 23, |
6102
|
|
|
|
|
|
|
1, 24, 25, 25, 25, 25, 25, 25, |
6103
|
|
|
|
|
|
|
25, 25, 25, 25, 1, 26, 21, 21, |
6104
|
|
|
|
|
|
|
1, 6, 6, 1, 11, 11, 9, 1, |
6105
|
|
|
|
|
|
|
27, 28, 29, 30, 31, 12, 32, 27, |
6106
|
|
|
|
|
|
|
33, 29, 30, 34, 12, 32, 1, 35, |
6107
|
|
|
|
|
|
|
35, 1, 36, 36, 36, 37, 36, 38, |
6108
|
|
|
|
|
|
|
39, 40, 36, 36, 36, 37, 36, 38, |
6109
|
|
|
|
|
|
|
39, 40, 1, 41, 41, 41, 41, 41, |
6110
|
|
|
|
|
|
|
41, 41, 41, 41, 41, 1, 42, 42, |
6111
|
|
|
|
|
|
|
42, 42, 42, 42, 44, 42, 42, 42, |
6112
|
|
|
|
|
|
|
42, 42, 42, 44, 43, 43, 1, 24, |
6113
|
|
|
|
|
|
|
27, 33, 29, 30, 34, 12, 32, 27, |
6114
|
|
|
|
|
|
|
33, 29, 30, 34, 12, 32, 1, 0 |
6115
|
|
|
|
|
|
|
}; |
6116
|
|
|
|
|
|
|
|
6117
|
|
|
|
|
|
|
static const char _NNPS_trans_targs[] = { |
6118
|
|
|
|
|
|
|
2, 0, 5, 20, 21, 3, 4, 22, |
6119
|
|
|
|
|
|
|
22, 22, 23, 29, 22, 8, 22, 22, |
6120
|
|
|
|
|
|
|
24, 22, 12, 22, 14, 15, 22, 22, |
6121
|
|
|
|
|
|
|
22, 18, 22, 6, 7, 9, 25, 13, |
6122
|
|
|
|
|
|
|
27, 17, 19, 22, 22, 10, 11, 22, |
6123
|
|
|
|
|
|
|
26, 22, 22, 16, 28 |
6124
|
|
|
|
|
|
|
}; |
6125
|
|
|
|
|
|
|
|
6126
|
|
|
|
|
|
|
static const char _NNPS_trans_actions[] = { |
6127
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 29, |
6128
|
|
|
|
|
|
|
1, 27, 27, 27, 21, 0, 35, 25, |
6129
|
|
|
|
|
|
|
25, 19, 0, 17, 0, 0, 32, 5, |
6130
|
|
|
|
|
|
|
11, 0, 23, 0, 0, 0, 21, 0, |
6131
|
|
|
|
|
|
|
21, 0, 0, 3, 9, 0, 0, 15, |
6132
|
|
|
|
|
|
|
9, 7, 13, 0, 15 |
6133
|
|
|
|
|
|
|
}; |
6134
|
|
|
|
|
|
|
|
6135
|
|
|
|
|
|
|
static const int NNPS_start = 1; |
6136
|
|
|
|
|
|
|
|
6137
|
0
|
|
|
|
|
|
void english_morpho_guesser::add_NNPS(const string& form, vector& lemmas) const { |
6138
|
|
|
|
|
|
|
const char* p = form.c_str(); int cs; |
6139
|
|
|
|
|
|
|
char best = 'z'; unsigned remove = 0; const char* append = nullptr; |
6140
|
|
|
|
|
|
|
|
6141
|
|
|
|
|
|
|
{ |
6142
|
|
|
|
|
|
|
cs = NNPS_start; |
6143
|
|
|
|
|
|
|
} |
6144
|
|
|
|
|
|
|
|
6145
|
|
|
|
|
|
|
{ |
6146
|
|
|
|
|
|
|
int _klen; |
6147
|
|
|
|
|
|
|
unsigned int _trans; |
6148
|
|
|
|
|
|
|
const char *_acts; |
6149
|
|
|
|
|
|
|
unsigned int _nacts; |
6150
|
|
|
|
|
|
|
const char *_keys; |
6151
|
|
|
|
|
|
|
|
6152
|
0
|
0
|
|
|
|
|
if ( p == ( (form.c_str() + form.size())) ) |
6153
|
|
|
|
|
|
|
goto _test_eof; |
6154
|
|
|
|
|
|
|
if ( cs == 0 ) |
6155
|
|
|
|
|
|
|
goto _out; |
6156
|
|
|
|
|
|
|
_resume: |
6157
|
0
|
|
|
|
|
|
_keys = _NNPS_trans_keys + _NNPS_key_offsets[cs]; |
6158
|
0
|
|
|
|
|
|
_trans = _NNPS_index_offsets[cs]; |
6159
|
|
|
|
|
|
|
|
6160
|
0
|
|
|
|
|
|
_klen = _NNPS_single_lengths[cs]; |
6161
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
6162
|
|
|
|
|
|
|
const char *_lower = _keys; |
6163
|
|
|
|
|
|
|
const char *_mid; |
6164
|
0
|
|
|
|
|
|
const char *_upper = _keys + _klen - 1; |
6165
|
|
|
|
|
|
|
while (1) { |
6166
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
6167
|
|
|
|
|
|
|
break; |
6168
|
|
|
|
|
|
|
|
6169
|
0
|
|
|
|
|
|
_mid = _lower + ((_upper-_lower) >> 1); |
6170
|
0
|
0
|
|
|
|
|
if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid ) |
6171
|
0
|
|
|
|
|
|
_upper = _mid - 1; |
6172
|
0
|
0
|
|
|
|
|
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid ) |
6173
|
0
|
|
|
|
|
|
_lower = _mid + 1; |
6174
|
|
|
|
|
|
|
else { |
6175
|
0
|
|
|
|
|
|
_trans += (unsigned int)(_mid - _keys); |
6176
|
0
|
|
|
|
|
|
goto _match; |
6177
|
|
|
|
|
|
|
} |
6178
|
|
|
|
|
|
|
} |
6179
|
0
|
|
|
|
|
|
_keys += _klen; |
6180
|
0
|
|
|
|
|
|
_trans += _klen; |
6181
|
|
|
|
|
|
|
} |
6182
|
|
|
|
|
|
|
|
6183
|
0
|
|
|
|
|
|
_klen = _NNPS_range_lengths[cs]; |
6184
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
6185
|
|
|
|
|
|
|
const char *_lower = _keys; |
6186
|
|
|
|
|
|
|
const char *_mid; |
6187
|
0
|
|
|
|
|
|
const char *_upper = _keys + (_klen<<1) - 2; |
6188
|
|
|
|
|
|
|
while (1) { |
6189
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
6190
|
|
|
|
|
|
|
break; |
6191
|
|
|
|
|
|
|
|
6192
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
6193
|
0
|
0
|
|
|
|
|
if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] ) |
6194
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
6195
|
0
|
0
|
|
|
|
|
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] ) |
6196
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
6197
|
|
|
|
|
|
|
else { |
6198
|
0
|
|
|
|
|
|
_trans += (unsigned int)((_mid - _keys)>>1); |
6199
|
0
|
|
|
|
|
|
goto _match; |
6200
|
|
|
|
|
|
|
} |
6201
|
|
|
|
|
|
|
} |
6202
|
0
|
|
|
|
|
|
_trans += _klen; |
6203
|
|
|
|
|
|
|
} |
6204
|
|
|
|
|
|
|
|
6205
|
|
|
|
|
|
|
_match: |
6206
|
0
|
|
|
|
|
|
_trans = _NNPS_indicies[_trans]; |
6207
|
0
|
|
|
|
|
|
cs = _NNPS_trans_targs[_trans]; |
6208
|
|
|
|
|
|
|
|
6209
|
0
|
0
|
|
|
|
|
if ( _NNPS_trans_actions[_trans] == 0 ) |
6210
|
|
|
|
|
|
|
goto _again; |
6211
|
|
|
|
|
|
|
|
6212
|
0
|
|
|
|
|
|
_acts = _NNPS_actions + _NNPS_trans_actions[_trans]; |
6213
|
0
|
|
|
|
|
|
_nacts = (unsigned int) *_acts++; |
6214
|
0
|
0
|
|
|
|
|
while ( _nacts-- > 0 ) |
6215
|
|
|
|
|
|
|
{ |
6216
|
0
|
|
|
|
|
|
switch ( *_acts++ ) |
6217
|
|
|
|
|
|
|
{ |
6218
|
|
|
|
|
|
|
case 0: |
6219
|
0
|
0
|
|
|
|
|
{ if (best > 'a') best = 'a', remove = 2, append = "AN"; } |
6220
|
|
|
|
|
|
|
break; |
6221
|
|
|
|
|
|
|
case 1: |
6222
|
0
|
0
|
|
|
|
|
{ if (best > 'b') best = 'b', remove = 2, append = "an"; } |
6223
|
|
|
|
|
|
|
break; |
6224
|
|
|
|
|
|
|
case 2: |
6225
|
0
|
0
|
|
|
|
|
{ if (best > 'c') best = 'c', remove = 1, append = nullptr; } |
6226
|
|
|
|
|
|
|
break; |
6227
|
|
|
|
|
|
|
case 3: |
6228
|
0
|
0
|
|
|
|
|
{ if (best > 'd') best = 'd', remove = 3, append = "FE"; } |
6229
|
|
|
|
|
|
|
break; |
6230
|
|
|
|
|
|
|
case 4: |
6231
|
0
|
0
|
|
|
|
|
{ if (best > 'e') best = 'e', remove = 3, append = "fe"; } |
6232
|
|
|
|
|
|
|
break; |
6233
|
|
|
|
|
|
|
case 5: |
6234
|
0
|
0
|
|
|
|
|
{ if (best > 'f') best = 'f', remove = 2, append = nullptr; } |
6235
|
|
|
|
|
|
|
break; |
6236
|
|
|
|
|
|
|
case 6: |
6237
|
0
|
0
|
|
|
|
|
{ if (best > 'g') best = 'g', remove = 1, append = nullptr; } |
6238
|
|
|
|
|
|
|
break; |
6239
|
|
|
|
|
|
|
case 7: |
6240
|
0
|
0
|
|
|
|
|
{ if (best > 'h') best = 'h', remove = 2, append = nullptr; } |
6241
|
|
|
|
|
|
|
break; |
6242
|
|
|
|
|
|
|
case 8: |
6243
|
0
|
0
|
|
|
|
|
{ if (best > 'i') best = 'i', remove = 1, append = nullptr; } |
6244
|
|
|
|
|
|
|
break; |
6245
|
|
|
|
|
|
|
case 9: |
6246
|
0
|
0
|
|
|
|
|
{ if (best > 'j') best = 'j', remove = 2, append = nullptr; } |
6247
|
|
|
|
|
|
|
break; |
6248
|
|
|
|
|
|
|
case 10: |
6249
|
0
|
0
|
|
|
|
|
{ if (best > 'k') best = 'k', remove = 1, append = nullptr; } |
6250
|
|
|
|
|
|
|
break; |
6251
|
|
|
|
|
|
|
case 11: |
6252
|
0
|
0
|
|
|
|
|
{ if (best > 'l') best = 'l', remove = 1, append = nullptr; } |
6253
|
|
|
|
|
|
|
break; |
6254
|
|
|
|
|
|
|
case 12: |
6255
|
0
|
0
|
|
|
|
|
{ if (best > 'm') best = 'm', remove = 2, append = nullptr; } |
6256
|
|
|
|
|
|
|
break; |
6257
|
|
|
|
|
|
|
case 13: |
6258
|
0
|
0
|
|
|
|
|
{ if (best > 'n') best = 'n', remove = 3, append = "Y"; } |
6259
|
|
|
|
|
|
|
break; |
6260
|
|
|
|
|
|
|
case 14: |
6261
|
0
|
0
|
|
|
|
|
{ if (best > 'o') best = 'o', remove = 3, append = "y"; } |
6262
|
|
|
|
|
|
|
break; |
6263
|
|
|
|
|
|
|
case 15: |
6264
|
0
|
0
|
|
|
|
|
{ if (best > 'p') best = 'p', remove = 2, append = nullptr; } |
6265
|
|
|
|
|
|
|
break; |
6266
|
|
|
|
|
|
|
case 16: |
6267
|
0
|
0
|
|
|
|
|
{ if (best > 'q') best = 'q', remove = 1, append = nullptr; } |
6268
|
|
|
|
|
|
|
break; |
6269
|
|
|
|
|
|
|
} |
6270
|
|
|
|
|
|
|
} |
6271
|
|
|
|
|
|
|
|
6272
|
|
|
|
|
|
|
_again: |
6273
|
0
|
0
|
|
|
|
|
if ( cs == 0 ) |
6274
|
|
|
|
|
|
|
goto _out; |
6275
|
0
|
0
|
|
|
|
|
if ( ++p != ( (form.c_str() + form.size())) ) |
6276
|
|
|
|
|
|
|
goto _resume; |
6277
|
|
|
|
|
|
|
_test_eof: {} |
6278
|
|
|
|
|
|
|
_out: {} |
6279
|
|
|
|
|
|
|
} |
6280
|
|
|
|
|
|
|
|
6281
|
0
|
0
|
|
|
|
|
add(NNPS, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
|
|
0
|
|
|
|
|
|
6282
|
0
|
|
|
|
|
|
} |
6283
|
|
|
|
|
|
|
|
6284
|
|
|
|
|
|
|
static const char _VBG_actions[] = { |
6285
|
|
|
|
|
|
|
0, 1, 1, 1, 2, 1, 4, 1, |
6286
|
|
|
|
|
|
|
5, 1, 6, 1, 7, 1, 9, 1, |
6287
|
|
|
|
|
|
|
10, 1, 11, 1, 12, 1, 13, 1, |
6288
|
|
|
|
|
|
|
14, 1, 15, 1, 16, 1, 17, 2, |
6289
|
|
|
|
|
|
|
0, 12, 2, 3, 4, 2, 5, 9, |
6290
|
|
|
|
|
|
|
2, 5, 10, 2, 8, 9, 2, 9, |
6291
|
|
|
|
|
|
|
10, 2, 11, 12, 3, 0, 2, 12, |
6292
|
|
|
|
|
|
|
3, 2, 11, 12 |
6293
|
|
|
|
|
|
|
}; |
6294
|
|
|
|
|
|
|
|
6295
|
|
|
|
|
|
|
static const short _VBG_key_offsets[] = { |
6296
|
|
|
|
|
|
|
0, 0, 1, 2, 3, 9, 14, 24, |
6297
|
|
|
|
|
|
|
29, 34, 44, 46, 47, 48, 49, 50, |
6298
|
|
|
|
|
|
|
51, 52, 59, 66, 68, 70, 71, 72, |
6299
|
|
|
|
|
|
|
73, 74, 75, 76, 81, 89, 90, 91, |
6300
|
|
|
|
|
|
|
92, 93, 94, 96, 97, 98, 99, 100, |
6301
|
|
|
|
|
|
|
101, 102, 127, 127, 136, 137, 142, 153, |
6302
|
|
|
|
|
|
|
162, 171, 181, 186, 191, 197, 207, 207, |
6303
|
|
|
|
|
|
|
216, 228, 229, 240, 240, 249, 258, 267, |
6304
|
|
|
|
|
|
|
276, 285, 290, 302, 313, 318, 324, 334, |
6305
|
|
|
|
|
|
|
344, 355, 362, 373, 382, 391, 391, 402, |
6306
|
|
|
|
|
|
|
413, 415, 416, 417, 417, 418, 426, 437, |
6307
|
|
|
|
|
|
|
442, 448, 458, 468, 479, 486, 497, 504, |
6308
|
|
|
|
|
|
|
510, 519, 528, 537, 543 |
6309
|
|
|
|
|
|
|
}; |
6310
|
|
|
|
|
|
|
|
6311
|
|
|
|
|
|
|
static const char _VBG_trans_keys[] = { |
6312
|
|
|
|
|
|
|
103, 110, 105, 97, 101, 105, 111, 117, |
6313
|
|
|
|
|
|
|
121, 97, 101, 105, 111, 117, 98, 100, |
6314
|
|
|
|
|
|
|
102, 104, 106, 110, 112, 116, 118, 122, |
6315
|
|
|
|
|
|
|
97, 101, 105, 111, 117, 97, 101, 105, |
6316
|
|
|
|
|
|
|
111, 117, 98, 100, 102, 104, 106, 110, |
6317
|
|
|
|
|
|
|
112, 116, 118, 122, 98, 114, 105, 114, |
6318
|
|
|
|
|
|
|
112, 105, 109, 101, 97, 101, 105, 111, |
6319
|
|
|
|
|
|
|
117, 98, 122, 97, 101, 105, 111, 117, |
6320
|
|
|
|
|
|
|
98, 122, 97, 122, 98, 114, 105, 114, |
6321
|
|
|
|
|
|
|
112, 105, 109, 101, 97, 101, 105, 111, |
6322
|
|
|
|
|
|
|
117, 97, 101, 105, 110, 111, 115, 117, |
6323
|
|
|
|
|
|
|
120, 105, 112, 105, 109, 101, 98, 114, |
6324
|
|
|
|
|
|
|
105, 114, 112, 105, 109, 101, 98, 99, |
6325
|
|
|
|
|
|
|
100, 102, 103, 104, 106, 107, 108, 109, |
6326
|
|
|
|
|
|
|
110, 111, 112, 113, 114, 115, 116, 117, |
6327
|
|
|
|
|
|
|
118, 119, 120, 121, 122, 97, 105, 97, |
6328
|
|
|
|
|
|
|
98, 101, 105, 111, 117, 122, 99, 120, |
6329
|
|
|
|
|
|
|
113, 97, 101, 105, 111, 117, 98, 99, |
6330
|
|
|
|
|
|
|
100, 105, 111, 117, 122, 97, 101, 102, |
6331
|
|
|
|
|
|
|
120, 97, 100, 101, 105, 111, 117, 122, |
6332
|
|
|
|
|
|
|
98, 120, 97, 101, 102, 105, 111, 117, |
6333
|
|
|
|
|
|
|
122, 98, 120, 97, 101, 103, 105, 110, |
6334
|
|
|
|
|
|
|
111, 117, 122, 98, 120, 97, 101, 105, |
6335
|
|
|
|
|
|
|
111, 117, 101, 110, 111, 115, 120, 101, |
6336
|
|
|
|
|
|
|
110, 111, 112, 115, 120, 97, 101, 104, |
6337
|
|
|
|
|
|
|
105, 111, 116, 117, 122, 98, 120, 97, |
6338
|
|
|
|
|
|
|
101, 105, 106, 111, 117, 122, 98, 120, |
6339
|
|
|
|
|
|
|
98, 99, 100, 105, 107, 111, 117, 122, |
6340
|
|
|
|
|
|
|
97, 101, 102, 120, 105, 97, 101, 105, |
6341
|
|
|
|
|
|
|
108, 111, 114, 117, 119, 122, 98, 120, |
6342
|
|
|
|
|
|
|
97, 101, 105, 109, 111, 117, 122, 98, |
6343
|
|
|
|
|
|
|
120, 97, 101, 105, 110, 111, 117, 122, |
6344
|
|
|
|
|
|
|
98, 120, 97, 101, 105, 111, 112, 117, |
6345
|
|
|
|
|
|
|
122, 98, 120, 97, 101, 105, 111, 113, |
6346
|
|
|
|
|
|
|
117, 122, 98, 120, 97, 101, 105, 111, |
6347
|
|
|
|
|
|
|
114, 117, 122, 98, 120, 97, 101, 105, |
6348
|
|
|
|
|
|
|
111, 117, 98, 99, 100, 105, 108, 111, |
6349
|
|
|
|
|
|
|
116, 117, 97, 101, 102, 122, 101, 110, |
6350
|
|
|
|
|
|
|
111, 115, 120, 98, 104, 106, 116, 118, |
6351
|
|
|
|
|
|
|
122, 101, 110, 111, 115, 120, 101, 110, |
6352
|
|
|
|
|
|
|
111, 112, 115, 120, 101, 105, 110, 111, |
6353
|
|
|
|
|
|
|
115, 120, 98, 116, 118, 122, 101, 105, |
6354
|
|
|
|
|
|
|
110, 111, 115, 120, 98, 116, 118, 122, |
6355
|
|
|
|
|
|
|
101, 110, 111, 115, 120, 98, 104, 106, |
6356
|
|
|
|
|
|
|
116, 118, 122, 98, 101, 110, 111, 114, |
6357
|
|
|
|
|
|
|
115, 120, 101, 110, 111, 115, 120, 98, |
6358
|
|
|
|
|
|
|
104, 106, 116, 118, 122, 97, 101, 105, |
6359
|
|
|
|
|
|
|
111, 115, 117, 122, 98, 120, 97, 101, |
6360
|
|
|
|
|
|
|
105, 111, 116, 117, 122, 98, 120, 122, |
6361
|
|
|
|
|
|
|
98, 100, 102, 104, 106, 110, 112, 116, |
6362
|
|
|
|
|
|
|
118, 120, 122, 98, 100, 102, 104, 106, |
6363
|
|
|
|
|
|
|
110, 112, 116, 118, 120, 98, 114, 112, |
6364
|
|
|
|
|
|
|
114, 113, 97, 101, 105, 108, 111, 117, |
6365
|
|
|
|
|
|
|
98, 122, 101, 110, 111, 115, 120, 98, |
6366
|
|
|
|
|
|
|
104, 106, 116, 118, 122, 101, 110, 111, |
6367
|
|
|
|
|
|
|
115, 120, 101, 110, 111, 112, 115, 120, |
6368
|
|
|
|
|
|
|
101, 105, 110, 111, 115, 120, 98, 116, |
6369
|
|
|
|
|
|
|
118, 122, 101, 105, 110, 111, 115, 120, |
6370
|
|
|
|
|
|
|
98, 116, 118, 122, 101, 110, 111, 115, |
6371
|
|
|
|
|
|
|
120, 98, 104, 106, 116, 118, 122, 98, |
6372
|
|
|
|
|
|
|
101, 110, 111, 114, 115, 120, 101, 110, |
6373
|
|
|
|
|
|
|
111, 115, 120, 98, 104, 106, 116, 118, |
6374
|
|
|
|
|
|
|
122, 97, 101, 105, 111, 117, 98, 122, |
6375
|
|
|
|
|
|
|
97, 101, 105, 111, 117, 121, 97, 101, |
6376
|
|
|
|
|
|
|
105, 111, 117, 118, 122, 98, 120, 97, |
6377
|
|
|
|
|
|
|
101, 105, 111, 117, 119, 122, 98, 120, |
6378
|
|
|
|
|
|
|
97, 101, 105, 111, 117, 120, 122, 98, |
6379
|
|
|
|
|
|
|
119, 97, 101, 105, 111, 117, 121, 97, |
6380
|
|
|
|
|
|
|
101, 105, 111, 117, 121, 122, 98, 120, |
6381
|
|
|
|
|
|
|
0 |
6382
|
|
|
|
|
|
|
}; |
6383
|
|
|
|
|
|
|
|
6384
|
|
|
|
|
|
|
static const char _VBG_single_lengths[] = { |
6385
|
|
|
|
|
|
|
0, 1, 1, 1, 6, 5, 0, 5, |
6386
|
|
|
|
|
|
|
5, 0, 2, 1, 1, 1, 1, 1, |
6387
|
|
|
|
|
|
|
1, 5, 5, 0, 2, 1, 1, 1, |
6388
|
|
|
|
|
|
|
1, 1, 1, 5, 8, 1, 1, 1, |
6389
|
|
|
|
|
|
|
1, 1, 2, 1, 1, 1, 1, 1, |
6390
|
|
|
|
|
|
|
1, 23, 0, 7, 1, 5, 7, 7, |
6391
|
|
|
|
|
|
|
7, 8, 5, 5, 6, 8, 0, 7, |
6392
|
|
|
|
|
|
|
8, 1, 9, 0, 7, 7, 7, 7, |
6393
|
|
|
|
|
|
|
7, 5, 8, 5, 5, 6, 6, 6, |
6394
|
|
|
|
|
|
|
5, 7, 5, 7, 7, 0, 1, 1, |
6395
|
|
|
|
|
|
|
2, 1, 1, 0, 1, 6, 5, 5, |
6396
|
|
|
|
|
|
|
6, 6, 6, 5, 7, 5, 5, 6, |
6397
|
|
|
|
|
|
|
7, 7, 7, 6, 7 |
6398
|
|
|
|
|
|
|
}; |
6399
|
|
|
|
|
|
|
|
6400
|
|
|
|
|
|
|
static const char _VBG_range_lengths[] = { |
6401
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 5, 0, |
6402
|
|
|
|
|
|
|
0, 5, 0, 0, 0, 0, 0, 0, |
6403
|
|
|
|
|
|
|
0, 1, 1, 1, 0, 0, 0, 0, |
6404
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
6405
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
6406
|
|
|
|
|
|
|
0, 1, 0, 1, 0, 0, 2, 1, |
6407
|
|
|
|
|
|
|
1, 1, 0, 0, 0, 1, 0, 1, |
6408
|
|
|
|
|
|
|
2, 0, 1, 0, 1, 1, 1, 1, |
6409
|
|
|
|
|
|
|
1, 0, 2, 3, 0, 0, 2, 2, |
6410
|
|
|
|
|
|
|
3, 0, 3, 1, 1, 0, 5, 5, |
6411
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 1, 3, 0, |
6412
|
|
|
|
|
|
|
0, 2, 2, 3, 0, 3, 1, 0, |
6413
|
|
|
|
|
|
|
1, 1, 1, 0, 1 |
6414
|
|
|
|
|
|
|
}; |
6415
|
|
|
|
|
|
|
|
6416
|
|
|
|
|
|
|
static const short _VBG_index_offsets[] = { |
6417
|
|
|
|
|
|
|
0, 0, 2, 4, 6, 13, 19, 25, |
6418
|
|
|
|
|
|
|
31, 37, 43, 46, 48, 50, 52, 54, |
6419
|
|
|
|
|
|
|
56, 58, 65, 72, 74, 77, 79, 81, |
6420
|
|
|
|
|
|
|
83, 85, 87, 89, 95, 104, 106, 108, |
6421
|
|
|
|
|
|
|
110, 112, 114, 117, 119, 121, 123, 125, |
6422
|
|
|
|
|
|
|
127, 129, 154, 155, 164, 166, 172, 182, |
6423
|
|
|
|
|
|
|
191, 200, 210, 216, 222, 229, 239, 240, |
6424
|
|
|
|
|
|
|
249, 260, 262, 273, 274, 283, 292, 301, |
6425
|
|
|
|
|
|
|
310, 319, 325, 336, 345, 351, 358, 367, |
6426
|
|
|
|
|
|
|
376, 385, 393, 402, 411, 420, 421, 428, |
6427
|
|
|
|
|
|
|
435, 438, 440, 442, 443, 445, 453, 462, |
6428
|
|
|
|
|
|
|
468, 475, 484, 493, 502, 510, 519, 526, |
6429
|
|
|
|
|
|
|
533, 542, 551, 560, 567 |
6430
|
|
|
|
|
|
|
}; |
6431
|
|
|
|
|
|
|
|
6432
|
|
|
|
|
|
|
static const unsigned char _VBG_indicies[] = { |
6433
|
|
|
|
|
|
|
0, 1, 2, 1, 3, 1, 4, 4, |
6434
|
|
|
|
|
|
|
4, 4, 4, 4, 1, 5, 5, 5, |
6435
|
|
|
|
|
|
|
5, 6, 1, 7, 7, 7, 7, 7, |
6436
|
|
|
|
|
|
|
1, 8, 8, 8, 8, 9, 1, 5, |
6437
|
|
|
|
|
|
|
5, 5, 5, 10, 1, 11, 11, 11, |
6438
|
|
|
|
|
|
|
11, 11, 1, 11, 12, 1, 11, 1, |
6439
|
|
|
|
|
|
|
13, 1, 11, 1, 14, 1, 11, 1, |
6440
|
|
|
|
|
|
|
11, 1, 5, 5, 5, 5, 6, 15, |
6441
|
|
|
|
|
|
|
1, 5, 5, 5, 5, 6, 16, 1, |
6442
|
|
|
|
|
|
|
4, 1, 17, 18, 1, 17, 1, 19, |
6443
|
|
|
|
|
|
|
1, 17, 1, 20, 1, 17, 1, 17, |
6444
|
|
|
|
|
|
|
1, 21, 22, 21, 23, 24, 1, 25, |
6445
|
|
|
|
|
|
|
26, 25, 27, 28, 29, 25, 30, 1, |
6446
|
|
|
|
|
|
|
31, 1, 31, 1, 32, 1, 31, 1, |
6447
|
|
|
|
|
|
|
31, 1, 33, 34, 1, 33, 1, 35, |
6448
|
|
|
|
|
|
|
1, 33, 1, 36, 1, 33, 1, 33, |
6449
|
|
|
|
|
|
|
1, 38, 39, 40, 41, 42, 43, 44, |
6450
|
|
|
|
|
|
|
45, 46, 47, 48, 49, 50, 51, 52, |
6451
|
|
|
|
|
|
|
53, 54, 55, 56, 57, 58, 59, 60, |
6452
|
|
|
|
|
|
|
37, 1, 1, 61, 62, 61, 61, 61, |
6453
|
|
|
|
|
|
|
61, 63, 63, 1, 64, 1, 65, 65, |
6454
|
|
|
|
|
|
|
65, 65, 65, 1, 67, 68, 67, 66, |
6455
|
|
|
|
|
|
|
66, 66, 67, 66, 67, 1, 69, 62, |
6456
|
|
|
|
|
|
|
69, 69, 69, 69, 63, 63, 1, 61, |
6457
|
|
|
|
|
|
|
61, 62, 61, 61, 61, 63, 63, 1, |
6458
|
|
|
|
|
|
|
66, 66, 68, 66, 70, 66, 66, 67, |
6459
|
|
|
|
|
|
|
67, 1, 71, 71, 71, 71, 71, 1, |
6460
|
|
|
|
|
|
|
72, 73, 74, 75, 76, 1, 72, 73, |
6461
|
|
|
|
|
|
|
74, 11, 75, 76, 1, 61, 61, 62, |
6462
|
|
|
|
|
|
|
61, 61, 77, 61, 63, 63, 1, 78, |
6463
|
|
|
|
|
|
|
61, 61, 61, 62, 61, 61, 63, 63, |
6464
|
|
|
|
|
|
|
1, 63, 79, 63, 61, 62, 61, 61, |
6465
|
|
|
|
|
|
|
63, 61, 63, 1, 7, 1, 61, 61, |
6466
|
|
|
|
|
|
|
61, 68, 61, 80, 61, 80, 67, 67, |
6467
|
|
|
|
|
|
|
1, 5, 61, 61, 61, 62, 61, 61, |
6468
|
|
|
|
|
|
|
63, 63, 1, 81, 81, 82, 62, 81, |
6469
|
|
|
|
|
|
|
81, 63, 63, 1, 81, 81, 81, 81, |
6470
|
|
|
|
|
|
|
62, 81, 63, 63, 1, 61, 61, 61, |
6471
|
|
|
|
|
|
|
61, 62, 61, 63, 63, 1, 61, 83, |
6472
|
|
|
|
|
|
|
61, 84, 62, 61, 63, 63, 1, 5, |
6473
|
|
|
|
|
|
|
5, 5, 5, 6, 1, 85, 86, 85, |
6474
|
|
|
|
|
|
|
5, 86, 5, 86, 6, 5, 85, 1, |
6475
|
|
|
|
|
|
|
87, 88, 89, 90, 91, 85, 85, 85, |
6476
|
|
|
|
|
|
|
1, 87, 92, 89, 93, 94, 1, 87, |
6477
|
|
|
|
|
|
|
92, 89, 17, 93, 94, 1, 87, 17, |
6478
|
|
|
|
|
|
|
88, 89, 90, 91, 85, 85, 1, 87, |
6479
|
|
|
|
|
|
|
20, 88, 89, 90, 91, 85, 85, 1, |
6480
|
|
|
|
|
|
|
95, 88, 89, 90, 91, 85, 85, 85, |
6481
|
|
|
|
|
|
|
1, 17, 87, 92, 89, 18, 93, 94, |
6482
|
|
|
|
|
|
|
1, 87, 97, 89, 98, 99, 96, 96, |
6483
|
|
|
|
|
|
|
96, 1, 66, 66, 66, 66, 100, 66, |
6484
|
|
|
|
|
|
|
67, 67, 1, 101, 102, 103, 61, 62, |
6485
|
|
|
|
|
|
|
61, 63, 63, 1, 104, 106, 106, 106, |
6486
|
|
|
|
|
|
|
106, 106, 106, 105, 107, 107, 107, 107, |
6487
|
|
|
|
|
|
|
107, 107, 1, 31, 108, 1, 31, 1, |
6488
|
|
|
|
|
|
|
109, 1, 105, 110, 104, 5, 5, 5, |
6489
|
|
|
|
|
|
|
112, 5, 6, 111, 1, 113, 114, 115, |
6490
|
|
|
|
|
|
|
116, 117, 111, 111, 111, 1, 113, 118, |
6491
|
|
|
|
|
|
|
115, 119, 120, 1, 113, 118, 115, 33, |
6492
|
|
|
|
|
|
|
119, 120, 1, 113, 33, 114, 115, 116, |
6493
|
|
|
|
|
|
|
117, 111, 111, 1, 113, 36, 114, 115, |
6494
|
|
|
|
|
|
|
116, 117, 111, 111, 1, 121, 114, 115, |
6495
|
|
|
|
|
|
|
116, 117, 111, 111, 111, 1, 33, 113, |
6496
|
|
|
|
|
|
|
118, 115, 34, 119, 120, 1, 113, 123, |
6497
|
|
|
|
|
|
|
115, 124, 125, 122, 122, 122, 1, 5, |
6498
|
|
|
|
|
|
|
5, 5, 5, 6, 111, 1, 4, 4, |
6499
|
|
|
|
|
|
|
4, 4, 4, 4, 1, 66, 66, 66, |
6500
|
|
|
|
|
|
|
66, 66, 68, 67, 67, 1, 81, 81, |
6501
|
|
|
|
|
|
|
81, 81, 81, 62, 63, 63, 1, 81, |
6502
|
|
|
|
|
|
|
81, 81, 81, 81, 62, 63, 63, 1, |
6503
|
|
|
|
|
|
|
126, 126, 126, 126, 126, 4, 1, 127, |
6504
|
|
|
|
|
|
|
127, 127, 127, 127, 129, 130, 128, 1, |
6505
|
|
|
|
|
|
|
0 |
6506
|
|
|
|
|
|
|
}; |
6507
|
|
|
|
|
|
|
|
6508
|
|
|
|
|
|
|
static const char _VBG_trans_targs[] = { |
6509
|
|
|
|
|
|
|
2, 0, 3, 41, 42, 42, 44, 42, |
6510
|
|
|
|
|
|
|
42, 44, 44, 51, 52, 13, 15, 42, |
6511
|
|
|
|
|
|
|
42, 68, 69, 23, 25, 77, 78, 83, |
6512
|
|
|
|
|
|
|
84, 42, 80, 29, 82, 31, 33, 42, |
6513
|
|
|
|
|
|
|
32, 87, 88, 37, 39, 4, 43, 46, |
6514
|
|
|
|
|
|
|
47, 48, 49, 53, 55, 56, 58, 60, |
6515
|
|
|
|
|
|
|
61, 19, 62, 63, 64, 75, 76, 95, |
6516
|
|
|
|
|
|
|
96, 97, 98, 99, 100, 5, 45, 42, |
6517
|
|
|
|
|
|
|
42, 6, 7, 42, 45, 8, 50, 9, |
6518
|
|
|
|
|
|
|
10, 11, 12, 14, 16, 54, 42, 57, |
6519
|
|
|
|
|
|
|
59, 17, 18, 65, 66, 67, 74, 20, |
6520
|
|
|
|
|
|
|
70, 22, 71, 72, 21, 24, 26, 73, |
6521
|
|
|
|
|
|
|
67, 70, 71, 72, 45, 27, 85, 94, |
6522
|
|
|
|
|
|
|
42, 42, 79, 28, 81, 30, 42, 86, |
6523
|
|
|
|
|
|
|
93, 34, 89, 36, 90, 91, 35, 38, |
6524
|
|
|
|
|
|
|
40, 92, 86, 89, 90, 91, 65, 65, |
6525
|
|
|
|
|
|
|
42, 42, 45 |
6526
|
|
|
|
|
|
|
}; |
6527
|
|
|
|
|
|
|
|
6528
|
|
|
|
|
|
|
static const char _VBG_trans_actions[] = { |
6529
|
|
|
|
|
|
|
0, 0, 0, 29, 23, 15, 15, 3, |
6530
|
|
|
|
|
|
|
46, 46, 40, 0, 0, 0, 0, 5, |
6531
|
|
|
|
|
|
|
34, 0, 0, 0, 0, 15, 15, 15, |
6532
|
|
|
|
|
|
|
15, 11, 11, 0, 11, 0, 0, 9, |
6533
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
6534
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
6535
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 21, |
6536
|
|
|
|
|
|
|
0, 0, 0, 23, 0, 0, 19, 19, |
6537
|
|
|
|
|
|
|
7, 0, 0, 49, 49, 0, 49, 0, |
6538
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 19, 17, 19, |
6539
|
|
|
|
|
|
|
49, 0, 0, 27, 27, 0, 0, 0, |
6540
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
6541
|
|
|
|
|
|
|
25, 25, 25, 25, 56, 0, 9, 9, |
6542
|
|
|
|
|
|
|
13, 43, 43, 0, 9, 0, 37, 0, |
6543
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
6544
|
|
|
|
|
|
|
0, 0, 7, 7, 7, 7, 23, 1, |
6545
|
|
|
|
|
|
|
31, 1, 52 |
6546
|
|
|
|
|
|
|
}; |
6547
|
|
|
|
|
|
|
|
6548
|
|
|
|
|
|
|
static const char _VBG_eof_actions[] = { |
6549
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
6550
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
6551
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
6552
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
6553
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
6554
|
|
|
|
|
|
|
0, 0, 0, 3, 0, 0, 3, 3, |
6555
|
|
|
|
|
|
|
3, 3, 0, 3, 3, 3, 0, 3, |
6556
|
|
|
|
|
|
|
3, 0, 3, 0, 3, 3, 3, 3, |
6557
|
|
|
|
|
|
|
3, 0, 0, 25, 25, 25, 25, 25, |
6558
|
|
|
|
|
|
|
25, 25, 25, 3, 3, 0, 0, 0, |
6559
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 7, 7, |
6560
|
|
|
|
|
|
|
7, 7, 7, 7, 7, 7, 0, 0, |
6561
|
|
|
|
|
|
|
3, 3, 3, 0, 3 |
6562
|
|
|
|
|
|
|
}; |
6563
|
|
|
|
|
|
|
|
6564
|
|
|
|
|
|
|
static const int VBG_start = 1; |
6565
|
|
|
|
|
|
|
|
6566
|
0
|
|
|
|
|
|
void english_morpho_guesser::add_VBG(const string& form, vector& lemmas) const { |
6567
|
|
|
|
|
|
|
const char* p = form.c_str(); int cs; |
6568
|
|
|
|
|
|
|
char best = 'z'; unsigned remove = 0; const char* append = nullptr; |
6569
|
|
|
|
|
|
|
|
6570
|
|
|
|
|
|
|
{ |
6571
|
|
|
|
|
|
|
cs = VBG_start; |
6572
|
|
|
|
|
|
|
} |
6573
|
|
|
|
|
|
|
|
6574
|
|
|
|
|
|
|
{ |
6575
|
|
|
|
|
|
|
int _klen; |
6576
|
|
|
|
|
|
|
unsigned int _trans; |
6577
|
|
|
|
|
|
|
const char *_acts; |
6578
|
|
|
|
|
|
|
unsigned int _nacts; |
6579
|
|
|
|
|
|
|
const char *_keys; |
6580
|
|
|
|
|
|
|
|
6581
|
0
|
0
|
|
|
|
|
if ( p == ( (form.c_str() + form.size())) ) |
6582
|
|
|
|
|
|
|
goto _test_eof; |
6583
|
|
|
|
|
|
|
if ( cs == 0 ) |
6584
|
|
|
|
|
|
|
goto _out; |
6585
|
|
|
|
|
|
|
_resume: |
6586
|
0
|
|
|
|
|
|
_keys = _VBG_trans_keys + _VBG_key_offsets[cs]; |
6587
|
0
|
|
|
|
|
|
_trans = _VBG_index_offsets[cs]; |
6588
|
|
|
|
|
|
|
|
6589
|
0
|
|
|
|
|
|
_klen = _VBG_single_lengths[cs]; |
6590
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
6591
|
|
|
|
|
|
|
const char *_lower = _keys; |
6592
|
|
|
|
|
|
|
const char *_mid; |
6593
|
0
|
|
|
|
|
|
const char *_upper = _keys + _klen - 1; |
6594
|
|
|
|
|
|
|
while (1) { |
6595
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
6596
|
|
|
|
|
|
|
break; |
6597
|
|
|
|
|
|
|
|
6598
|
0
|
|
|
|
|
|
_mid = _lower + ((_upper-_lower) >> 1); |
6599
|
0
|
0
|
|
|
|
|
if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid ) |
6600
|
0
|
|
|
|
|
|
_upper = _mid - 1; |
6601
|
0
|
0
|
|
|
|
|
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid ) |
6602
|
0
|
|
|
|
|
|
_lower = _mid + 1; |
6603
|
|
|
|
|
|
|
else { |
6604
|
0
|
|
|
|
|
|
_trans += (unsigned int)(_mid - _keys); |
6605
|
0
|
|
|
|
|
|
goto _match; |
6606
|
|
|
|
|
|
|
} |
6607
|
|
|
|
|
|
|
} |
6608
|
0
|
|
|
|
|
|
_keys += _klen; |
6609
|
0
|
|
|
|
|
|
_trans += _klen; |
6610
|
|
|
|
|
|
|
} |
6611
|
|
|
|
|
|
|
|
6612
|
0
|
|
|
|
|
|
_klen = _VBG_range_lengths[cs]; |
6613
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
6614
|
|
|
|
|
|
|
const char *_lower = _keys; |
6615
|
|
|
|
|
|
|
const char *_mid; |
6616
|
0
|
|
|
|
|
|
const char *_upper = _keys + (_klen<<1) - 2; |
6617
|
|
|
|
|
|
|
while (1) { |
6618
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
6619
|
|
|
|
|
|
|
break; |
6620
|
|
|
|
|
|
|
|
6621
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
6622
|
0
|
0
|
|
|
|
|
if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] ) |
6623
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
6624
|
0
|
0
|
|
|
|
|
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] ) |
6625
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
6626
|
|
|
|
|
|
|
else { |
6627
|
0
|
|
|
|
|
|
_trans += (unsigned int)((_mid - _keys)>>1); |
6628
|
0
|
|
|
|
|
|
goto _match; |
6629
|
|
|
|
|
|
|
} |
6630
|
|
|
|
|
|
|
} |
6631
|
0
|
|
|
|
|
|
_trans += _klen; |
6632
|
|
|
|
|
|
|
} |
6633
|
|
|
|
|
|
|
|
6634
|
|
|
|
|
|
|
_match: |
6635
|
0
|
|
|
|
|
|
_trans = _VBG_indicies[_trans]; |
6636
|
0
|
|
|
|
|
|
cs = _VBG_trans_targs[_trans]; |
6637
|
|
|
|
|
|
|
|
6638
|
0
|
0
|
|
|
|
|
if ( _VBG_trans_actions[_trans] == 0 ) |
6639
|
|
|
|
|
|
|
goto _again; |
6640
|
|
|
|
|
|
|
|
6641
|
0
|
|
|
|
|
|
_acts = _VBG_actions + _VBG_trans_actions[_trans]; |
6642
|
0
|
|
|
|
|
|
_nacts = (unsigned int) *_acts++; |
6643
|
0
|
0
|
|
|
|
|
while ( _nacts-- > 0 ) |
6644
|
|
|
|
|
|
|
{ |
6645
|
0
|
|
|
|
|
|
switch ( *_acts++ ) |
6646
|
|
|
|
|
|
|
{ |
6647
|
|
|
|
|
|
|
case 0: |
6648
|
0
|
0
|
|
|
|
|
{ if (best > 'a') best = 'a', remove = 3, append = nullptr; } |
6649
|
|
|
|
|
|
|
break; |
6650
|
|
|
|
|
|
|
case 1: |
6651
|
0
|
0
|
|
|
|
|
{ if (best > 'b') best = 'b', remove = 3, append = "e"; } |
6652
|
|
|
|
|
|
|
break; |
6653
|
|
|
|
|
|
|
case 2: |
6654
|
0
|
0
|
|
|
|
|
{ if (best > 'c') best = 'c', remove = 3, append = nullptr; } |
6655
|
|
|
|
|
|
|
break; |
6656
|
|
|
|
|
|
|
case 3: |
6657
|
0
|
0
|
|
|
|
|
{ if (best > 'd') best = 'd', remove = 3, append = "e"; } |
6658
|
|
|
|
|
|
|
break; |
6659
|
|
|
|
|
|
|
case 4: |
6660
|
0
|
0
|
|
|
|
|
{ if (best > 'e') best = 'e', remove = 3, append = nullptr; } |
6661
|
|
|
|
|
|
|
break; |
6662
|
|
|
|
|
|
|
case 5: |
6663
|
0
|
0
|
|
|
|
|
{ if (best > 'f') best = 'f', remove = 3, append = "e"; } |
6664
|
|
|
|
|
|
|
break; |
6665
|
|
|
|
|
|
|
case 6: |
6666
|
0
|
0
|
|
|
|
|
{ if (best > 'g') best = 'g', remove = 3, append = nullptr; } |
6667
|
|
|
|
|
|
|
break; |
6668
|
|
|
|
|
|
|
case 7: |
6669
|
0
|
0
|
|
|
|
|
{ if (best > 'h') best = 'h', remove = 3, append = "e"; } |
6670
|
|
|
|
|
|
|
break; |
6671
|
|
|
|
|
|
|
case 8: |
6672
|
0
|
0
|
|
|
|
|
{ if (best > 'i') best = 'i', remove = 3, append = nullptr; } |
6673
|
|
|
|
|
|
|
break; |
6674
|
|
|
|
|
|
|
case 9: |
6675
|
0
|
0
|
|
|
|
|
{ if (best > 'j') best = 'j', remove = 3, append = "e"; } |
6676
|
|
|
|
|
|
|
break; |
6677
|
|
|
|
|
|
|
case 10: |
6678
|
0
|
0
|
|
|
|
|
{ if (best > 'k') best = 'k', remove = 3, append = nullptr; } |
6679
|
|
|
|
|
|
|
break; |
6680
|
|
|
|
|
|
|
case 11: |
6681
|
0
|
0
|
|
|
|
|
{ if (best > 'l') best = 'l', remove = 3, append = "e"; } |
6682
|
|
|
|
|
|
|
break; |
6683
|
|
|
|
|
|
|
case 12: |
6684
|
0
|
0
|
|
|
|
|
{ if (best > 'm') best = 'm', remove = 3, append = nullptr; } |
6685
|
|
|
|
|
|
|
break; |
6686
|
|
|
|
|
|
|
case 13: |
6687
|
0
|
0
|
|
|
|
|
{ if (best > 'n') best = 'n', remove = 3, append = "e"; } |
6688
|
|
|
|
|
|
|
break; |
6689
|
|
|
|
|
|
|
case 14: |
6690
|
0
|
0
|
|
|
|
|
{ if (best > 'o') best = 'o', remove = 3, append = nullptr; } |
6691
|
|
|
|
|
|
|
break; |
6692
|
|
|
|
|
|
|
case 15: |
6693
|
0
|
0
|
|
|
|
|
{ if (best > 'p') best = 'p', remove = 3, append = "e"; } |
6694
|
|
|
|
|
|
|
break; |
6695
|
|
|
|
|
|
|
case 16: |
6696
|
0
|
0
|
|
|
|
|
{ if (best > 'q') best = 'q', remove = 3, append = nullptr; } |
6697
|
|
|
|
|
|
|
break; |
6698
|
|
|
|
|
|
|
case 17: |
6699
|
0
|
0
|
|
|
|
|
{ if (best > 'r') best = 'r', remove = 3, append = "e"; } |
6700
|
|
|
|
|
|
|
break; |
6701
|
|
|
|
|
|
|
} |
6702
|
|
|
|
|
|
|
} |
6703
|
|
|
|
|
|
|
|
6704
|
|
|
|
|
|
|
_again: |
6705
|
0
|
0
|
|
|
|
|
if ( cs == 0 ) |
6706
|
|
|
|
|
|
|
goto _out; |
6707
|
0
|
0
|
|
|
|
|
if ( ++p != ( (form.c_str() + form.size())) ) |
6708
|
|
|
|
|
|
|
goto _resume; |
6709
|
|
|
|
|
|
|
_test_eof: {} |
6710
|
0
|
0
|
|
|
|
|
if ( p == ( (form.c_str() + form.size())) ) |
6711
|
|
|
|
|
|
|
{ |
6712
|
0
|
|
|
|
|
|
const char *__acts = _VBG_actions + _VBG_eof_actions[cs]; |
6713
|
0
|
|
|
|
|
|
unsigned int __nacts = (unsigned int) *__acts++; |
6714
|
0
|
0
|
|
|
|
|
while ( __nacts-- > 0 ) { |
6715
|
0
|
|
|
|
|
|
switch ( *__acts++ ) { |
6716
|
|
|
|
|
|
|
case 2: |
6717
|
0
|
0
|
|
|
|
|
{ if (best > 'c') best = 'c', remove = 3, append = nullptr; } |
6718
|
|
|
|
|
|
|
break; |
6719
|
|
|
|
|
|
|
case 5: |
6720
|
0
|
0
|
|
|
|
|
{ if (best > 'f') best = 'f', remove = 3, append = "e"; } |
6721
|
|
|
|
|
|
|
break; |
6722
|
|
|
|
|
|
|
case 15: |
6723
|
0
|
0
|
|
|
|
|
{ if (best > 'p') best = 'p', remove = 3, append = "e"; } |
6724
|
|
|
|
|
|
|
break; |
6725
|
|
|
|
|
|
|
} |
6726
|
|
|
|
|
|
|
} |
6727
|
|
|
|
|
|
|
} |
6728
|
|
|
|
|
|
|
|
6729
|
|
|
|
|
|
|
_out: {} |
6730
|
|
|
|
|
|
|
} |
6731
|
|
|
|
|
|
|
|
6732
|
0
|
0
|
|
|
|
|
add(VBG, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
|
|
0
|
|
|
|
|
|
6733
|
0
|
|
|
|
|
|
} |
6734
|
|
|
|
|
|
|
|
6735
|
|
|
|
|
|
|
static const char _VBD_VBN_actions[] = { |
6736
|
|
|
|
|
|
|
0, 1, 0, 1, 2, 1, 3, 1, |
6737
|
|
|
|
|
|
|
4, 1, 5, 1, 6, 1, 7, 1, |
6738
|
|
|
|
|
|
|
8, 1, 9, 1, 10, 1, 11, 1, |
6739
|
|
|
|
|
|
|
13, 1, 14, 1, 15, 1, 16, 1, |
6740
|
|
|
|
|
|
|
17, 2, 1, 16, 2, 4, 5, 2, |
6741
|
|
|
|
|
|
|
8, 16, 2, 9, 13, 2, 9, 14, |
6742
|
|
|
|
|
|
|
2, 12, 13, 2, 13, 14, 2, 15, |
6743
|
|
|
|
|
|
|
16, 3, 1, 3, 16, 3, 3, 15, |
6744
|
|
|
|
|
|
|
16 |
6745
|
|
|
|
|
|
|
}; |
6746
|
|
|
|
|
|
|
|
6747
|
|
|
|
|
|
|
static const short _VBD_VBN_key_offsets[] = { |
6748
|
|
|
|
|
|
|
0, 0, 2, 3, 9, 14, 24, 29, |
6749
|
|
|
|
|
|
|
34, 44, 46, 47, 48, 49, 50, 51, |
6750
|
|
|
|
|
|
|
52, 60, 67, 74, 76, 77, 78, 79, |
6751
|
|
|
|
|
|
|
80, 81, 82, 87, 95, 96, 97, 98, |
6752
|
|
|
|
|
|
|
99, 100, 102, 103, 104, 105, 106, 107, |
6753
|
|
|
|
|
|
|
108, 114, 115, 140, 140, 149, 150, 155, |
6754
|
|
|
|
|
|
|
166, 175, 184, 194, 199, 204, 210, 220, |
6755
|
|
|
|
|
|
|
220, 229, 241, 242, 253, 253, 262, 271, |
6756
|
|
|
|
|
|
|
280, 289, 298, 303, 316, 327, 332, 338, |
6757
|
|
|
|
|
|
|
348, 358, 369, 376, 387, 396, 405, 405, |
6758
|
|
|
|
|
|
|
416, 427, 429, 430, 431, 431, 432, 440, |
6759
|
|
|
|
|
|
|
451, 456, 462, 472, 482, 493, 500, 511, |
6760
|
|
|
|
|
|
|
518, 524, 533, 542, 551 |
6761
|
|
|
|
|
|
|
}; |
6762
|
|
|
|
|
|
|
|
6763
|
|
|
|
|
|
|
static const char _VBD_VBN_trans_keys[] = { |
6764
|
|
|
|
|
|
|
100, 110, 101, 97, 101, 105, 111, 117, |
6765
|
|
|
|
|
|
|
121, 97, 101, 105, 111, 117, 98, 100, |
6766
|
|
|
|
|
|
|
102, 104, 106, 110, 112, 116, 118, 122, |
6767
|
|
|
|
|
|
|
97, 101, 105, 111, 117, 97, 101, 105, |
6768
|
|
|
|
|
|
|
111, 117, 98, 100, 102, 104, 106, 110, |
6769
|
|
|
|
|
|
|
112, 116, 118, 122, 98, 114, 105, 114, |
6770
|
|
|
|
|
|
|
112, 105, 109, 101, 97, 101, 105, 111, |
6771
|
|
|
|
|
|
|
117, 121, 98, 122, 97, 101, 105, 111, |
6772
|
|
|
|
|
|
|
117, 98, 122, 97, 101, 105, 111, 117, |
6773
|
|
|
|
|
|
|
98, 122, 98, 114, 105, 114, 112, 105, |
6774
|
|
|
|
|
|
|
109, 101, 97, 101, 105, 111, 117, 97, |
6775
|
|
|
|
|
|
|
101, 105, 110, 111, 115, 117, 120, 105, |
6776
|
|
|
|
|
|
|
112, 105, 109, 101, 98, 114, 105, 114, |
6777
|
|
|
|
|
|
|
112, 105, 109, 101, 97, 101, 105, 111, |
6778
|
|
|
|
|
|
|
117, 121, 101, 98, 99, 100, 102, 103, |
6779
|
|
|
|
|
|
|
104, 105, 106, 107, 108, 109, 110, 112, |
6780
|
|
|
|
|
|
|
113, 114, 115, 116, 117, 118, 119, 120, |
6781
|
|
|
|
|
|
|
121, 122, 97, 111, 97, 98, 101, 105, |
6782
|
|
|
|
|
|
|
111, 117, 122, 99, 120, 113, 97, 101, |
6783
|
|
|
|
|
|
|
105, 111, 117, 98, 99, 100, 105, 111, |
6784
|
|
|
|
|
|
|
117, 122, 97, 101, 102, 120, 97, 100, |
6785
|
|
|
|
|
|
|
101, 105, 111, 117, 122, 98, 120, 97, |
6786
|
|
|
|
|
|
|
101, 102, 105, 111, 117, 122, 98, 120, |
6787
|
|
|
|
|
|
|
97, 101, 103, 105, 110, 111, 117, 122, |
6788
|
|
|
|
|
|
|
98, 120, 97, 101, 105, 111, 117, 101, |
6789
|
|
|
|
|
|
|
110, 111, 115, 120, 101, 110, 111, 112, |
6790
|
|
|
|
|
|
|
115, 120, 97, 101, 104, 105, 111, 116, |
6791
|
|
|
|
|
|
|
117, 122, 98, 120, 97, 101, 105, 106, |
6792
|
|
|
|
|
|
|
111, 117, 122, 98, 120, 98, 99, 100, |
6793
|
|
|
|
|
|
|
105, 107, 111, 117, 122, 97, 101, 102, |
6794
|
|
|
|
|
|
|
120, 105, 97, 101, 105, 108, 111, 114, |
6795
|
|
|
|
|
|
|
117, 119, 122, 98, 120, 97, 101, 105, |
6796
|
|
|
|
|
|
|
109, 111, 117, 122, 98, 120, 97, 101, |
6797
|
|
|
|
|
|
|
105, 110, 111, 117, 122, 98, 120, 97, |
6798
|
|
|
|
|
|
|
101, 105, 111, 112, 117, 122, 98, 120, |
6799
|
|
|
|
|
|
|
97, 101, 105, 111, 113, 117, 122, 98, |
6800
|
|
|
|
|
|
|
120, 97, 101, 105, 111, 114, 117, 122, |
6801
|
|
|
|
|
|
|
98, 120, 97, 101, 105, 111, 117, 98, |
6802
|
|
|
|
|
|
|
99, 100, 105, 108, 110, 111, 116, 117, |
6803
|
|
|
|
|
|
|
97, 101, 102, 122, 101, 110, 111, 115, |
6804
|
|
|
|
|
|
|
120, 98, 104, 106, 116, 118, 122, 101, |
6805
|
|
|
|
|
|
|
110, 111, 115, 120, 101, 110, 111, 112, |
6806
|
|
|
|
|
|
|
115, 120, 101, 105, 110, 111, 115, 120, |
6807
|
|
|
|
|
|
|
98, 116, 118, 122, 101, 105, 110, 111, |
6808
|
|
|
|
|
|
|
115, 120, 98, 116, 118, 122, 101, 110, |
6809
|
|
|
|
|
|
|
111, 115, 120, 98, 104, 106, 116, 118, |
6810
|
|
|
|
|
|
|
122, 98, 101, 110, 111, 114, 115, 120, |
6811
|
|
|
|
|
|
|
101, 110, 111, 115, 120, 98, 104, 106, |
6812
|
|
|
|
|
|
|
116, 118, 122, 97, 101, 105, 111, 115, |
6813
|
|
|
|
|
|
|
117, 122, 98, 120, 97, 101, 105, 111, |
6814
|
|
|
|
|
|
|
116, 117, 122, 98, 120, 122, 98, 100, |
6815
|
|
|
|
|
|
|
102, 104, 106, 110, 112, 116, 118, 120, |
6816
|
|
|
|
|
|
|
122, 98, 100, 102, 104, 106, 110, 112, |
6817
|
|
|
|
|
|
|
116, 118, 120, 98, 114, 112, 114, 113, |
6818
|
|
|
|
|
|
|
97, 101, 105, 108, 111, 117, 98, 122, |
6819
|
|
|
|
|
|
|
101, 110, 111, 115, 120, 98, 104, 106, |
6820
|
|
|
|
|
|
|
116, 118, 122, 101, 110, 111, 115, 120, |
6821
|
|
|
|
|
|
|
101, 110, 111, 112, 115, 120, 101, 105, |
6822
|
|
|
|
|
|
|
110, 111, 115, 120, 98, 116, 118, 122, |
6823
|
|
|
|
|
|
|
101, 105, 110, 111, 115, 120, 98, 116, |
6824
|
|
|
|
|
|
|
118, 122, 101, 110, 111, 115, 120, 98, |
6825
|
|
|
|
|
|
|
104, 106, 116, 118, 122, 98, 101, 110, |
6826
|
|
|
|
|
|
|
111, 114, 115, 120, 101, 110, 111, 115, |
6827
|
|
|
|
|
|
|
120, 98, 104, 106, 116, 118, 122, 97, |
6828
|
|
|
|
|
|
|
101, 105, 111, 117, 98, 122, 97, 101, |
6829
|
|
|
|
|
|
|
105, 111, 117, 121, 97, 101, 105, 111, |
6830
|
|
|
|
|
|
|
117, 118, 122, 98, 120, 97, 101, 105, |
6831
|
|
|
|
|
|
|
111, 117, 119, 122, 98, 120, 97, 101, |
6832
|
|
|
|
|
|
|
105, 111, 117, 120, 122, 98, 119, 97, |
6833
|
|
|
|
|
|
|
101, 105, 111, 117, 121, 122, 98, 120, |
6834
|
|
|
|
|
|
|
0 |
6835
|
|
|
|
|
|
|
}; |
6836
|
|
|
|
|
|
|
|
6837
|
|
|
|
|
|
|
static const char _VBD_VBN_single_lengths[] = { |
6838
|
|
|
|
|
|
|
0, 2, 1, 6, 5, 0, 5, 5, |
6839
|
|
|
|
|
|
|
0, 2, 1, 1, 1, 1, 1, 1, |
6840
|
|
|
|
|
|
|
6, 5, 5, 2, 1, 1, 1, 1, |
6841
|
|
|
|
|
|
|
1, 1, 5, 8, 1, 1, 1, 1, |
6842
|
|
|
|
|
|
|
1, 2, 1, 1, 1, 1, 1, 1, |
6843
|
|
|
|
|
|
|
6, 1, 23, 0, 7, 1, 5, 7, |
6844
|
|
|
|
|
|
|
7, 7, 8, 5, 5, 6, 8, 0, |
6845
|
|
|
|
|
|
|
7, 8, 1, 9, 0, 7, 7, 7, |
6846
|
|
|
|
|
|
|
7, 7, 5, 9, 5, 5, 6, 6, |
6847
|
|
|
|
|
|
|
6, 5, 7, 5, 7, 7, 0, 1, |
6848
|
|
|
|
|
|
|
1, 2, 1, 1, 0, 1, 6, 5, |
6849
|
|
|
|
|
|
|
5, 6, 6, 6, 5, 7, 5, 5, |
6850
|
|
|
|
|
|
|
6, 7, 7, 7, 7 |
6851
|
|
|
|
|
|
|
}; |
6852
|
|
|
|
|
|
|
|
6853
|
|
|
|
|
|
|
static const char _VBD_VBN_range_lengths[] = { |
6854
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 5, 0, 0, |
6855
|
|
|
|
|
|
|
5, 0, 0, 0, 0, 0, 0, 0, |
6856
|
|
|
|
|
|
|
1, 1, 1, 0, 0, 0, 0, 0, |
6857
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
6858
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
6859
|
|
|
|
|
|
|
0, 0, 1, 0, 1, 0, 0, 2, |
6860
|
|
|
|
|
|
|
1, 1, 1, 0, 0, 0, 1, 0, |
6861
|
|
|
|
|
|
|
1, 2, 0, 1, 0, 1, 1, 1, |
6862
|
|
|
|
|
|
|
1, 1, 0, 2, 3, 0, 0, 2, |
6863
|
|
|
|
|
|
|
2, 3, 0, 3, 1, 1, 0, 5, |
6864
|
|
|
|
|
|
|
5, 0, 0, 0, 0, 0, 1, 3, |
6865
|
|
|
|
|
|
|
0, 0, 2, 2, 3, 0, 3, 1, |
6866
|
|
|
|
|
|
|
0, 1, 1, 1, 1 |
6867
|
|
|
|
|
|
|
}; |
6868
|
|
|
|
|
|
|
|
6869
|
|
|
|
|
|
|
static const short _VBD_VBN_index_offsets[] = { |
6870
|
|
|
|
|
|
|
0, 0, 3, 5, 12, 18, 24, 30, |
6871
|
|
|
|
|
|
|
36, 42, 45, 47, 49, 51, 53, 55, |
6872
|
|
|
|
|
|
|
57, 65, 72, 79, 82, 84, 86, 88, |
6873
|
|
|
|
|
|
|
90, 92, 94, 100, 109, 111, 113, 115, |
6874
|
|
|
|
|
|
|
117, 119, 122, 124, 126, 128, 130, 132, |
6875
|
|
|
|
|
|
|
134, 141, 143, 168, 169, 178, 180, 186, |
6876
|
|
|
|
|
|
|
196, 205, 214, 224, 230, 236, 243, 253, |
6877
|
|
|
|
|
|
|
254, 263, 274, 276, 287, 288, 297, 306, |
6878
|
|
|
|
|
|
|
315, 324, 333, 339, 351, 360, 366, 373, |
6879
|
|
|
|
|
|
|
382, 391, 400, 408, 417, 426, 435, 436, |
6880
|
|
|
|
|
|
|
443, 450, 453, 455, 457, 458, 460, 468, |
6881
|
|
|
|
|
|
|
477, 483, 490, 499, 508, 517, 525, 534, |
6882
|
|
|
|
|
|
|
541, 548, 557, 566, 575 |
6883
|
|
|
|
|
|
|
}; |
6884
|
|
|
|
|
|
|
|
6885
|
|
|
|
|
|
|
static const unsigned char _VBD_VBN_indicies[] = { |
6886
|
|
|
|
|
|
|
0, 2, 1, 3, 1, 4, 4, 4, |
6887
|
|
|
|
|
|
|
4, 4, 4, 1, 5, 5, 5, 5, |
6888
|
|
|
|
|
|
|
6, 1, 7, 7, 7, 7, 7, 1, |
6889
|
|
|
|
|
|
|
8, 8, 8, 8, 9, 1, 5, 5, |
6890
|
|
|
|
|
|
|
5, 5, 10, 1, 11, 11, 11, 11, |
6891
|
|
|
|
|
|
|
11, 1, 11, 12, 1, 11, 1, 13, |
6892
|
|
|
|
|
|
|
1, 11, 1, 14, 1, 11, 1, 11, |
6893
|
|
|
|
|
|
|
1, 4, 4, 4, 4, 4, 16, 15, |
6894
|
|
|
|
|
|
|
1, 5, 5, 5, 5, 6, 17, 1, |
6895
|
|
|
|
|
|
|
5, 5, 5, 5, 6, 18, 1, 19, |
6896
|
|
|
|
|
|
|
20, 1, 19, 1, 21, 1, 19, 1, |
6897
|
|
|
|
|
|
|
22, 1, 19, 1, 19, 1, 23, 24, |
6898
|
|
|
|
|
|
|
23, 25, 26, 1, 27, 28, 27, 29, |
6899
|
|
|
|
|
|
|
30, 31, 27, 32, 1, 33, 1, 33, |
6900
|
|
|
|
|
|
|
1, 34, 1, 33, 1, 33, 1, 35, |
6901
|
|
|
|
|
|
|
36, 1, 35, 1, 37, 1, 35, 1, |
6902
|
|
|
|
|
|
|
38, 1, 35, 1, 35, 1, 39, 39, |
6903
|
|
|
|
|
|
|
39, 39, 39, 4, 1, 40, 1, 42, |
6904
|
|
|
|
|
|
|
43, 44, 45, 46, 47, 48, 49, 50, |
6905
|
|
|
|
|
|
|
51, 52, 53, 54, 55, 56, 57, 58, |
6906
|
|
|
|
|
|
|
59, 60, 61, 62, 63, 64, 41, 1, |
6907
|
|
|
|
|
|
|
1, 65, 66, 65, 65, 65, 65, 4, |
6908
|
|
|
|
|
|
|
4, 1, 67, 1, 68, 68, 68, 68, |
6909
|
|
|
|
|
|
|
68, 1, 70, 71, 70, 69, 69, 69, |
6910
|
|
|
|
|
|
|
70, 69, 70, 1, 72, 66, 72, 72, |
6911
|
|
|
|
|
|
|
72, 72, 4, 4, 1, 65, 65, 66, |
6912
|
|
|
|
|
|
|
65, 65, 65, 4, 4, 1, 69, 69, |
6913
|
|
|
|
|
|
|
71, 69, 73, 69, 69, 70, 70, 1, |
6914
|
|
|
|
|
|
|
74, 74, 74, 74, 74, 1, 75, 76, |
6915
|
|
|
|
|
|
|
77, 78, 79, 1, 75, 76, 77, 11, |
6916
|
|
|
|
|
|
|
78, 79, 1, 65, 65, 66, 65, 65, |
6917
|
|
|
|
|
|
|
80, 65, 4, 4, 1, 81, 65, 65, |
6918
|
|
|
|
|
|
|
65, 66, 65, 65, 4, 4, 1, 4, |
6919
|
|
|
|
|
|
|
82, 4, 65, 66, 65, 65, 4, 65, |
6920
|
|
|
|
|
|
|
4, 1, 7, 1, 65, 65, 65, 71, |
6921
|
|
|
|
|
|
|
65, 83, 65, 83, 70, 70, 1, 5, |
6922
|
|
|
|
|
|
|
65, 65, 65, 66, 65, 65, 4, 4, |
6923
|
|
|
|
|
|
|
1, 84, 84, 85, 66, 84, 84, 4, |
6924
|
|
|
|
|
|
|
4, 1, 84, 84, 84, 84, 66, 84, |
6925
|
|
|
|
|
|
|
4, 4, 1, 65, 65, 65, 65, 66, |
6926
|
|
|
|
|
|
|
65, 4, 4, 1, 65, 86, 65, 87, |
6927
|
|
|
|
|
|
|
66, 65, 4, 4, 1, 5, 5, 5, |
6928
|
|
|
|
|
|
|
5, 6, 1, 88, 89, 88, 5, 89, |
6929
|
|
|
|
|
|
|
89, 5, 89, 6, 5, 88, 1, 90, |
6930
|
|
|
|
|
|
|
91, 92, 93, 94, 88, 88, 88, 1, |
6931
|
|
|
|
|
|
|
90, 95, 92, 96, 97, 1, 90, 95, |
6932
|
|
|
|
|
|
|
92, 19, 96, 97, 1, 90, 19, 91, |
6933
|
|
|
|
|
|
|
92, 93, 94, 88, 88, 1, 90, 22, |
6934
|
|
|
|
|
|
|
91, 92, 93, 94, 88, 88, 1, 98, |
6935
|
|
|
|
|
|
|
91, 92, 93, 94, 88, 88, 88, 1, |
6936
|
|
|
|
|
|
|
19, 90, 95, 92, 20, 96, 97, 1, |
6937
|
|
|
|
|
|
|
90, 100, 92, 101, 102, 99, 99, 99, |
6938
|
|
|
|
|
|
|
1, 69, 69, 69, 69, 103, 69, 70, |
6939
|
|
|
|
|
|
|
70, 1, 104, 105, 106, 65, 66, 65, |
6940
|
|
|
|
|
|
|
4, 4, 1, 107, 109, 109, 109, 109, |
6941
|
|
|
|
|
|
|
109, 109, 108, 110, 110, 110, 110, 110, |
6942
|
|
|
|
|
|
|
110, 1, 33, 111, 1, 33, 1, 112, |
6943
|
|
|
|
|
|
|
1, 108, 113, 107, 5, 5, 5, 115, |
6944
|
|
|
|
|
|
|
5, 6, 114, 1, 116, 117, 118, 119, |
6945
|
|
|
|
|
|
|
120, 114, 114, 114, 1, 116, 121, 118, |
6946
|
|
|
|
|
|
|
122, 123, 1, 116, 121, 118, 35, 122, |
6947
|
|
|
|
|
|
|
123, 1, 116, 35, 117, 118, 119, 120, |
6948
|
|
|
|
|
|
|
114, 114, 1, 116, 38, 117, 118, 119, |
6949
|
|
|
|
|
|
|
120, 114, 114, 1, 124, 117, 118, 119, |
6950
|
|
|
|
|
|
|
120, 114, 114, 114, 1, 35, 116, 121, |
6951
|
|
|
|
|
|
|
118, 36, 122, 123, 1, 116, 126, 118, |
6952
|
|
|
|
|
|
|
127, 128, 125, 125, 125, 1, 5, 5, |
6953
|
|
|
|
|
|
|
5, 5, 6, 114, 1, 4, 4, 4, |
6954
|
|
|
|
|
|
|
4, 4, 4, 1, 69, 69, 69, 69, |
6955
|
|
|
|
|
|
|
69, 71, 70, 70, 1, 84, 84, 84, |
6956
|
|
|
|
|
|
|
84, 84, 66, 4, 4, 1, 84, 84, |
6957
|
|
|
|
|
|
|
84, 84, 84, 66, 4, 4, 1, 129, |
6958
|
|
|
|
|
|
|
129, 129, 129, 129, 131, 132, 130, 1, |
6959
|
|
|
|
|
|
|
0 |
6960
|
|
|
|
|
|
|
}; |
6961
|
|
|
|
|
|
|
|
6962
|
|
|
|
|
|
|
static const char _VBD_VBN_trans_targs[] = { |
6963
|
|
|
|
|
|
|
2, 0, 41, 42, 43, 43, 45, 43, |
6964
|
|
|
|
|
|
|
43, 45, 45, 52, 53, 12, 14, 43, |
6965
|
|
|
|
|
|
|
43, 43, 43, 69, 70, 22, 24, 78, |
6966
|
|
|
|
|
|
|
79, 84, 85, 43, 81, 28, 83, 30, |
6967
|
|
|
|
|
|
|
32, 43, 31, 88, 89, 36, 38, 66, |
6968
|
|
|
|
|
|
|
43, 3, 44, 47, 48, 49, 50, 54, |
6969
|
|
|
|
|
|
|
16, 56, 57, 59, 61, 62, 63, 64, |
6970
|
|
|
|
|
|
|
65, 76, 77, 96, 97, 98, 99, 40, |
6971
|
|
|
|
|
|
|
100, 4, 46, 43, 5, 6, 43, 46, |
6972
|
|
|
|
|
|
|
7, 51, 8, 9, 10, 11, 13, 15, |
6973
|
|
|
|
|
|
|
55, 43, 58, 60, 17, 18, 66, 67, |
6974
|
|
|
|
|
|
|
68, 75, 19, 71, 21, 72, 73, 20, |
6975
|
|
|
|
|
|
|
23, 25, 74, 68, 71, 72, 73, 46, |
6976
|
|
|
|
|
|
|
26, 86, 95, 43, 43, 80, 27, 82, |
6977
|
|
|
|
|
|
|
29, 43, 87, 94, 33, 90, 35, 91, |
6978
|
|
|
|
|
|
|
92, 34, 37, 39, 93, 87, 90, 91, |
6979
|
|
|
|
|
|
|
92, 66, 43, 43, 46 |
6980
|
|
|
|
|
|
|
}; |
6981
|
|
|
|
|
|
|
|
6982
|
|
|
|
|
|
|
static const char _VBD_VBN_trans_actions[] = { |
6983
|
|
|
|
|
|
|
0, 0, 0, 31, 29, 25, 25, 5, |
6984
|
|
|
|
|
|
|
51, 51, 45, 0, 0, 0, 0, 15, |
6985
|
|
|
|
|
|
|
39, 9, 36, 0, 0, 0, 0, 25, |
6986
|
|
|
|
|
|
|
25, 25, 25, 21, 21, 0, 21, 0, |
6987
|
|
|
|
|
|
|
0, 19, 0, 0, 0, 0, 0, 29, |
6988
|
|
|
|
|
|
|
1, 0, 0, 0, 0, 0, 0, 0, |
6989
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
6990
|
|
|
|
|
|
|
0, 0, 0, 27, 0, 0, 0, 0, |
6991
|
|
|
|
|
|
|
0, 0, 29, 17, 0, 0, 54, 54, |
6992
|
|
|
|
|
|
|
0, 54, 0, 0, 0, 0, 0, 0, |
6993
|
|
|
|
|
|
|
29, 27, 29, 54, 0, 0, 13, 13, |
6994
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
6995
|
|
|
|
|
|
|
0, 0, 0, 7, 7, 7, 7, 61, |
6996
|
|
|
|
|
|
|
0, 19, 19, 23, 48, 48, 0, 19, |
6997
|
|
|
|
|
|
|
0, 42, 0, 0, 0, 0, 0, 0, |
6998
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 17, 17, 17, |
6999
|
|
|
|
|
|
|
17, 3, 33, 3, 57 |
7000
|
|
|
|
|
|
|
}; |
7001
|
|
|
|
|
|
|
|
7002
|
|
|
|
|
|
|
static const char _VBD_VBN_eof_actions[] = { |
7003
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
7004
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
7005
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
7006
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
7007
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
7008
|
|
|
|
|
|
|
0, 0, 0, 0, 5, 0, 0, 5, |
7009
|
|
|
|
|
|
|
5, 5, 5, 0, 5, 5, 5, 0, |
7010
|
|
|
|
|
|
|
5, 5, 0, 5, 0, 5, 5, 5, |
7011
|
|
|
|
|
|
|
5, 5, 0, 0, 11, 11, 11, 11, |
7012
|
|
|
|
|
|
|
11, 11, 11, 11, 5, 5, 0, 0, |
7013
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 17, |
7014
|
|
|
|
|
|
|
17, 17, 17, 17, 17, 17, 17, 0, |
7015
|
|
|
|
|
|
|
0, 5, 5, 5, 5 |
7016
|
|
|
|
|
|
|
}; |
7017
|
|
|
|
|
|
|
|
7018
|
|
|
|
|
|
|
static const int VBD_VBN_start = 1; |
7019
|
|
|
|
|
|
|
|
7020
|
0
|
|
|
|
|
|
void english_morpho_guesser::add_VBD_VBN(const string& form, vector& lemmas) const { |
7021
|
|
|
|
|
|
|
const char* p = form.c_str(); int cs; |
7022
|
|
|
|
|
|
|
char best = 'z'; unsigned remove = 0; const char* append = nullptr; |
7023
|
|
|
|
|
|
|
|
7024
|
|
|
|
|
|
|
{ |
7025
|
|
|
|
|
|
|
cs = VBD_VBN_start; |
7026
|
|
|
|
|
|
|
} |
7027
|
|
|
|
|
|
|
|
7028
|
|
|
|
|
|
|
{ |
7029
|
|
|
|
|
|
|
int _klen; |
7030
|
|
|
|
|
|
|
unsigned int _trans; |
7031
|
|
|
|
|
|
|
const char *_acts; |
7032
|
|
|
|
|
|
|
unsigned int _nacts; |
7033
|
|
|
|
|
|
|
const char *_keys; |
7034
|
|
|
|
|
|
|
|
7035
|
0
|
0
|
|
|
|
|
if ( p == ( (form.c_str() + form.size())) ) |
7036
|
|
|
|
|
|
|
goto _test_eof; |
7037
|
|
|
|
|
|
|
if ( cs == 0 ) |
7038
|
|
|
|
|
|
|
goto _out; |
7039
|
|
|
|
|
|
|
_resume: |
7040
|
0
|
|
|
|
|
|
_keys = _VBD_VBN_trans_keys + _VBD_VBN_key_offsets[cs]; |
7041
|
0
|
|
|
|
|
|
_trans = _VBD_VBN_index_offsets[cs]; |
7042
|
|
|
|
|
|
|
|
7043
|
0
|
|
|
|
|
|
_klen = _VBD_VBN_single_lengths[cs]; |
7044
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
7045
|
|
|
|
|
|
|
const char *_lower = _keys; |
7046
|
|
|
|
|
|
|
const char *_mid; |
7047
|
0
|
|
|
|
|
|
const char *_upper = _keys + _klen - 1; |
7048
|
|
|
|
|
|
|
while (1) { |
7049
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
7050
|
|
|
|
|
|
|
break; |
7051
|
|
|
|
|
|
|
|
7052
|
0
|
|
|
|
|
|
_mid = _lower + ((_upper-_lower) >> 1); |
7053
|
0
|
0
|
|
|
|
|
if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid ) |
7054
|
0
|
|
|
|
|
|
_upper = _mid - 1; |
7055
|
0
|
0
|
|
|
|
|
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid ) |
7056
|
0
|
|
|
|
|
|
_lower = _mid + 1; |
7057
|
|
|
|
|
|
|
else { |
7058
|
0
|
|
|
|
|
|
_trans += (unsigned int)(_mid - _keys); |
7059
|
0
|
|
|
|
|
|
goto _match; |
7060
|
|
|
|
|
|
|
} |
7061
|
|
|
|
|
|
|
} |
7062
|
0
|
|
|
|
|
|
_keys += _klen; |
7063
|
0
|
|
|
|
|
|
_trans += _klen; |
7064
|
|
|
|
|
|
|
} |
7065
|
|
|
|
|
|
|
|
7066
|
0
|
|
|
|
|
|
_klen = _VBD_VBN_range_lengths[cs]; |
7067
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
7068
|
|
|
|
|
|
|
const char *_lower = _keys; |
7069
|
|
|
|
|
|
|
const char *_mid; |
7070
|
0
|
|
|
|
|
|
const char *_upper = _keys + (_klen<<1) - 2; |
7071
|
|
|
|
|
|
|
while (1) { |
7072
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
7073
|
|
|
|
|
|
|
break; |
7074
|
|
|
|
|
|
|
|
7075
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
7076
|
0
|
0
|
|
|
|
|
if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] ) |
7077
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
7078
|
0
|
0
|
|
|
|
|
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] ) |
7079
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
7080
|
|
|
|
|
|
|
else { |
7081
|
0
|
|
|
|
|
|
_trans += (unsigned int)((_mid - _keys)>>1); |
7082
|
0
|
|
|
|
|
|
goto _match; |
7083
|
|
|
|
|
|
|
} |
7084
|
|
|
|
|
|
|
} |
7085
|
0
|
|
|
|
|
|
_trans += _klen; |
7086
|
|
|
|
|
|
|
} |
7087
|
|
|
|
|
|
|
|
7088
|
|
|
|
|
|
|
_match: |
7089
|
0
|
|
|
|
|
|
_trans = _VBD_VBN_indicies[_trans]; |
7090
|
0
|
|
|
|
|
|
cs = _VBD_VBN_trans_targs[_trans]; |
7091
|
|
|
|
|
|
|
|
7092
|
0
|
0
|
|
|
|
|
if ( _VBD_VBN_trans_actions[_trans] == 0 ) |
7093
|
|
|
|
|
|
|
goto _again; |
7094
|
|
|
|
|
|
|
|
7095
|
0
|
|
|
|
|
|
_acts = _VBD_VBN_actions + _VBD_VBN_trans_actions[_trans]; |
7096
|
0
|
|
|
|
|
|
_nacts = (unsigned int) *_acts++; |
7097
|
0
|
0
|
|
|
|
|
while ( _nacts-- > 0 ) |
7098
|
|
|
|
|
|
|
{ |
7099
|
0
|
|
|
|
|
|
switch ( *_acts++ ) |
7100
|
|
|
|
|
|
|
{ |
7101
|
|
|
|
|
|
|
case 0: |
7102
|
0
|
0
|
|
|
|
|
{ if (best > 'a') best = 'a', remove = 1, append = nullptr; } |
7103
|
|
|
|
|
|
|
break; |
7104
|
|
|
|
|
|
|
case 1: |
7105
|
0
|
0
|
|
|
|
|
{ if (best > 'b') best = 'b', remove = 2, append = nullptr; } |
7106
|
|
|
|
|
|
|
break; |
7107
|
|
|
|
|
|
|
case 2: |
7108
|
0
|
0
|
|
|
|
|
{ if (best > 'c') best = 'c', remove = 1, append = nullptr; } |
7109
|
|
|
|
|
|
|
break; |
7110
|
|
|
|
|
|
|
case 3: |
7111
|
0
|
0
|
|
|
|
|
{ if (best > 'd') best = 'd', remove = 2, append = nullptr; } |
7112
|
|
|
|
|
|
|
break; |
7113
|
|
|
|
|
|
|
case 4: |
7114
|
0
|
0
|
|
|
|
|
{ if (best > 'e') best = 'e', remove = 1, append = nullptr; } |
7115
|
|
|
|
|
|
|
break; |
7116
|
|
|
|
|
|
|
case 5: |
7117
|
0
|
0
|
|
|
|
|
{ if (best > 'f') best = 'f', remove = 2, append = nullptr; } |
7118
|
|
|
|
|
|
|
break; |
7119
|
|
|
|
|
|
|
case 7: |
7120
|
0
|
0
|
|
|
|
|
{ if (best > 'h') best = 'h', remove = 2, append = nullptr; } |
7121
|
|
|
|
|
|
|
break; |
7122
|
|
|
|
|
|
|
case 8: |
7123
|
0
|
0
|
|
|
|
|
{ if (best > 'i') best = 'i', remove = 3, append = "y"; } |
7124
|
|
|
|
|
|
|
break; |
7125
|
|
|
|
|
|
|
case 9: |
7126
|
0
|
0
|
|
|
|
|
{ if (best > 'j') best = 'j', remove = 1, append = nullptr; } |
7127
|
|
|
|
|
|
|
break; |
7128
|
|
|
|
|
|
|
case 10: |
7129
|
0
|
0
|
|
|
|
|
{ if (best > 'k') best = 'k', remove = 2, append = nullptr; } |
7130
|
|
|
|
|
|
|
break; |
7131
|
|
|
|
|
|
|
case 11: |
7132
|
0
|
0
|
|
|
|
|
{ if (best > 'l') best = 'l', remove = 1, append = nullptr; } |
7133
|
|
|
|
|
|
|
break; |
7134
|
|
|
|
|
|
|
case 12: |
7135
|
0
|
0
|
|
|
|
|
{ if (best > 'm') best = 'm', remove = 2, append = nullptr; } |
7136
|
|
|
|
|
|
|
break; |
7137
|
|
|
|
|
|
|
case 13: |
7138
|
0
|
0
|
|
|
|
|
{ if (best > 'n') best = 'n', remove = 1, append = nullptr; } |
7139
|
|
|
|
|
|
|
break; |
7140
|
|
|
|
|
|
|
case 14: |
7141
|
0
|
0
|
|
|
|
|
{ if (best > 'o') best = 'o', remove = 2, append = nullptr; } |
7142
|
|
|
|
|
|
|
break; |
7143
|
|
|
|
|
|
|
case 15: |
7144
|
0
|
0
|
|
|
|
|
{ if (best > 'p') best = 'p', remove = 1, append = nullptr; } |
7145
|
|
|
|
|
|
|
break; |
7146
|
|
|
|
|
|
|
case 16: |
7147
|
0
|
0
|
|
|
|
|
{ if (best > 'q') best = 'q', remove = 2, append = nullptr; } |
7148
|
|
|
|
|
|
|
break; |
7149
|
|
|
|
|
|
|
case 17: |
7150
|
0
|
0
|
|
|
|
|
{ if (best > 'r') best = 'r', remove = 1, append = nullptr; } |
7151
|
|
|
|
|
|
|
break; |
7152
|
|
|
|
|
|
|
} |
7153
|
|
|
|
|
|
|
} |
7154
|
|
|
|
|
|
|
|
7155
|
|
|
|
|
|
|
_again: |
7156
|
0
|
0
|
|
|
|
|
if ( cs == 0 ) |
7157
|
|
|
|
|
|
|
goto _out; |
7158
|
0
|
0
|
|
|
|
|
if ( ++p != ( (form.c_str() + form.size())) ) |
7159
|
|
|
|
|
|
|
goto _resume; |
7160
|
|
|
|
|
|
|
_test_eof: {} |
7161
|
0
|
0
|
|
|
|
|
if ( p == ( (form.c_str() + form.size())) ) |
7162
|
|
|
|
|
|
|
{ |
7163
|
0
|
|
|
|
|
|
const char *__acts = _VBD_VBN_actions + _VBD_VBN_eof_actions[cs]; |
7164
|
0
|
|
|
|
|
|
unsigned int __nacts = (unsigned int) *__acts++; |
7165
|
0
|
0
|
|
|
|
|
while ( __nacts-- > 0 ) { |
7166
|
0
|
|
|
|
|
|
switch ( *__acts++ ) { |
7167
|
|
|
|
|
|
|
case 3: |
7168
|
0
|
0
|
|
|
|
|
{ if (best > 'd') best = 'd', remove = 2, append = nullptr; } |
7169
|
|
|
|
|
|
|
break; |
7170
|
|
|
|
|
|
|
case 6: |
7171
|
0
|
0
|
|
|
|
|
{ if (best > 'g') best = 'g', remove = 1, append = nullptr; } |
7172
|
|
|
|
|
|
|
break; |
7173
|
|
|
|
|
|
|
case 9: |
7174
|
0
|
0
|
|
|
|
|
{ if (best > 'j') best = 'j', remove = 1, append = nullptr; } |
7175
|
|
|
|
|
|
|
break; |
7176
|
|
|
|
|
|
|
} |
7177
|
|
|
|
|
|
|
} |
7178
|
|
|
|
|
|
|
} |
7179
|
|
|
|
|
|
|
|
7180
|
|
|
|
|
|
|
_out: {} |
7181
|
|
|
|
|
|
|
} |
7182
|
|
|
|
|
|
|
|
7183
|
0
|
0
|
|
|
|
|
add(VBD, VBN, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
|
|
0
|
|
|
|
|
|
7184
|
0
|
|
|
|
|
|
} |
7185
|
|
|
|
|
|
|
|
7186
|
|
|
|
|
|
|
static const char _VBZ_actions[] = { |
7187
|
|
|
|
|
|
|
0, 1, 0, 1, 1, 1, 2, 1, |
7188
|
|
|
|
|
|
|
3, 1, 4, 1, 5, 1, 6, 1, |
7189
|
|
|
|
|
|
|
7, 1, 8 |
7190
|
|
|
|
|
|
|
}; |
7191
|
|
|
|
|
|
|
|
7192
|
|
|
|
|
|
|
static const char _VBZ_key_offsets[] = { |
7193
|
|
|
|
|
|
|
0, 0, 1, 2, 4, 14, 14, 25, |
7194
|
|
|
|
|
|
|
26, 31, 31, 31, 31, 37, 45, 54 |
7195
|
|
|
|
|
|
|
}; |
7196
|
|
|
|
|
|
|
|
7197
|
|
|
|
|
|
|
static const char _VBZ_trans_keys[] = { |
7198
|
|
|
|
|
|
|
115, 101, 99, 115, 98, 100, 102, 104, |
7199
|
|
|
|
|
|
|
106, 110, 112, 116, 118, 122, 122, 98, |
7200
|
|
|
|
|
|
|
100, 102, 104, 106, 110, 112, 116, 118, |
7201
|
|
|
|
|
|
|
120, 111, 97, 101, 105, 111, 117, 104, |
7202
|
|
|
|
|
|
|
105, 111, 115, 120, 122, 97, 101, 105, |
7203
|
|
|
|
|
|
|
110, 111, 114, 115, 117, 97, 101, 105, |
7204
|
|
|
|
|
|
|
111, 117, 121, 122, 98, 120, 0 |
7205
|
|
|
|
|
|
|
}; |
7206
|
|
|
|
|
|
|
|
7207
|
|
|
|
|
|
|
static const char _VBZ_single_lengths[] = { |
7208
|
|
|
|
|
|
|
0, 1, 1, 2, 0, 0, 1, 1, |
7209
|
|
|
|
|
|
|
5, 0, 0, 0, 6, 8, 7, 0 |
7210
|
|
|
|
|
|
|
}; |
7211
|
|
|
|
|
|
|
|
7212
|
|
|
|
|
|
|
static const char _VBZ_range_lengths[] = { |
7213
|
|
|
|
|
|
|
0, 0, 0, 0, 5, 0, 5, 0, |
7214
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 1, 0 |
7215
|
|
|
|
|
|
|
}; |
7216
|
|
|
|
|
|
|
|
7217
|
|
|
|
|
|
|
static const char _VBZ_index_offsets[] = { |
7218
|
|
|
|
|
|
|
0, 0, 2, 4, 7, 13, 14, 21, |
7219
|
|
|
|
|
|
|
23, 29, 30, 31, 32, 39, 48, 57 |
7220
|
|
|
|
|
|
|
}; |
7221
|
|
|
|
|
|
|
|
7222
|
|
|
|
|
|
|
static const char _VBZ_indicies[] = { |
7223
|
|
|
|
|
|
|
0, 1, 3, 2, 4, 4, 1, 5, |
7224
|
|
|
|
|
|
|
5, 5, 5, 5, 1, 6, 7, 7, |
7225
|
|
|
|
|
|
|
7, 7, 7, 7, 1, 8, 1, 9, |
7226
|
|
|
|
|
|
|
9, 9, 9, 9, 1, 8, 10, 1, |
7227
|
|
|
|
|
|
|
11, 12, 13, 14, 4, 15, 1, 16, |
7228
|
|
|
|
|
|
|
16, 16, 17, 16, 18, 19, 16, 1, |
7229
|
|
|
|
|
|
|
20, 20, 20, 20, 20, 20, 22, 21, |
7230
|
|
|
|
|
|
|
1, 10, 0 |
7231
|
|
|
|
|
|
|
}; |
7232
|
|
|
|
|
|
|
|
7233
|
|
|
|
|
|
|
static const char _VBZ_trans_targs[] = { |
7234
|
|
|
|
|
|
|
2, 0, 11, 12, 11, 5, 11, 11, |
7235
|
|
|
|
|
|
|
11, 9, 11, 3, 4, 6, 13, 14, |
7236
|
|
|
|
|
|
|
11, 7, 8, 11, 11, 10, 15 |
7237
|
|
|
|
|
|
|
}; |
7238
|
|
|
|
|
|
|
|
7239
|
|
|
|
|
|
|
static const char _VBZ_trans_actions[] = { |
7240
|
|
|
|
|
|
|
0, 0, 17, 17, 11, 0, 13, 15, |
7241
|
|
|
|
|
|
|
9, 0, 3, 0, 0, 0, 11, 11, |
7242
|
|
|
|
|
|
|
1, 0, 0, 7, 5, 0, 7 |
7243
|
|
|
|
|
|
|
}; |
7244
|
|
|
|
|
|
|
|
7245
|
|
|
|
|
|
|
static const int VBZ_start = 1; |
7246
|
|
|
|
|
|
|
|
7247
|
0
|
|
|
|
|
|
void english_morpho_guesser::add_VBZ(const string& form, vector& lemmas) const { |
7248
|
|
|
|
|
|
|
const char* p = form.c_str(); int cs; |
7249
|
|
|
|
|
|
|
char best = 'z'; unsigned remove = 0; const char* append = nullptr; |
7250
|
|
|
|
|
|
|
|
7251
|
|
|
|
|
|
|
{ |
7252
|
|
|
|
|
|
|
cs = VBZ_start; |
7253
|
|
|
|
|
|
|
} |
7254
|
|
|
|
|
|
|
|
7255
|
|
|
|
|
|
|
{ |
7256
|
|
|
|
|
|
|
int _klen; |
7257
|
|
|
|
|
|
|
unsigned int _trans; |
7258
|
|
|
|
|
|
|
const char *_acts; |
7259
|
|
|
|
|
|
|
unsigned int _nacts; |
7260
|
|
|
|
|
|
|
const char *_keys; |
7261
|
|
|
|
|
|
|
|
7262
|
0
|
0
|
|
|
|
|
if ( p == ( (form.c_str() + form.size())) ) |
7263
|
|
|
|
|
|
|
goto _test_eof; |
7264
|
|
|
|
|
|
|
if ( cs == 0 ) |
7265
|
|
|
|
|
|
|
goto _out; |
7266
|
|
|
|
|
|
|
_resume: |
7267
|
0
|
|
|
|
|
|
_keys = _VBZ_trans_keys + _VBZ_key_offsets[cs]; |
7268
|
0
|
|
|
|
|
|
_trans = _VBZ_index_offsets[cs]; |
7269
|
|
|
|
|
|
|
|
7270
|
0
|
|
|
|
|
|
_klen = _VBZ_single_lengths[cs]; |
7271
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
7272
|
|
|
|
|
|
|
const char *_lower = _keys; |
7273
|
|
|
|
|
|
|
const char *_mid; |
7274
|
0
|
|
|
|
|
|
const char *_upper = _keys + _klen - 1; |
7275
|
|
|
|
|
|
|
while (1) { |
7276
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
7277
|
|
|
|
|
|
|
break; |
7278
|
|
|
|
|
|
|
|
7279
|
0
|
|
|
|
|
|
_mid = _lower + ((_upper-_lower) >> 1); |
7280
|
0
|
0
|
|
|
|
|
if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid ) |
7281
|
0
|
|
|
|
|
|
_upper = _mid - 1; |
7282
|
0
|
0
|
|
|
|
|
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid ) |
7283
|
0
|
|
|
|
|
|
_lower = _mid + 1; |
7284
|
|
|
|
|
|
|
else { |
7285
|
0
|
|
|
|
|
|
_trans += (unsigned int)(_mid - _keys); |
7286
|
0
|
|
|
|
|
|
goto _match; |
7287
|
|
|
|
|
|
|
} |
7288
|
|
|
|
|
|
|
} |
7289
|
0
|
|
|
|
|
|
_keys += _klen; |
7290
|
0
|
|
|
|
|
|
_trans += _klen; |
7291
|
|
|
|
|
|
|
} |
7292
|
|
|
|
|
|
|
|
7293
|
0
|
|
|
|
|
|
_klen = _VBZ_range_lengths[cs]; |
7294
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
7295
|
|
|
|
|
|
|
const char *_lower = _keys; |
7296
|
|
|
|
|
|
|
const char *_mid; |
7297
|
0
|
|
|
|
|
|
const char *_upper = _keys + (_klen<<1) - 2; |
7298
|
|
|
|
|
|
|
while (1) { |
7299
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
7300
|
|
|
|
|
|
|
break; |
7301
|
|
|
|
|
|
|
|
7302
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
7303
|
0
|
0
|
|
|
|
|
if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] ) |
7304
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
7305
|
0
|
0
|
|
|
|
|
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] ) |
7306
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
7307
|
|
|
|
|
|
|
else { |
7308
|
0
|
|
|
|
|
|
_trans += (unsigned int)((_mid - _keys)>>1); |
7309
|
0
|
|
|
|
|
|
goto _match; |
7310
|
|
|
|
|
|
|
} |
7311
|
|
|
|
|
|
|
} |
7312
|
0
|
|
|
|
|
|
_trans += _klen; |
7313
|
|
|
|
|
|
|
} |
7314
|
|
|
|
|
|
|
|
7315
|
|
|
|
|
|
|
_match: |
7316
|
0
|
|
|
|
|
|
_trans = _VBZ_indicies[_trans]; |
7317
|
0
|
|
|
|
|
|
cs = _VBZ_trans_targs[_trans]; |
7318
|
|
|
|
|
|
|
|
7319
|
0
|
0
|
|
|
|
|
if ( _VBZ_trans_actions[_trans] == 0 ) |
7320
|
|
|
|
|
|
|
goto _again; |
7321
|
|
|
|
|
|
|
|
7322
|
0
|
|
|
|
|
|
_acts = _VBZ_actions + _VBZ_trans_actions[_trans]; |
7323
|
0
|
|
|
|
|
|
_nacts = (unsigned int) *_acts++; |
7324
|
0
|
0
|
|
|
|
|
while ( _nacts-- > 0 ) |
7325
|
|
|
|
|
|
|
{ |
7326
|
0
|
|
|
|
|
|
switch ( *_acts++ ) |
7327
|
|
|
|
|
|
|
{ |
7328
|
|
|
|
|
|
|
case 0: |
7329
|
0
|
0
|
|
|
|
|
{ if (best > 'a') best = 'a', remove = 1, append = nullptr; } |
7330
|
|
|
|
|
|
|
break; |
7331
|
|
|
|
|
|
|
case 1: |
7332
|
0
|
0
|
|
|
|
|
{ if (best > 'b') best = 'b', remove = 2, append = nullptr; } |
7333
|
|
|
|
|
|
|
break; |
7334
|
|
|
|
|
|
|
case 2: |
7335
|
0
|
0
|
|
|
|
|
{ if (best > 'c') best = 'c', remove = 1, append = nullptr; } |
7336
|
|
|
|
|
|
|
break; |
7337
|
|
|
|
|
|
|
case 3: |
7338
|
0
|
0
|
|
|
|
|
{ if (best > 'd') best = 'd', remove = 2, append = nullptr; } |
7339
|
|
|
|
|
|
|
break; |
7340
|
|
|
|
|
|
|
case 4: |
7341
|
0
|
0
|
|
|
|
|
{ if (best > 'e') best = 'e', remove = 1, append = nullptr; } |
7342
|
|
|
|
|
|
|
break; |
7343
|
|
|
|
|
|
|
case 5: |
7344
|
0
|
0
|
|
|
|
|
{ if (best > 'f') best = 'f', remove = 2, append = nullptr; } |
7345
|
|
|
|
|
|
|
break; |
7346
|
|
|
|
|
|
|
case 6: |
7347
|
0
|
0
|
|
|
|
|
{ if (best > 'g') best = 'g', remove = 3, append = "y"; } |
7348
|
|
|
|
|
|
|
break; |
7349
|
|
|
|
|
|
|
case 7: |
7350
|
0
|
0
|
|
|
|
|
{ if (best > 'h') best = 'h', remove = 2, append = nullptr; } |
7351
|
|
|
|
|
|
|
break; |
7352
|
|
|
|
|
|
|
case 8: |
7353
|
0
|
0
|
|
|
|
|
{ if (best > 'i') best = 'i', remove = 1, append = nullptr; } |
7354
|
|
|
|
|
|
|
break; |
7355
|
|
|
|
|
|
|
} |
7356
|
|
|
|
|
|
|
} |
7357
|
|
|
|
|
|
|
|
7358
|
|
|
|
|
|
|
_again: |
7359
|
0
|
0
|
|
|
|
|
if ( cs == 0 ) |
7360
|
|
|
|
|
|
|
goto _out; |
7361
|
0
|
0
|
|
|
|
|
if ( ++p != ( (form.c_str() + form.size())) ) |
7362
|
|
|
|
|
|
|
goto _resume; |
7363
|
|
|
|
|
|
|
_test_eof: {} |
7364
|
|
|
|
|
|
|
_out: {} |
7365
|
|
|
|
|
|
|
} |
7366
|
|
|
|
|
|
|
|
7367
|
0
|
0
|
|
|
|
|
add(VBZ, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
|
|
0
|
|
|
|
|
|
7368
|
0
|
|
|
|
|
|
} |
7369
|
|
|
|
|
|
|
|
7370
|
|
|
|
|
|
|
static const char _JJR_RBR_actions[] = { |
7371
|
|
|
|
|
|
|
0, 1, 0, 1, 1, 1, 3, 1, |
7372
|
|
|
|
|
|
|
4, 1, 5, 2, 1, 4, 2, 2, |
7373
|
|
|
|
|
|
|
5, 2, 4, 5 |
7374
|
|
|
|
|
|
|
}; |
7375
|
|
|
|
|
|
|
|
7376
|
|
|
|
|
|
|
static const unsigned char _JJR_RBR_key_offsets[] = { |
7377
|
|
|
|
|
|
|
0, 0, 1, 2, 26, 26, 32, 37, |
7378
|
|
|
|
|
|
|
50, 56, 62, 73, 79, 85, 91, 102, |
7379
|
|
|
|
|
|
|
103, 109, 115, 117, 123, 129, 135, 146, |
7380
|
|
|
|
|
|
|
152, 163, 169, 175, 181 |
7381
|
|
|
|
|
|
|
}; |
7382
|
|
|
|
|
|
|
|
7383
|
|
|
|
|
|
|
static const char _JJR_RBR_trans_keys[] = { |
7384
|
|
|
|
|
|
|
114, 101, 98, 99, 100, 101, 102, 103, |
7385
|
|
|
|
|
|
|
104, 105, 106, 107, 108, 109, 110, 112, |
7386
|
|
|
|
|
|
|
113, 114, 115, 116, 117, 118, 119, 120, |
7387
|
|
|
|
|
|
|
121, 122, 97, 98, 101, 105, 111, 117, |
7388
|
|
|
|
|
|
|
97, 101, 105, 111, 117, 98, 99, 100, |
7389
|
|
|
|
|
|
|
105, 111, 117, 122, 97, 101, 102, 109, |
7390
|
|
|
|
|
|
|
112, 120, 97, 100, 101, 105, 111, 117, |
7391
|
|
|
|
|
|
|
97, 101, 102, 105, 111, 117, 97, 101, |
7392
|
|
|
|
|
|
|
103, 105, 111, 117, 122, 98, 109, 112, |
7393
|
|
|
|
|
|
|
120, 97, 101, 104, 105, 111, 117, 97, |
7394
|
|
|
|
|
|
|
101, 105, 106, 111, 117, 97, 101, 105, |
7395
|
|
|
|
|
|
|
107, 111, 117, 97, 101, 105, 108, 111, |
7396
|
|
|
|
|
|
|
117, 122, 98, 109, 112, 120, 101, 97, |
7397
|
|
|
|
|
|
|
101, 105, 109, 111, 117, 97, 101, 105, |
7398
|
|
|
|
|
|
|
110, 111, 117, 97, 122, 97, 101, 105, |
7399
|
|
|
|
|
|
|
111, 112, 117, 97, 101, 105, 111, 113, |
7400
|
|
|
|
|
|
|
117, 97, 101, 105, 111, 114, 117, 97, |
7401
|
|
|
|
|
|
|
101, 105, 111, 115, 117, 122, 98, 109, |
7402
|
|
|
|
|
|
|
112, 120, 97, 101, 105, 111, 116, 117, |
7403
|
|
|
|
|
|
|
97, 101, 105, 111, 117, 118, 122, 98, |
7404
|
|
|
|
|
|
|
109, 112, 120, 97, 101, 105, 111, 117, |
7405
|
|
|
|
|
|
|
119, 97, 101, 105, 111, 117, 120, 97, |
7406
|
|
|
|
|
|
|
101, 105, 111, 117, 121, 97, 101, 105, |
7407
|
|
|
|
|
|
|
111, 117, 122, 0 |
7408
|
|
|
|
|
|
|
}; |
7409
|
|
|
|
|
|
|
|
7410
|
|
|
|
|
|
|
static const char _JJR_RBR_single_lengths[] = { |
7411
|
|
|
|
|
|
|
0, 1, 1, 24, 0, 6, 5, 7, |
7412
|
|
|
|
|
|
|
6, 6, 7, 6, 6, 6, 7, 1, |
7413
|
|
|
|
|
|
|
6, 6, 0, 6, 6, 6, 7, 6, |
7414
|
|
|
|
|
|
|
7, 6, 6, 6, 6 |
7415
|
|
|
|
|
|
|
}; |
7416
|
|
|
|
|
|
|
|
7417
|
|
|
|
|
|
|
static const char _JJR_RBR_range_lengths[] = { |
7418
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 3, |
7419
|
|
|
|
|
|
|
0, 0, 2, 0, 0, 0, 2, 0, |
7420
|
|
|
|
|
|
|
0, 0, 1, 0, 0, 0, 2, 0, |
7421
|
|
|
|
|
|
|
2, 0, 0, 0, 0 |
7422
|
|
|
|
|
|
|
}; |
7423
|
|
|
|
|
|
|
|
7424
|
|
|
|
|
|
|
static const unsigned char _JJR_RBR_index_offsets[] = { |
7425
|
|
|
|
|
|
|
0, 0, 2, 4, 29, 30, 37, 43, |
7426
|
|
|
|
|
|
|
54, 61, 68, 78, 85, 92, 99, 109, |
7427
|
|
|
|
|
|
|
111, 118, 125, 127, 134, 141, 148, 158, |
7428
|
|
|
|
|
|
|
165, 175, 182, 189, 196 |
7429
|
|
|
|
|
|
|
}; |
7430
|
|
|
|
|
|
|
|
7431
|
|
|
|
|
|
|
static const char _JJR_RBR_indicies[] = { |
7432
|
|
|
|
|
|
|
0, 1, 2, 1, 4, 5, 6, 7, |
7433
|
|
|
|
|
|
|
8, 9, 10, 11, 12, 13, 14, 15, |
7434
|
|
|
|
|
|
|
16, 17, 18, 19, 20, 21, 7, 22, |
7435
|
|
|
|
|
|
|
23, 24, 25, 26, 3, 1, 27, 28, |
7436
|
|
|
|
|
|
|
27, 27, 27, 27, 1, 29, 29, 29, |
7437
|
|
|
|
|
|
|
29, 29, 1, 30, 31, 30, 27, 27, |
7438
|
|
|
|
|
|
|
27, 30, 27, 30, 30, 1, 27, 28, |
7439
|
|
|
|
|
|
|
27, 27, 27, 27, 1, 27, 27, 28, |
7440
|
|
|
|
|
|
|
27, 27, 27, 1, 27, 27, 31, 27, |
7441
|
|
|
|
|
|
|
27, 27, 30, 30, 30, 1, 27, 27, |
7442
|
|
|
|
|
|
|
28, 27, 27, 27, 1, 27, 27, 27, |
7443
|
|
|
|
|
|
|
28, 27, 27, 1, 27, 27, 27, 28, |
7444
|
|
|
|
|
|
|
27, 27, 1, 27, 27, 27, 32, 27, |
7445
|
|
|
|
|
|
|
27, 30, 30, 30, 1, 1, 33, 27, |
7446
|
|
|
|
|
|
|
27, 27, 28, 27, 27, 1, 34, 34, |
7447
|
|
|
|
|
|
|
34, 28, 34, 34, 1, 29, 1, 34, |
7448
|
|
|
|
|
|
|
34, 34, 34, 28, 34, 1, 27, 27, |
7449
|
|
|
|
|
|
|
27, 27, 28, 27, 1, 27, 27, 27, |
7450
|
|
|
|
|
|
|
27, 28, 27, 1, 27, 27, 27, 27, |
7451
|
|
|
|
|
|
|
31, 27, 30, 30, 30, 1, 27, 27, |
7452
|
|
|
|
|
|
|
27, 27, 28, 27, 1, 27, 27, 27, |
7453
|
|
|
|
|
|
|
27, 27, 31, 30, 30, 30, 1, 34, |
7454
|
|
|
|
|
|
|
34, 34, 34, 34, 28, 1, 34, 34, |
7455
|
|
|
|
|
|
|
34, 34, 34, 28, 1, 27, 27, 27, |
7456
|
|
|
|
|
|
|
27, 27, 28, 1, 27, 27, 27, 27, |
7457
|
|
|
|
|
|
|
27, 28, 1, 0 |
7458
|
|
|
|
|
|
|
}; |
7459
|
|
|
|
|
|
|
|
7460
|
|
|
|
|
|
|
static const char _JJR_RBR_trans_targs[] = { |
7461
|
|
|
|
|
|
|
2, 0, 3, 4, 5, 7, 8, 4, |
7462
|
|
|
|
|
|
|
9, 10, 11, 4, 12, 13, 14, 16, |
7463
|
|
|
|
|
|
|
17, 19, 20, 21, 22, 23, 24, 25, |
7464
|
|
|
|
|
|
|
26, 27, 28, 6, 4, 4, 4, 4, |
7465
|
|
|
|
|
|
|
15, 4, 18 |
7466
|
|
|
|
|
|
|
}; |
7467
|
|
|
|
|
|
|
|
7468
|
|
|
|
|
|
|
static const char _JJR_RBR_trans_actions[] = { |
7469
|
|
|
|
|
|
|
0, 0, 0, 9, 9, 9, 9, 17, |
7470
|
|
|
|
|
|
|
9, 9, 9, 14, 9, 9, 9, 9, |
7471
|
|
|
|
|
|
|
9, 9, 9, 9, 9, 9, 9, 9, |
7472
|
|
|
|
|
|
|
9, 9, 9, 7, 3, 5, 7, 11, |
7473
|
|
|
|
|
|
|
11, 1, 7 |
7474
|
|
|
|
|
|
|
}; |
7475
|
|
|
|
|
|
|
|
7476
|
|
|
|
|
|
|
static const int JJR_RBR_start = 1; |
7477
|
|
|
|
|
|
|
|
7478
|
0
|
|
|
|
|
|
void english_morpho_guesser::add_JJR_RBR(const string& form, unsigned negation_len, vector& lemmas) const { |
7479
|
0
|
|
|
|
|
|
const char* p = form.c_str() + negation_len; int cs; |
7480
|
|
|
|
|
|
|
char best = 'z'; unsigned remove = 0; const char* append = nullptr; |
7481
|
|
|
|
|
|
|
|
7482
|
|
|
|
|
|
|
{ |
7483
|
|
|
|
|
|
|
cs = JJR_RBR_start; |
7484
|
|
|
|
|
|
|
} |
7485
|
|
|
|
|
|
|
|
7486
|
|
|
|
|
|
|
{ |
7487
|
|
|
|
|
|
|
int _klen; |
7488
|
|
|
|
|
|
|
unsigned int _trans; |
7489
|
|
|
|
|
|
|
const char *_acts; |
7490
|
|
|
|
|
|
|
unsigned int _nacts; |
7491
|
|
|
|
|
|
|
const char *_keys; |
7492
|
|
|
|
|
|
|
|
7493
|
0
|
0
|
|
|
|
|
if ( p == ( (form.c_str() + form.size())) ) |
7494
|
|
|
|
|
|
|
goto _test_eof; |
7495
|
|
|
|
|
|
|
if ( cs == 0 ) |
7496
|
|
|
|
|
|
|
goto _out; |
7497
|
|
|
|
|
|
|
_resume: |
7498
|
0
|
|
|
|
|
|
_keys = _JJR_RBR_trans_keys + _JJR_RBR_key_offsets[cs]; |
7499
|
0
|
|
|
|
|
|
_trans = _JJR_RBR_index_offsets[cs]; |
7500
|
|
|
|
|
|
|
|
7501
|
0
|
|
|
|
|
|
_klen = _JJR_RBR_single_lengths[cs]; |
7502
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
7503
|
|
|
|
|
|
|
const char *_lower = _keys; |
7504
|
|
|
|
|
|
|
const char *_mid; |
7505
|
0
|
|
|
|
|
|
const char *_upper = _keys + _klen - 1; |
7506
|
|
|
|
|
|
|
while (1) { |
7507
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
7508
|
|
|
|
|
|
|
break; |
7509
|
|
|
|
|
|
|
|
7510
|
0
|
|
|
|
|
|
_mid = _lower + ((_upper-_lower) >> 1); |
7511
|
0
|
0
|
|
|
|
|
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < *_mid ) |
7512
|
0
|
|
|
|
|
|
_upper = _mid - 1; |
7513
|
0
|
0
|
|
|
|
|
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > *_mid ) |
7514
|
0
|
|
|
|
|
|
_lower = _mid + 1; |
7515
|
|
|
|
|
|
|
else { |
7516
|
0
|
|
|
|
|
|
_trans += (unsigned int)(_mid - _keys); |
7517
|
0
|
|
|
|
|
|
goto _match; |
7518
|
|
|
|
|
|
|
} |
7519
|
|
|
|
|
|
|
} |
7520
|
0
|
|
|
|
|
|
_keys += _klen; |
7521
|
0
|
|
|
|
|
|
_trans += _klen; |
7522
|
|
|
|
|
|
|
} |
7523
|
|
|
|
|
|
|
|
7524
|
0
|
|
|
|
|
|
_klen = _JJR_RBR_range_lengths[cs]; |
7525
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
7526
|
|
|
|
|
|
|
const char *_lower = _keys; |
7527
|
|
|
|
|
|
|
const char *_mid; |
7528
|
0
|
|
|
|
|
|
const char *_upper = _keys + (_klen<<1) - 2; |
7529
|
|
|
|
|
|
|
while (1) { |
7530
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
7531
|
|
|
|
|
|
|
break; |
7532
|
|
|
|
|
|
|
|
7533
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
7534
|
0
|
0
|
|
|
|
|
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < _mid[0] ) |
7535
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
7536
|
0
|
0
|
|
|
|
|
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > _mid[1] ) |
7537
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
7538
|
|
|
|
|
|
|
else { |
7539
|
0
|
|
|
|
|
|
_trans += (unsigned int)((_mid - _keys)>>1); |
7540
|
0
|
|
|
|
|
|
goto _match; |
7541
|
|
|
|
|
|
|
} |
7542
|
|
|
|
|
|
|
} |
7543
|
0
|
|
|
|
|
|
_trans += _klen; |
7544
|
|
|
|
|
|
|
} |
7545
|
|
|
|
|
|
|
|
7546
|
|
|
|
|
|
|
_match: |
7547
|
0
|
|
|
|
|
|
_trans = _JJR_RBR_indicies[_trans]; |
7548
|
0
|
|
|
|
|
|
cs = _JJR_RBR_trans_targs[_trans]; |
7549
|
|
|
|
|
|
|
|
7550
|
0
|
0
|
|
|
|
|
if ( _JJR_RBR_trans_actions[_trans] == 0 ) |
7551
|
|
|
|
|
|
|
goto _again; |
7552
|
|
|
|
|
|
|
|
7553
|
0
|
|
|
|
|
|
_acts = _JJR_RBR_actions + _JJR_RBR_trans_actions[_trans]; |
7554
|
0
|
|
|
|
|
|
_nacts = (unsigned int) *_acts++; |
7555
|
0
|
0
|
|
|
|
|
while ( _nacts-- > 0 ) |
7556
|
|
|
|
|
|
|
{ |
7557
|
0
|
|
|
|
|
|
switch ( *_acts++ ) |
7558
|
|
|
|
|
|
|
{ |
7559
|
|
|
|
|
|
|
case 0: |
7560
|
0
|
0
|
|
|
|
|
{ if (best > 'a') best = 'a', remove = 2, append = nullptr; } |
7561
|
|
|
|
|
|
|
break; |
7562
|
|
|
|
|
|
|
case 1: |
7563
|
0
|
0
|
|
|
|
|
{ if (best > 'b') best = 'b', remove = 3, append = nullptr; } |
7564
|
|
|
|
|
|
|
break; |
7565
|
|
|
|
|
|
|
case 2: |
7566
|
0
|
0
|
|
|
|
|
{ if (best > 'c') best = 'c', remove = 3, append = "y"; } |
7567
|
|
|
|
|
|
|
break; |
7568
|
|
|
|
|
|
|
case 3: |
7569
|
0
|
0
|
|
|
|
|
{ if (best > 'd') best = 'd', remove = 2, append = nullptr; } |
7570
|
|
|
|
|
|
|
break; |
7571
|
|
|
|
|
|
|
case 4: |
7572
|
0
|
0
|
|
|
|
|
{ if (best > 'e') best = 'e', remove = 1, append = nullptr; } |
7573
|
|
|
|
|
|
|
break; |
7574
|
|
|
|
|
|
|
case 5: |
7575
|
0
|
0
|
|
|
|
|
{ if (best > 'f') best = 'f', remove = 2, append = nullptr; } |
7576
|
|
|
|
|
|
|
break; |
7577
|
|
|
|
|
|
|
} |
7578
|
|
|
|
|
|
|
} |
7579
|
|
|
|
|
|
|
|
7580
|
|
|
|
|
|
|
_again: |
7581
|
0
|
0
|
|
|
|
|
if ( cs == 0 ) |
7582
|
|
|
|
|
|
|
goto _out; |
7583
|
0
|
0
|
|
|
|
|
if ( ++p != ( (form.c_str() + form.size())) ) |
7584
|
|
|
|
|
|
|
goto _resume; |
7585
|
|
|
|
|
|
|
_test_eof: {} |
7586
|
|
|
|
|
|
|
_out: {} |
7587
|
|
|
|
|
|
|
} |
7588
|
|
|
|
|
|
|
|
7589
|
0
|
0
|
|
|
|
|
add(JJR, RBR, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
7590
|
0
|
|
|
|
|
|
} |
7591
|
|
|
|
|
|
|
|
7592
|
|
|
|
|
|
|
static const char _JJS_RBS_actions[] = { |
7593
|
|
|
|
|
|
|
0, 1, 1, 1, 2, 1, 4, 1, |
7594
|
|
|
|
|
|
|
5, 2, 0, 5, 2, 1, 4, 2, |
7595
|
|
|
|
|
|
|
3, 5 |
7596
|
|
|
|
|
|
|
}; |
7597
|
|
|
|
|
|
|
|
7598
|
|
|
|
|
|
|
static const unsigned char _JJS_RBS_key_offsets[] = { |
7599
|
|
|
|
|
|
|
0, 0, 1, 2, 3, 25, 25, 25, |
7600
|
|
|
|
|
|
|
31, 44, 50, 56, 67, 73, 79, 85, |
7601
|
|
|
|
|
|
|
96, 102, 108, 114, 120, 126, 137, 143, |
7602
|
|
|
|
|
|
|
154, 160, 166, 172, 178, 178, 183, 183, |
7603
|
|
|
|
|
|
|
183, 184 |
7604
|
|
|
|
|
|
|
}; |
7605
|
|
|
|
|
|
|
|
7606
|
|
|
|
|
|
|
static const char _JJS_RBS_trans_keys[] = { |
7607
|
|
|
|
|
|
|
116, 115, 101, 98, 99, 100, 102, 103, |
7608
|
|
|
|
|
|
|
104, 105, 106, 107, 108, 109, 110, 112, |
7609
|
|
|
|
|
|
|
113, 114, 115, 116, 118, 119, 120, 121, |
7610
|
|
|
|
|
|
|
122, 97, 98, 101, 105, 111, 117, 98, |
7611
|
|
|
|
|
|
|
99, 100, 105, 111, 117, 122, 97, 101, |
7612
|
|
|
|
|
|
|
102, 109, 112, 120, 97, 100, 101, 105, |
7613
|
|
|
|
|
|
|
111, 117, 97, 101, 102, 105, 111, 117, |
7614
|
|
|
|
|
|
|
97, 101, 103, 105, 111, 117, 122, 98, |
7615
|
|
|
|
|
|
|
109, 112, 120, 97, 101, 104, 105, 111, |
7616
|
|
|
|
|
|
|
117, 97, 101, 105, 106, 111, 117, 97, |
7617
|
|
|
|
|
|
|
101, 105, 107, 111, 117, 97, 101, 105, |
7618
|
|
|
|
|
|
|
108, 111, 117, 122, 98, 109, 112, 120, |
7619
|
|
|
|
|
|
|
97, 101, 105, 109, 111, 117, 97, 101, |
7620
|
|
|
|
|
|
|
105, 110, 111, 117, 97, 101, 105, 111, |
7621
|
|
|
|
|
|
|
112, 117, 97, 101, 105, 111, 113, 117, |
7622
|
|
|
|
|
|
|
97, 101, 105, 111, 114, 117, 97, 101, |
7623
|
|
|
|
|
|
|
105, 111, 115, 117, 122, 98, 109, 112, |
7624
|
|
|
|
|
|
|
120, 97, 101, 105, 111, 116, 117, 97, |
7625
|
|
|
|
|
|
|
101, 105, 111, 117, 118, 122, 98, 109, |
7626
|
|
|
|
|
|
|
112, 120, 97, 101, 105, 111, 117, 119, |
7627
|
|
|
|
|
|
|
97, 101, 105, 111, 117, 120, 97, 101, |
7628
|
|
|
|
|
|
|
105, 111, 117, 121, 97, 101, 105, 111, |
7629
|
|
|
|
|
|
|
117, 122, 97, 101, 105, 111, 117, 101, |
7630
|
|
|
|
|
|
|
97, 122, 0 |
7631
|
|
|
|
|
|
|
}; |
7632
|
|
|
|
|
|
|
|
7633
|
|
|
|
|
|
|
static const char _JJS_RBS_single_lengths[] = { |
7634
|
|
|
|
|
|
|
0, 1, 1, 1, 22, 0, 0, 6, |
7635
|
|
|
|
|
|
|
7, 6, 6, 7, 6, 6, 6, 7, |
7636
|
|
|
|
|
|
|
6, 6, 6, 6, 6, 7, 6, 7, |
7637
|
|
|
|
|
|
|
6, 6, 6, 6, 0, 5, 0, 0, |
7638
|
|
|
|
|
|
|
1, 0 |
7639
|
|
|
|
|
|
|
}; |
7640
|
|
|
|
|
|
|
|
7641
|
|
|
|
|
|
|
static const char _JJS_RBS_range_lengths[] = { |
7642
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
7643
|
|
|
|
|
|
|
3, 0, 0, 2, 0, 0, 0, 2, |
7644
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 2, 0, 2, |
7645
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
7646
|
|
|
|
|
|
|
0, 1 |
7647
|
|
|
|
|
|
|
}; |
7648
|
|
|
|
|
|
|
|
7649
|
|
|
|
|
|
|
static const unsigned char _JJS_RBS_index_offsets[] = { |
7650
|
|
|
|
|
|
|
0, 0, 2, 4, 6, 29, 30, 31, |
7651
|
|
|
|
|
|
|
38, 49, 56, 63, 73, 80, 87, 94, |
7652
|
|
|
|
|
|
|
104, 111, 118, 125, 132, 139, 149, 156, |
7653
|
|
|
|
|
|
|
166, 173, 180, 187, 194, 195, 201, 202, |
7654
|
|
|
|
|
|
|
203, 205 |
7655
|
|
|
|
|
|
|
}; |
7656
|
|
|
|
|
|
|
|
7657
|
|
|
|
|
|
|
static const char _JJS_RBS_indicies[] = { |
7658
|
|
|
|
|
|
|
0, 1, 2, 1, 3, 1, 5, 6, |
7659
|
|
|
|
|
|
|
7, 8, 9, 10, 11, 12, 13, 14, |
7660
|
|
|
|
|
|
|
15, 16, 17, 18, 19, 20, 21, 22, |
7661
|
|
|
|
|
|
|
23, 24, 25, 26, 4, 27, 28, 29, |
7662
|
|
|
|
|
|
|
30, 29, 29, 29, 29, 27, 31, 32, |
7663
|
|
|
|
|
|
|
31, 29, 29, 29, 31, 29, 31, 31, |
7664
|
|
|
|
|
|
|
27, 29, 30, 29, 29, 29, 29, 27, |
7665
|
|
|
|
|
|
|
29, 29, 30, 29, 29, 29, 27, 29, |
7666
|
|
|
|
|
|
|
29, 32, 29, 29, 29, 31, 31, 31, |
7667
|
|
|
|
|
|
|
27, 29, 29, 30, 29, 29, 29, 27, |
7668
|
|
|
|
|
|
|
29, 29, 29, 30, 29, 29, 27, 29, |
7669
|
|
|
|
|
|
|
29, 29, 30, 29, 29, 27, 29, 29, |
7670
|
|
|
|
|
|
|
29, 33, 29, 29, 31, 31, 31, 27, |
7671
|
|
|
|
|
|
|
29, 29, 29, 30, 29, 29, 27, 34, |
7672
|
|
|
|
|
|
|
34, 34, 30, 34, 34, 27, 34, 34, |
7673
|
|
|
|
|
|
|
34, 34, 30, 34, 27, 29, 29, 29, |
7674
|
|
|
|
|
|
|
29, 30, 29, 27, 29, 29, 29, 29, |
7675
|
|
|
|
|
|
|
30, 29, 27, 29, 29, 29, 29, 32, |
7676
|
|
|
|
|
|
|
29, 31, 31, 31, 27, 29, 29, 29, |
7677
|
|
|
|
|
|
|
29, 30, 29, 27, 29, 29, 29, 29, |
7678
|
|
|
|
|
|
|
29, 32, 31, 31, 31, 27, 34, 34, |
7679
|
|
|
|
|
|
|
34, 34, 34, 30, 27, 34, 34, 34, |
7680
|
|
|
|
|
|
|
34, 34, 30, 27, 29, 29, 29, 29, |
7681
|
|
|
|
|
|
|
29, 30, 27, 29, 29, 29, 29, 29, |
7682
|
|
|
|
|
|
|
30, 27, 1, 35, 35, 35, 35, 35, |
7683
|
|
|
|
|
|
|
28, 28, 27, 28, 36, 35, 28, 0 |
7684
|
|
|
|
|
|
|
}; |
7685
|
|
|
|
|
|
|
|
7686
|
|
|
|
|
|
|
static const char _JJS_RBS_trans_targs[] = { |
7687
|
|
|
|
|
|
|
2, 0, 3, 4, 5, 7, 8, 9, |
7688
|
|
|
|
|
|
|
10, 11, 12, 31, 13, 14, 15, 16, |
7689
|
|
|
|
|
|
|
17, 18, 19, 20, 21, 22, 23, 24, |
7690
|
|
|
|
|
|
|
25, 26, 27, 6, 28, 29, 30, 30, |
7691
|
|
|
|
|
|
|
30, 32, 33, 28, 28 |
7692
|
|
|
|
|
|
|
}; |
7693
|
|
|
|
|
|
|
|
7694
|
|
|
|
|
|
|
static const char _JJS_RBS_trans_actions[] = { |
7695
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
7696
|
|
|
|
|
|
|
0, 0, 0, 3, 0, 0, 0, 0, |
7697
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
7698
|
|
|
|
|
|
|
0, 0, 0, 0, 7, 5, 1, 5, |
7699
|
|
|
|
|
|
|
12, 12, 5, 15, 9 |
7700
|
|
|
|
|
|
|
}; |
7701
|
|
|
|
|
|
|
|
7702
|
|
|
|
|
|
|
static const int JJS_RBS_start = 1; |
7703
|
|
|
|
|
|
|
|
7704
|
0
|
|
|
|
|
|
void english_morpho_guesser::add_JJS_RBS(const string& form, unsigned negation_len, vector& lemmas) const { |
7705
|
0
|
|
|
|
|
|
const char* p = form.c_str() + negation_len; int cs; |
7706
|
|
|
|
|
|
|
char best = 'z'; unsigned remove = 0; const char* append = nullptr; |
7707
|
|
|
|
|
|
|
|
7708
|
|
|
|
|
|
|
{ |
7709
|
|
|
|
|
|
|
cs = JJS_RBS_start; |
7710
|
|
|
|
|
|
|
} |
7711
|
|
|
|
|
|
|
|
7712
|
|
|
|
|
|
|
{ |
7713
|
|
|
|
|
|
|
int _klen; |
7714
|
|
|
|
|
|
|
unsigned int _trans; |
7715
|
|
|
|
|
|
|
const char *_acts; |
7716
|
|
|
|
|
|
|
unsigned int _nacts; |
7717
|
|
|
|
|
|
|
const char *_keys; |
7718
|
|
|
|
|
|
|
|
7719
|
0
|
0
|
|
|
|
|
if ( p == ( (form.c_str() + form.size())) ) |
7720
|
|
|
|
|
|
|
goto _test_eof; |
7721
|
|
|
|
|
|
|
if ( cs == 0 ) |
7722
|
|
|
|
|
|
|
goto _out; |
7723
|
|
|
|
|
|
|
_resume: |
7724
|
0
|
|
|
|
|
|
_keys = _JJS_RBS_trans_keys + _JJS_RBS_key_offsets[cs]; |
7725
|
0
|
|
|
|
|
|
_trans = _JJS_RBS_index_offsets[cs]; |
7726
|
|
|
|
|
|
|
|
7727
|
0
|
|
|
|
|
|
_klen = _JJS_RBS_single_lengths[cs]; |
7728
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
7729
|
|
|
|
|
|
|
const char *_lower = _keys; |
7730
|
|
|
|
|
|
|
const char *_mid; |
7731
|
0
|
|
|
|
|
|
const char *_upper = _keys + _klen - 1; |
7732
|
|
|
|
|
|
|
while (1) { |
7733
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
7734
|
|
|
|
|
|
|
break; |
7735
|
|
|
|
|
|
|
|
7736
|
0
|
|
|
|
|
|
_mid = _lower + ((_upper-_lower) >> 1); |
7737
|
0
|
0
|
|
|
|
|
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < *_mid ) |
7738
|
0
|
|
|
|
|
|
_upper = _mid - 1; |
7739
|
0
|
0
|
|
|
|
|
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > *_mid ) |
7740
|
0
|
|
|
|
|
|
_lower = _mid + 1; |
7741
|
|
|
|
|
|
|
else { |
7742
|
0
|
|
|
|
|
|
_trans += (unsigned int)(_mid - _keys); |
7743
|
0
|
|
|
|
|
|
goto _match; |
7744
|
|
|
|
|
|
|
} |
7745
|
|
|
|
|
|
|
} |
7746
|
0
|
|
|
|
|
|
_keys += _klen; |
7747
|
0
|
|
|
|
|
|
_trans += _klen; |
7748
|
|
|
|
|
|
|
} |
7749
|
|
|
|
|
|
|
|
7750
|
0
|
|
|
|
|
|
_klen = _JJS_RBS_range_lengths[cs]; |
7751
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
7752
|
|
|
|
|
|
|
const char *_lower = _keys; |
7753
|
|
|
|
|
|
|
const char *_mid; |
7754
|
0
|
|
|
|
|
|
const char *_upper = _keys + (_klen<<1) - 2; |
7755
|
|
|
|
|
|
|
while (1) { |
7756
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
7757
|
|
|
|
|
|
|
break; |
7758
|
|
|
|
|
|
|
|
7759
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
7760
|
0
|
0
|
|
|
|
|
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < _mid[0] ) |
7761
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
7762
|
0
|
0
|
|
|
|
|
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > _mid[1] ) |
7763
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
7764
|
|
|
|
|
|
|
else { |
7765
|
0
|
|
|
|
|
|
_trans += (unsigned int)((_mid - _keys)>>1); |
7766
|
0
|
|
|
|
|
|
goto _match; |
7767
|
|
|
|
|
|
|
} |
7768
|
|
|
|
|
|
|
} |
7769
|
0
|
|
|
|
|
|
_trans += _klen; |
7770
|
|
|
|
|
|
|
} |
7771
|
|
|
|
|
|
|
|
7772
|
|
|
|
|
|
|
_match: |
7773
|
0
|
|
|
|
|
|
_trans = _JJS_RBS_indicies[_trans]; |
7774
|
0
|
|
|
|
|
|
cs = _JJS_RBS_trans_targs[_trans]; |
7775
|
|
|
|
|
|
|
|
7776
|
0
|
0
|
|
|
|
|
if ( _JJS_RBS_trans_actions[_trans] == 0 ) |
7777
|
|
|
|
|
|
|
goto _again; |
7778
|
|
|
|
|
|
|
|
7779
|
0
|
|
|
|
|
|
_acts = _JJS_RBS_actions + _JJS_RBS_trans_actions[_trans]; |
7780
|
0
|
|
|
|
|
|
_nacts = (unsigned int) *_acts++; |
7781
|
0
|
0
|
|
|
|
|
while ( _nacts-- > 0 ) |
7782
|
|
|
|
|
|
|
{ |
7783
|
0
|
|
|
|
|
|
switch ( *_acts++ ) |
7784
|
|
|
|
|
|
|
{ |
7785
|
|
|
|
|
|
|
case 0: |
7786
|
0
|
0
|
|
|
|
|
{ if (best > 'a') best = 'a', remove = 3, append = nullptr; } |
7787
|
|
|
|
|
|
|
break; |
7788
|
|
|
|
|
|
|
case 1: |
7789
|
0
|
0
|
|
|
|
|
{ if (best > 'b') best = 'b', remove = 4, append = nullptr; } |
7790
|
|
|
|
|
|
|
break; |
7791
|
|
|
|
|
|
|
case 2: |
7792
|
0
|
0
|
|
|
|
|
{ if (best > 'c') best = 'c', remove = 4, append = "y"; } |
7793
|
|
|
|
|
|
|
break; |
7794
|
|
|
|
|
|
|
case 3: |
7795
|
0
|
0
|
|
|
|
|
{ if (best > 'd') best = 'd', remove = 3, append = nullptr; } |
7796
|
|
|
|
|
|
|
break; |
7797
|
|
|
|
|
|
|
case 4: |
7798
|
0
|
0
|
|
|
|
|
{ if (best > 'e') best = 'e', remove = 2, append = nullptr; } |
7799
|
|
|
|
|
|
|
break; |
7800
|
|
|
|
|
|
|
case 5: |
7801
|
0
|
0
|
|
|
|
|
{ if (best > 'f') best = 'f', remove = 3, append = nullptr; } |
7802
|
|
|
|
|
|
|
break; |
7803
|
|
|
|
|
|
|
} |
7804
|
|
|
|
|
|
|
} |
7805
|
|
|
|
|
|
|
|
7806
|
|
|
|
|
|
|
_again: |
7807
|
0
|
0
|
|
|
|
|
if ( cs == 0 ) |
7808
|
|
|
|
|
|
|
goto _out; |
7809
|
0
|
0
|
|
|
|
|
if ( ++p != ( (form.c_str() + form.size())) ) |
7810
|
|
|
|
|
|
|
goto _resume; |
7811
|
|
|
|
|
|
|
_test_eof: {} |
7812
|
|
|
|
|
|
|
_out: {} |
7813
|
|
|
|
|
|
|
} |
7814
|
|
|
|
|
|
|
|
7815
|
0
|
0
|
|
|
|
|
add(JJS, RBS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
7816
|
0
|
|
|
|
|
|
} |
7817
|
|
|
|
|
|
|
|
7818
|
|
|
|
|
|
|
} // namespace morphodita |
7819
|
|
|
|
|
|
|
|
7820
|
|
|
|
|
|
|
///////// |
7821
|
|
|
|
|
|
|
// File: morphodita/morpho/external_morpho.h |
7822
|
|
|
|
|
|
|
///////// |
7823
|
|
|
|
|
|
|
|
7824
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
7825
|
|
|
|
|
|
|
// |
7826
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
7827
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
7828
|
|
|
|
|
|
|
// |
7829
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
7830
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
7831
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
7832
|
|
|
|
|
|
|
|
7833
|
|
|
|
|
|
|
namespace morphodita { |
7834
|
|
|
|
|
|
|
|
7835
|
0
|
|
|
|
|
|
class external_morpho : public morpho { |
7836
|
|
|
|
|
|
|
public: |
7837
|
0
|
|
|
|
|
|
external_morpho(unsigned version) : version(version) {} |
7838
|
|
|
|
|
|
|
|
7839
|
|
|
|
|
|
|
virtual int analyze(string_piece form, morpho::guesser_mode guesser, vector& lemmas) const override; |
7840
|
|
|
|
|
|
|
virtual int generate(string_piece lemma, const char* tag_wildcard, guesser_mode guesser, vector& forms) const override; |
7841
|
|
|
|
|
|
|
virtual int raw_lemma_len(string_piece lemma) const override; |
7842
|
|
|
|
|
|
|
virtual int lemma_id_len(string_piece lemma) const override; |
7843
|
|
|
|
|
|
|
virtual int raw_form_len(string_piece form) const override; |
7844
|
|
|
|
|
|
|
virtual tokenizer* new_tokenizer() const override; |
7845
|
|
|
|
|
|
|
|
7846
|
|
|
|
|
|
|
bool load(istream& is); |
7847
|
|
|
|
|
|
|
|
7848
|
|
|
|
|
|
|
private: |
7849
|
|
|
|
|
|
|
unsigned version; |
7850
|
|
|
|
|
|
|
|
7851
|
|
|
|
|
|
|
string unknown_tag; |
7852
|
|
|
|
|
|
|
}; |
7853
|
|
|
|
|
|
|
|
7854
|
|
|
|
|
|
|
} // namespace morphodita |
7855
|
|
|
|
|
|
|
|
7856
|
|
|
|
|
|
|
///////// |
7857
|
|
|
|
|
|
|
// File: morphodita/tokenizer/generic_tokenizer.h |
7858
|
|
|
|
|
|
|
///////// |
7859
|
|
|
|
|
|
|
|
7860
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
7861
|
|
|
|
|
|
|
// |
7862
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
7863
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
7864
|
|
|
|
|
|
|
// |
7865
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
7866
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
7867
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
7868
|
|
|
|
|
|
|
|
7869
|
|
|
|
|
|
|
namespace morphodita { |
7870
|
|
|
|
|
|
|
|
7871
|
0
|
|
|
|
|
|
class generic_tokenizer : public ragel_tokenizer { |
7872
|
|
|
|
|
|
|
public: |
7873
|
|
|
|
|
|
|
enum { LATEST = 2 }; |
7874
|
|
|
|
|
|
|
generic_tokenizer(unsigned version); |
7875
|
|
|
|
|
|
|
|
7876
|
|
|
|
|
|
|
virtual bool next_sentence(vector& tokens) override; |
7877
|
|
|
|
|
|
|
}; |
7878
|
|
|
|
|
|
|
|
7879
|
|
|
|
|
|
|
} // namespace morphodita |
7880
|
|
|
|
|
|
|
|
7881
|
|
|
|
|
|
|
///////// |
7882
|
|
|
|
|
|
|
// File: morphodita/morpho/external_morpho.cpp |
7883
|
|
|
|
|
|
|
///////// |
7884
|
|
|
|
|
|
|
|
7885
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
7886
|
|
|
|
|
|
|
// |
7887
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
7888
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
7889
|
|
|
|
|
|
|
// |
7890
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
7891
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
7892
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
7893
|
|
|
|
|
|
|
|
7894
|
|
|
|
|
|
|
namespace morphodita { |
7895
|
|
|
|
|
|
|
|
7896
|
0
|
|
|
|
|
|
bool external_morpho::load(istream& is) { |
7897
|
|
|
|
|
|
|
binary_decoder data; |
7898
|
0
|
0
|
|
|
|
|
if (!compressor::load(is, data)) return false; |
|
|
0
|
|
|
|
|
|
7899
|
|
|
|
|
|
|
|
7900
|
|
|
|
|
|
|
try { |
7901
|
|
|
|
|
|
|
// Load unknown_tag |
7902
|
0
|
0
|
|
|
|
|
unsigned length = data.next_1B(); |
7903
|
0
|
0
|
|
|
|
|
unknown_tag.assign(data.next(length), length); |
|
|
0
|
|
|
|
|
|
7904
|
|
|
|
|
|
|
} catch (binary_decoder_error&) { |
7905
|
|
|
|
|
|
|
return false; |
7906
|
|
|
|
|
|
|
} |
7907
|
|
|
|
|
|
|
|
7908
|
0
|
|
|
|
|
|
return data.is_end(); |
7909
|
|
|
|
|
|
|
} |
7910
|
|
|
|
|
|
|
|
7911
|
0
|
|
|
|
|
|
int external_morpho::analyze(string_piece form, guesser_mode /*guesser*/, vector& lemmas) const { |
7912
|
|
|
|
|
|
|
lemmas.clear(); |
7913
|
|
|
|
|
|
|
|
7914
|
0
|
0
|
|
|
|
|
if (form.len) { |
7915
|
|
|
|
|
|
|
// Start by skipping the first form |
7916
|
|
|
|
|
|
|
string_piece lemmatags = form; |
7917
|
0
|
0
|
|
|
|
|
while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++; |
|
|
0
|
|
|
|
|
|
7918
|
0
|
0
|
|
|
|
|
if (lemmatags.len) lemmatags.len--, lemmatags.str++; |
7919
|
|
|
|
|
|
|
|
7920
|
|
|
|
|
|
|
// Split lemmatags using ' ' into lemma-tag pairs. |
7921
|
0
|
0
|
|
|
|
|
while (lemmatags.len) { |
7922
|
|
|
|
|
|
|
auto lemma_start = lemmatags.str; |
7923
|
0
|
0
|
|
|
|
|
while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++; |
|
|
0
|
|
|
|
|
|
7924
|
0
|
0
|
|
|
|
|
if (!lemmatags.len) break; |
7925
|
|
|
|
|
|
|
auto lemma_len = lemmatags.str - lemma_start; |
7926
|
0
|
|
|
|
|
|
lemmatags.len--, lemmatags.str++; |
7927
|
|
|
|
|
|
|
|
7928
|
|
|
|
|
|
|
auto tag_start = lemmatags.str; |
7929
|
0
|
0
|
|
|
|
|
while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++; |
|
|
0
|
|
|
|
|
|
7930
|
|
|
|
|
|
|
auto tag_len = lemmatags.str - tag_start; |
7931
|
0
|
0
|
|
|
|
|
if (lemmatags.len) lemmatags.len--, lemmatags.str++; |
7932
|
|
|
|
|
|
|
|
7933
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(lemma_start, lemma_len), string(tag_start, tag_len)); |
7934
|
|
|
|
|
|
|
} |
7935
|
|
|
|
|
|
|
|
7936
|
0
|
0
|
|
|
|
|
if (!lemmas.empty()) return NO_GUESSER; |
7937
|
|
|
|
|
|
|
} |
7938
|
|
|
|
|
|
|
|
7939
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), unknown_tag); |
7940
|
0
|
|
|
|
|
|
return -1; |
7941
|
|
|
|
|
|
|
} |
7942
|
|
|
|
|
|
|
|
7943
|
0
|
|
|
|
|
|
int external_morpho::generate(string_piece lemma, const char* tag_wildcard, morpho::guesser_mode /*guesser*/, vector& forms) const { |
7944
|
|
|
|
|
|
|
forms.clear(); |
7945
|
|
|
|
|
|
|
|
7946
|
0
|
|
|
|
|
|
tag_filter filter(tag_wildcard); |
7947
|
|
|
|
|
|
|
|
7948
|
0
|
0
|
|
|
|
|
if (lemma.len) { |
7949
|
|
|
|
|
|
|
// Start by locating the lemma |
7950
|
|
|
|
|
|
|
string_piece formtags = lemma; |
7951
|
0
|
0
|
|
|
|
|
while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++; |
|
|
0
|
|
|
|
|
|
7952
|
0
|
|
|
|
|
|
string_piece real_lemma(lemma.str, lemma.len - formtags.len); |
7953
|
0
|
0
|
|
|
|
|
if (formtags.len) formtags.len--, formtags.str++; |
7954
|
|
|
|
|
|
|
|
7955
|
|
|
|
|
|
|
// Split formtags using ' ' into form-tag pairs. |
7956
|
|
|
|
|
|
|
bool any_result = false; |
7957
|
0
|
0
|
|
|
|
|
while (formtags.len) { |
7958
|
|
|
|
|
|
|
auto form_start = formtags.str; |
7959
|
0
|
0
|
|
|
|
|
while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++; |
|
|
0
|
|
|
|
|
|
7960
|
0
|
0
|
|
|
|
|
if (!formtags.len) break; |
7961
|
|
|
|
|
|
|
auto form_len = formtags.str - form_start; |
7962
|
0
|
|
|
|
|
|
formtags.len--, formtags.str++; |
7963
|
|
|
|
|
|
|
|
7964
|
|
|
|
|
|
|
auto tag_start = formtags.str; |
7965
|
0
|
0
|
|
|
|
|
while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++; |
|
|
0
|
|
|
|
|
|
7966
|
|
|
|
|
|
|
auto tag_len = formtags.str - tag_start; |
7967
|
0
|
0
|
|
|
|
|
if (formtags.len) formtags.len--, formtags.str++; |
7968
|
|
|
|
|
|
|
|
7969
|
|
|
|
|
|
|
any_result = true; |
7970
|
|
|
|
|
|
|
string tag(tag_start, tag_len); |
7971
|
0
|
0
|
|
|
|
|
if (filter.matches(tag.c_str())) { |
7972
|
0
|
0
|
|
|
|
|
if (forms.empty()) forms.emplace_back(string(real_lemma.str, real_lemma.len)); |
|
|
0
|
|
|
|
|
|
7973
|
0
|
0
|
|
|
|
|
forms.back().forms.emplace_back(string(form_start, form_len), tag); |
7974
|
|
|
|
|
|
|
} |
7975
|
|
|
|
|
|
|
} |
7976
|
|
|
|
|
|
|
|
7977
|
0
|
0
|
|
|
|
|
if (any_result) return NO_GUESSER; |
7978
|
|
|
|
|
|
|
} |
7979
|
|
|
|
|
|
|
|
7980
|
|
|
|
|
|
|
return -1; |
7981
|
|
|
|
|
|
|
} |
7982
|
|
|
|
|
|
|
|
7983
|
0
|
|
|
|
|
|
int external_morpho::raw_lemma_len(string_piece lemma) const { |
7984
|
|
|
|
|
|
|
unsigned lemma_len = 0; |
7985
|
0
|
0
|
|
|
|
|
while (lemma_len < lemma.len && lemma.str[lemma_len] != ' ') lemma_len++; |
|
|
0
|
|
|
|
|
|
7986
|
0
|
|
|
|
|
|
return lemma_len; |
7987
|
|
|
|
|
|
|
} |
7988
|
|
|
|
|
|
|
|
7989
|
0
|
|
|
|
|
|
int external_morpho::lemma_id_len(string_piece lemma) const { |
7990
|
|
|
|
|
|
|
unsigned lemma_len = 0; |
7991
|
0
|
0
|
|
|
|
|
while (lemma_len < lemma.len && lemma.str[lemma_len] != ' ') lemma_len++; |
|
|
0
|
|
|
|
|
|
7992
|
0
|
|
|
|
|
|
return lemma_len; |
7993
|
|
|
|
|
|
|
} |
7994
|
|
|
|
|
|
|
|
7995
|
0
|
|
|
|
|
|
int external_morpho::raw_form_len(string_piece form) const { |
7996
|
|
|
|
|
|
|
unsigned form_len = 0; |
7997
|
0
|
0
|
|
|
|
|
while (form_len < form.len && form.str[form_len] != ' ') form_len++; |
|
|
0
|
|
|
|
|
|
7998
|
0
|
|
|
|
|
|
return form_len; |
7999
|
|
|
|
|
|
|
} |
8000
|
|
|
|
|
|
|
|
8001
|
0
|
|
|
|
|
|
tokenizer* external_morpho::new_tokenizer() const { |
8002
|
0
|
|
|
|
|
|
return new generic_tokenizer(version); |
8003
|
|
|
|
|
|
|
} |
8004
|
|
|
|
|
|
|
|
8005
|
|
|
|
|
|
|
} // namespace morphodita |
8006
|
|
|
|
|
|
|
|
8007
|
|
|
|
|
|
|
///////// |
8008
|
|
|
|
|
|
|
// File: morphodita/morpho/generic_lemma_addinfo.h |
8009
|
|
|
|
|
|
|
///////// |
8010
|
|
|
|
|
|
|
|
8011
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
8012
|
|
|
|
|
|
|
// |
8013
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
8014
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
8015
|
|
|
|
|
|
|
// |
8016
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
8017
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
8018
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
8019
|
|
|
|
|
|
|
|
8020
|
|
|
|
|
|
|
namespace morphodita { |
8021
|
|
|
|
|
|
|
|
8022
|
|
|
|
|
|
|
// Declarations |
8023
|
0
|
|
|
|
|
|
struct generic_lemma_addinfo { |
8024
|
|
|
|
|
|
|
inline static int raw_lemma_len(string_piece lemma); |
8025
|
|
|
|
|
|
|
inline static int lemma_id_len(string_piece lemma); |
8026
|
|
|
|
|
|
|
inline static string format(const unsigned char* addinfo, int addinfo_len); |
8027
|
|
|
|
|
|
|
inline static bool generatable(const unsigned char* addinfo, int addinfo_len); |
8028
|
|
|
|
|
|
|
|
8029
|
|
|
|
|
|
|
inline int parse(string_piece lemma, bool die_on_failure = false); |
8030
|
|
|
|
|
|
|
inline bool match_lemma_id(const unsigned char* other_addinfo, int other_addinfo_len); |
8031
|
|
|
|
|
|
|
|
8032
|
|
|
|
|
|
|
vector data; |
8033
|
|
|
|
|
|
|
}; |
8034
|
|
|
|
|
|
|
|
8035
|
|
|
|
|
|
|
// Definitions |
8036
|
|
|
|
|
|
|
int generic_lemma_addinfo::raw_lemma_len(string_piece lemma) { |
8037
|
0
|
|
|
|
|
|
return lemma.len; |
8038
|
|
|
|
|
|
|
} |
8039
|
|
|
|
|
|
|
|
8040
|
|
|
|
|
|
|
int generic_lemma_addinfo::lemma_id_len(string_piece lemma) { |
8041
|
0
|
|
|
|
|
|
return lemma.len; |
8042
|
|
|
|
|
|
|
} |
8043
|
|
|
|
|
|
|
|
8044
|
|
|
|
|
|
|
string generic_lemma_addinfo::format(const unsigned char* /*addinfo*/, int /*addinfo_len*/) { |
8045
|
|
|
|
|
|
|
return string(); |
8046
|
|
|
|
|
|
|
} |
8047
|
|
|
|
|
|
|
|
8048
|
|
|
|
|
|
|
bool generic_lemma_addinfo::generatable(const unsigned char* /*addinfo*/, int /*addinfo_len*/) { |
8049
|
|
|
|
|
|
|
return true; |
8050
|
|
|
|
|
|
|
} |
8051
|
|
|
|
|
|
|
|
8052
|
|
|
|
|
|
|
int generic_lemma_addinfo::parse(string_piece lemma, bool /*die_on_failure*/) { |
8053
|
0
|
|
|
|
|
|
return lemma.len; |
8054
|
|
|
|
|
|
|
} |
8055
|
|
|
|
|
|
|
|
8056
|
|
|
|
|
|
|
bool generic_lemma_addinfo::match_lemma_id(const unsigned char* /*other_addinfo*/, int /*other_addinfo_len*/) { |
8057
|
|
|
|
|
|
|
return true; |
8058
|
|
|
|
|
|
|
} |
8059
|
|
|
|
|
|
|
|
8060
|
|
|
|
|
|
|
} // namespace morphodita |
8061
|
|
|
|
|
|
|
|
8062
|
|
|
|
|
|
|
///////// |
8063
|
|
|
|
|
|
|
// File: morphodita/morpho/generic_morpho.h |
8064
|
|
|
|
|
|
|
///////// |
8065
|
|
|
|
|
|
|
|
8066
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
8067
|
|
|
|
|
|
|
// |
8068
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
8069
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
8070
|
|
|
|
|
|
|
// |
8071
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
8072
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
8073
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
8074
|
|
|
|
|
|
|
|
8075
|
|
|
|
|
|
|
namespace morphodita { |
8076
|
|
|
|
|
|
|
|
8077
|
4
|
|
|
|
|
|
class generic_morpho : public morpho { |
8078
|
|
|
|
|
|
|
public: |
8079
|
1
|
|
|
|
|
|
generic_morpho(unsigned version) : version(version) {} |
8080
|
|
|
|
|
|
|
|
8081
|
|
|
|
|
|
|
virtual int analyze(string_piece form, morpho::guesser_mode guesser, vector& lemmas) const override; |
8082
|
|
|
|
|
|
|
virtual int generate(string_piece lemma, const char* tag_wildcard, guesser_mode guesser, vector& forms) const override; |
8083
|
|
|
|
|
|
|
virtual int raw_lemma_len(string_piece lemma) const override; |
8084
|
|
|
|
|
|
|
virtual int lemma_id_len(string_piece lemma) const override; |
8085
|
|
|
|
|
|
|
virtual int raw_form_len(string_piece form) const override; |
8086
|
|
|
|
|
|
|
virtual tokenizer* new_tokenizer() const override; |
8087
|
|
|
|
|
|
|
|
8088
|
|
|
|
|
|
|
bool load(istream& is); |
8089
|
|
|
|
|
|
|
private: |
8090
|
|
|
|
|
|
|
inline void analyze_special(string_piece form, vector& lemmas) const; |
8091
|
|
|
|
|
|
|
|
8092
|
|
|
|
|
|
|
unsigned version; |
8093
|
|
|
|
|
|
|
morpho_dictionary dictionary; |
8094
|
|
|
|
|
|
|
unique_ptr statistical_guesser; |
8095
|
|
|
|
|
|
|
|
8096
|
|
|
|
|
|
|
string unknown_tag, number_tag, punctuation_tag, symbol_tag; |
8097
|
|
|
|
|
|
|
}; |
8098
|
|
|
|
|
|
|
|
8099
|
|
|
|
|
|
|
} // namespace morphodita |
8100
|
|
|
|
|
|
|
|
8101
|
|
|
|
|
|
|
///////// |
8102
|
|
|
|
|
|
|
// File: morphodita/morpho/generic_morpho.cpp |
8103
|
|
|
|
|
|
|
///////// |
8104
|
|
|
|
|
|
|
|
8105
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
8106
|
|
|
|
|
|
|
// |
8107
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
8108
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
8109
|
|
|
|
|
|
|
// |
8110
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
8111
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
8112
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
8113
|
|
|
|
|
|
|
|
8114
|
|
|
|
|
|
|
namespace morphodita { |
8115
|
|
|
|
|
|
|
|
8116
|
1
|
|
|
|
|
|
bool generic_morpho::load(istream& is) { |
8117
|
|
|
|
|
|
|
binary_decoder data; |
8118
|
1
|
50
|
|
|
|
|
if (!compressor::load(is, data)) return false; |
|
|
50
|
|
|
|
|
|
8119
|
|
|
|
|
|
|
|
8120
|
|
|
|
|
|
|
try { |
8121
|
|
|
|
|
|
|
// Load tags |
8122
|
1
|
50
|
|
|
|
|
unsigned length = data.next_1B(); |
8123
|
1
|
50
|
|
|
|
|
unknown_tag.assign(data.next(length), length); |
8124
|
1
|
50
|
|
|
|
|
length = data.next_1B(); |
8125
|
1
|
50
|
|
|
|
|
number_tag.assign(data.next(length), length); |
8126
|
1
|
50
|
|
|
|
|
length = data.next_1B(); |
8127
|
1
|
50
|
|
|
|
|
punctuation_tag.assign(data.next(length), length); |
8128
|
1
|
50
|
|
|
|
|
length = data.next_1B(); |
8129
|
1
|
50
|
|
|
|
|
symbol_tag.assign(data.next(length), length); |
8130
|
|
|
|
|
|
|
|
8131
|
|
|
|
|
|
|
// Load dictionary |
8132
|
1
|
50
|
|
|
|
|
dictionary.load(data); |
8133
|
|
|
|
|
|
|
|
8134
|
|
|
|
|
|
|
// Optionally statistical guesser if present |
8135
|
|
|
|
|
|
|
statistical_guesser.reset(); |
8136
|
1
|
50
|
|
|
|
|
if (data.next_1B()) { |
|
|
50
|
|
|
|
|
|
8137
|
1
|
50
|
|
|
|
|
statistical_guesser.reset(new morpho_statistical_guesser()); |
8138
|
1
|
50
|
|
|
|
|
statistical_guesser->load(data); |
8139
|
|
0
|
|
|
|
|
} |
8140
|
|
|
|
|
|
|
} catch (binary_decoder_error&) { |
8141
|
|
|
|
|
|
|
return false; |
8142
|
|
|
|
|
|
|
} |
8143
|
|
|
|
|
|
|
|
8144
|
1
|
|
|
|
|
|
return data.is_end(); |
8145
|
|
|
|
|
|
|
} |
8146
|
|
|
|
|
|
|
|
8147
|
7
|
|
|
|
|
|
int generic_morpho::analyze(string_piece form, guesser_mode guesser, vector& lemmas) const { |
8148
|
|
|
|
|
|
|
lemmas.clear(); |
8149
|
|
|
|
|
|
|
|
8150
|
7
|
50
|
|
|
|
|
if (form.len) { |
8151
|
|
|
|
|
|
|
// Generate all casing variants if needed (they are different than given form). |
8152
|
|
|
|
|
|
|
string form_uclc; // first uppercase, rest lowercase |
8153
|
|
|
|
|
|
|
string form_lc; // all lowercase |
8154
|
7
|
50
|
|
|
|
|
generate_casing_variants(form, form_uclc, form_lc); |
8155
|
|
|
|
|
|
|
|
8156
|
|
|
|
|
|
|
// Start by analysing using the dictionary and all casing variants. |
8157
|
7
|
50
|
|
|
|
|
dictionary.analyze(form, lemmas); |
8158
|
7
|
50
|
|
|
|
|
if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas); |
|
|
0
|
|
|
|
|
|
8159
|
7
|
100
|
|
|
|
|
if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas); |
|
|
50
|
|
|
|
|
|
8160
|
7
|
50
|
|
|
|
|
if (!lemmas.empty()) return NO_GUESSER; |
8161
|
|
|
|
|
|
|
|
8162
|
|
|
|
|
|
|
// Then call analyze_special to handle numbers, punctuation and symbols. |
8163
|
0
|
0
|
|
|
|
|
analyze_special(form, lemmas); |
8164
|
0
|
0
|
|
|
|
|
if (!lemmas.empty()) return NO_GUESSER; |
8165
|
|
|
|
|
|
|
|
8166
|
|
|
|
|
|
|
// For the statistical guesser, use all casing variants. |
8167
|
0
|
0
|
|
|
|
|
if (guesser == GUESSER && statistical_guesser) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8168
|
0
|
0
|
|
|
|
|
if (form_uclc.empty() && form_lc.empty()) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8169
|
0
|
0
|
|
|
|
|
statistical_guesser->analyze(form, lemmas, nullptr); |
8170
|
|
|
|
|
|
|
else { |
8171
|
0
|
0
|
|
|
|
|
morpho_statistical_guesser::used_rules used_rules; used_rules.reserve(3); |
8172
|
0
|
0
|
|
|
|
|
statistical_guesser->analyze(form, lemmas, &used_rules); |
8173
|
0
|
0
|
|
|
|
|
if (!form_uclc.empty()) statistical_guesser->analyze(form_uclc, lemmas, &used_rules); |
|
|
0
|
|
|
|
|
|
8174
|
0
|
0
|
|
|
|
|
if (!form_lc.empty()) statistical_guesser->analyze(form_lc, lemmas, &used_rules); |
|
|
0
|
|
|
|
|
|
8175
|
|
|
|
|
|
|
} |
8176
|
|
|
|
|
|
|
} |
8177
|
0
|
0
|
|
|
|
|
if (!lemmas.empty()) return GUESSER; |
8178
|
|
|
|
|
|
|
} |
8179
|
|
|
|
|
|
|
|
8180
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), unknown_tag); |
8181
|
7
|
|
|
|
|
|
return -1; |
8182
|
|
|
|
|
|
|
} |
8183
|
|
|
|
|
|
|
|
8184
|
0
|
|
|
|
|
|
int generic_morpho::generate(string_piece lemma, const char* tag_wildcard, morpho::guesser_mode /*guesser*/, vector& forms) const { |
8185
|
|
|
|
|
|
|
forms.clear(); |
8186
|
|
|
|
|
|
|
|
8187
|
0
|
|
|
|
|
|
tag_filter filter(tag_wildcard); |
8188
|
|
|
|
|
|
|
|
8189
|
0
|
0
|
|
|
|
|
if (lemma.len) { |
8190
|
0
|
0
|
|
|
|
|
if (dictionary.generate(lemma, filter, forms)) |
|
|
0
|
|
|
|
|
|
8191
|
|
|
|
|
|
|
return NO_GUESSER; |
8192
|
|
|
|
|
|
|
} |
8193
|
|
|
|
|
|
|
|
8194
|
|
|
|
|
|
|
return -1; |
8195
|
|
|
|
|
|
|
} |
8196
|
|
|
|
|
|
|
|
8197
|
0
|
|
|
|
|
|
int generic_morpho::raw_lemma_len(string_piece lemma) const { |
8198
|
0
|
|
|
|
|
|
return generic_lemma_addinfo::raw_lemma_len(lemma); |
8199
|
|
|
|
|
|
|
} |
8200
|
|
|
|
|
|
|
|
8201
|
0
|
|
|
|
|
|
int generic_morpho::lemma_id_len(string_piece lemma) const { |
8202
|
0
|
|
|
|
|
|
return generic_lemma_addinfo::lemma_id_len(lemma); |
8203
|
|
|
|
|
|
|
} |
8204
|
|
|
|
|
|
|
|
8205
|
7
|
|
|
|
|
|
int generic_morpho::raw_form_len(string_piece form) const { |
8206
|
7
|
|
|
|
|
|
return form.len; |
8207
|
|
|
|
|
|
|
} |
8208
|
|
|
|
|
|
|
|
8209
|
0
|
|
|
|
|
|
tokenizer* generic_morpho::new_tokenizer() const { |
8210
|
0
|
|
|
|
|
|
return new generic_tokenizer(version); |
8211
|
|
|
|
|
|
|
} |
8212
|
|
|
|
|
|
|
|
8213
|
0
|
|
|
|
|
|
void generic_morpho::analyze_special(string_piece form, vector& lemmas) const { |
8214
|
|
|
|
|
|
|
using namespace unilib; |
8215
|
|
|
|
|
|
|
|
8216
|
|
|
|
|
|
|
// Analyzer for numbers, punctuation and symbols. |
8217
|
|
|
|
|
|
|
// Number is anything matching [+-]? is_Pn* ([.,] is_Pn*)? ([Ee] [+-]? is_Pn+)? for at least one is_Pn* nonempty. |
8218
|
|
|
|
|
|
|
// Punctuation is any form beginning with either unicode punctuation or punctuation_exceptions character. |
8219
|
|
|
|
|
|
|
// Beware that numbers takes precedence, so - is punctuation, -3 is number, -. is punctuation, -.3 is number. |
8220
|
0
|
0
|
|
|
|
|
if (!form.len) return; |
8221
|
|
|
|
|
|
|
|
8222
|
0
|
|
|
|
|
|
string_piece number = form; |
8223
|
0
|
|
|
|
|
|
char32_t first = utf8::decode(number.str, number.len); |
8224
|
|
|
|
|
|
|
|
8225
|
|
|
|
|
|
|
// Try matching a number. |
8226
|
|
|
|
|
|
|
char32_t codepoint = first; |
8227
|
|
|
|
|
|
|
bool any_digit = false; |
8228
|
0
|
0
|
|
|
|
|
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(number.str, number.len); |
8229
|
0
|
0
|
|
|
|
|
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
8230
|
0
|
0
|
|
|
|
|
if ((codepoint == '.' && number.len) || codepoint == ',') codepoint = utf8::decode(number.str, number.len); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8231
|
0
|
0
|
|
|
|
|
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
8232
|
0
|
0
|
|
|
|
|
if (any_digit && (codepoint == 'e' || codepoint == 'E')) { |
|
|
0
|
|
|
|
|
|
8233
|
0
|
|
|
|
|
|
codepoint = utf8::decode(number.str, number.len); |
8234
|
0
|
0
|
|
|
|
|
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(number.str, number.len); |
8235
|
|
|
|
|
|
|
any_digit = false; |
8236
|
0
|
0
|
|
|
|
|
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
8237
|
|
|
|
|
|
|
} |
8238
|
|
|
|
|
|
|
|
8239
|
0
|
0
|
|
|
|
|
if (any_digit && !number.len && (!codepoint || codepoint == '.')) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8240
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), number_tag); |
8241
|
0
|
|
|
|
|
|
return; |
8242
|
|
|
|
|
|
|
} |
8243
|
|
|
|
|
|
|
|
8244
|
|
|
|
|
|
|
// Try matching punctuation or symbol. |
8245
|
|
|
|
|
|
|
bool punctuation = true, symbol = true; |
8246
|
0
|
|
|
|
|
|
string_piece form_ori = form; |
8247
|
0
|
0
|
|
|
|
|
while (form.len) { |
8248
|
0
|
|
|
|
|
|
codepoint = utf8::decode(form.str, form.len); |
8249
|
0
|
0
|
|
|
|
|
punctuation = punctuation && unicode::category(codepoint) & unicode::P; |
|
|
0
|
|
|
|
|
|
8250
|
0
|
0
|
|
|
|
|
symbol = symbol && unicode::category(codepoint) & unicode::S; |
|
|
0
|
|
|
|
|
|
8251
|
|
|
|
|
|
|
} |
8252
|
0
|
0
|
|
|
|
|
if (punctuation) |
8253
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form_ori.str, form_ori.len), punctuation_tag); |
8254
|
0
|
0
|
|
|
|
|
else if (symbol) |
8255
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form_ori.str, form_ori.len), symbol_tag); |
8256
|
|
|
|
|
|
|
} |
8257
|
|
|
|
|
|
|
|
8258
|
|
|
|
|
|
|
} // namespace morphodita |
8259
|
|
|
|
|
|
|
|
8260
|
|
|
|
|
|
|
///////// |
8261
|
|
|
|
|
|
|
// File: morphodita/morpho/generic_morpho_encoder.h |
8262
|
|
|
|
|
|
|
///////// |
8263
|
|
|
|
|
|
|
|
8264
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
8265
|
|
|
|
|
|
|
// |
8266
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
8267
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
8268
|
|
|
|
|
|
|
// |
8269
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
8270
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
8271
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
8272
|
|
|
|
|
|
|
|
8273
|
|
|
|
|
|
|
namespace morphodita { |
8274
|
|
|
|
|
|
|
|
8275
|
|
|
|
|
|
|
class generic_morpho_encoder { |
8276
|
|
|
|
|
|
|
public: |
8277
|
0
|
|
|
|
|
|
struct tags { |
8278
|
|
|
|
|
|
|
string unknown_tag, number_tag, punctuation_tag, symbol_tag; |
8279
|
|
|
|
|
|
|
}; |
8280
|
|
|
|
|
|
|
static void encode(istream& in_dictionary, int max_suffix_len, const tags& tags, istream& in_statistical_guesser, ostream& out_morpho); |
8281
|
|
|
|
|
|
|
}; |
8282
|
|
|
|
|
|
|
|
8283
|
|
|
|
|
|
|
} // namespace morphodita |
8284
|
|
|
|
|
|
|
|
8285
|
|
|
|
|
|
|
///////// |
8286
|
|
|
|
|
|
|
// File: morphodita/morpho/persistent_unordered_map_encoder.h |
8287
|
|
|
|
|
|
|
///////// |
8288
|
|
|
|
|
|
|
|
8289
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
8290
|
|
|
|
|
|
|
// |
8291
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
8292
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
8293
|
|
|
|
|
|
|
// |
8294
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
8295
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
8296
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
8297
|
|
|
|
|
|
|
|
8298
|
|
|
|
|
|
|
namespace morphodita { |
8299
|
|
|
|
|
|
|
|
8300
|
|
|
|
|
|
|
template |
8301
|
0
|
|
|
|
|
|
persistent_unordered_map::persistent_unordered_map(const unordered_map& map, double load_factor, EntryEncode entry_encode) { |
8302
|
0
|
0
|
|
|
|
|
construct(std::map(map.begin(), map.end()), load_factor, entry_encode); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8303
|
0
|
|
|
|
|
|
} |
8304
|
|
|
|
|
|
|
|
8305
|
|
|
|
|
|
|
template |
8306
|
0
|
|
|
|
|
|
persistent_unordered_map::persistent_unordered_map(const unordered_map& map, double load_factor, bool add_prefixes, bool add_suffixes, EntryEncode entry_encode) { |
8307
|
|
|
|
|
|
|
// Copy data, possibly including prefixes and suffixes |
8308
|
0
|
0
|
|
|
|
|
std::map enlarged_map(map.begin(), map.end()); |
|
|
0
|
|
|
|
|
|
8309
|
|
|
|
|
|
|
|
8310
|
0
|
0
|
|
|
|
|
for (auto&& entry : map) { |
|
|
0
|
|
|
|
|
|
8311
|
0
|
|
|
|
|
|
const string& key = entry.first; |
8312
|
|
|
|
|
|
|
|
8313
|
0
|
0
|
|
|
|
|
if (!key.empty() && add_prefixes) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8314
|
0
|
0
|
|
|
|
|
for (unsigned i = key.size() - 1; i; i--) |
|
|
0
|
|
|
|
|
|
8315
|
0
|
0
|
|
|
|
|
enlarged_map[key.substr(0, i)]; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8316
|
|
|
|
|
|
|
|
8317
|
0
|
0
|
|
|
|
|
if (!key.empty() && add_suffixes) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8318
|
0
|
0
|
|
|
|
|
for (unsigned i = 1; i < key.size(); i++) |
|
|
0
|
|
|
|
|
|
8319
|
0
|
0
|
|
|
|
|
enlarged_map[key.substr(i)]; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8320
|
|
|
|
|
|
|
} |
8321
|
|
|
|
|
|
|
|
8322
|
0
|
0
|
|
|
|
|
construct(enlarged_map, load_factor, entry_encode); |
|
|
0
|
|
|
|
|
|
8323
|
0
|
|
|
|
|
|
} |
8324
|
|
|
|
|
|
|
|
8325
|
|
|
|
|
|
|
// We could (and used to) use unordered_map as input parameter. |
8326
|
|
|
|
|
|
|
// Nevertheless, as order is unspecified, the resulting persistent_unordered_map |
8327
|
|
|
|
|
|
|
// has different collision chains when generated on 32-bit and 64-bit machines. |
8328
|
|
|
|
|
|
|
// To guarantee uniform binary representation, we use map instead. |
8329
|
|
|
|
|
|
|
template |
8330
|
0
|
|
|
|
|
|
void persistent_unordered_map::construct(const map& map, double load_factor, EntryEncode entry_encode) { |
8331
|
|
|
|
|
|
|
// 1) Count number of elements for each size |
8332
|
|
|
|
|
|
|
vector sizes; |
8333
|
0
|
0
|
|
|
|
|
for (auto&& elem : map) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8334
|
0
|
|
|
|
|
|
unsigned len = elem.first.size(); |
8335
|
0
|
0
|
|
|
|
|
if (len >= sizes.size()) sizes.resize(len + 1); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8336
|
0
|
|
|
|
|
|
sizes[len]++; |
8337
|
|
|
|
|
|
|
} |
8338
|
0
|
0
|
|
|
|
|
for (auto&& size : sizes) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8339
|
0
|
0
|
|
|
|
|
resize(unsigned(load_factor * size)); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8340
|
|
|
|
|
|
|
|
8341
|
|
|
|
|
|
|
// 2) Add sizes of element data |
8342
|
0
|
0
|
|
|
|
|
for (auto&& elem : map) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8343
|
0
|
0
|
|
|
|
|
binary_encoder enc; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8344
|
0
|
0
|
|
|
|
|
entry_encode(enc, elem.second); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8345
|
0
|
|
|
|
|
|
add(elem.first.c_str(), elem.first.size(), enc.data.size()); |
8346
|
|
|
|
|
|
|
} |
8347
|
0
|
0
|
|
|
|
|
done_adding(); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8348
|
|
|
|
|
|
|
|
8349
|
|
|
|
|
|
|
// 3) Fill in element data |
8350
|
0
|
0
|
|
|
|
|
for (auto&& elem : map) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8351
|
0
|
0
|
|
|
|
|
binary_encoder enc; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8352
|
0
|
0
|
|
|
|
|
entry_encode(enc, elem.second); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8353
|
0
|
|
|
|
|
|
small_memcpy(fill(elem.first.c_str(), elem.first.size(), enc.data.size()), enc.data.data(), enc.data.size()); |
8354
|
|
|
|
|
|
|
} |
8355
|
0
|
|
|
|
|
|
done_filling(); |
8356
|
0
|
|
|
|
|
|
} |
8357
|
|
|
|
|
|
|
|
8358
|
0
|
|
|
|
|
|
void persistent_unordered_map::save(binary_encoder& enc) { |
8359
|
0
|
|
|
|
|
|
enc.add_1B(hashes.size()); |
8360
|
|
|
|
|
|
|
|
8361
|
0
|
0
|
|
|
|
|
for (auto&& hash : hashes) |
8362
|
0
|
|
|
|
|
|
hash.save(enc); |
8363
|
0
|
|
|
|
|
|
} |
8364
|
|
|
|
|
|
|
|
8365
|
0
|
|
|
|
|
|
void persistent_unordered_map::fnv_hash::save(binary_encoder& enc) { |
8366
|
0
|
|
|
|
|
|
enc.add_4B(hash.size()); |
8367
|
|
|
|
|
|
|
enc.add_data(hash); |
8368
|
|
|
|
|
|
|
|
8369
|
0
|
|
|
|
|
|
enc.add_4B(data.size()); |
8370
|
|
|
|
|
|
|
enc.add_data(data); |
8371
|
0
|
|
|
|
|
|
} |
8372
|
|
|
|
|
|
|
|
8373
|
|
|
|
|
|
|
} // namespace morphodita |
8374
|
|
|
|
|
|
|
|
8375
|
|
|
|
|
|
|
///////// |
8376
|
|
|
|
|
|
|
// File: morphodita/morpho/raw_morpho_dictionary_reader.h |
8377
|
|
|
|
|
|
|
///////// |
8378
|
|
|
|
|
|
|
|
8379
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
8380
|
|
|
|
|
|
|
// |
8381
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
8382
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
8383
|
|
|
|
|
|
|
// |
8384
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
8385
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
8386
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
8387
|
|
|
|
|
|
|
|
8388
|
|
|
|
|
|
|
namespace morphodita { |
8389
|
|
|
|
|
|
|
|
8390
|
0
|
|
|
|
|
|
class raw_morpho_dictionary_reader { |
8391
|
|
|
|
|
|
|
public: |
8392
|
0
|
|
|
|
|
|
raw_morpho_dictionary_reader(istream& in) : in(in) {} |
8393
|
|
|
|
|
|
|
bool next_lemma(string& lemma, vector>& tagged_forms); |
8394
|
|
|
|
|
|
|
private: |
8395
|
|
|
|
|
|
|
istream& in; |
8396
|
|
|
|
|
|
|
string line; |
8397
|
|
|
|
|
|
|
vector tokens; |
8398
|
|
|
|
|
|
|
unordered_set seen_lemmas; |
8399
|
|
|
|
|
|
|
}; |
8400
|
|
|
|
|
|
|
|
8401
|
|
|
|
|
|
|
} // namespace morphodita |
8402
|
|
|
|
|
|
|
|
8403
|
|
|
|
|
|
|
///////// |
8404
|
|
|
|
|
|
|
// File: utils/new_unique_ptr.h |
8405
|
|
|
|
|
|
|
///////// |
8406
|
|
|
|
|
|
|
|
8407
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
8408
|
|
|
|
|
|
|
// |
8409
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
8410
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
8411
|
|
|
|
|
|
|
// |
8412
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
8413
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
8414
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
8415
|
|
|
|
|
|
|
|
8416
|
|
|
|
|
|
|
namespace utils { |
8417
|
|
|
|
|
|
|
|
8418
|
|
|
|
|
|
|
template |
8419
|
3
|
|
|
|
|
|
unique_ptr new_unique_ptr(Args&&... args) { |
8420
|
3
|
50
|
|
|
|
|
return unique_ptr(new T(std::forward(args)...)); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8421
|
|
|
|
|
|
|
} |
8422
|
|
|
|
|
|
|
|
8423
|
|
|
|
|
|
|
} // namespace utils |
8424
|
|
|
|
|
|
|
|
8425
|
|
|
|
|
|
|
///////// |
8426
|
|
|
|
|
|
|
// File: morphodita/morpho/morpho_dictionary_encoder.h |
8427
|
|
|
|
|
|
|
///////// |
8428
|
|
|
|
|
|
|
|
8429
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
8430
|
|
|
|
|
|
|
// |
8431
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
8432
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
8433
|
|
|
|
|
|
|
// |
8434
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
8435
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
8436
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
8437
|
|
|
|
|
|
|
|
8438
|
|
|
|
|
|
|
namespace morphodita { |
8439
|
|
|
|
|
|
|
|
8440
|
|
|
|
|
|
|
// Declarations |
8441
|
|
|
|
|
|
|
template |
8442
|
|
|
|
|
|
|
class morpho_dictionary_encoder { |
8443
|
|
|
|
|
|
|
public: |
8444
|
|
|
|
|
|
|
static void encode(istream& is, int max_suffix_len, binary_encoder& enc); |
8445
|
|
|
|
|
|
|
}; |
8446
|
|
|
|
|
|
|
|
8447
|
|
|
|
|
|
|
// Definitions |
8448
|
|
|
|
|
|
|
template |
8449
|
0
|
|
|
|
|
|
class dictionary { |
8450
|
|
|
|
|
|
|
public: |
8451
|
|
|
|
|
|
|
void load(istream& is, int max_suffix_len); |
8452
|
|
|
|
|
|
|
void encode(binary_encoder& enc); |
8453
|
|
|
|
|
|
|
|
8454
|
|
|
|
|
|
|
private: |
8455
|
0
|
|
|
|
|
|
class trie { |
8456
|
|
|
|
|
|
|
public: |
8457
|
0
|
|
|
|
|
|
trie() : depth(0) {} |
8458
|
|
|
|
|
|
|
|
8459
|
0
|
|
|
|
|
|
void add(const char* str) { |
8460
|
0
|
0
|
|
|
|
|
if (!*str) return; |
8461
|
|
|
|
|
|
|
|
8462
|
0
|
0
|
|
|
|
|
for (auto&& child : children) |
8463
|
0
|
0
|
|
|
|
|
if (child.first == *str) { |
8464
|
0
|
|
|
|
|
|
child.second->add(str + 1); |
8465
|
0
|
|
|
|
|
|
depth = max(depth, 1 + child.second->depth); |
8466
|
|
|
|
|
|
|
return; |
8467
|
|
|
|
|
|
|
} |
8468
|
0
|
0
|
|
|
|
|
children.emplace_back(*str, new_unique_ptr()); |
8469
|
0
|
|
|
|
|
|
children.back().second->add(str + 1); |
8470
|
0
|
|
|
|
|
|
depth = max(depth, 1 + children.back().second->depth); |
8471
|
|
|
|
|
|
|
} |
8472
|
|
|
|
|
|
|
|
8473
|
0
|
|
|
|
|
|
string find_candidate_prefix(int max_suffix_len) { |
8474
|
|
|
|
|
|
|
string current, best; |
8475
|
0
|
|
|
|
|
|
int best_length = 0; |
8476
|
0
|
0
|
|
|
|
|
find_candidate_prefix(max_suffix_len, current, best, best_length, 0); |
8477
|
0
|
|
|
|
|
|
return best; |
8478
|
|
|
|
|
|
|
} |
8479
|
0
|
|
|
|
|
|
void find_candidate_prefix(int max_suffix_len, string& current, string& best, int& best_length, int length) { |
8480
|
0
|
0
|
|
|
|
|
if (depth < max_suffix_len && length > best_length) { |
|
|
0
|
|
|
|
|
|
8481
|
|
|
|
|
|
|
best = current; |
8482
|
0
|
|
|
|
|
|
best_length = length; |
8483
|
|
|
|
|
|
|
} |
8484
|
0
|
0
|
|
|
|
|
for (auto&& child : children) { |
8485
|
0
|
|
|
|
|
|
current.push_back(child.first); |
8486
|
0
|
0
|
|
|
|
|
child.second->find_candidate_prefix(max_suffix_len, current, best, best_length, children.size() == 1 ? length + 1 : 1); |
8487
|
0
|
|
|
|
|
|
current.resize(current.size() - 1); |
8488
|
|
|
|
|
|
|
} |
8489
|
0
|
|
|
|
|
|
} |
8490
|
|
|
|
|
|
|
|
8491
|
|
|
|
|
|
|
vector>> children; |
8492
|
|
|
|
|
|
|
int depth; |
8493
|
|
|
|
|
|
|
}; |
8494
|
|
|
|
|
|
|
|
8495
|
0
|
|
|
|
|
|
class histogram { |
8496
|
|
|
|
|
|
|
public: |
8497
|
0
|
|
|
|
|
|
void add(const string& str) { |
8498
|
0
|
0
|
|
|
|
|
if (str.size() >= lengths.size()) lengths.resize(str.size() + 1); |
8499
|
|
|
|
|
|
|
lengths[str.size()].insert(str); |
8500
|
0
|
|
|
|
|
|
} |
8501
|
|
|
|
|
|
|
|
8502
|
0
|
|
|
|
|
|
void encode(binary_encoder& enc) { |
8503
|
0
|
|
|
|
|
|
enc.add_1B(lengths.size()); |
8504
|
0
|
0
|
|
|
|
|
for (auto&& set : lengths) |
8505
|
0
|
|
|
|
|
|
enc.add_4B(set.size()); |
8506
|
0
|
|
|
|
|
|
} |
8507
|
|
|
|
|
|
|
|
8508
|
|
|
|
|
|
|
vector> lengths; |
8509
|
|
|
|
|
|
|
}; |
8510
|
|
|
|
|
|
|
|
8511
|
0
|
|
|
|
|
|
struct lemma_info { |
8512
|
0
|
|
|
|
|
|
lemma_info(string lemma) { |
8513
|
0
|
0
|
|
|
|
|
this->lemma = lemma.substr(0, addinfo.parse(lemma, true)); |
8514
|
0
|
|
|
|
|
|
} |
8515
|
|
|
|
|
|
|
|
8516
|
|
|
|
|
|
|
string lemma; |
8517
|
|
|
|
|
|
|
LemmaAddinfo addinfo; |
8518
|
0
|
|
|
|
|
|
struct lemma_form_info { |
8519
|
0
|
|
|
|
|
|
lemma_form_info(string form, int clas) : form(form), clas(clas) {} |
8520
|
|
|
|
|
|
|
|
8521
|
|
|
|
|
|
|
string form; |
8522
|
|
|
|
|
|
|
int clas; |
8523
|
|
|
|
|
|
|
|
8524
|
0
|
0
|
|
|
|
|
bool operator<(const lemma_form_info& other) const { return form < other.form || (form == other.form && clas < other.clas); } |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8525
|
|
|
|
|
|
|
}; |
8526
|
|
|
|
|
|
|
vector forms; |
8527
|
|
|
|
|
|
|
|
8528
|
0
|
0
|
|
|
|
|
bool operator<(const lemma_info& other) const { return lemma < other.lemma || (lemma == other.lemma && addinfo.data < other.addinfo.data); } |
8529
|
|
|
|
|
|
|
}; |
8530
|
|
|
|
|
|
|
|
8531
|
|
|
|
|
|
|
unordered_map classes; |
8532
|
|
|
|
|
|
|
unordered_map>> suffixes; |
8533
|
|
|
|
|
|
|
|
8534
|
|
|
|
|
|
|
vector tags; |
8535
|
|
|
|
|
|
|
unordered_map tags_map; |
8536
|
|
|
|
|
|
|
|
8537
|
|
|
|
|
|
|
histogram lemmas_hist, forms_hist; |
8538
|
|
|
|
|
|
|
|
8539
|
|
|
|
|
|
|
vector lemmas; |
8540
|
|
|
|
|
|
|
}; |
8541
|
|
|
|
|
|
|
|
8542
|
|
|
|
|
|
|
template |
8543
|
0
|
|
|
|
|
|
void morpho_dictionary_encoder::encode(istream& is, int max_suffix_len, binary_encoder& enc) { |
8544
|
0
|
|
|
|
|
|
dictionary dict; |
8545
|
|
|
|
|
|
|
|
8546
|
|
|
|
|
|
|
// Load the dictionary and create classes |
8547
|
0
|
0
|
|
|
|
|
dict.load(is, max_suffix_len); |
8548
|
|
|
|
|
|
|
|
8549
|
|
|
|
|
|
|
// Encode the dictionary |
8550
|
0
|
0
|
|
|
|
|
dict.encode(enc); |
8551
|
0
|
|
|
|
|
|
} |
8552
|
|
|
|
|
|
|
|
8553
|
|
|
|
|
|
|
template |
8554
|
0
|
|
|
|
|
|
void dictionary::load(istream& is, int max_suffix_len) { |
8555
|
|
|
|
|
|
|
// Load lemmas and create classes |
8556
|
0
|
|
|
|
|
|
raw_morpho_dictionary_reader raw(is); |
8557
|
|
|
|
|
|
|
string lemma; |
8558
|
0
|
|
|
|
|
|
vector> forms; |
8559
|
0
|
0
|
|
|
|
|
while(raw.next_lemma(lemma, forms)) { |
|
|
0
|
|
|
|
|
|
8560
|
|
|
|
|
|
|
// Make sure forms are unique |
8561
|
|
|
|
|
|
|
sort(forms.begin(), forms.end()); |
8562
|
|
|
|
|
|
|
auto forms_end = unique(forms.begin(), forms.end()); |
8563
|
0
|
0
|
|
|
|
|
if (forms_end != forms.end()) { |
8564
|
|
|
|
|
|
|
// cerr << "Warning: repeated form-tag in lemma " << lemma << '.' << endl; |
8565
|
|
|
|
|
|
|
forms.erase(forms_end, forms.end()); |
8566
|
|
|
|
|
|
|
} |
8567
|
|
|
|
|
|
|
|
8568
|
|
|
|
|
|
|
// Create lemma_info |
8569
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(lemma); |
8570
|
|
|
|
|
|
|
auto& lemma_info = lemmas.back(); |
8571
|
0
|
0
|
|
|
|
|
lemmas_hist.add(lemma_info.lemma); |
8572
|
|
|
|
|
|
|
|
8573
|
|
|
|
|
|
|
// Create classes |
8574
|
0
|
0
|
|
|
|
|
while (!forms.empty()) { |
8575
|
|
|
|
|
|
|
trie t; |
8576
|
0
|
0
|
|
|
|
|
for (auto&& form : forms) |
8577
|
0
|
0
|
|
|
|
|
t.add(form.first.c_str()); |
8578
|
|
|
|
|
|
|
|
8579
|
|
|
|
|
|
|
// Find prefix of forms in class being added. |
8580
|
0
|
0
|
|
|
|
|
string prefix = t.find_candidate_prefix(max_suffix_len); |
8581
|
|
|
|
|
|
|
|
8582
|
|
|
|
|
|
|
// Find forms of the class being added. |
8583
|
|
|
|
|
|
|
auto start = forms.begin(); |
8584
|
0
|
0
|
|
|
|
|
while (start != forms.end() && start->first.compare(0, prefix.size(), prefix) != 0) start++; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8585
|
0
|
0
|
|
|
|
|
if (start == forms.end()) training_failure("Internal error when generating classes, cannot find prefix '" << prefix << "'!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8586
|
|
|
|
|
|
|
auto end = start; |
8587
|
0
|
0
|
|
|
|
|
while (end != forms.end() && end->first.compare(0, prefix.size(), prefix) == 0) end++; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8588
|
|
|
|
|
|
|
|
8589
|
|
|
|
|
|
|
// Find common prefix of class forms -- may be larger than prefix. |
8590
|
0
|
|
|
|
|
|
int common_prefix = prefix.size(); |
8591
|
0
|
0
|
|
|
|
|
while (common_prefix < int(start->first.size()) && start->first[common_prefix] == (end-1)->first[common_prefix]) common_prefix++; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8592
|
|
|
|
|
|
|
|
8593
|
|
|
|
|
|
|
string clas; |
8594
|
0
|
0
|
|
|
|
|
for (auto form = start; form != end; form++) { |
8595
|
0
|
0
|
|
|
|
|
if (!clas.empty()) clas.push_back('\t'); |
|
|
0
|
|
|
|
|
|
8596
|
0
|
0
|
|
|
|
|
clas.append(form->first, common_prefix, string::npos); |
8597
|
0
|
0
|
|
|
|
|
clas.push_back('\t'); |
8598
|
|
|
|
|
|
|
clas.append(form->second); |
8599
|
|
|
|
|
|
|
} |
8600
|
|
|
|
|
|
|
|
8601
|
0
|
|
|
|
|
|
auto class_it = classes.emplace(clas, int(classes.size())); |
8602
|
0
|
|
|
|
|
|
int class_id = class_it.first->second; |
8603
|
0
|
0
|
|
|
|
|
if (class_it.second) { |
8604
|
|
|
|
|
|
|
// New class, add it, together with its tags. |
8605
|
0
|
0
|
|
|
|
|
for (auto form = start; form != end; form++) { |
8606
|
0
|
|
|
|
|
|
int tag = tags_map.emplace(form->second, int(tags.size())).first->second; |
8607
|
0
|
0
|
|
|
|
|
if (tag >= int(tags.size())) tags.emplace_back(form->second); |
|
|
0
|
|
|
|
|
|
8608
|
0
|
0
|
|
|
|
|
suffixes[form->first.substr(common_prefix)][class_id].emplace_back(tag); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8609
|
|
|
|
|
|
|
} |
8610
|
|
|
|
|
|
|
} |
8611
|
|
|
|
|
|
|
|
8612
|
|
|
|
|
|
|
// Move forms in the class being added to lemma and remove them from unprocessed forms. |
8613
|
0
|
0
|
|
|
|
|
lemma_info.forms.emplace_back(start->first.substr(0, common_prefix), class_id); |
|
|
0
|
|
|
|
|
|
8614
|
0
|
0
|
|
|
|
|
forms_hist.add(lemma_info.forms.back().form); |
8615
|
|
|
|
|
|
|
forms.erase(start, end); |
8616
|
|
|
|
|
|
|
} |
8617
|
|
|
|
|
|
|
stable_sort(lemma_info.forms.begin(), lemma_info.forms.end()); |
8618
|
|
|
|
|
|
|
} |
8619
|
|
|
|
|
|
|
stable_sort(lemmas.begin(), lemmas.end()); |
8620
|
0
|
|
|
|
|
|
} |
8621
|
|
|
|
|
|
|
|
8622
|
|
|
|
|
|
|
template |
8623
|
0
|
|
|
|
|
|
void dictionary::encode(binary_encoder& enc) { |
8624
|
|
|
|
|
|
|
// Encode lemmas and forms |
8625
|
0
|
|
|
|
|
|
lemmas_hist.encode(enc); |
8626
|
0
|
|
|
|
|
|
forms_hist.encode(enc); |
8627
|
|
|
|
|
|
|
|
8628
|
0
|
|
|
|
|
|
string prev = ""; |
8629
|
0
|
|
|
|
|
|
enc.add_4B(lemmas.size()); |
8630
|
0
|
0
|
|
|
|
|
for (auto&& lemma : lemmas) { |
8631
|
|
|
|
|
|
|
int cpl = 0; |
8632
|
0
|
0
|
|
|
|
|
while (prev[cpl] && prev[cpl] == lemma.lemma[cpl]) cpl++; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8633
|
|
|
|
|
|
|
|
8634
|
0
|
0
|
|
|
|
|
enc.add_1B(prev.length() - cpl); |
8635
|
0
|
0
|
|
|
|
|
enc.add_1B(lemma.lemma.size() - cpl); |
8636
|
0
|
0
|
|
|
|
|
enc.add_data(lemma.lemma.substr(cpl)); |
8637
|
0
|
0
|
|
|
|
|
enc.add_1B(lemma.addinfo.data.size()); |
8638
|
|
|
|
|
|
|
enc.add_data(lemma.addinfo.data); |
8639
|
0
|
0
|
|
|
|
|
enc.add_1B(lemma.forms.size()); |
8640
|
|
|
|
|
|
|
|
8641
|
|
|
|
|
|
|
string prev_form = lemma.lemma; |
8642
|
0
|
0
|
|
|
|
|
for (auto&& lemma_form : lemma.forms) { |
8643
|
|
|
|
|
|
|
unsigned best_prev_from = 0, best_form_from = 0, best_len = 0; |
8644
|
0
|
0
|
|
|
|
|
for (unsigned prev_from = 0; prev_from < prev_form.size(); prev_from++) |
8645
|
0
|
0
|
|
|
|
|
for (unsigned form_from = 0; form_from < lemma_form.form.size(); form_from++) { |
8646
|
|
|
|
|
|
|
unsigned len = 0; |
8647
|
0
|
0
|
|
|
|
|
while (prev_from + len < prev_form.size() && form_from + len < lemma_form.form.size() && prev_form[prev_from+len] == lemma_form.form[form_from+len]) len++; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8648
|
0
|
0
|
|
|
|
|
if (len > best_len) best_prev_from = prev_from, best_form_from = form_from, best_len = len; |
8649
|
|
|
|
|
|
|
} |
8650
|
|
|
|
|
|
|
|
8651
|
|
|
|
|
|
|
enum { REMOVE_START = 1, REMOVE_END = 2, ADD_START = 4, ADD_END = 8 }; |
8652
|
0
|
0
|
|
|
|
|
enc.add_1B(REMOVE_START * (best_prev_from>0) + REMOVE_END * (best_prev_from+best_len
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8653
|
|
|
|
|
|
|
ADD_START * (best_form_from>0) + ADD_END * (best_form_from+best_len
|
8654
|
0
|
0
|
|
|
|
|
if (best_prev_from > 0) enc.add_1B(best_prev_from); |
|
|
0
|
|
|
|
|
|
8655
|
0
|
0
|
|
|
|
|
if (best_prev_from + best_len < prev_form.size()) enc.add_1B(prev_form.size() - best_prev_from - best_len); |
|
|
0
|
|
|
|
|
|
8656
|
0
|
0
|
|
|
|
|
if (best_form_from > 0) { |
8657
|
0
|
0
|
|
|
|
|
enc.add_1B(best_form_from); |
8658
|
0
|
0
|
|
|
|
|
enc.add_data(lemma_form.form.substr(0, best_form_from)); |
8659
|
|
|
|
|
|
|
} |
8660
|
0
|
0
|
|
|
|
|
if (best_form_from + best_len < lemma_form.form.size()) { |
8661
|
0
|
0
|
|
|
|
|
enc.add_1B(lemma_form.form.size() - best_form_from - best_len); |
8662
|
0
|
0
|
|
|
|
|
enc.add_data(lemma_form.form.substr(best_form_from + best_len)); |
8663
|
|
|
|
|
|
|
} |
8664
|
0
|
0
|
|
|
|
|
enc.add_2B(lemma_form.clas); |
8665
|
|
|
|
|
|
|
|
8666
|
0
|
|
|
|
|
|
prev_form = lemma_form.form; |
8667
|
|
|
|
|
|
|
} |
8668
|
|
|
|
|
|
|
|
8669
|
|
|
|
|
|
|
prev = lemma.lemma; |
8670
|
|
|
|
|
|
|
} |
8671
|
|
|
|
|
|
|
|
8672
|
|
|
|
|
|
|
// Encode tags |
8673
|
0
|
0
|
|
|
|
|
enc.add_2B(tags.size()); |
8674
|
0
|
0
|
|
|
|
|
for (auto&& tag : tags) { |
8675
|
0
|
0
|
|
|
|
|
enc.add_1B(tag.size()); |
8676
|
|
|
|
|
|
|
enc.add_data(tag); |
8677
|
|
|
|
|
|
|
} |
8678
|
|
|
|
|
|
|
|
8679
|
|
|
|
|
|
|
// Encode classes |
8680
|
0
|
0
|
|
|
|
|
persistent_unordered_map(suffixes, 5, false, true, [](binary_encoder& enc, const map>& suffix) { |
8681
|
0
|
|
|
|
|
|
enc.add_2B(suffix.size()); |
8682
|
0
|
0
|
|
|
|
|
for (auto&& clas : suffix) |
8683
|
0
|
|
|
|
|
|
enc.add_2B(clas.first); |
8684
|
|
|
|
|
|
|
uint32_t tags = 0, prev_tags = 0; |
8685
|
0
|
0
|
|
|
|
|
for (auto&& clas : suffix) { |
8686
|
0
|
0
|
|
|
|
|
enc.add_2B(tags - prev_tags < (1<<16) ? uint16_t(tags) : tags); |
8687
|
|
|
|
|
|
|
prev_tags = tags; |
8688
|
0
|
|
|
|
|
|
tags += clas.second.size(); |
8689
|
|
|
|
|
|
|
} |
8690
|
0
|
0
|
|
|
|
|
enc.add_2B(tags - prev_tags < (1<<16) ? uint16_t(tags) : tags); |
8691
|
0
|
0
|
|
|
|
|
for (auto&& clas : suffix) |
8692
|
0
|
0
|
|
|
|
|
for (auto&& tag : clas.second) |
8693
|
0
|
|
|
|
|
|
enc.add_2B(tag); |
8694
|
0
|
0
|
|
|
|
|
}).save(enc); |
8695
|
0
|
|
|
|
|
|
} |
8696
|
|
|
|
|
|
|
|
8697
|
|
|
|
|
|
|
} // namespace morphodita |
8698
|
|
|
|
|
|
|
|
8699
|
|
|
|
|
|
|
///////// |
8700
|
|
|
|
|
|
|
// File: morphodita/morpho/morpho_prefix_guesser_encoder.h |
8701
|
|
|
|
|
|
|
///////// |
8702
|
|
|
|
|
|
|
|
8703
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
8704
|
|
|
|
|
|
|
// |
8705
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
8706
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
8707
|
|
|
|
|
|
|
// |
8708
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
8709
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
8710
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
8711
|
|
|
|
|
|
|
|
8712
|
|
|
|
|
|
|
namespace morphodita { |
8713
|
|
|
|
|
|
|
|
8714
|
|
|
|
|
|
|
class morpho_prefix_guesser_encoder { |
8715
|
|
|
|
|
|
|
public: |
8716
|
|
|
|
|
|
|
static void encode(istream& is, binary_encoder& enc); |
8717
|
|
|
|
|
|
|
}; |
8718
|
|
|
|
|
|
|
|
8719
|
|
|
|
|
|
|
} // namespace morphodita |
8720
|
|
|
|
|
|
|
|
8721
|
|
|
|
|
|
|
///////// |
8722
|
|
|
|
|
|
|
// File: morphodita/morpho/morpho_statistical_guesser_encoder.h |
8723
|
|
|
|
|
|
|
///////// |
8724
|
|
|
|
|
|
|
|
8725
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
8726
|
|
|
|
|
|
|
// |
8727
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
8728
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
8729
|
|
|
|
|
|
|
// |
8730
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
8731
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
8732
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
8733
|
|
|
|
|
|
|
|
8734
|
|
|
|
|
|
|
namespace morphodita { |
8735
|
|
|
|
|
|
|
|
8736
|
|
|
|
|
|
|
class morpho_statistical_guesser_encoder { |
8737
|
|
|
|
|
|
|
public: |
8738
|
|
|
|
|
|
|
static void encode(istream& is, binary_encoder& enc); |
8739
|
|
|
|
|
|
|
}; |
8740
|
|
|
|
|
|
|
|
8741
|
|
|
|
|
|
|
} // namespace morphodita |
8742
|
|
|
|
|
|
|
|
8743
|
|
|
|
|
|
|
///////// |
8744
|
|
|
|
|
|
|
// File: morphodita/morpho/generic_morpho_encoder.cpp |
8745
|
|
|
|
|
|
|
///////// |
8746
|
|
|
|
|
|
|
|
8747
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
8748
|
|
|
|
|
|
|
// |
8749
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
8750
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
8751
|
|
|
|
|
|
|
// |
8752
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
8753
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
8754
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
8755
|
|
|
|
|
|
|
|
8756
|
|
|
|
|
|
|
namespace morphodita { |
8757
|
|
|
|
|
|
|
|
8758
|
0
|
|
|
|
|
|
void generic_morpho_encoder::encode(istream& in_dictionary, int max_suffix_len, const tags& tags, istream& in_statistical_guesser, ostream& out_morpho) { |
8759
|
0
|
|
|
|
|
|
binary_encoder enc; |
8760
|
|
|
|
|
|
|
|
8761
|
0
|
0
|
|
|
|
|
enc.add_1B(tags.unknown_tag.size()); |
8762
|
|
|
|
|
|
|
enc.add_data(tags.unknown_tag); |
8763
|
0
|
0
|
|
|
|
|
enc.add_1B(tags.number_tag.size()); |
8764
|
|
|
|
|
|
|
enc.add_data(tags.number_tag); |
8765
|
0
|
0
|
|
|
|
|
enc.add_1B(tags.punctuation_tag.size()); |
8766
|
|
|
|
|
|
|
enc.add_data(tags.punctuation_tag); |
8767
|
0
|
0
|
|
|
|
|
enc.add_1B(tags.symbol_tag.size()); |
8768
|
|
|
|
|
|
|
enc.add_data(tags.symbol_tag); |
8769
|
|
|
|
|
|
|
|
8770
|
|
|
|
|
|
|
// cerr << "Encoding dictionary." << endl; |
8771
|
0
|
0
|
|
|
|
|
morpho_dictionary_encoder::encode(in_dictionary, max_suffix_len, enc); |
8772
|
|
|
|
|
|
|
|
8773
|
|
|
|
|
|
|
// Load and encode statistical guesser if requested |
8774
|
0
|
0
|
|
|
|
|
enc.add_1B(bool(in_statistical_guesser)); |
8775
|
0
|
0
|
|
|
|
|
if (in_statistical_guesser) { |
8776
|
|
|
|
|
|
|
// cerr << "Encoding statistical guesser." << endl; |
8777
|
0
|
0
|
|
|
|
|
morpho_statistical_guesser_encoder::encode(in_statistical_guesser, enc); |
8778
|
|
|
|
|
|
|
} |
8779
|
|
|
|
|
|
|
|
8780
|
|
|
|
|
|
|
// done, save the dictionary |
8781
|
|
|
|
|
|
|
// cerr << "Compressing dictionary." << endl; |
8782
|
0
|
0
|
|
|
|
|
if (!compressor::save(out_morpho, enc)) training_failure("Cannot compress and write dictionary to file!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8783
|
|
|
|
|
|
|
// cerr << "Dictionary saved." << endl; |
8784
|
0
|
|
|
|
|
|
} |
8785
|
|
|
|
|
|
|
|
8786
|
|
|
|
|
|
|
} // namespace morphodita |
8787
|
|
|
|
|
|
|
|
8788
|
|
|
|
|
|
|
///////// |
8789
|
|
|
|
|
|
|
// File: morphodita/morpho/morpho_ids.h |
8790
|
|
|
|
|
|
|
///////// |
8791
|
|
|
|
|
|
|
|
8792
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
8793
|
|
|
|
|
|
|
// |
8794
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
8795
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
8796
|
|
|
|
|
|
|
// |
8797
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
8798
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
8799
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
8800
|
|
|
|
|
|
|
|
8801
|
|
|
|
|
|
|
namespace morphodita { |
8802
|
|
|
|
|
|
|
|
8803
|
|
|
|
|
|
|
class morpho_ids { |
8804
|
|
|
|
|
|
|
public: |
8805
|
|
|
|
|
|
|
enum morpho_id { |
8806
|
|
|
|
|
|
|
CZECH = 0, |
8807
|
|
|
|
|
|
|
ENGLISH_V1 = 1, |
8808
|
|
|
|
|
|
|
GENERIC = 2, |
8809
|
|
|
|
|
|
|
EXTERNAL = 3, |
8810
|
|
|
|
|
|
|
ENGLISH_V2 = 4, |
8811
|
|
|
|
|
|
|
ENGLISH_V3 = 5, ENGLISH = ENGLISH_V3, |
8812
|
|
|
|
|
|
|
SLOVAK_PDT = 6, |
8813
|
|
|
|
|
|
|
DERIVATOR_DICTIONARY = 7, |
8814
|
|
|
|
|
|
|
}; |
8815
|
|
|
|
|
|
|
|
8816
|
|
|
|
|
|
|
static bool parse(const string& str, morpho_id& id) { |
8817
|
|
|
|
|
|
|
if (str == "czech") return id = CZECH, true; |
8818
|
|
|
|
|
|
|
if (str == "english") return id = ENGLISH, true; |
8819
|
|
|
|
|
|
|
if (str == "external") return id = EXTERNAL, true; |
8820
|
|
|
|
|
|
|
if (str == "generic") return id = GENERIC, true; |
8821
|
|
|
|
|
|
|
if (str == "slovak_pdt") return id = SLOVAK_PDT, true; |
8822
|
|
|
|
|
|
|
return false; |
8823
|
|
|
|
|
|
|
} |
8824
|
|
|
|
|
|
|
}; |
8825
|
|
|
|
|
|
|
|
8826
|
|
|
|
|
|
|
typedef morpho_ids::morpho_id morpho_id; |
8827
|
|
|
|
|
|
|
|
8828
|
|
|
|
|
|
|
} // namespace morphodita |
8829
|
|
|
|
|
|
|
|
8830
|
|
|
|
|
|
|
///////// |
8831
|
|
|
|
|
|
|
// File: morphodita/morpho/morpho.cpp |
8832
|
|
|
|
|
|
|
///////// |
8833
|
|
|
|
|
|
|
|
8834
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
8835
|
|
|
|
|
|
|
// |
8836
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
8837
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
8838
|
|
|
|
|
|
|
// |
8839
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
8840
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
8841
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
8842
|
|
|
|
|
|
|
|
8843
|
|
|
|
|
|
|
namespace morphodita { |
8844
|
|
|
|
|
|
|
|
8845
|
1
|
|
|
|
|
|
morpho* morpho::load(istream& is) { |
8846
|
1
|
|
|
|
|
|
morpho_id id = morpho_id(is.get()); |
8847
|
1
|
|
|
|
|
|
switch (id) { |
8848
|
|
|
|
|
|
|
case morpho_ids::CZECH: |
8849
|
|
|
|
|
|
|
{ |
8850
|
0
|
|
|
|
|
|
auto res = new_unique_ptr(czech_morpho::morpho_language::CZECH, 1); |
8851
|
0
|
0
|
|
|
|
|
if (res->load(is)) return res.release(); |
|
|
0
|
|
|
|
|
|
8852
|
|
|
|
|
|
|
break; |
8853
|
|
|
|
|
|
|
} |
8854
|
|
|
|
|
|
|
case morpho_ids::ENGLISH_V1: |
8855
|
|
|
|
|
|
|
case morpho_ids::ENGLISH_V2: |
8856
|
|
|
|
|
|
|
case morpho_ids::ENGLISH_V3: |
8857
|
|
|
|
|
|
|
{ |
8858
|
|
|
|
|
|
|
auto res = new_unique_ptr(id == morpho_ids::ENGLISH_V1 ? 1 : |
8859
|
|
|
|
|
|
|
id == morpho_ids::ENGLISH_V2 ? 2 : |
8860
|
0
|
0
|
|
|
|
|
3); |
|
|
0
|
|
|
|
|
|
8861
|
0
|
0
|
|
|
|
|
if (res->load(is)) return res.release(); |
|
|
0
|
|
|
|
|
|
8862
|
|
|
|
|
|
|
break; |
8863
|
|
|
|
|
|
|
} |
8864
|
|
|
|
|
|
|
case morpho_ids::EXTERNAL: |
8865
|
|
|
|
|
|
|
{ |
8866
|
0
|
|
|
|
|
|
auto res = new_unique_ptr(1); |
8867
|
0
|
0
|
|
|
|
|
if (res->load(is)) return res.release(); |
|
|
0
|
|
|
|
|
|
8868
|
|
|
|
|
|
|
break; |
8869
|
|
|
|
|
|
|
} |
8870
|
|
|
|
|
|
|
case morpho_ids::GENERIC: |
8871
|
|
|
|
|
|
|
{ |
8872
|
1
|
|
|
|
|
|
auto res = new_unique_ptr(1); |
8873
|
1
|
50
|
|
|
|
|
if (res->load(is)) return res.release(); |
|
|
50
|
|
|
|
|
|
8874
|
|
|
|
|
|
|
break; |
8875
|
|
|
|
|
|
|
} |
8876
|
|
|
|
|
|
|
case morpho_ids::SLOVAK_PDT: |
8877
|
|
|
|
|
|
|
{ |
8878
|
0
|
|
|
|
|
|
auto res = new_unique_ptr(czech_morpho::morpho_language::SLOVAK, 3); |
8879
|
0
|
0
|
|
|
|
|
if (res->load(is)) return res.release(); |
|
|
0
|
|
|
|
|
|
8880
|
|
|
|
|
|
|
break; |
8881
|
|
|
|
|
|
|
} |
8882
|
|
|
|
|
|
|
case morpho_ids::DERIVATOR_DICTIONARY: |
8883
|
|
|
|
|
|
|
{ |
8884
|
0
|
|
|
|
|
|
auto derinet = new_unique_ptr(); |
8885
|
0
|
0
|
|
|
|
|
if (!derinet->load(is)) return nullptr; |
|
|
0
|
|
|
|
|
|
8886
|
|
|
|
|
|
|
|
8887
|
0
|
0
|
|
|
|
|
unique_ptr dictionary(load(is)); |
8888
|
0
|
0
|
|
|
|
|
if (!dictionary) return nullptr; |
8889
|
0
|
|
|
|
|
|
derinet->dictionary = dictionary.get(); |
8890
|
|
|
|
|
|
|
dictionary->derinet.reset(derinet.release()); |
8891
|
0
|
|
|
|
|
|
return dictionary.release(); |
8892
|
|
|
|
|
|
|
} |
8893
|
|
|
|
|
|
|
} |
8894
|
|
|
|
|
|
|
|
8895
|
|
|
|
|
|
|
return nullptr; |
8896
|
|
|
|
|
|
|
} |
8897
|
|
|
|
|
|
|
|
8898
|
0
|
|
|
|
|
|
morpho* morpho::load(const char* fname) { |
8899
|
0
|
0
|
|
|
|
|
ifstream f(path_from_utf8(fname).c_str(), ifstream::binary); |
8900
|
0
|
0
|
|
|
|
|
if (!f) return nullptr; |
8901
|
|
|
|
|
|
|
|
8902
|
0
|
0
|
|
|
|
|
return load(f); |
8903
|
|
|
|
|
|
|
} |
8904
|
|
|
|
|
|
|
|
8905
|
0
|
|
|
|
|
|
const derivator* morpho::get_derivator() const { |
8906
|
0
|
|
|
|
|
|
return derinet.get(); |
8907
|
|
|
|
|
|
|
} |
8908
|
|
|
|
|
|
|
|
8909
|
|
|
|
|
|
|
} // namespace morphodita |
8910
|
|
|
|
|
|
|
|
8911
|
|
|
|
|
|
|
///////// |
8912
|
|
|
|
|
|
|
// File: morphodita/morpho/morpho_statistical_guesser.cpp |
8913
|
|
|
|
|
|
|
///////// |
8914
|
|
|
|
|
|
|
|
8915
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
8916
|
|
|
|
|
|
|
// |
8917
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
8918
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
8919
|
|
|
|
|
|
|
// |
8920
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
8921
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
8922
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
8923
|
|
|
|
|
|
|
|
8924
|
|
|
|
|
|
|
namespace morphodita { |
8925
|
|
|
|
|
|
|
|
8926
|
1
|
|
|
|
|
|
void morpho_statistical_guesser::load(binary_decoder& data) { |
8927
|
|
|
|
|
|
|
// Load tags and default tag |
8928
|
1
|
|
|
|
|
|
tags.resize(data.next_2B()); |
8929
|
7
|
100
|
|
|
|
|
for (auto&& tag : tags) { |
8930
|
6
|
|
|
|
|
|
tag.resize(data.next_1B()); |
8931
|
403
|
100
|
|
|
|
|
for (unsigned i = 0; i < tag.size(); i++) |
8932
|
397
|
|
|
|
|
|
tag[i] = data.next_1B(); |
8933
|
|
|
|
|
|
|
} |
8934
|
1
|
|
|
|
|
|
default_tag = data.next_2B(); |
8935
|
|
|
|
|
|
|
|
8936
|
|
|
|
|
|
|
// Load rules |
8937
|
1
|
|
|
|
|
|
rules.load(data); |
8938
|
1
|
|
|
|
|
|
} |
8939
|
|
|
|
|
|
|
|
8940
|
|
|
|
|
|
|
// Helper method for analyze. |
8941
|
0
|
|
|
|
|
|
static bool contains(morpho_statistical_guesser::used_rules* used, const string& rule) { |
8942
|
0
|
0
|
|
|
|
|
if (!used) return false; |
8943
|
|
|
|
|
|
|
|
8944
|
0
|
0
|
|
|
|
|
for (auto&& used_rule : *used) |
8945
|
0
|
0
|
|
|
|
|
if (used_rule == rule) |
8946
|
|
|
|
|
|
|
return true; |
8947
|
|
|
|
|
|
|
|
8948
|
|
|
|
|
|
|
return false; |
8949
|
|
|
|
|
|
|
} |
8950
|
|
|
|
|
|
|
|
8951
|
|
|
|
|
|
|
// Produces unique lemma-tag pairs. |
8952
|
0
|
|
|
|
|
|
void morpho_statistical_guesser::analyze(string_piece form, vector& lemmas, morpho_statistical_guesser::used_rules* used) { |
8953
|
|
|
|
|
|
|
unsigned lemmas_initial_size = lemmas.size(); |
8954
|
|
|
|
|
|
|
|
8955
|
|
|
|
|
|
|
// We have rules in format "suffix prefix" in rules. |
8956
|
|
|
|
|
|
|
// Find the matching rule with longest suffix and of those with longest prefix. |
8957
|
0
|
0
|
|
|
|
|
string rule_label; rule_label.reserve(12); |
8958
|
|
|
|
|
|
|
unsigned suffix_len = 0; |
8959
|
0
|
0
|
|
|
|
|
for (; suffix_len < form.len; suffix_len++) { |
8960
|
0
|
0
|
|
|
|
|
rule_label.push_back(form.str[form.len - (suffix_len + 1)]); |
8961
|
0
|
0
|
|
|
|
|
if (!rules.at(rule_label.c_str(), rule_label.size(), [](pointer_decoder& data){ data.next(data.next_2B()); })) |
8962
|
|
|
|
|
|
|
break; |
8963
|
|
|
|
|
|
|
} |
8964
|
|
|
|
|
|
|
|
8965
|
0
|
0
|
|
|
|
|
for (suffix_len++; suffix_len--; ) { |
8966
|
0
|
|
|
|
|
|
rule_label.resize(suffix_len); |
8967
|
0
|
0
|
|
|
|
|
rule_label.push_back(' '); |
8968
|
|
|
|
|
|
|
|
8969
|
|
|
|
|
|
|
const unsigned char* rule = nullptr; |
8970
|
|
|
|
|
|
|
unsigned rule_prefix_len = 0; |
8971
|
0
|
0
|
|
|
|
|
for (unsigned prefix_len = 0; prefix_len + suffix_len <= form.len; prefix_len++) { |
8972
|
0
|
0
|
|
|
|
|
if (prefix_len) rule_label.push_back(form.str[prefix_len - 1]); |
|
|
0
|
|
|
|
|
|
8973
|
0
|
|
|
|
|
|
const unsigned char* found = rules.at(rule_label.c_str(), rule_label.size(), [](pointer_decoder& data){ data.next(data.next_2B()); }); |
8974
|
0
|
0
|
|
|
|
|
if (!found) break; |
8975
|
0
|
0
|
|
|
|
|
if (*(found += sizeof(uint16_t))) { |
8976
|
|
|
|
|
|
|
rule = found; |
8977
|
|
|
|
|
|
|
rule_prefix_len = prefix_len; |
8978
|
|
|
|
|
|
|
} |
8979
|
|
|
|
|
|
|
} |
8980
|
|
|
|
|
|
|
|
8981
|
0
|
0
|
|
|
|
|
if (rule) { |
8982
|
0
|
|
|
|
|
|
rule_label.resize(suffix_len + 1 + rule_prefix_len); |
8983
|
0
|
0
|
|
|
|
|
if (rule_label.size() > 1 && !contains(used, rule_label)) { // ignore rule ' ' |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8984
|
0
|
0
|
|
|
|
|
if (used) used->push_back(rule_label); |
|
|
0
|
|
|
|
|
|
8985
|
0
|
0
|
|
|
|
|
for (int rules_len = *rule++; rules_len; rules_len--) { |
8986
|
0
|
|
|
|
|
|
unsigned pref_del_len = *rule++; const char* pref_del = (const char*)rule; rule += pref_del_len; |
8987
|
0
|
|
|
|
|
|
unsigned pref_add_len = *rule++; const char* pref_add = (const char*)rule; rule += pref_add_len; |
8988
|
0
|
|
|
|
|
|
unsigned suff_del_len = *rule++; const char* suff_del = (const char*)rule; rule += suff_del_len; |
8989
|
0
|
|
|
|
|
|
unsigned suff_add_len = *rule++; const char* suff_add = (const char*)rule; rule += suff_add_len; |
8990
|
0
|
|
|
|
|
|
unsigned tags_len = *rule++; const uint16_t* tags = (const uint16_t*)rule; rule += tags_len * sizeof(uint16_t); |
8991
|
|
|
|
|
|
|
|
8992
|
0
|
0
|
|
|
|
|
if (pref_del_len + suff_del_len > form.len || |
|
|
0
|
|
|
|
|
|
8993
|
0
|
0
|
|
|
|
|
(pref_del_len && !small_memeq(pref_del, form.str, pref_del_len)) || |
|
|
0
|
|
|
|
|
|
8994
|
0
|
0
|
|
|
|
|
(suff_del_len && !small_memeq(suff_del, form.str + form.len - suff_del_len, suff_del_len)) || |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
8995
|
0
|
|
|
|
|
|
(form.len + pref_add_len - pref_del_len + suff_add_len - suff_del_len == 0)) |
8996
|
0
|
|
|
|
|
|
continue; |
8997
|
|
|
|
|
|
|
|
8998
|
|
|
|
|
|
|
string lemma; |
8999
|
0
|
0
|
|
|
|
|
lemma.reserve(form.len + pref_add_len - pref_del_len + suff_add_len - suff_del_len); |
9000
|
0
|
0
|
|
|
|
|
if (pref_add_len) lemma.append(pref_add, pref_add_len); |
|
|
0
|
|
|
|
|
|
9001
|
0
|
0
|
|
|
|
|
if (pref_del_len + suff_del_len < form.len) lemma.append(form.str + pref_del_len, form.len - pref_del_len - suff_del_len); |
|
|
0
|
|
|
|
|
|
9002
|
0
|
0
|
|
|
|
|
if (suff_add_len) lemma.append(suff_add, suff_add_len); |
|
|
0
|
|
|
|
|
|
9003
|
0
|
0
|
|
|
|
|
while (tags_len--) |
9004
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(lemma, this->tags[unaligned_load_inc(tags)]); |
9005
|
|
|
|
|
|
|
} |
9006
|
|
|
|
|
|
|
} |
9007
|
|
|
|
|
|
|
break; |
9008
|
|
|
|
|
|
|
} |
9009
|
|
|
|
|
|
|
} |
9010
|
|
|
|
|
|
|
|
9011
|
|
|
|
|
|
|
// If nothing was found, use default tag. |
9012
|
0
|
0
|
|
|
|
|
if (lemmas.size() == lemmas_initial_size) |
9013
|
0
|
0
|
|
|
|
|
if (!contains(used, string())) { |
9014
|
0
|
0
|
|
|
|
|
if (used) used->push_back(string()); |
9015
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), tags[default_tag]); |
9016
|
|
|
|
|
|
|
} |
9017
|
0
|
|
|
|
|
|
} |
9018
|
|
|
|
|
|
|
|
9019
|
|
|
|
|
|
|
} // namespace morphodita |
9020
|
|
|
|
|
|
|
|
9021
|
|
|
|
|
|
|
///////// |
9022
|
|
|
|
|
|
|
// File: utils/split.h |
9023
|
|
|
|
|
|
|
///////// |
9024
|
|
|
|
|
|
|
|
9025
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
9026
|
|
|
|
|
|
|
// |
9027
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
9028
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
9029
|
|
|
|
|
|
|
// |
9030
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
9031
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
9032
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
9033
|
|
|
|
|
|
|
|
9034
|
|
|
|
|
|
|
namespace utils { |
9035
|
|
|
|
|
|
|
|
9036
|
|
|
|
|
|
|
// |
9037
|
|
|
|
|
|
|
// Declarations |
9038
|
|
|
|
|
|
|
// |
9039
|
|
|
|
|
|
|
|
9040
|
|
|
|
|
|
|
// Split given text on the separator character. |
9041
|
|
|
|
|
|
|
inline void split(const string& text, char sep, vector& tokens); |
9042
|
|
|
|
|
|
|
inline void split(string_piece text, char sep, vector& tokens); |
9043
|
|
|
|
|
|
|
|
9044
|
|
|
|
|
|
|
// |
9045
|
|
|
|
|
|
|
// Definitions |
9046
|
|
|
|
|
|
|
// |
9047
|
|
|
|
|
|
|
|
9048
|
0
|
|
|
|
|
|
void split(const string& text, char sep, vector& tokens) { |
9049
|
0
|
|
|
|
|
|
tokens.clear(); |
9050
|
0
|
0
|
|
|
|
|
if (text.empty()) return; |
9051
|
|
|
|
|
|
|
|
9052
|
0
|
|
|
|
|
|
string::size_type index = 0; |
9053
|
0
|
0
|
|
|
|
|
for (string::size_type next; (next = text.find(sep, index)) != string::npos; index = next + 1) |
9054
|
0
|
|
|
|
|
|
tokens.emplace_back(text, index, next - index); |
9055
|
|
|
|
|
|
|
|
9056
|
0
|
|
|
|
|
|
tokens.emplace_back(text, index); |
9057
|
|
|
|
|
|
|
} |
9058
|
|
|
|
|
|
|
|
9059
|
53
|
|
|
|
|
|
void split(string_piece text, char sep, vector& tokens) { |
9060
|
|
|
|
|
|
|
tokens.clear(); |
9061
|
53
|
50
|
|
|
|
|
if (!text.len) return; |
9062
|
|
|
|
|
|
|
|
9063
|
53
|
|
|
|
|
|
const char* str = text.str; |
9064
|
121
|
100
|
|
|
|
|
for (const char* next; (next = (const char*) memchr(str, sep, text.str + text.len - str)); str = next + 1) |
9065
|
68
|
|
|
|
|
|
tokens.emplace_back(str, next - str); |
9066
|
|
|
|
|
|
|
|
9067
|
53
|
|
|
|
|
|
tokens.emplace_back(str, text.str + text.len - str); |
9068
|
|
|
|
|
|
|
} |
9069
|
|
|
|
|
|
|
|
9070
|
|
|
|
|
|
|
} // namespace utils |
9071
|
|
|
|
|
|
|
|
9072
|
|
|
|
|
|
|
///////// |
9073
|
|
|
|
|
|
|
// File: morphodita/morpho/morpho_statistical_guesser_encoder.cpp |
9074
|
|
|
|
|
|
|
///////// |
9075
|
|
|
|
|
|
|
|
9076
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
9077
|
|
|
|
|
|
|
// |
9078
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
9079
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
9080
|
|
|
|
|
|
|
// |
9081
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
9082
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
9083
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
9084
|
|
|
|
|
|
|
|
9085
|
|
|
|
|
|
|
namespace morphodita { |
9086
|
|
|
|
|
|
|
|
9087
|
0
|
|
|
|
|
|
void morpho_statistical_guesser_encoder::encode(istream& is, binary_encoder& enc) { |
9088
|
|
|
|
|
|
|
unordered_map, vector>>> statistical_guesser; |
9089
|
0
|
|
|
|
|
|
vector tags; |
9090
|
|
|
|
|
|
|
unordered_map tags_map; |
9091
|
|
|
|
|
|
|
|
9092
|
|
|
|
|
|
|
// Load statistical guesser |
9093
|
|
|
|
|
|
|
string line; |
9094
|
0
|
|
|
|
|
|
vector tokens; |
9095
|
0
|
0
|
|
|
|
|
if (!getline(is, line)) training_failure("Missing first line with default tag in statistical guesser file"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9096
|
0
|
|
|
|
|
|
int statistical_guesser_default = tags_map.emplace(line.data(), int(tags.size())).first->second; |
9097
|
0
|
0
|
|
|
|
|
if (unsigned(statistical_guesser_default) >= tags.size()) tags.emplace_back(line.data()); |
|
|
0
|
|
|
|
|
|
9098
|
|
|
|
|
|
|
|
9099
|
0
|
0
|
|
|
|
|
while (getline(is, line)) { |
|
|
0
|
|
|
|
|
|
9100
|
0
|
0
|
|
|
|
|
split(line, '\t', tokens); |
9101
|
0
|
0
|
|
|
|
|
if (tokens.size() < 3 || (tokens.size() % 2) != 1) training_failure("Cannot parse line " << line << " in statistical guesser file!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9102
|
|
|
|
|
|
|
|
9103
|
0
|
|
|
|
|
|
vector affixes; |
9104
|
0
|
0
|
|
|
|
|
split(tokens[0], ' ', affixes); |
9105
|
0
|
0
|
|
|
|
|
if (affixes.size() != 2) training_failure("Cannot parse prefix_suffix '" << tokens[0] << "' in statistical guesser file!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9106
|
|
|
|
|
|
|
reverse(affixes[1].begin(), affixes[1].end()); |
9107
|
|
|
|
|
|
|
|
9108
|
0
|
0
|
|
|
|
|
auto& rules = statistical_guesser[affixes[1] + ' ' + affixes[0]]; |
|
|
0
|
|
|
|
|
|
9109
|
0
|
0
|
|
|
|
|
for (unsigned i = 1; i < tokens.size(); i+= 2) { |
9110
|
0
|
|
|
|
|
|
vector replacements; |
9111
|
0
|
0
|
|
|
|
|
split(tokens[i], ' ', replacements); |
9112
|
0
|
0
|
|
|
|
|
if (replacements.size() != 4) training_failure("Cannot parse replacement rule '" << tokens[i] << "' in statistical guesser file!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9113
|
|
|
|
|
|
|
|
9114
|
0
|
|
|
|
|
|
vector rule_tags; |
9115
|
0
|
0
|
|
|
|
|
split(tokens[i+1], ' ', rule_tags); |
9116
|
|
|
|
|
|
|
vector decoded_tags; |
9117
|
0
|
0
|
|
|
|
|
for (auto&& rule_tag : rule_tags) { |
9118
|
0
|
|
|
|
|
|
int tag = tags_map.emplace(rule_tag, int(tags.size())).first->second; |
9119
|
0
|
0
|
|
|
|
|
if (unsigned(tag) >= tags.size()) tags.emplace_back(rule_tag); |
|
|
0
|
|
|
|
|
|
9120
|
0
|
0
|
|
|
|
|
decoded_tags.emplace_back(tag); |
9121
|
|
|
|
|
|
|
} |
9122
|
|
|
|
|
|
|
|
9123
|
0
|
0
|
|
|
|
|
rules.emplace_back(replacements, decoded_tags); |
9124
|
|
|
|
|
|
|
} |
9125
|
|
|
|
|
|
|
} |
9126
|
|
|
|
|
|
|
|
9127
|
|
|
|
|
|
|
// Encode statistical guesser |
9128
|
0
|
0
|
|
|
|
|
enc.add_2B(tags.size()); |
9129
|
0
|
0
|
|
|
|
|
for (auto&& tag : tags) { |
9130
|
0
|
0
|
|
|
|
|
enc.add_1B(tag.size()); |
9131
|
|
|
|
|
|
|
enc.add_data(tag); |
9132
|
|
|
|
|
|
|
} |
9133
|
0
|
0
|
|
|
|
|
enc.add_2B(statistical_guesser_default); |
9134
|
|
|
|
|
|
|
|
9135
|
0
|
|
|
|
|
|
persistent_unordered_map(statistical_guesser, 5, true, false, [](binary_encoder& enc, vector, vector>> rules) { |
9136
|
0
|
|
|
|
|
|
binary_encoder e; |
9137
|
0
|
0
|
|
|
|
|
e.add_1B(rules.size()); |
9138
|
0
|
0
|
|
|
|
|
for (auto&& rule : rules) { |
9139
|
0
|
0
|
|
|
|
|
if (rule.first.size() != 4) training_failure("Replacement rule not of size 4 in statistical guesser!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9140
|
0
|
0
|
|
|
|
|
for (auto&& affix : rule.first) { |
9141
|
0
|
0
|
|
|
|
|
e.add_1B(affix.size()); |
9142
|
|
|
|
|
|
|
e.add_data(affix); |
9143
|
|
|
|
|
|
|
} |
9144
|
0
|
0
|
|
|
|
|
e.add_1B(rule.second.size()); |
9145
|
0
|
0
|
|
|
|
|
for (auto&& tag : rule.second) |
9146
|
0
|
0
|
|
|
|
|
e.add_2B(tag); |
9147
|
|
|
|
|
|
|
} |
9148
|
0
|
0
|
|
|
|
|
enc.add_2B(e.data.size()); |
9149
|
|
|
|
|
|
|
enc.add_data(e.data); |
9150
|
0
|
0
|
|
|
|
|
}).save(enc); |
|
|
0
|
|
|
|
|
|
9151
|
0
|
|
|
|
|
|
} |
9152
|
|
|
|
|
|
|
|
9153
|
|
|
|
|
|
|
} // namespace morphodita |
9154
|
|
|
|
|
|
|
|
9155
|
|
|
|
|
|
|
///////// |
9156
|
|
|
|
|
|
|
// File: morphodita/morpho/morpho_statistical_guesser_trainer.h |
9157
|
|
|
|
|
|
|
///////// |
9158
|
|
|
|
|
|
|
|
9159
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
9160
|
|
|
|
|
|
|
// |
9161
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
9162
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
9163
|
|
|
|
|
|
|
// |
9164
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
9165
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
9166
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
9167
|
|
|
|
|
|
|
|
9168
|
|
|
|
|
|
|
namespace morphodita { |
9169
|
|
|
|
|
|
|
|
9170
|
|
|
|
|
|
|
class morpho_statistical_guesser_trainer { |
9171
|
|
|
|
|
|
|
public: |
9172
|
|
|
|
|
|
|
static void train(istream& is, unsigned suffix_len, unsigned rules_per_suffix, unsigned max_prefixes, unsigned min_prefix_count, ostream& os); |
9173
|
|
|
|
|
|
|
|
9174
|
|
|
|
|
|
|
private: |
9175
|
0
|
|
|
|
|
|
struct instance { |
9176
|
|
|
|
|
|
|
string form, lemma, tag; |
9177
|
|
|
|
|
|
|
string lemma_rule, form_prefix; |
9178
|
|
|
|
|
|
|
|
9179
|
|
|
|
|
|
|
instance(const string& form, const string& lemma, const string& tag); |
9180
|
|
|
|
|
|
|
}; |
9181
|
|
|
|
|
|
|
|
9182
|
|
|
|
|
|
|
enum casing { CASE_LC, CASE_UCLC, CASE_UC, CASE_OTHER }; |
9183
|
|
|
|
|
|
|
static casing get_casing(const string& word, bool allow_nonletters); |
9184
|
|
|
|
|
|
|
static void set_casing(const string& original, casing c, string& word); |
9185
|
|
|
|
|
|
|
static bool suffix(const string& word, unsigned& length); |
9186
|
|
|
|
|
|
|
}; |
9187
|
|
|
|
|
|
|
|
9188
|
|
|
|
|
|
|
} // namespace morphodita |
9189
|
|
|
|
|
|
|
|
9190
|
|
|
|
|
|
|
///////// |
9191
|
|
|
|
|
|
|
// File: morphodita/morpho/morpho_statistical_guesser_trainer.cpp |
9192
|
|
|
|
|
|
|
///////// |
9193
|
|
|
|
|
|
|
|
9194
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
9195
|
|
|
|
|
|
|
// |
9196
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
9197
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
9198
|
|
|
|
|
|
|
// |
9199
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
9200
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
9201
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
9202
|
|
|
|
|
|
|
|
9203
|
|
|
|
|
|
|
namespace morphodita { |
9204
|
|
|
|
|
|
|
|
9205
|
0
|
|
|
|
|
|
void morpho_statistical_guesser_trainer::train(istream& is, unsigned suffix_len, unsigned rules_per_suffix, unsigned max_prefixes, unsigned min_prefix_count, ostream& os) { |
9206
|
0
|
|
|
|
|
|
vector data; |
9207
|
|
|
|
|
|
|
|
9208
|
|
|
|
|
|
|
// Load training data |
9209
|
|
|
|
|
|
|
string form; |
9210
|
0
|
|
|
|
|
|
vector tokens; |
9211
|
0
|
0
|
|
|
|
|
for (string line; getline(is, line);) { |
|
|
0
|
|
|
|
|
|
9212
|
0
|
0
|
|
|
|
|
if (line.empty()) continue; |
9213
|
|
|
|
|
|
|
|
9214
|
0
|
0
|
|
|
|
|
split(line, '\t', tokens); |
9215
|
0
|
0
|
|
|
|
|
if (tokens.size() != 3) training_failure("The guesser training line '" << line << "' does not contain three columns!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9216
|
0
|
0
|
|
|
|
|
if (tokens[0].empty() || tokens[1].empty() || tokens[2].empty()) training_failure("The guesser training line '" << line << "' contains an empty column!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9217
|
|
|
|
|
|
|
|
9218
|
|
|
|
|
|
|
// Normalize case |
9219
|
0
|
|
|
|
|
|
casing form_case = get_casing(tokens[0], false); |
9220
|
0
|
|
|
|
|
|
casing lemma_case = get_casing(tokens[1], true); |
9221
|
0
|
0
|
|
|
|
|
if ((lemma_case == CASE_LC && (form_case == CASE_UCLC || form_case == CASE_UC)) || |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9222
|
0
|
|
|
|
|
|
(lemma_case == CASE_UCLC && form_case == CASE_UC)) { |
9223
|
0
|
0
|
|
|
|
|
set_casing(tokens[0], lemma_case, form); |
9224
|
|
|
|
|
|
|
} else { |
9225
|
0
|
|
|
|
|
|
form.swap(tokens[0]); |
9226
|
|
|
|
|
|
|
} |
9227
|
|
|
|
|
|
|
|
9228
|
0
|
0
|
|
|
|
|
data.emplace_back(form, tokens[1], tokens[2]); |
9229
|
|
|
|
|
|
|
} |
9230
|
|
|
|
|
|
|
|
9231
|
|
|
|
|
|
|
// Generate at most max_prefixes prefixes with min_prefix_count |
9232
|
|
|
|
|
|
|
unordered_map> prefixes_with_forms; |
9233
|
0
|
0
|
|
|
|
|
for (auto&& instance : data) |
9234
|
0
|
0
|
|
|
|
|
if (!instance.form_prefix.empty()) |
9235
|
0
|
|
|
|
|
|
prefixes_with_forms[instance.form_prefix].insert(instance.form); |
9236
|
|
|
|
|
|
|
|
9237
|
0
|
|
|
|
|
|
vector> prefixes_with_counts; |
9238
|
0
|
0
|
|
|
|
|
for (auto&& prefix : prefixes_with_forms) |
9239
|
0
|
0
|
|
|
|
|
if (prefix.second.size() >= min_prefix_count) |
9240
|
0
|
0
|
|
|
|
|
prefixes_with_counts.emplace_back(unsigned(prefix.second.size()), prefix.first); |
9241
|
|
|
|
|
|
|
|
9242
|
0
|
0
|
|
|
|
|
if (prefixes_with_counts.size() > max_prefixes) { |
9243
|
|
|
|
|
|
|
sort(prefixes_with_counts.begin(), prefixes_with_counts.end(), greater>()); |
9244
|
0
|
0
|
|
|
|
|
prefixes_with_counts.resize(max_prefixes); |
9245
|
|
|
|
|
|
|
} |
9246
|
|
|
|
|
|
|
|
9247
|
|
|
|
|
|
|
unordered_set prefixes; |
9248
|
|
|
|
|
|
|
prefixes.emplace(); |
9249
|
0
|
0
|
|
|
|
|
for (auto&& prefix : prefixes_with_counts) |
9250
|
0
|
|
|
|
|
|
prefixes.insert(prefix.second); |
9251
|
|
|
|
|
|
|
|
9252
|
|
|
|
|
|
|
// Generate the guesser rules |
9253
|
|
|
|
|
|
|
unordered_map> tags; |
9254
|
|
|
|
|
|
|
unordered_map>> rules; |
9255
|
|
|
|
|
|
|
unordered_set suffixes; |
9256
|
|
|
|
|
|
|
string prefix_suffix, tag_lemma_rule; |
9257
|
0
|
0
|
|
|
|
|
for (auto&& instance : data) { |
9258
|
|
|
|
|
|
|
// Add tag |
9259
|
0
|
|
|
|
|
|
tags[instance.tag].insert(instance.form); |
9260
|
|
|
|
|
|
|
|
9261
|
|
|
|
|
|
|
// Find longest matching prefix |
9262
|
|
|
|
|
|
|
unsigned prefix_length = 0; |
9263
|
0
|
0
|
|
|
|
|
for (auto&& prefix : prefixes) |
9264
|
0
|
0
|
|
|
|
|
if (prefix.size() > prefix_length && instance.form.compare(0, prefix.size(), prefix) == 0) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9265
|
0
|
|
|
|
|
|
prefix_length = prefix.size(); |
9266
|
|
|
|
|
|
|
|
9267
|
0
|
0
|
|
|
|
|
tag_lemma_rule.assign(instance.lemma_rule).append("\t").append(instance.tag); |
9268
|
|
|
|
|
|
|
|
9269
|
|
|
|
|
|
|
// Add prefix + all suffixes of length 1..suffix_len to rules |
9270
|
0
|
0
|
|
|
|
|
for (unsigned length = 0, utf8_length = 0; length < suffix_len && suffix(instance.form, utf8_length); length++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9271
|
0
|
0
|
|
|
|
|
prefix_suffix.assign(instance.form, 0, prefix_length).append(" ").append(instance.form, instance.form.size() - utf8_length, utf8_length); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9272
|
|
|
|
|
|
|
rules[prefix_suffix][tag_lemma_rule].insert(instance.form); |
9273
|
0
|
|
|
|
|
|
suffixes.emplace(instance.form, instance.form.size() - utf8_length, utf8_length); |
9274
|
|
|
|
|
|
|
} |
9275
|
|
|
|
|
|
|
} |
9276
|
|
|
|
|
|
|
|
9277
|
|
|
|
|
|
|
// Start generating the guesser description by writing the most "frequent" tag |
9278
|
|
|
|
|
|
|
string most_frequent_tag; unsigned most_frequent_tag_count = 0; |
9279
|
0
|
0
|
|
|
|
|
for (auto&& tag : tags) |
9280
|
0
|
0
|
|
|
|
|
if (tag.second.size() > most_frequent_tag_count) |
9281
|
0
|
|
|
|
|
|
most_frequent_tag.assign(tag.first), most_frequent_tag_count = tag.second.size(); |
9282
|
|
|
|
|
|
|
|
9283
|
|
|
|
|
|
|
os << most_frequent_tag << endl; |
9284
|
|
|
|
|
|
|
|
9285
|
|
|
|
|
|
|
// For every prefix-suffix, write at most rules_per_suffix most "frequent" rules |
9286
|
|
|
|
|
|
|
string rule_key, output; |
9287
|
|
|
|
|
|
|
unordered_set rules_set; |
9288
|
0
|
|
|
|
|
|
vector> rules_counts; |
9289
|
0
|
0
|
|
|
|
|
for (auto&& suffix : suffixes) { |
9290
|
0
|
0
|
|
|
|
|
for (auto&& prefix : prefixes) { |
9291
|
0
|
|
|
|
|
|
rules_counts.clear(); |
9292
|
|
|
|
|
|
|
rules_set.clear(); |
9293
|
|
|
|
|
|
|
|
9294
|
|
|
|
|
|
|
// Gather at most rules_per_suffix rules |
9295
|
0
|
0
|
|
|
|
|
for (int prefix_len = int(prefix.size()); prefix_len >= 0; prefix_len -= prefix.empty() ? 1 : prefix.size()) { |
|
|
0
|
|
|
|
|
|
9296
|
0
|
0
|
|
|
|
|
for (int suffix_len = int(suffix.size()); rules_counts.size() < rules_per_suffix && suffix_len > 0; suffix_len--) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9297
|
0
|
0
|
|
|
|
|
rule_key.assign(prefix, 0, prefix_len).append(" ").append(suffix, suffix.size() - suffix_len, suffix_len); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9298
|
0
|
0
|
|
|
|
|
if (!rules.count(rule_key)) continue; |
9299
|
|
|
|
|
|
|
|
9300
|
|
|
|
|
|
|
unsigned rules_counts_original = rules_counts.size(); |
9301
|
0
|
0
|
|
|
|
|
for (auto&& entry : rules[rule_key]) |
9302
|
0
|
0
|
|
|
|
|
if (!rules_set.count(entry.first)) { |
9303
|
0
|
0
|
|
|
|
|
rules_counts.emplace_back(unsigned(entry.second.size()), entry.first); |
9304
|
|
|
|
|
|
|
rules_set.insert(entry.first); |
9305
|
|
|
|
|
|
|
} |
9306
|
|
|
|
|
|
|
|
9307
|
|
|
|
|
|
|
sort(rules_counts.begin() + rules_counts_original, rules_counts.end(), greater>()); |
9308
|
|
|
|
|
|
|
|
9309
|
0
|
0
|
|
|
|
|
if (rules_counts.size() >= rules_per_suffix) { |
9310
|
0
|
0
|
|
|
|
|
rules_counts.resize(rules_per_suffix); |
9311
|
|
|
|
|
|
|
break; |
9312
|
|
|
|
|
|
|
} |
9313
|
|
|
|
|
|
|
} |
9314
|
|
|
|
|
|
|
// Stop if there are no rules for given prefix |
9315
|
0
|
0
|
|
|
|
|
if (rules_set.empty()) break; |
9316
|
|
|
|
|
|
|
} |
9317
|
0
|
0
|
|
|
|
|
if (!rules_set.empty()) { |
9318
|
|
|
|
|
|
|
// Write the chosen rules |
9319
|
0
|
0
|
|
|
|
|
output.assign(prefix).append(" ").append(suffix); |
9320
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < rules_counts.size(); i++) { |
9321
|
0
|
|
|
|
|
|
unsigned tab = rules_counts[i].second.find('\t'); |
9322
|
|
|
|
|
|
|
|
9323
|
0
|
0
|
|
|
|
|
output.append("\t").append(rules_counts[i].second, 0, tab).append("\t").append(rules_counts[i].second, tab + 1, string::npos); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9324
|
|
|
|
|
|
|
|
9325
|
|
|
|
|
|
|
// Join rules with same lemma_rule |
9326
|
0
|
0
|
|
|
|
|
for (unsigned start = i; i+1 < rules_counts.size() && rules_counts[i+1].second.compare(0, tab + 1, rules_counts[start].second, 0, tab + 1) == 0; i++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9327
|
0
|
0
|
|
|
|
|
output.append(" ").append(rules_counts[i+1].second, tab + 1, string::npos); |
|
|
0
|
|
|
|
|
|
9328
|
|
|
|
|
|
|
} |
9329
|
|
|
|
|
|
|
os << output << endl; |
9330
|
|
|
|
|
|
|
} |
9331
|
|
|
|
|
|
|
} |
9332
|
|
|
|
|
|
|
} |
9333
|
0
|
|
|
|
|
|
} |
9334
|
|
|
|
|
|
|
|
9335
|
0
|
|
|
|
|
|
morpho_statistical_guesser_trainer::instance::instance(const string& form, const string& lemma, const string& tag) |
9336
|
0
|
|
|
|
|
|
: form(form), lemma(lemma), tag(tag) |
9337
|
|
|
|
|
|
|
{ |
9338
|
|
|
|
|
|
|
using namespace unilib; |
9339
|
|
|
|
|
|
|
|
9340
|
|
|
|
|
|
|
unsigned length_best = 0; |
9341
|
|
|
|
|
|
|
int form_best = 0, lemma_best = 0; |
9342
|
0
|
0
|
|
|
|
|
for (int offset = -int(lemma.size() - 1); offset < int(form.size()) - 1; offset++) { |
9343
|
0
|
|
|
|
|
|
unsigned form_offset = max(0, offset); |
9344
|
0
|
|
|
|
|
|
unsigned lemma_offset = max(0, -offset); |
9345
|
0
|
0
|
|
|
|
|
for (unsigned length = 0; form_offset < form.size() && lemma_offset < lemma.size(); form_offset++, lemma_offset++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9346
|
0
|
0
|
|
|
|
|
if (form[form_offset] == lemma[lemma_offset]) { |
9347
|
0
|
0
|
|
|
|
|
if (++length > length_best && utf8::valid(form.c_str() + form_offset + 1 - length, length)) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9348
|
0
|
|
|
|
|
|
length_best = length, form_best = form_offset + 1 - length, lemma_best = lemma_offset + 1 - length; |
9349
|
|
|
|
|
|
|
} else { |
9350
|
|
|
|
|
|
|
length = 0; |
9351
|
|
|
|
|
|
|
} |
9352
|
|
|
|
|
|
|
} |
9353
|
|
|
|
|
|
|
|
9354
|
0
|
0
|
|
|
|
|
form_prefix.assign(form, 0, lemma_best == 0 ? form_best : 0); |
|
|
0
|
|
|
|
|
|
9355
|
0
|
0
|
|
|
|
|
lemma_rule.assign(form, 0, form_best).append(" ").append(lemma, 0, lemma_best).append(" ") |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9356
|
0
|
0
|
|
|
|
|
.append(form, form_best + length_best, string::npos).append(" ").append(lemma, lemma_best + length_best, string::npos); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9357
|
0
|
|
|
|
|
|
} |
9358
|
|
|
|
|
|
|
|
9359
|
0
|
|
|
|
|
|
morpho_statistical_guesser_trainer::casing morpho_statistical_guesser_trainer::get_casing(const string& word, bool allow_nonletters) { |
9360
|
|
|
|
|
|
|
using namespace unilib; |
9361
|
|
|
|
|
|
|
|
9362
|
|
|
|
|
|
|
casing c = CASE_OTHER; |
9363
|
|
|
|
|
|
|
int index = 0; |
9364
|
0
|
0
|
|
|
|
|
for (auto&& chr : utf8::decoder(word)) { |
9365
|
0
|
|
|
|
|
|
auto cat = unicode::category(chr); |
9366
|
|
|
|
|
|
|
|
9367
|
|
|
|
|
|
|
// Return OTHER for non-letters |
9368
|
0
|
0
|
|
|
|
|
if (allow_nonletters && index >= 2 && cat & ~unicode::L) continue; |
|
|
0
|
|
|
|
|
|
9369
|
0
|
0
|
|
|
|
|
if (cat & ~unicode::L) return CASE_OTHER; |
9370
|
|
|
|
|
|
|
|
9371
|
0
|
0
|
|
|
|
|
if (index == 0) { |
9372
|
0
|
0
|
|
|
|
|
c = cat & unicode::Ll ? CASE_LC : CASE_UC; |
9373
|
0
|
0
|
|
|
|
|
} else if (c == CASE_UC && index == 1) { |
9374
|
0
|
0
|
|
|
|
|
c = cat & unicode::Ll ? CASE_UCLC : CASE_UC; |
9375
|
0
|
0
|
|
|
|
|
} else if (c == CASE_UC) { |
9376
|
0
|
0
|
|
|
|
|
if (cat & ~unicode::Lut) return CASE_OTHER; |
9377
|
|
|
|
|
|
|
} else /*CASE_LC or CASE_UCLC*/ { |
9378
|
0
|
0
|
|
|
|
|
if (cat & ~unicode::Ll) return CASE_OTHER; |
9379
|
|
|
|
|
|
|
} |
9380
|
0
|
|
|
|
|
|
index++; |
9381
|
|
|
|
|
|
|
} |
9382
|
0
|
|
|
|
|
|
return c; |
9383
|
|
|
|
|
|
|
} |
9384
|
|
|
|
|
|
|
|
9385
|
0
|
|
|
|
|
|
void morpho_statistical_guesser_trainer::set_casing(const string& original, casing c, string& word) { |
9386
|
|
|
|
|
|
|
using namespace unilib; |
9387
|
|
|
|
|
|
|
|
9388
|
|
|
|
|
|
|
word.clear(); |
9389
|
|
|
|
|
|
|
bool first = true; |
9390
|
0
|
0
|
|
|
|
|
for (auto&& chr : utf8::decoder(original)) { |
9391
|
0
|
0
|
|
|
|
|
utf8::append(word, (c == CASE_UC || (c == CASE_UCLC && first)) ? unicode::uppercase(chr) : unicode::lowercase(chr)); |
|
|
0
|
|
|
|
|
|
9392
|
|
|
|
|
|
|
first = false; |
9393
|
|
|
|
|
|
|
} |
9394
|
0
|
|
|
|
|
|
} |
9395
|
|
|
|
|
|
|
|
9396
|
0
|
|
|
|
|
|
bool morpho_statistical_guesser_trainer::suffix(const string& word, unsigned& length) { |
9397
|
|
|
|
|
|
|
using namespace unilib; |
9398
|
|
|
|
|
|
|
|
9399
|
|
|
|
|
|
|
unsigned additional = 1; |
9400
|
0
|
0
|
|
|
|
|
while (additional + length <= word.size() && !utf8::valid(word.c_str() + word.size() - length - additional, additional)) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9401
|
0
|
|
|
|
|
|
additional++; |
9402
|
|
|
|
|
|
|
|
9403
|
0
|
0
|
|
|
|
|
if (additional + length > word.size()) return false; |
9404
|
|
|
|
|
|
|
|
9405
|
0
|
|
|
|
|
|
length += additional; |
9406
|
0
|
|
|
|
|
|
return true; |
9407
|
|
|
|
|
|
|
} |
9408
|
|
|
|
|
|
|
|
9409
|
|
|
|
|
|
|
} // namespace morphodita |
9410
|
|
|
|
|
|
|
|
9411
|
|
|
|
|
|
|
///////// |
9412
|
|
|
|
|
|
|
// File: morphodita/morpho/raw_morpho_dictionary_reader.cpp |
9413
|
|
|
|
|
|
|
///////// |
9414
|
|
|
|
|
|
|
|
9415
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
9416
|
|
|
|
|
|
|
// |
9417
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
9418
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
9419
|
|
|
|
|
|
|
// |
9420
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
9421
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
9422
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
9423
|
|
|
|
|
|
|
|
9424
|
|
|
|
|
|
|
namespace morphodita { |
9425
|
|
|
|
|
|
|
|
9426
|
0
|
|
|
|
|
|
bool raw_morpho_dictionary_reader::next_lemma(string& lemma, vector>& tagged_forms) { |
9427
|
0
|
0
|
|
|
|
|
if (line.empty()) { |
9428
|
0
|
0
|
|
|
|
|
if (!getline(in, line)) |
9429
|
|
|
|
|
|
|
return false; |
9430
|
0
|
|
|
|
|
|
split(line, '\t', tokens); |
9431
|
0
|
0
|
|
|
|
|
if (tokens.size() != 3) training_failure("Line " << line << " does not have three columns!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9432
|
|
|
|
|
|
|
} |
9433
|
|
|
|
|
|
|
|
9434
|
|
|
|
|
|
|
lemma = tokens[0]; |
9435
|
0
|
0
|
|
|
|
|
if (seen_lemmas.count(lemma)) |
9436
|
0
|
0
|
|
|
|
|
training_failure("Raw morphological dictionary contains lemma '" << lemma << "' multiple times - all forms of one lemma must be in continuous region!"); |
|
|
0
|
|
|
|
|
|
9437
|
|
|
|
|
|
|
seen_lemmas.insert(lemma); |
9438
|
|
|
|
|
|
|
|
9439
|
|
|
|
|
|
|
tagged_forms.clear(); |
9440
|
0
|
|
|
|
|
|
tagged_forms.emplace_back(tokens[2], tokens[1]); |
9441
|
0
|
0
|
|
|
|
|
while (getline(in, line)) { |
9442
|
0
|
|
|
|
|
|
split(line, '\t', tokens); |
9443
|
0
|
0
|
|
|
|
|
if (tokens.size() != 3) training_failure("Line " << line << " does not have three columns!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9444
|
|
|
|
|
|
|
|
9445
|
0
|
0
|
|
|
|
|
if (lemma != tokens[0]) break; |
9446
|
0
|
|
|
|
|
|
tagged_forms.emplace_back(tokens[2], tokens[1]); |
9447
|
|
|
|
|
|
|
} |
9448
|
|
|
|
|
|
|
|
9449
|
|
|
|
|
|
|
return true; |
9450
|
|
|
|
|
|
|
} |
9451
|
|
|
|
|
|
|
|
9452
|
|
|
|
|
|
|
} // namespace morphodita |
9453
|
|
|
|
|
|
|
|
9454
|
|
|
|
|
|
|
///////// |
9455
|
|
|
|
|
|
|
// File: morphodita/morpho/tag_filter.cpp |
9456
|
|
|
|
|
|
|
///////// |
9457
|
|
|
|
|
|
|
|
9458
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
9459
|
|
|
|
|
|
|
// |
9460
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
9461
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
9462
|
|
|
|
|
|
|
// |
9463
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
9464
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
9465
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
9466
|
|
|
|
|
|
|
|
9467
|
|
|
|
|
|
|
namespace morphodita { |
9468
|
|
|
|
|
|
|
|
9469
|
0
|
|
|
|
|
|
tag_filter::tag_filter(const char* filter) { |
9470
|
0
|
0
|
|
|
|
|
if (!filter) return; |
9471
|
|
|
|
|
|
|
|
9472
|
0
|
0
|
|
|
|
|
wildcard.assign(filter); |
9473
|
|
|
|
|
|
|
filter = wildcard.c_str(); |
9474
|
|
|
|
|
|
|
|
9475
|
0
|
0
|
|
|
|
|
for (int tag_pos = 0, filter_pos = 0; filter[filter_pos]; tag_pos++, filter_pos++) { |
9476
|
0
|
0
|
|
|
|
|
if (filter[filter_pos] == '?') continue; |
9477
|
0
|
0
|
|
|
|
|
if (filter[filter_pos] == '[') { |
9478
|
0
|
|
|
|
|
|
filter_pos++; |
9479
|
|
|
|
|
|
|
|
9480
|
0
|
|
|
|
|
|
bool negate = false; |
9481
|
0
|
0
|
|
|
|
|
if (filter[filter_pos] == '^') negate = true, filter_pos++; |
9482
|
|
|
|
|
|
|
|
9483
|
0
|
|
|
|
|
|
int chars_start = filter_pos; |
9484
|
0
|
0
|
|
|
|
|
for (bool first = true; filter[filter_pos] && (first || filter[filter_pos] != ']'); first = false) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9485
|
0
|
|
|
|
|
|
filter_pos++; |
9486
|
|
|
|
|
|
|
|
9487
|
0
|
0
|
|
|
|
|
filters.emplace_back(tag_pos, negate, chars_start, filter_pos - chars_start); |
9488
|
0
|
0
|
|
|
|
|
if (!filter[filter_pos]) break; |
9489
|
|
|
|
|
|
|
} else { |
9490
|
0
|
0
|
|
|
|
|
filters.emplace_back(tag_pos, false, filter_pos, 1); |
9491
|
|
|
|
|
|
|
} |
9492
|
|
|
|
|
|
|
} |
9493
|
|
|
|
|
|
|
} |
9494
|
|
|
|
|
|
|
|
9495
|
|
|
|
|
|
|
} // namespace morphodita |
9496
|
|
|
|
|
|
|
|
9497
|
|
|
|
|
|
|
///////// |
9498
|
|
|
|
|
|
|
// File: morphodita/tagger/elementary_features.h |
9499
|
|
|
|
|
|
|
///////// |
9500
|
|
|
|
|
|
|
|
9501
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
9502
|
|
|
|
|
|
|
// |
9503
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
9504
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
9505
|
|
|
|
|
|
|
// |
9506
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
9507
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
9508
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
9509
|
|
|
|
|
|
|
|
9510
|
|
|
|
|
|
|
namespace morphodita { |
9511
|
|
|
|
|
|
|
|
9512
|
|
|
|
|
|
|
// Declarations |
9513
|
|
|
|
|
|
|
enum elementary_feature_type { PER_FORM, PER_TAG, DYNAMIC }; |
9514
|
|
|
|
|
|
|
enum elementary_feature_range { ONLY_CURRENT, ANY_OFFSET }; |
9515
|
|
|
|
|
|
|
|
9516
|
|
|
|
|
|
|
typedef uint32_t elementary_feature_value; |
9517
|
|
|
|
|
|
|
enum :elementary_feature_value { elementary_feature_unknown = 0, elementary_feature_empty = 1 }; |
9518
|
|
|
|
|
|
|
|
9519
|
136
|
|
|
|
|
|
struct elementary_feature_description { |
9520
|
|
|
|
|
|
|
string name; |
9521
|
|
|
|
|
|
|
elementary_feature_type type; |
9522
|
|
|
|
|
|
|
elementary_feature_range range; |
9523
|
|
|
|
|
|
|
int index; |
9524
|
|
|
|
|
|
|
int map_index; |
9525
|
|
|
|
|
|
|
}; |
9526
|
|
|
|
|
|
|
|
9527
|
|
|
|
|
|
|
template |
9528
|
1
|
|
|
|
|
|
class elementary_features { |
9529
|
|
|
|
|
|
|
public: |
9530
|
|
|
|
|
|
|
bool load(istream& is); |
9531
|
|
|
|
|
|
|
bool save(ostream& out); |
9532
|
|
|
|
|
|
|
|
9533
|
|
|
|
|
|
|
vector |
9534
|
|
|
|
|
|
|
}; |
9535
|
|
|
|
|
|
|
|
9536
|
0
|
|
|
|
|
|
class persistent_elementary_feature_map : public persistent_unordered_map { |
9537
|
|
|
|
|
|
|
public: |
9538
|
|
|
|
|
|
|
persistent_elementary_feature_map() : persistent_unordered_map() {} |
9539
|
|
|
|
|
|
|
persistent_elementary_feature_map(const persistent_unordered_map&& map) : persistent_unordered_map(map) {} |
9540
|
|
|
|
|
|
|
|
9541
|
|
|
|
|
|
|
elementary_feature_value value(const char* feature, int len) const { |
9542
|
92
|
|
|
|
|
|
auto* it = at_typed(feature, len); |
9543
|
92
|
0
|
|
|
|
|
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
9544
|
|
|
|
|
|
|
} |
9545
|
|
|
|
|
|
|
}; |
9546
|
|
|
|
|
|
|
|
9547
|
|
|
|
|
|
|
// Definitions |
9548
|
|
|
|
|
|
|
template |
9549
|
1
|
|
|
|
|
|
inline bool elementary_features |
9550
|
|
|
|
|
|
|
binary_decoder data; |
9551
|
1
|
50
|
|
|
|
|
if (!compressor::load(is, data)) return false; |
|
|
50
|
|
|
|
|
|
9552
|
|
|
|
|
|
|
|
9553
|
|
|
|
|
|
|
try { |
9554
|
1
|
50
|
|
|
|
|
maps.resize(data.next_1B()); |
|
|
50
|
|
|
|
|
|
9555
|
28
|
100
|
|
|
|
|
for (auto&& map : maps) |
9556
|
27
|
50
|
|
|
|
|
map.load(data); |
|
|
0
|
|
|
|
|
|
9557
|
|
|
|
|
|
|
} catch (binary_decoder_error&) { |
9558
|
|
|
|
|
|
|
return false; |
9559
|
|
|
|
|
|
|
} |
9560
|
|
|
|
|
|
|
|
9561
|
1
|
|
|
|
|
|
return data.is_end(); |
9562
|
|
|
|
|
|
|
} |
9563
|
|
|
|
|
|
|
|
9564
|
|
|
|
|
|
|
} // namespace morphodita |
9565
|
|
|
|
|
|
|
|
9566
|
|
|
|
|
|
|
///////// |
9567
|
|
|
|
|
|
|
// File: morphodita/tagger/vli.h |
9568
|
|
|
|
|
|
|
///////// |
9569
|
|
|
|
|
|
|
|
9570
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
9571
|
|
|
|
|
|
|
// |
9572
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
9573
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
9574
|
|
|
|
|
|
|
// |
9575
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
9576
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
9577
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
9578
|
|
|
|
|
|
|
|
9579
|
|
|
|
|
|
|
namespace morphodita { |
9580
|
|
|
|
|
|
|
|
9581
|
|
|
|
|
|
|
// Declarations |
9582
|
|
|
|
|
|
|
template |
9583
|
|
|
|
|
|
|
class vli { |
9584
|
|
|
|
|
|
|
public: |
9585
|
|
|
|
|
|
|
static int max_length(); |
9586
|
|
|
|
|
|
|
static void encode(T value, char*& where); |
9587
|
|
|
|
|
|
|
static T decode(const char*& from); |
9588
|
|
|
|
|
|
|
}; |
9589
|
|
|
|
|
|
|
|
9590
|
|
|
|
|
|
|
// Definitions |
9591
|
|
|
|
|
|
|
template <> |
9592
|
|
|
|
|
|
|
inline int vli::max_length() { |
9593
|
|
|
|
|
|
|
return 5; |
9594
|
|
|
|
|
|
|
} |
9595
|
|
|
|
|
|
|
|
9596
|
|
|
|
|
|
|
template <> |
9597
|
1171
|
|
|
|
|
|
inline void vli::encode(uint32_t value, char*& where) { |
9598
|
1171
|
50
|
|
|
|
|
if (value < 0x80) *where++ = value; |
9599
|
0
|
0
|
|
|
|
|
else if (value < 0x4000) *where++ = (value >> 7) | 0x80u, *where++ = value & 0x7Fu; |
9600
|
0
|
0
|
|
|
|
|
else if (value < 0x200000) *where++ = (value >> 14) | 0x80u, *where++ = ((value >> 7) & 0x7Fu) | 0x80u, *where++ = value & 0x7Fu; |
9601
|
0
|
0
|
|
|
|
|
else if (value < 0x10000000) *where++ = (value >> 21) | 0x80u, *where++ = ((value >> 14) & 0x7Fu) | 0x80u, *where++ = ((value >> 7) & 0x7Fu) | 0x80u, *where++ = value & 0x7Fu; |
9602
|
0
|
|
|
|
|
|
else *where++ = (value >> 28) | 0x80u, *where++ = ((value >> 21) & 0x7Fu) | 0x80u, *where++ = ((value >> 14) & 0x7Fu) | 0x80u, *where++ = ((value >> 7) & 0x7Fu) | 0x80u, *where++ = value & 0x7Fu; |
9603
|
1171
|
|
|
|
|
|
} |
9604
|
|
|
|
|
|
|
|
9605
|
|
|
|
|
|
|
template <> |
9606
|
|
|
|
|
|
|
inline uint32_t vli::decode(const char*& from) { |
9607
|
|
|
|
|
|
|
uint32_t value = 0; |
9608
|
0
|
0
|
|
|
|
|
while (((unsigned char)(*from)) & 0x80u) value = (value << 7) | (((unsigned char)(*from++)) ^ 0x80u); |
|
|
0
|
|
|
|
|
|
9609
|
0
|
|
|
|
|
|
value = (value << 7) | ((unsigned char)(*from++)); |
9610
|
|
|
|
|
|
|
return value; |
9611
|
|
|
|
|
|
|
} |
9612
|
|
|
|
|
|
|
|
9613
|
|
|
|
|
|
|
} // namespace morphodita |
9614
|
|
|
|
|
|
|
|
9615
|
|
|
|
|
|
|
///////// |
9616
|
|
|
|
|
|
|
// File: morphodita/tagger/feature_sequences.h |
9617
|
|
|
|
|
|
|
///////// |
9618
|
|
|
|
|
|
|
|
9619
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
9620
|
|
|
|
|
|
|
// |
9621
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
9622
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
9623
|
|
|
|
|
|
|
// |
9624
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
9625
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
9626
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
9627
|
|
|
|
|
|
|
|
9628
|
|
|
|
|
|
|
namespace morphodita { |
9629
|
|
|
|
|
|
|
|
9630
|
|
|
|
|
|
|
// Declarations |
9631
|
|
|
|
|
|
|
typedef int32_t feature_sequence_score; |
9632
|
|
|
|
|
|
|
typedef int64_t feature_sequences_score; |
9633
|
|
|
|
|
|
|
|
9634
|
|
|
|
|
|
|
struct feature_sequence_element { |
9635
|
|
|
|
|
|
|
elementary_feature_type type; |
9636
|
|
|
|
|
|
|
int elementary_index; |
9637
|
|
|
|
|
|
|
int sequence_index; |
9638
|
|
|
|
|
|
|
|
9639
|
|
|
|
|
|
|
feature_sequence_element() {} |
9640
|
0
|
|
|
|
|
|
feature_sequence_element(elementary_feature_type type, int elementary_index, int sequence_index) : type(type), elementary_index(elementary_index), sequence_index(sequence_index) {} |
9641
|
|
|
|
|
|
|
}; |
9642
|
|
|
|
|
|
|
|
9643
|
74
|
0
|
|
|
|
|
struct feature_sequence { |
|
|
0
|
|
|
|
|
|
9644
|
|
|
|
|
|
|
vector elements; |
9645
|
|
|
|
|
|
|
int dependant_range = 1; |
9646
|
|
|
|
|
|
|
}; |
9647
|
|
|
|
|
|
|
|
9648
|
|
|
|
|
|
|
template |
9649
|
3
|
0
|
|
|
|
|
class feature_sequences { |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9650
|
|
|
|
|
|
|
public: |
9651
|
|
|
|
|
|
|
typedef typename ElementaryFeatures::per_form_features per_form_features; |
9652
|
|
|
|
|
|
|
typedef typename ElementaryFeatures::per_tag_features per_tag_features; |
9653
|
|
|
|
|
|
|
typedef typename ElementaryFeatures::dynamic_features dynamic_features; |
9654
|
|
|
|
|
|
|
|
9655
|
|
|
|
|
|
|
void parse(int window_size, istream& is); |
9656
|
|
|
|
|
|
|
bool load(istream& is); |
9657
|
|
|
|
|
|
|
bool save(ostream& os); |
9658
|
|
|
|
|
|
|
|
9659
|
|
|
|
|
|
|
struct cache; |
9660
|
|
|
|
|
|
|
|
9661
|
|
|
|
|
|
|
inline void initialize_sentence(const vector& forms, const vector>& analyses, cache& c) const; |
9662
|
|
|
|
|
|
|
inline void compute_dynamic_features(int form_index, int tag_index, const dynamic_features* prev_dynamic, dynamic_features& dynamic, cache& c) const; |
9663
|
|
|
|
|
|
|
inline feature_sequences_score score(int form_index, int tags_window[], int tags_unchanged, dynamic_features& dynamic, cache& c) const; |
9664
|
|
|
|
|
|
|
void feature_keys(int form_index, int tags_window[], int tags_unchanged, dynamic_features& dynamic, vector& keys, cache& c) const; |
9665
|
|
|
|
|
|
|
|
9666
|
|
|
|
|
|
|
ElementaryFeatures elementary; |
9667
|
|
|
|
|
|
|
vector |
9668
|
|
|
|
|
|
|
vector sequences; |
9669
|
|
|
|
|
|
|
}; |
9670
|
|
|
|
|
|
|
|
9671
|
0
|
|
|
|
|
|
class persistent_feature_sequence_map : public persistent_unordered_map { |
9672
|
|
|
|
|
|
|
public: |
9673
|
|
|
|
|
|
|
persistent_feature_sequence_map() : persistent_unordered_map() {} |
9674
|
|
|
|
|
|
|
persistent_feature_sequence_map(const persistent_unordered_map&& map) : persistent_unordered_map(map) {} |
9675
|
|
|
|
|
|
|
|
9676
|
|
|
|
|
|
|
feature_sequence_score score(const char* feature, int len) const { |
9677
|
346
|
|
|
|
|
|
auto* it = at_typed(feature, len); |
9678
|
346
|
0
|
|
|
|
|
return it ? unaligned_load(it) : 0; |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
9679
|
|
|
|
|
|
|
} |
9680
|
|
|
|
|
|
|
}; |
9681
|
|
|
|
|
|
|
|
9682
|
|
|
|
|
|
|
template using persistent_feature_sequences = feature_sequences; |
9683
|
|
|
|
|
|
|
|
9684
|
|
|
|
|
|
|
// Definitions |
9685
|
|
|
|
|
|
|
template |
9686
|
1
|
|
|
|
|
|
inline bool feature_sequences::load(istream& is) { |
9687
|
1
|
50
|
|
|
|
|
if (!elementary.load(is)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9688
|
|
|
|
|
|
|
|
9689
|
|
|
|
|
|
|
binary_decoder data; |
9690
|
1
|
50
|
|
|
|
|
if (!compressor::load(is, data)) return false; |
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9691
|
|
|
|
|
|
|
|
9692
|
|
|
|
|
|
|
try { |
9693
|
1
|
50
|
|
|
|
|
sequences.resize(data.next_1B()); |
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9694
|
75
|
100
|
|
|
|
|
for (auto&& sequence : sequences) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9695
|
74
|
50
|
|
|
|
|
sequence.dependant_range = data.next_4B(); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9696
|
74
|
50
|
|
|
|
|
sequence.elements.resize(data.next_1B()); |
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9697
|
228
|
100
|
|
|
|
|
for (auto&& element : sequence.elements) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9698
|
154
|
50
|
|
|
|
|
element.type = elementary_feature_type(data.next_4B()); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9699
|
154
|
50
|
|
|
|
|
element.elementary_index = data.next_4B(); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9700
|
154
|
50
|
|
|
|
|
element.sequence_index = data.next_4B(); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9701
|
|
|
|
|
|
|
} |
9702
|
|
|
|
|
|
|
} |
9703
|
|
|
|
|
|
|
|
9704
|
1
|
50
|
|
|
|
|
scores.resize(data.next_1B()); |
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9705
|
75
|
100
|
|
|
|
|
for (auto&& score : scores) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9706
|
74
|
50
|
|
|
|
|
score.load(data); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9707
|
|
|
|
|
|
|
} catch (binary_decoder_error&) { |
9708
|
|
|
|
|
|
|
return false; |
9709
|
|
|
|
|
|
|
} |
9710
|
|
|
|
|
|
|
|
9711
|
1
|
|
|
|
|
|
return data.is_end(); |
9712
|
|
|
|
|
|
|
} |
9713
|
|
|
|
|
|
|
|
9714
|
|
|
|
|
|
|
template |
9715
|
2
|
|
|
|
|
|
struct feature_sequences::cache { |
9716
|
|
|
|
|
|
|
const vector* forms; |
9717
|
|
|
|
|
|
|
const vector>* analyses; |
9718
|
|
|
|
|
|
|
vector elementary_per_form; |
9719
|
|
|
|
|
|
|
vector> elementary_per_tag; |
9720
|
|
|
|
|
|
|
|
9721
|
0
|
|
|
|
|
|
struct cache_element { |
9722
|
|
|
|
|
|
|
vector key; |
9723
|
|
|
|
|
|
|
int key_size; |
9724
|
|
|
|
|
|
|
feature_sequence_score score; |
9725
|
|
|
|
|
|
|
|
9726
|
74
|
0
|
|
|
|
|
cache_element(int elements) : key(vli::max_length() * elements), key_size(0), score(0) {} |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9727
|
|
|
|
|
|
|
}; |
9728
|
|
|
|
|
|
|
vector caches; |
9729
|
|
|
|
|
|
|
vector window; |
9730
|
|
|
|
|
|
|
vector key; |
9731
|
|
|
|
|
|
|
feature_sequences_score score; |
9732
|
|
|
|
|
|
|
|
9733
|
1
|
|
|
|
|
|
cache(const feature_sequences& self) : score(0) { |
9734
|
1
|
0
|
|
|
|
|
caches.reserve(self.sequences.size()); |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9735
|
|
|
|
|
|
|
int max_sequence_elements = 0, max_window_size = 1; |
9736
|
75
|
0
|
|
|
|
|
for (auto&& sequence : self.sequences) { |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9737
|
74
|
0
|
|
|
|
|
caches.emplace_back(int(sequence.elements.size())); |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9738
|
74
|
0
|
|
|
|
|
if (int(sequence.elements.size()) > max_sequence_elements) max_sequence_elements = sequence.elements.size(); |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9739
|
228
|
0
|
|
|
|
|
for (auto&& element : sequence.elements) |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9740
|
154
|
0
|
|
|
|
|
if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9741
|
|
|
|
|
|
|
max_window_size = 1 - element.sequence_index; |
9742
|
|
|
|
|
|
|
} |
9743
|
1
|
0
|
|
|
|
|
key.resize(max_sequence_elements * vli::max_length()); |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9744
|
1
|
0
|
|
|
|
|
window.resize(max_window_size); |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9745
|
1
|
|
|
|
|
|
} |
9746
|
|
|
|
|
|
|
}; |
9747
|
|
|
|
|
|
|
|
9748
|
|
|
|
|
|
|
template |
9749
|
1
|
|
|
|
|
|
void feature_sequences::initialize_sentence(const vector& forms, const vector>& analyses, cache& c) const { |
9750
|
|
|
|
|
|
|
// Store forms and forms_size |
9751
|
1
|
|
|
|
|
|
c.forms = &forms; |
9752
|
1
|
|
|
|
|
|
c.analyses = &analyses; |
9753
|
|
|
|
|
|
|
|
9754
|
|
|
|
|
|
|
// Enlarge elementary features vectors if needed |
9755
|
1
|
0
|
|
|
|
|
if (forms.size() > c.elementary_per_form.size()) c.elementary_per_form.resize(forms.size() * 2); |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9756
|
1
|
0
|
|
|
|
|
if (forms.size() > c.elementary_per_tag.size()) c.elementary_per_tag.resize(forms.size() * 2); |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9757
|
8
|
0
|
|
|
|
|
for (unsigned i = 0; i < forms.size(); i++) |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9758
|
7
|
0
|
|
|
|
|
if (analyses[i].size() > c.elementary_per_tag[i].size()) |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9759
|
7
|
|
|
|
|
|
c.elementary_per_tag[i].resize(analyses[i].size() * 2); |
9760
|
|
|
|
|
|
|
|
9761
|
|
|
|
|
|
|
// Compute elementary features |
9762
|
1
|
|
|
|
|
|
elementary.compute_features(forms, analyses, c.elementary_per_form, c.elementary_per_tag); |
9763
|
|
|
|
|
|
|
|
9764
|
|
|
|
|
|
|
// Clear score cache, because scores may have been modified |
9765
|
1
|
|
|
|
|
|
c.score = 0; |
9766
|
75
|
0
|
|
|
|
|
for (auto&& cache : c.caches) |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9767
|
74
|
|
|
|
|
|
cache.key_size = cache.score = 0; |
9768
|
1
|
|
|
|
|
|
} |
9769
|
|
|
|
|
|
|
|
9770
|
|
|
|
|
|
|
template |
9771
|
30
|
|
|
|
|
|
void feature_sequences::compute_dynamic_features(int form_index, int tag_index, const dynamic_features* prev_dynamic, dynamic_features& dynamic, cache& c) const { |
9772
|
15
|
0
|
|
|
|
|
elementary.compute_dynamic_features((*c.analyses)[form_index][tag_index], c.elementary_per_form[form_index], c.elementary_per_tag[form_index][tag_index], form_index > 0 ? prev_dynamic : nullptr, dynamic); |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9773
|
15
|
|
|
|
|
|
} |
9774
|
|
|
|
|
|
|
|
9775
|
|
|
|
|
|
|
template |
9776
|
26
|
|
|
|
|
|
feature_sequences_score feature_sequences::score(int form_index, int tags_window[], int tags_unchanged, dynamic_features& dynamic, cache& c) const { |
9777
|
|
|
|
|
|
|
// Start by creating a window of per_tag_features* |
9778
|
43
|
0
|
|
|
|
|
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9779
|
90
|
|
|
|
|
|
c.window[i] = &c.elementary_per_tag[form_index - i][tags_window[i]]; |
9780
|
|
|
|
|
|
|
|
9781
|
|
|
|
|
|
|
// Compute the score |
9782
|
13
|
|
|
|
|
|
feature_sequences_score result = c.score; |
9783
|
671
|
0
|
|
|
|
|
for (unsigned i = 0; i < sequences.size(); i++) { |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9784
|
658
|
0
|
|
|
|
|
if (tags_unchanged >= sequences[i].dependant_range) |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9785
|
|
|
|
|
|
|
break; |
9786
|
|
|
|
|
|
|
|
9787
|
653
|
|
|
|
|
|
char* key = c.key.data(); |
9788
|
1824
|
0
|
|
|
|
|
for (unsigned j = 0; j < sequences[i].elements.size(); j++) { |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9789
|
|
|
|
|
|
|
auto& element = sequences[i].elements[j]; |
9790
|
|
|
|
|
|
|
elementary_feature_value value; |
9791
|
|
|
|
|
|
|
|
9792
|
1345
|
|
|
|
|
|
switch (element.type) { |
9793
|
|
|
|
|
|
|
case PER_FORM: |
9794
|
475
|
0
|
|
|
|
|
value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index]; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9795
|
|
|
|
|
|
|
break; |
9796
|
|
|
|
|
|
|
case PER_TAG: |
9797
|
844
|
0
|
|
|
|
|
value = form_index + element.sequence_index < 0 ? elementary_feature_empty : c.window[-element.sequence_index]->values[element.elementary_index]; |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9798
|
|
|
|
|
|
|
break; |
9799
|
|
|
|
|
|
|
case DYNAMIC: |
9800
|
|
|
|
|
|
|
default: |
9801
|
26
|
|
|
|
|
|
value = dynamic.values[element.elementary_index]; |
9802
|
|
|
|
|
|
|
} |
9803
|
|
|
|
|
|
|
|
9804
|
1345
|
0
|
|
|
|
|
if (value == elementary_feature_unknown) { |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9805
|
174
|
|
|
|
|
|
key = c.key.data(); |
9806
|
174
|
|
|
|
|
|
break; |
9807
|
|
|
|
|
|
|
} |
9808
|
1171
|
|
|
|
|
|
vli::encode(value, key); |
9809
|
|
|
|
|
|
|
} |
9810
|
|
|
|
|
|
|
|
9811
|
653
|
|
|
|
|
|
result -= c.caches[i].score; |
9812
|
653
|
|
|
|
|
|
int key_size = key - c.key.data(); |
9813
|
653
|
0
|
|
|
|
|
if (!key_size) { |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9814
|
174
|
|
|
|
|
|
c.caches[i].score = 0; |
9815
|
174
|
|
|
|
|
|
c.caches[i].key_size = 0; |
9816
|
834
|
0
|
|
|
|
|
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9817
|
0
|
|
|
|
|
|
c.caches[i].score = scores[i].score(c.key.data(), key_size); |
9818
|
346
|
|
|
|
|
|
c.caches[i].key_size = key_size; |
9819
|
346
|
|
|
|
|
|
small_memcpy(c.caches[i].key.data(), c.key.data(), key_size); |
9820
|
|
|
|
|
|
|
} |
9821
|
653
|
|
|
|
|
|
result += c.caches[i].score; |
9822
|
|
|
|
|
|
|
} |
9823
|
|
|
|
|
|
|
|
9824
|
13
|
|
|
|
|
|
c.score = result; |
9825
|
13
|
|
|
|
|
|
return result; |
9826
|
|
|
|
|
|
|
} |
9827
|
|
|
|
|
|
|
|
9828
|
|
|
|
|
|
|
template |
9829
|
0
|
|
|
|
|
|
void feature_sequences::feature_keys(int form_index, int tags_window[], int tags_unchanged, dynamic_features& dynamic, vector& keys, cache& c) const { |
9830
|
0
|
|
|
|
|
|
score(form_index, tags_window, tags_unchanged, dynamic, c); |
9831
|
|
|
|
|
|
|
|
9832
|
0
|
|
|
|
|
|
keys.resize(c.caches.size()); |
9833
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < c.caches.size(); i++) |
9834
|
0
|
|
|
|
|
|
keys[i].assign(c.caches[i].key.data(), c.caches[i].key_size); |
9835
|
0
|
|
|
|
|
|
} |
9836
|
|
|
|
|
|
|
|
9837
|
|
|
|
|
|
|
} // namespace morphodita |
9838
|
|
|
|
|
|
|
|
9839
|
|
|
|
|
|
|
///////// |
9840
|
|
|
|
|
|
|
// File: morphodita/tagger/viterbi.h |
9841
|
|
|
|
|
|
|
///////// |
9842
|
|
|
|
|
|
|
|
9843
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
9844
|
|
|
|
|
|
|
// |
9845
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
9846
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
9847
|
|
|
|
|
|
|
// |
9848
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
9849
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
9850
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
9851
|
|
|
|
|
|
|
|
9852
|
|
|
|
|
|
|
namespace morphodita { |
9853
|
|
|
|
|
|
|
|
9854
|
|
|
|
|
|
|
// Declarations |
9855
|
|
|
|
|
|
|
template |
9856
|
|
|
|
|
|
|
class viterbi { |
9857
|
|
|
|
|
|
|
public: |
9858
|
|
|
|
|
|
|
viterbi(const FeatureSequences& features, int decoding_order, int window_size) |
9859
|
1
|
|
|
|
|
|
: features(features), decoding_order(decoding_order), window_size(window_size) {} |
9860
|
|
|
|
|
|
|
|
9861
|
|
|
|
|
|
|
struct cache; |
9862
|
|
|
|
|
|
|
void tag(const vector& forms, const vector>& analyses, cache& c, vector& tags) const; |
9863
|
|
|
|
|
|
|
|
9864
|
|
|
|
|
|
|
private: |
9865
|
|
|
|
|
|
|
struct node; |
9866
|
|
|
|
|
|
|
|
9867
|
|
|
|
|
|
|
const FeatureSequences& features; |
9868
|
|
|
|
|
|
|
int decoding_order, window_size; |
9869
|
|
|
|
|
|
|
}; |
9870
|
|
|
|
|
|
|
|
9871
|
|
|
|
|
|
|
// Definitions |
9872
|
|
|
|
|
|
|
template |
9873
|
2
|
|
|
|
|
|
struct viterbi::cache { |
9874
|
|
|
|
|
|
|
vector nodes; |
9875
|
|
|
|
|
|
|
typename FeatureSequences::cache features_cache; |
9876
|
|
|
|
|
|
|
|
9877
|
1
|
0
|
|
|
|
|
cache(const viterbi& self) : features_cache(self.features) {} |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9878
|
|
|
|
|
|
|
}; |
9879
|
|
|
|
|
|
|
|
9880
|
|
|
|
|
|
|
template |
9881
|
|
|
|
|
|
|
struct viterbi::node { |
9882
|
|
|
|
|
|
|
int tag; |
9883
|
|
|
|
|
|
|
int prev; |
9884
|
|
|
|
|
|
|
feature_sequences_score score; |
9885
|
|
|
|
|
|
|
typename FeatureSequences::dynamic_features dynamic; |
9886
|
|
|
|
|
|
|
}; |
9887
|
|
|
|
|
|
|
|
9888
|
|
|
|
|
|
|
template |
9889
|
1
|
|
|
|
|
|
void viterbi::tag(const vector& forms, const vector>& analyses, cache& c, vector& tags) const { |
9890
|
2
|
0
|
|
|
|
|
if (!forms.size()) return; |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9891
|
|
|
|
|
|
|
|
9892
|
|
|
|
|
|
|
// Count number of nodes and allocate |
9893
|
|
|
|
|
|
|
unsigned nodes = 0; |
9894
|
8
|
0
|
|
|
|
|
for (unsigned i = 0, states = 1; i < forms.size(); i++) { |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9895
|
7
|
0
|
|
|
|
|
if (analyses[i].empty()) return; |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9896
|
7
|
0
|
|
|
|
|
states = (i+1 >= unsigned(decoding_order) ? states / analyses[i-decoding_order+1].size() : states) * analyses[i].size(); |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9897
|
7
|
|
|
|
|
|
nodes += states; |
9898
|
|
|
|
|
|
|
} |
9899
|
1
|
0
|
|
|
|
|
if (nodes > c.nodes.size()) c.nodes.resize(nodes); |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9900
|
|
|
|
|
|
|
|
9901
|
|
|
|
|
|
|
// Init feature sequences |
9902
|
1
|
|
|
|
|
|
features.initialize_sentence(forms, analyses, c.features_cache); |
9903
|
|
|
|
|
|
|
|
9904
|
|
|
|
|
|
|
int window_stack[16]; vector window_heap; |
9905
|
1
|
0
|
|
|
|
|
int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data()); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9906
|
|
|
|
|
|
|
typename FeatureSequences::dynamic_features dynamic; |
9907
|
|
|
|
|
|
|
feature_sequences_score score; |
9908
|
|
|
|
|
|
|
|
9909
|
|
|
|
|
|
|
// Compute all nodes score |
9910
|
|
|
|
|
|
|
int nodes_prev = -1, nodes_now = 0; |
9911
|
8
|
0
|
|
|
|
|
for (unsigned i = 0; i < forms.size(); i++) { |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9912
|
|
|
|
|
|
|
int nodes_next = nodes_now; |
9913
|
|
|
|
|
|
|
|
9914
|
28
|
0
|
|
|
|
|
for (int j = 0; j < window_size; j++) window[j] = -1; |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9915
|
17
|
0
|
|
|
|
|
for (int tag = 0; tag < int(analyses[i].size()); tag++) |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9916
|
25
|
0
|
|
|
|
|
for (int prev = nodes_prev; prev < nodes_now; prev++) { |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9917
|
|
|
|
|
|
|
// Compute predecessors and number of unchanges |
9918
|
15
|
|
|
|
|
|
int same_tags = window[0] == tag; |
9919
|
15
|
|
|
|
|
|
window[0] = tag; |
9920
|
36
|
0
|
|
|
|
|
for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9921
|
21
|
0
|
|
|
|
|
same_tags += same_tags == n && window[n] == c.nodes[p].tag; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9922
|
42
|
|
|
|
|
|
window[n] = c.nodes[p].tag; |
9923
|
|
|
|
|
|
|
} |
9924
|
|
|
|
|
|
|
|
9925
|
|
|
|
|
|
|
// Compute dynamic elementary features and score |
9926
|
15
|
0
|
|
|
|
|
features.compute_dynamic_features(i, tag, prev >= 0 ? &c.nodes[prev].dynamic : nullptr, dynamic, c.features_cache); |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9927
|
15
|
0
|
|
|
|
|
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9928
|
12
|
|
|
|
|
|
(prev >= 0 ? c.nodes[prev].score : 0); |
9929
|
|
|
|
|
|
|
|
9930
|
|
|
|
|
|
|
// Update existing node or create a new one |
9931
|
15
|
0
|
|
|
|
|
if (same_tags >= decoding_order-1) { |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9932
|
2
|
0
|
|
|
|
|
if (score <= c.nodes[nodes_next-1].score) continue; |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9933
|
|
|
|
|
|
|
nodes_next--; |
9934
|
|
|
|
|
|
|
} |
9935
|
28
|
|
|
|
|
|
c.nodes[nodes_next].tag = tag; |
9936
|
14
|
|
|
|
|
|
c.nodes[nodes_next].prev = prev; |
9937
|
14
|
|
|
|
|
|
c.nodes[nodes_next].score = score; |
9938
|
14
|
|
|
|
|
|
c.nodes[nodes_next++].dynamic = dynamic; |
9939
|
|
|
|
|
|
|
} |
9940
|
|
|
|
|
|
|
|
9941
|
|
|
|
|
|
|
nodes_prev = nodes_now; |
9942
|
|
|
|
|
|
|
nodes_now = nodes_next; |
9943
|
|
|
|
|
|
|
} |
9944
|
|
|
|
|
|
|
|
9945
|
|
|
|
|
|
|
// Choose the best ending node |
9946
|
|
|
|
|
|
|
int best = nodes_prev; |
9947
|
2
|
0
|
|
|
|
|
for (int node = nodes_prev + 1; node < nodes_now; node++) |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9948
|
1
|
0
|
|
|
|
|
if (c.nodes[node].score > c.nodes[best].score) |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9949
|
|
|
|
|
|
|
best = node; |
9950
|
|
|
|
|
|
|
|
9951
|
8
|
0
|
|
|
|
|
for (int i = forms.size() - 1; i >= 0; i--, best = c.nodes[best].prev) |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
9952
|
21
|
|
|
|
|
|
tags[i] = c.nodes[best].tag; |
9953
|
|
|
|
|
|
|
} |
9954
|
|
|
|
|
|
|
|
9955
|
|
|
|
|
|
|
} // namespace morphodita |
9956
|
|
|
|
|
|
|
|
9957
|
|
|
|
|
|
|
///////// |
9958
|
|
|
|
|
|
|
// File: morphodita/tagger/conllu_elementary_features.h |
9959
|
|
|
|
|
|
|
///////// |
9960
|
|
|
|
|
|
|
|
9961
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
9962
|
|
|
|
|
|
|
// |
9963
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
9964
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
9965
|
|
|
|
|
|
|
// |
9966
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
9967
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
9968
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
9969
|
|
|
|
|
|
|
|
9970
|
|
|
|
|
|
|
namespace morphodita { |
9971
|
|
|
|
|
|
|
|
9972
|
|
|
|
|
|
|
// Declarations |
9973
|
|
|
|
|
|
|
template |
9974
|
1
|
|
|
|
|
|
class conllu_elementary_features : public elementary_features |
9975
|
|
|
|
|
|
|
public: |
9976
|
|
|
|
|
|
|
conllu_elementary_features(); |
9977
|
|
|
|
|
|
|
|
9978
|
|
|
|
|
|
|
enum features_per_form { FORM, FOLLOWING_VERB_TAG, FOLLOWING_VERB_FORM, NUM, CAP, DASH, PREFIX1, PREFIX2, PREFIX3, PREFIX4, PREFIX5, PREFIX6, PREFIX7, PREFIX8, PREFIX9, SUFFIX1, SUFFIX2, SUFFIX3, SUFFIX4, SUFFIX5, SUFFIX6, SUFFIX7, SUFFIX8, SUFFIX9, PER_FORM_TOTAL }; |
9979
|
|
|
|
|
|
|
enum features_per_tag { TAG, TAG_UPOS, TAG_CASE, TAG_GENDER, TAG_NUMBER, TAG_NEGATIVE, TAG_PERSON, LEMMA, PER_TAG_TOTAL }; |
9980
|
|
|
|
|
|
|
enum features_dynamic { PREVIOUS_VERB_TAG, PREVIOUS_VERB_FORM, PREVIOUS_OR_CURRENT_VERB_TAG, PREVIOUS_OR_CURRENT_VERB_FORM, DYNAMIC_TOTAL }; |
9981
|
|
|
|
|
|
|
enum features_map { MAP_NONE = -1, MAP_FORM, MAP_PREFIX1, MAP_PREFIX2, MAP_PREFIX3, MAP_PREFIX4, MAP_PREFIX5, MAP_PREFIX6, MAP_PREFIX7, MAP_PREFIX8, MAP_PREFIX9, MAP_SUFFIX1, MAP_SUFFIX2, MAP_SUFFIX3, MAP_SUFFIX4, MAP_SUFFIX5, MAP_SUFFIX6, MAP_SUFFIX7, MAP_SUFFIX8, MAP_SUFFIX9, MAP_TAG, MAP_TAG_UPOS, MAP_TAG_CASE, MAP_TAG_GENDER, MAP_TAG_NUMBER, MAP_TAG_NEGATIVE, MAP_TAG_PERSON, MAP_LEMMA, MAP_TOTAL } ; |
9982
|
|
|
|
|
|
|
|
9983
|
|
|
|
|
|
|
struct per_form_features { elementary_feature_value values[PER_FORM_TOTAL]; }; |
9984
|
|
|
|
|
|
|
struct per_tag_features { elementary_feature_value values[PER_TAG_TOTAL]; }; |
9985
|
|
|
|
|
|
|
struct dynamic_features { elementary_feature_value values[DYNAMIC_TOTAL]; }; |
9986
|
|
|
|
|
|
|
|
9987
|
|
|
|
|
|
|
static vector descriptions; |
9988
|
|
|
|
|
|
|
|
9989
|
|
|
|
|
|
|
void compute_features(const vector& forms, const vector>& analyses, vector& per_form, vector>& per_tag) const; |
9990
|
|
|
|
|
|
|
inline void compute_dynamic_features(const tagged_lemma& tag, const per_form_features& per_form, const per_tag_features& per_tag, const dynamic_features* prev_dynamic, dynamic_features& dynamic) const; |
9991
|
|
|
|
|
|
|
|
9992
|
|
|
|
|
|
|
using elementary_features |
9993
|
|
|
|
|
|
|
}; |
9994
|
|
|
|
|
|
|
|
9995
|
|
|
|
|
|
|
typedef conllu_elementary_features persistent_conllu_elementary_features; |
9996
|
|
|
|
|
|
|
|
9997
|
|
|
|
|
|
|
// Definitions |
9998
|
|
|
|
|
|
|
template |
9999
|
1
|
|
|
|
|
|
conllu_elementary_features |
10000
|
1
|
0
|
|
|
|
|
maps.resize(MAP_TOTAL); |
|
|
50
|
|
|
|
|
|
10001
|
1
|
|
|
|
|
|
} |
10002
|
|
|
|
|
|
|
|
10003
|
|
|
|
|
|
|
template |
10004
|
70
|
50
|
|
|
|
|
vector conllu_elementary_features |
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
10005
|
|
|
|
|
|
|
{"Form", PER_FORM, ANY_OFFSET, FORM, MAP_FORM}, |
10006
|
|
|
|
|
|
|
{"FollowingVerbTag", PER_FORM, ANY_OFFSET, FOLLOWING_VERB_TAG, MAP_TAG}, |
10007
|
|
|
|
|
|
|
{"FollowingVerbForm", PER_FORM, ANY_OFFSET, FOLLOWING_VERB_FORM, MAP_FORM}, |
10008
|
|
|
|
|
|
|
{"Num", PER_FORM, ONLY_CURRENT, NUM, MAP_NONE}, |
10009
|
|
|
|
|
|
|
{"Cap", PER_FORM, ONLY_CURRENT, CAP, MAP_NONE}, |
10010
|
|
|
|
|
|
|
{"Dash", PER_FORM, ONLY_CURRENT, DASH, MAP_NONE}, |
10011
|
|
|
|
|
|
|
{"Prefix1", PER_FORM, ONLY_CURRENT, PREFIX1, MAP_PREFIX1}, |
10012
|
|
|
|
|
|
|
{"Prefix2", PER_FORM, ONLY_CURRENT, PREFIX2, MAP_PREFIX2}, |
10013
|
|
|
|
|
|
|
{"Prefix3", PER_FORM, ONLY_CURRENT, PREFIX3, MAP_PREFIX3}, |
10014
|
|
|
|
|
|
|
{"Prefix4", PER_FORM, ONLY_CURRENT, PREFIX4, MAP_PREFIX4}, |
10015
|
|
|
|
|
|
|
{"Prefix5", PER_FORM, ONLY_CURRENT, PREFIX5, MAP_PREFIX5}, |
10016
|
|
|
|
|
|
|
{"Prefix6", PER_FORM, ONLY_CURRENT, PREFIX6, MAP_PREFIX6}, |
10017
|
|
|
|
|
|
|
{"Prefix7", PER_FORM, ONLY_CURRENT, PREFIX7, MAP_PREFIX7}, |
10018
|
|
|
|
|
|
|
{"Prefix8", PER_FORM, ONLY_CURRENT, PREFIX8, MAP_PREFIX8}, |
10019
|
|
|
|
|
|
|
{"Prefix9", PER_FORM, ONLY_CURRENT, PREFIX9, MAP_PREFIX9}, |
10020
|
|
|
|
|
|
|
{"Suffix1", PER_FORM, ONLY_CURRENT, SUFFIX1, MAP_SUFFIX1}, |
10021
|
|
|
|
|
|
|
{"Suffix2", PER_FORM, ONLY_CURRENT, SUFFIX2, MAP_SUFFIX2}, |
10022
|
|
|
|
|
|
|
{"Suffix3", PER_FORM, ONLY_CURRENT, SUFFIX3, MAP_SUFFIX3}, |
10023
|
|
|
|
|
|
|
{"Suffix4", PER_FORM, ONLY_CURRENT, SUFFIX4, MAP_SUFFIX4}, |
10024
|
|
|
|
|
|
|
{"Suffix5", PER_FORM, ONLY_CURRENT, SUFFIX5, MAP_SUFFIX5}, |
10025
|
|
|
|
|
|
|
{"Suffix6", PER_FORM, ONLY_CURRENT, SUFFIX6, MAP_SUFFIX6}, |
10026
|
|
|
|
|
|
|
{"Suffix7", PER_FORM, ONLY_CURRENT, SUFFIX7, MAP_SUFFIX7}, |
10027
|
|
|
|
|
|
|
{"Suffix8", PER_FORM, ONLY_CURRENT, SUFFIX8, MAP_SUFFIX8}, |
10028
|
|
|
|
|
|
|
{"Suffix9", PER_FORM, ONLY_CURRENT, SUFFIX9, MAP_SUFFIX9}, |
10029
|
|
|
|
|
|
|
|
10030
|
|
|
|
|
|
|
{"Tag", PER_TAG, ANY_OFFSET, TAG, MAP_TAG}, |
10031
|
|
|
|
|
|
|
{"TagUPos", PER_TAG, ANY_OFFSET, TAG_UPOS, MAP_TAG_UPOS}, |
10032
|
|
|
|
|
|
|
{"TagCase", PER_TAG, ANY_OFFSET, TAG_CASE, MAP_TAG_CASE}, |
10033
|
|
|
|
|
|
|
{"TagGender", PER_TAG, ANY_OFFSET, TAG_GENDER, MAP_TAG_GENDER}, |
10034
|
|
|
|
|
|
|
{"TagNumber", PER_TAG, ANY_OFFSET, TAG_NUMBER, MAP_TAG_NUMBER}, |
10035
|
|
|
|
|
|
|
{"TagNegative", PER_TAG, ANY_OFFSET, TAG_NEGATIVE, MAP_TAG_NEGATIVE}, |
10036
|
|
|
|
|
|
|
{"TagPerson", PER_TAG, ANY_OFFSET, TAG_PERSON, MAP_TAG_PERSON}, |
10037
|
|
|
|
|
|
|
{"Lemma", PER_TAG, ANY_OFFSET, LEMMA, MAP_LEMMA}, |
10038
|
|
|
|
|
|
|
|
10039
|
|
|
|
|
|
|
{"PreviousVerbTag", DYNAMIC, ANY_OFFSET, PREVIOUS_VERB_TAG, MAP_TAG}, |
10040
|
|
|
|
|
|
|
{"PreviousVerbForm", DYNAMIC, ANY_OFFSET, PREVIOUS_VERB_FORM, MAP_FORM}, |
10041
|
|
|
|
|
|
|
}; |
10042
|
|
|
|
|
|
|
|
10043
|
|
|
|
|
|
|
template |
10044
|
1
|
|
|
|
|
|
void conllu_elementary_features |
10045
|
|
|
|
|
|
|
using namespace unilib; |
10046
|
|
|
|
|
|
|
|
10047
|
|
|
|
|
|
|
// We process the sentence in reverse order, so that we can compute FollowingVerbTag and FollowingVerbLemma directly. |
10048
|
|
|
|
|
|
|
elementary_feature_value following_verb_tag = elementary_feature_empty, following_verb_form = elementary_feature_empty; |
10049
|
8
|
100
|
|
|
|
|
for (unsigned i = forms.size(); i--;) { |
|
|
0
|
|
|
|
|
|
10050
|
|
|
|
|
|
|
int verb_candidate = -1; |
10051
|
|
|
|
|
|
|
|
10052
|
|
|
|
|
|
|
// Per_tag features and verb_candidate |
10053
|
17
|
100
|
|
|
|
|
for (unsigned j = 0; j < analyses[i].size(); j++) { |
|
|
0
|
|
|
|
|
|
10054
|
10
|
|
|
|
|
|
const string& tag = analyses[i][j].tag; |
10055
|
10
|
|
|
|
|
|
const string& lemma = analyses[i][j].lemma; |
10056
|
|
|
|
|
|
|
|
10057
|
|
|
|
|
|
|
// Tag consists of three parts separated by tag[0] character |
10058
|
|
|
|
|
|
|
// - first is TAG_UPOS, |
10059
|
|
|
|
|
|
|
// - second is TAG_LPOS, |
10060
|
|
|
|
|
|
|
// - then there is any number of | separated named fields in format Name=Value |
10061
|
0
|
|
|
|
|
|
per_tag[i][j].values[TAG] = maps[MAP_TAG].value(tag.c_str(), tag.size()); |
10062
|
10
|
|
|
|
|
|
per_tag[i][j].values[TAG_UPOS] = per_tag[i][j].values[TAG_CASE] = per_tag[i][j].values[TAG_GENDER] = elementary_feature_empty; |
10063
|
10
|
|
|
|
|
|
per_tag[i][j].values[TAG_NUMBER] = per_tag[i][j].values[TAG_NEGATIVE] = per_tag[i][j].values[TAG_PERSON] = elementary_feature_empty; |
10064
|
10
|
100
|
|
|
|
|
per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == lemma ? per_tag[i][j-1].values[LEMMA] : |
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
10065
|
|
|
|
|
|
|
maps[MAP_LEMMA].value(lemma.c_str(), lemma.size()); |
10066
|
|
|
|
|
|
|
|
10067
|
10
|
|
|
|
|
|
char separator = tag[0]; |
10068
|
10
|
|
|
|
|
|
size_t index = tag.find(separator, 1); |
10069
|
10
|
50
|
|
|
|
|
if (index == string::npos) index = tag.size(); |
|
|
0
|
|
|
|
|
|
10070
|
10
|
50
|
|
|
|
|
per_tag[i][j].values[TAG_UPOS] = maps[MAP_TAG_UPOS].value(tag.c_str() + (index ? 1 : 0), index - (index ? 1 : 0)); |
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
10071
|
|
|
|
|
|
|
|
10072
|
10
|
50
|
|
|
|
|
if (index < tag.size()) index++; |
|
|
0
|
|
|
|
|
|
10073
|
10
|
50
|
|
|
|
|
if (index < tag.size()) index = tag.find(separator, index); |
|
|
0
|
|
|
|
|
|
10074
|
10
|
50
|
|
|
|
|
if (index < tag.size()) index++; |
|
|
0
|
|
|
|
|
|
10075
|
50
|
100
|
|
|
|
|
for (size_t length; index < tag.size(); index += length + 1) { |
|
|
0
|
|
|
|
|
|
10076
|
40
|
|
|
|
|
|
length = tag.find('|', index); |
10077
|
40
|
100
|
|
|
|
|
length = (length == string::npos ? tag.size() : length) - index; |
|
|
0
|
|
|
|
|
|
10078
|
|
|
|
|
|
|
|
10079
|
280
|
50
|
|
|
|
|
for (size_t equal_sign = 0; equal_sign + 1 < length; equal_sign++) |
|
|
0
|
|
|
|
|
|
10080
|
280
|
100
|
|
|
|
|
if (tag[index + equal_sign] == '=') { |
|
|
0
|
|
|
|
|
|
10081
|
|
|
|
|
|
|
int value = -1, map; |
10082
|
40
|
|
|
|
|
|
switch (equal_sign) { |
10083
|
|
|
|
|
|
|
case 4: |
10084
|
6
|
100
|
|
|
|
|
if (tag.compare(index, equal_sign, "Case") == 0) value = TAG_CASE, map = MAP_TAG_CASE; |
|
|
0
|
|
|
|
|
|
10085
|
|
|
|
|
|
|
break; |
10086
|
|
|
|
|
|
|
case 6: |
10087
|
16
|
100
|
|
|
|
|
if (tag.compare(index, equal_sign, "Gender") == 0) value = TAG_GENDER, map = MAP_TAG_GENDER; |
|
|
0
|
|
|
|
|
|
10088
|
16
|
100
|
|
|
|
|
if (tag.compare(index, equal_sign, "Number") == 0) value = TAG_NUMBER, map = MAP_TAG_NUMBER; |
|
|
0
|
|
|
|
|
|
10089
|
16
|
100
|
|
|
|
|
if (tag.compare(index, equal_sign, "Person") == 0) value = TAG_PERSON, map = MAP_TAG_PERSON; |
|
|
0
|
|
|
|
|
|
10090
|
|
|
|
|
|
|
break; |
10091
|
|
|
|
|
|
|
case 8: |
10092
|
10
|
100
|
|
|
|
|
if (tag.compare(index, equal_sign, "Negative") == 0) value = TAG_NEGATIVE, map = MAP_TAG_NEGATIVE; |
|
|
0
|
|
|
|
|
|
10093
|
|
|
|
|
|
|
break; |
10094
|
|
|
|
|
|
|
} |
10095
|
|
|
|
|
|
|
|
10096
|
40
|
100
|
|
|
|
|
if (value >= 0) |
|
|
0
|
|
|
|
|
|
10097
|
19
|
|
|
|
|
|
per_tag[i][j].values[value] = maps[map].value(tag.c_str() + index + equal_sign + 1, length - equal_sign - 1); |
10098
|
|
|
|
|
|
|
break; |
10099
|
|
|
|
|
|
|
} |
10100
|
|
|
|
|
|
|
} |
10101
|
|
|
|
|
|
|
|
10102
|
10
|
50
|
|
|
|
|
if (tag.size() >= 2 && tag[1] == 'V') { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
10103
|
|
|
|
|
|
|
int tag_compare; |
10104
|
5
|
100
|
|
|
|
|
verb_candidate = verb_candidate < 0 || (tag_compare = tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate; |
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
10105
|
|
|
|
|
|
|
} |
10106
|
|
|
|
|
|
|
} |
10107
|
|
|
|
|
|
|
|
10108
|
|
|
|
|
|
|
// Per_form features |
10109
|
0
|
|
|
|
|
|
per_form[i].values[FORM] = maps[MAP_FORM].value(forms[i].str, forms[i].len); |
10110
|
7
|
|
|
|
|
|
per_form[i].values[FOLLOWING_VERB_TAG] = following_verb_tag; |
10111
|
7
|
|
|
|
|
|
per_form[i].values[FOLLOWING_VERB_FORM] = following_verb_form; |
10112
|
|
|
|
|
|
|
|
10113
|
|
|
|
|
|
|
// Update following_verb_{tag,lemma} _after_ filling FOLLOWING_VERB_{TAG,LEMMA}. |
10114
|
7
|
|
|
|
|
|
if (verb_candidate >= 0) { |
10115
|
4
|
|
|
|
|
|
following_verb_tag = per_tag[i][verb_candidate].values[TAG]; |
10116
|
2
|
|
|
|
|
|
following_verb_form = per_form[i].values[FORM]; |
10117
|
|
|
|
|
|
|
} |
10118
|
|
|
|
|
|
|
|
10119
|
|
|
|
|
|
|
// Ortographic per_form features if needed |
10120
|
7
|
100
|
|
|
|
|
if (analyses[i].size() == 1) { |
|
|
0
|
|
|
|
|
|
10121
|
5
|
|
|
|
|
|
per_form[i].values[NUM] = per_form[i].values[CAP] = per_form[i].values[DASH] = elementary_feature_unknown; |
10122
|
5
|
|
|
|
|
|
per_form[i].values[PREFIX1] = per_form[i].values[PREFIX2] = per_form[i].values[PREFIX3] = elementary_feature_unknown; |
10123
|
5
|
|
|
|
|
|
per_form[i].values[PREFIX4] = per_form[i].values[PREFIX5] = per_form[i].values[PREFIX6] = elementary_feature_unknown; |
10124
|
5
|
|
|
|
|
|
per_form[i].values[PREFIX7] = per_form[i].values[PREFIX8] = per_form[i].values[PREFIX9] = elementary_feature_unknown; |
10125
|
5
|
|
|
|
|
|
per_form[i].values[SUFFIX1] = per_form[i].values[SUFFIX2] = per_form[i].values[SUFFIX3] = elementary_feature_unknown; |
10126
|
5
|
|
|
|
|
|
per_form[i].values[SUFFIX4] = per_form[i].values[SUFFIX5] = per_form[i].values[SUFFIX6] = elementary_feature_unknown; |
10127
|
5
|
|
|
|
|
|
per_form[i].values[SUFFIX7] = per_form[i].values[SUFFIX8] = per_form[i].values[SUFFIX9] = elementary_feature_unknown; |
10128
|
2
|
50
|
|
|
|
|
} else if (forms[i].len <= 0) { |
|
|
0
|
|
|
|
|
|
10129
|
0
|
|
|
|
|
|
per_form[i].values[NUM] = per_form[i].values[CAP] = per_form[i].values[DASH] = elementary_feature_empty + 1; |
10130
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX1] = per_form[i].values[PREFIX2] = per_form[i].values[PREFIX3] = elementary_feature_empty; |
10131
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX4] = per_form[i].values[PREFIX5] = per_form[i].values[PREFIX6] = elementary_feature_empty; |
10132
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX7] = per_form[i].values[PREFIX8] = per_form[i].values[PREFIX9] = elementary_feature_empty; |
10133
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX1] = per_form[i].values[SUFFIX2] = per_form[i].values[SUFFIX3] = elementary_feature_empty; |
10134
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX4] = per_form[i].values[SUFFIX5] = per_form[i].values[SUFFIX6] = elementary_feature_empty; |
10135
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX7] = per_form[i].values[SUFFIX8] = per_form[i].values[SUFFIX9] = elementary_feature_empty; |
10136
|
|
|
|
|
|
|
} else { |
10137
|
2
|
|
|
|
|
|
string_piece form = forms[i]; |
10138
|
2
|
|
|
|
|
|
const char* form_start = form.str; |
10139
|
|
|
|
|
|
|
|
10140
|
|
|
|
|
|
|
bool num = false, cap = false, dash = false; |
10141
|
18
|
|
|
|
|
|
size_t indices[18] = {0, form.len, form.len, form.len, form.len, form.len, form.len, form.len, form.len, form.len, 0, 0, 0, 0, 0, 0, 0, 0}; // careful here regarding forms shorter than 9 characters |
10142
|
|
|
|
|
|
|
int index = 0; |
10143
|
18
|
100
|
|
|
|
|
while (form.len) { |
|
|
0
|
|
|
|
|
|
10144
|
16
|
|
|
|
|
|
indices[(index++) % 18] = form.str - form_start; |
10145
|
|
|
|
|
|
|
|
10146
|
16
|
|
|
|
|
|
unicode::category_t cat = unicode::category(utf8::decode(form.str, form.len)); |
10147
|
16
|
50
|
|
|
|
|
num = num || cat & unicode::N; |
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
10148
|
16
|
100
|
|
|
|
|
cap = cap || cat & unicode::Lut; |
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
10149
|
16
|
50
|
|
|
|
|
dash = dash || cat & unicode::Pd; |
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
10150
|
|
|
|
|
|
|
|
10151
|
16
|
50
|
|
|
|
|
if (index == 10 || (!form.len && index < 10)) { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
10152
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX1] = maps[MAP_PREFIX1].value(form_start, indices[1]); |
10153
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX2] = maps[MAP_PREFIX2].value(form_start, indices[2]); |
10154
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX3] = maps[MAP_PREFIX3].value(form_start, indices[3]); |
10155
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX4] = maps[MAP_PREFIX4].value(form_start, indices[4]); |
10156
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX5] = maps[MAP_PREFIX5].value(form_start, indices[5]); |
10157
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX6] = maps[MAP_PREFIX6].value(form_start, indices[6]); |
10158
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX7] = maps[MAP_PREFIX7].value(form_start, indices[7]); |
10159
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX8] = maps[MAP_PREFIX8].value(form_start, indices[8]); |
10160
|
2
|
|
|
|
|
|
per_form[i].values[PREFIX9] = maps[MAP_PREFIX9].value(form_start, indices[9]); |
10161
|
|
|
|
|
|
|
} |
10162
|
|
|
|
|
|
|
} |
10163
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX1] = maps[MAP_SUFFIX1].value(form_start + indices[(index+18-1) % 18], form.str - form_start - indices[(index+18-1) % 18]); |
10164
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX2] = maps[MAP_SUFFIX2].value(form_start + indices[(index+18-2) % 18], form.str - form_start - indices[(index+18-2) % 18]); |
10165
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX3] = maps[MAP_SUFFIX3].value(form_start + indices[(index+18-3) % 18], form.str - form_start - indices[(index+18-3) % 18]); |
10166
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX4] = maps[MAP_SUFFIX4].value(form_start + indices[(index+18-4) % 18], form.str - form_start - indices[(index+18-4) % 18]); |
10167
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX5] = maps[MAP_SUFFIX5].value(form_start + indices[(index+18-5) % 18], form.str - form_start - indices[(index+18-5) % 18]); |
10168
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX6] = maps[MAP_SUFFIX6].value(form_start + indices[(index+18-6) % 18], form.str - form_start - indices[(index+18-6) % 18]); |
10169
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX7] = maps[MAP_SUFFIX7].value(form_start + indices[(index+18-7) % 18], form.str - form_start - indices[(index+18-7) % 18]); |
10170
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX8] = maps[MAP_SUFFIX8].value(form_start + indices[(index+18-8) % 18], form.str - form_start - indices[(index+18-8) % 18]); |
10171
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX9] = maps[MAP_SUFFIX9].value(form_start + indices[(index+18-9) % 18], form.str - form_start - indices[(index+18-9) % 18]); |
10172
|
2
|
|
|
|
|
|
per_form[i].values[NUM] = elementary_feature_empty + 1 + num; |
10173
|
2
|
|
|
|
|
|
per_form[i].values[CAP] = elementary_feature_empty + 1 + cap; |
10174
|
2
|
|
|
|
|
|
per_form[i].values[DASH] = elementary_feature_empty + 1 + dash; |
10175
|
|
|
|
|
|
|
} |
10176
|
|
|
|
|
|
|
} |
10177
|
1
|
|
|
|
|
|
} |
10178
|
|
|
|
|
|
|
|
10179
|
|
|
|
|
|
|
template |
10180
|
|
|
|
|
|
|
void conllu_elementary_features |
10181
|
15
|
100
|
|
|
|
|
if (prev_dynamic) { |
|
|
0
|
|
|
|
|
|
10182
|
12
|
|
|
|
|
|
dynamic.values[PREVIOUS_VERB_TAG] = prev_dynamic->values[PREVIOUS_OR_CURRENT_VERB_TAG]; |
10183
|
12
|
|
|
|
|
|
dynamic.values[PREVIOUS_VERB_FORM] = prev_dynamic->values[PREVIOUS_OR_CURRENT_VERB_FORM]; |
10184
|
|
|
|
|
|
|
} else { |
10185
|
3
|
|
|
|
|
|
dynamic.values[PREVIOUS_VERB_TAG] = elementary_feature_empty; |
10186
|
3
|
|
|
|
|
|
dynamic.values[PREVIOUS_VERB_FORM] = elementary_feature_empty; |
10187
|
|
|
|
|
|
|
} |
10188
|
|
|
|
|
|
|
|
10189
|
15
|
50
|
|
|
|
|
if (tag.tag.size() >= 2 && tag.tag[1] == 'V') { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
10190
|
4
|
|
|
|
|
|
dynamic.values[PREVIOUS_OR_CURRENT_VERB_TAG] = per_tag.values[TAG]; |
10191
|
4
|
|
|
|
|
|
dynamic.values[PREVIOUS_OR_CURRENT_VERB_FORM] = per_form.values[FORM]; |
10192
|
|
|
|
|
|
|
} else { |
10193
|
11
|
|
|
|
|
|
dynamic.values[PREVIOUS_OR_CURRENT_VERB_TAG] = dynamic.values[PREVIOUS_VERB_TAG]; |
10194
|
11
|
|
|
|
|
|
dynamic.values[PREVIOUS_OR_CURRENT_VERB_FORM] = dynamic.values[PREVIOUS_VERB_FORM]; |
10195
|
|
|
|
|
|
|
} |
10196
|
|
|
|
|
|
|
} |
10197
|
|
|
|
|
|
|
|
10198
|
|
|
|
|
|
|
} // namespace morphodita |
10199
|
|
|
|
|
|
|
|
10200
|
|
|
|
|
|
|
///////// |
10201
|
|
|
|
|
|
|
// File: morphodita/tagger/czech_elementary_features.h |
10202
|
|
|
|
|
|
|
///////// |
10203
|
|
|
|
|
|
|
|
10204
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
10205
|
|
|
|
|
|
|
// |
10206
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
10207
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
10208
|
|
|
|
|
|
|
// |
10209
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
10210
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
10211
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
10212
|
|
|
|
|
|
|
|
10213
|
|
|
|
|
|
|
namespace morphodita { |
10214
|
|
|
|
|
|
|
|
10215
|
|
|
|
|
|
|
// Declarations |
10216
|
|
|
|
|
|
|
template |
10217
|
0
|
|
|
|
|
|
class czech_elementary_features : public elementary_features |
10218
|
|
|
|
|
|
|
public: |
10219
|
|
|
|
|
|
|
czech_elementary_features(); |
10220
|
|
|
|
|
|
|
|
10221
|
|
|
|
|
|
|
enum features_per_form { FORM, FOLLOWING_VERB_TAG, FOLLOWING_VERB_LEMMA, NUM, CAP, DASH, PREFIX1, PREFIX2, PREFIX3, PREFIX4, SUFFIX1, SUFFIX2, SUFFIX3, SUFFIX4, PER_FORM_TOTAL }; |
10222
|
|
|
|
|
|
|
enum features_per_tag { TAG, TAG3, TAG5, TAG25, LEMMA, PER_TAG_TOTAL }; |
10223
|
|
|
|
|
|
|
enum features_dynamic { PREVIOUS_VERB_TAG, PREVIOUS_VERB_LEMMA, PREVIOUS_OR_CURRENT_VERB_TAG, PREVIOUS_OR_CURRENT_VERB_LEMMA, DYNAMIC_TOTAL }; |
10224
|
|
|
|
|
|
|
enum features_map { MAP_NONE = -1, MAP_FORM, MAP_LEMMA, MAP_PREFIX1, MAP_PREFIX2, MAP_PREFIX3, MAP_PREFIX4, MAP_SUFFIX1, MAP_SUFFIX2, MAP_SUFFIX3, MAP_SUFFIX4, MAP_TAG, MAP_TAG3, MAP_TAG5, MAP_TAG25, MAP_TOTAL } ; |
10225
|
|
|
|
|
|
|
|
10226
|
|
|
|
|
|
|
struct per_form_features { elementary_feature_value values[PER_FORM_TOTAL]; }; |
10227
|
|
|
|
|
|
|
struct per_tag_features { elementary_feature_value values[PER_TAG_TOTAL]; }; |
10228
|
|
|
|
|
|
|
struct dynamic_features { elementary_feature_value values[DYNAMIC_TOTAL]; }; |
10229
|
|
|
|
|
|
|
|
10230
|
|
|
|
|
|
|
static vector descriptions; |
10231
|
|
|
|
|
|
|
|
10232
|
|
|
|
|
|
|
void compute_features(const vector& forms, const vector>& analyses, vector& per_form, vector>& per_tag) const; |
10233
|
|
|
|
|
|
|
inline void compute_dynamic_features(const tagged_lemma& tag, const per_form_features& per_form, const per_tag_features& per_tag, const dynamic_features* prev_dynamic, dynamic_features& dynamic) const; |
10234
|
|
|
|
|
|
|
|
10235
|
|
|
|
|
|
|
using elementary_features |
10236
|
|
|
|
|
|
|
}; |
10237
|
|
|
|
|
|
|
|
10238
|
|
|
|
|
|
|
typedef czech_elementary_features persistent_czech_elementary_features; |
10239
|
|
|
|
|
|
|
|
10240
|
|
|
|
|
|
|
// Definitions |
10241
|
|
|
|
|
|
|
template |
10242
|
0
|
|
|
|
|
|
czech_elementary_features |
10243
|
0
|
0
|
|
|
|
|
maps.resize(MAP_TOTAL); |
10244
|
0
|
|
|
|
|
|
} |
10245
|
|
|
|
|
|
|
|
10246
|
|
|
|
|
|
|
template |
10247
|
|
|
|
|
|
|
vector czech_elementary_features |
10248
|
|
|
|
|
|
|
{"Form", PER_FORM, ANY_OFFSET, FORM, MAP_FORM}, |
10249
|
|
|
|
|
|
|
{"FollowingVerbTag", PER_FORM, ANY_OFFSET, FOLLOWING_VERB_TAG, MAP_TAG}, |
10250
|
|
|
|
|
|
|
{"FollowingVerbLemma", PER_FORM, ANY_OFFSET, FOLLOWING_VERB_LEMMA, MAP_LEMMA }, |
10251
|
|
|
|
|
|
|
{"Num", PER_FORM, ONLY_CURRENT, NUM, MAP_NONE}, |
10252
|
|
|
|
|
|
|
{"Cap", PER_FORM, ONLY_CURRENT, CAP, MAP_NONE}, |
10253
|
|
|
|
|
|
|
{"Dash", PER_FORM, ONLY_CURRENT, DASH, MAP_NONE}, |
10254
|
|
|
|
|
|
|
{"Prefix1", PER_FORM, ONLY_CURRENT, PREFIX1, MAP_PREFIX1}, |
10255
|
|
|
|
|
|
|
{"Prefix2", PER_FORM, ONLY_CURRENT, PREFIX2, MAP_PREFIX2}, |
10256
|
|
|
|
|
|
|
{"Prefix3", PER_FORM, ONLY_CURRENT, PREFIX3, MAP_PREFIX3}, |
10257
|
|
|
|
|
|
|
{"Prefix4", PER_FORM, ONLY_CURRENT, PREFIX4, MAP_PREFIX4}, |
10258
|
|
|
|
|
|
|
{"Suffix1", PER_FORM, ONLY_CURRENT, SUFFIX1, MAP_SUFFIX1}, |
10259
|
|
|
|
|
|
|
{"Suffix2", PER_FORM, ONLY_CURRENT, SUFFIX2, MAP_SUFFIX2}, |
10260
|
|
|
|
|
|
|
{"Suffix3", PER_FORM, ONLY_CURRENT, SUFFIX3, MAP_SUFFIX3}, |
10261
|
|
|
|
|
|
|
{"Suffix4", PER_FORM, ONLY_CURRENT, SUFFIX4, MAP_SUFFIX4}, |
10262
|
|
|
|
|
|
|
|
10263
|
|
|
|
|
|
|
{"Tag", PER_TAG, ANY_OFFSET, TAG, MAP_TAG}, |
10264
|
|
|
|
|
|
|
{"Tag3", PER_TAG, ANY_OFFSET, TAG3, MAP_TAG3}, |
10265
|
|
|
|
|
|
|
{"Tag5", PER_TAG, ANY_OFFSET, TAG5, MAP_TAG5}, |
10266
|
|
|
|
|
|
|
{"Tag25", PER_TAG, ANY_OFFSET, TAG25, MAP_TAG25}, |
10267
|
|
|
|
|
|
|
{"Lemma", PER_TAG, ANY_OFFSET, LEMMA, MAP_LEMMA}, |
10268
|
|
|
|
|
|
|
|
10269
|
|
|
|
|
|
|
{"PreviousVerbTag", DYNAMIC, ANY_OFFSET, PREVIOUS_VERB_TAG, MAP_TAG}, |
10270
|
|
|
|
|
|
|
{"PreviousVerbLemma", DYNAMIC, ANY_OFFSET, PREVIOUS_VERB_LEMMA, MAP_LEMMA} |
10271
|
|
|
|
|
|
|
}; |
10272
|
|
|
|
|
|
|
|
10273
|
|
|
|
|
|
|
template |
10274
|
0
|
|
|
|
|
|
void czech_elementary_features |
10275
|
|
|
|
|
|
|
using namespace unilib; |
10276
|
|
|
|
|
|
|
|
10277
|
|
|
|
|
|
|
// We process the sentence in reverse order, so that we can compute FollowingVerbTag and FollowingVerbLemma directly. |
10278
|
|
|
|
|
|
|
elementary_feature_value following_verb_tag = elementary_feature_empty, following_verb_lemma = elementary_feature_empty; |
10279
|
0
|
0
|
|
|
|
|
for (unsigned i = forms.size(); i--;) { |
10280
|
|
|
|
|
|
|
int verb_candidate = -1; |
10281
|
|
|
|
|
|
|
|
10282
|
|
|
|
|
|
|
// Per_tag features and verb_candidate |
10283
|
0
|
0
|
|
|
|
|
for (unsigned j = 0; j < analyses[i].size(); j++) { |
10284
|
|
|
|
|
|
|
char tag25[2]; |
10285
|
0
|
|
|
|
|
|
per_tag[i][j].values[TAG] = maps[MAP_TAG].value(analyses[i][j].tag.c_str(), analyses[i][j].tag.size()); |
10286
|
0
|
0
|
|
|
|
|
per_tag[i][j].values[TAG3] = analyses[i][j].tag.size() >= 3 ? maps[MAP_TAG3].value(analyses[i][j].tag.c_str() + 2, 1) : elementary_feature_empty; |
10287
|
0
|
0
|
|
|
|
|
per_tag[i][j].values[TAG5] = analyses[i][j].tag.size() >= 5 ? maps[MAP_TAG5].value(analyses[i][j].tag.c_str() + 4, 1) : elementary_feature_empty; |
10288
|
0
|
0
|
|
|
|
|
per_tag[i][j].values[TAG25] = analyses[i][j].tag.size() >= 5 ? maps[MAP_TAG25].value((tag25[0] = analyses[i][j].tag[1], tag25[1] = analyses[i][j].tag[4], tag25), 2) : elementary_feature_empty; |
10289
|
0
|
0
|
|
|
|
|
per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == analyses[i][j].lemma ? per_tag[i][j-1].values[LEMMA] : |
|
|
0
|
|
|
|
|
|
10290
|
|
|
|
|
|
|
maps[MAP_LEMMA].value(analyses[i][j].lemma.c_str(), analyses[i][j].lemma.size()); |
10291
|
|
|
|
|
|
|
|
10292
|
0
|
0
|
|
|
|
|
if (analyses[i][j].tag[0] == 'V') { |
10293
|
|
|
|
|
|
|
int tag_compare; |
10294
|
0
|
0
|
|
|
|
|
verb_candidate = verb_candidate < 0 || (tag_compare = analyses[i][j].tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && analyses[i][j].lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate; |
|
|
0
|
|
|
|
|
|
10295
|
|
|
|
|
|
|
} |
10296
|
|
|
|
|
|
|
} |
10297
|
|
|
|
|
|
|
|
10298
|
|
|
|
|
|
|
// Per_form features |
10299
|
0
|
|
|
|
|
|
per_form[i].values[FORM] = maps[MAP_FORM].value(forms[i].str, forms[i].len); |
10300
|
0
|
|
|
|
|
|
per_form[i].values[FOLLOWING_VERB_TAG] = following_verb_tag; |
10301
|
0
|
|
|
|
|
|
per_form[i].values[FOLLOWING_VERB_LEMMA] = following_verb_lemma; |
10302
|
|
|
|
|
|
|
|
10303
|
|
|
|
|
|
|
// Update following_verb_{tag,lemma} _after_ filling FOLLOWING_VERB_{TAG,LEMMA}. |
10304
|
0
|
0
|
|
|
|
|
if (verb_candidate >= 0) { |
10305
|
0
|
|
|
|
|
|
following_verb_tag = per_tag[i][verb_candidate].values[TAG]; |
10306
|
0
|
|
|
|
|
|
following_verb_lemma = per_tag[i][verb_candidate].values[LEMMA]; |
10307
|
|
|
|
|
|
|
} |
10308
|
|
|
|
|
|
|
|
10309
|
|
|
|
|
|
|
// Ortographic per_form features if needed |
10310
|
0
|
0
|
|
|
|
|
if (analyses[i].size() == 1) { |
10311
|
0
|
|
|
|
|
|
per_form[i].values[NUM] = per_form[i].values[CAP] = per_form[i].values[DASH] = elementary_feature_unknown; |
10312
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX1] = per_form[i].values[PREFIX2] = per_form[i].values[PREFIX3] = per_form[i].values[PREFIX4] = elementary_feature_unknown; |
10313
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX1] = per_form[i].values[SUFFIX2] = per_form[i].values[SUFFIX3] = per_form[i].values[SUFFIX4] = elementary_feature_unknown; |
10314
|
0
|
0
|
|
|
|
|
} else if (forms[i].len <= 0) { |
10315
|
0
|
|
|
|
|
|
per_form[i].values[NUM] = per_form[i].values[CAP] = per_form[i].values[DASH] = elementary_feature_empty + 1; |
10316
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX1] = per_form[i].values[PREFIX2] = per_form[i].values[PREFIX3] = per_form[i].values[PREFIX4] = elementary_feature_empty; |
10317
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX1] = per_form[i].values[SUFFIX2] = per_form[i].values[SUFFIX3] = per_form[i].values[SUFFIX4] = elementary_feature_empty; |
10318
|
|
|
|
|
|
|
} else { |
10319
|
0
|
|
|
|
|
|
string_piece form = forms[i]; |
10320
|
0
|
|
|
|
|
|
const char* form_start = form.str; |
10321
|
|
|
|
|
|
|
|
10322
|
|
|
|
|
|
|
bool num = false, cap = false, dash = false; |
10323
|
0
|
|
|
|
|
|
size_t indices[8] = {0, form.len, form.len, form.len, form.len, 0, 0, 0}; // careful here regarding forms shorter than 4 characters |
10324
|
|
|
|
|
|
|
int index = 0; |
10325
|
0
|
0
|
|
|
|
|
while (form.len) { |
10326
|
0
|
|
|
|
|
|
indices[(index++)&7] = form.str - form_start; |
10327
|
|
|
|
|
|
|
|
10328
|
0
|
|
|
|
|
|
unicode::category_t cat = unicode::category(utf8::decode(form.str, form.len)); |
10329
|
0
|
0
|
|
|
|
|
num = num || cat & unicode::N; |
|
|
0
|
|
|
|
|
|
10330
|
0
|
0
|
|
|
|
|
cap = cap || cat & unicode::Lut; |
|
|
0
|
|
|
|
|
|
10331
|
0
|
0
|
|
|
|
|
dash = dash || cat & unicode::Pd; |
|
|
0
|
|
|
|
|
|
10332
|
|
|
|
|
|
|
|
10333
|
0
|
0
|
|
|
|
|
if (index == 5 || (!form.len && index < 5)) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
10334
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX1] = maps[MAP_PREFIX1].value(form_start, indices[1]); |
10335
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX2] = maps[MAP_PREFIX2].value(form_start, indices[2]); |
10336
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX3] = maps[MAP_PREFIX3].value(form_start, indices[3]); |
10337
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX4] = maps[MAP_PREFIX4].value(form_start, indices[4]); |
10338
|
|
|
|
|
|
|
} |
10339
|
|
|
|
|
|
|
} |
10340
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX1] = maps[MAP_SUFFIX1].value(form_start + indices[(index-1)&7], form.str - form_start - indices[(index-1)&7]); |
10341
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX2] = maps[MAP_SUFFIX2].value(form_start + indices[(index-2)&7], form.str - form_start - indices[(index-2)&7]); |
10342
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX3] = maps[MAP_SUFFIX3].value(form_start + indices[(index-3)&7], form.str - form_start - indices[(index-3)&7]); |
10343
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX4] = maps[MAP_SUFFIX4].value(form_start + indices[(index-4)&7], form.str - form_start - indices[(index-4)&7]); |
10344
|
0
|
|
|
|
|
|
per_form[i].values[NUM] = elementary_feature_empty + 1 + num; |
10345
|
0
|
|
|
|
|
|
per_form[i].values[CAP] = elementary_feature_empty + 1 + cap; |
10346
|
0
|
|
|
|
|
|
per_form[i].values[DASH] = elementary_feature_empty + 1 + dash; |
10347
|
|
|
|
|
|
|
} |
10348
|
|
|
|
|
|
|
} |
10349
|
0
|
|
|
|
|
|
} |
10350
|
|
|
|
|
|
|
|
10351
|
|
|
|
|
|
|
template |
10352
|
|
|
|
|
|
|
void czech_elementary_features |
10353
|
0
|
0
|
|
|
|
|
if (prev_dynamic) { |
10354
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_VERB_TAG] = prev_dynamic->values[PREVIOUS_OR_CURRENT_VERB_TAG]; |
10355
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_VERB_LEMMA] = prev_dynamic->values[PREVIOUS_OR_CURRENT_VERB_LEMMA]; |
10356
|
|
|
|
|
|
|
} else { |
10357
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_VERB_TAG] = elementary_feature_empty; |
10358
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_VERB_LEMMA] = elementary_feature_empty; |
10359
|
|
|
|
|
|
|
} |
10360
|
|
|
|
|
|
|
|
10361
|
0
|
0
|
|
|
|
|
if (tag.tag[0] == 'V') { |
10362
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_OR_CURRENT_VERB_TAG] = per_tag.values[TAG]; |
10363
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_OR_CURRENT_VERB_LEMMA] = per_tag.values[LEMMA]; |
10364
|
|
|
|
|
|
|
} else { |
10365
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_OR_CURRENT_VERB_TAG] = dynamic.values[PREVIOUS_VERB_TAG]; |
10366
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_OR_CURRENT_VERB_LEMMA] = dynamic.values[PREVIOUS_VERB_LEMMA]; |
10367
|
|
|
|
|
|
|
} |
10368
|
|
|
|
|
|
|
} |
10369
|
|
|
|
|
|
|
|
10370
|
|
|
|
|
|
|
} // namespace morphodita |
10371
|
|
|
|
|
|
|
|
10372
|
|
|
|
|
|
|
///////// |
10373
|
|
|
|
|
|
|
// File: morphodita/tagger/generic_elementary_features.h |
10374
|
|
|
|
|
|
|
///////// |
10375
|
|
|
|
|
|
|
|
10376
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
10377
|
|
|
|
|
|
|
// |
10378
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
10379
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
10380
|
|
|
|
|
|
|
// |
10381
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
10382
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
10383
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
10384
|
|
|
|
|
|
|
|
10385
|
|
|
|
|
|
|
namespace morphodita { |
10386
|
|
|
|
|
|
|
|
10387
|
|
|
|
|
|
|
// Declarations |
10388
|
|
|
|
|
|
|
template |
10389
|
0
|
|
|
|
|
|
class generic_elementary_features : public elementary_features |
10390
|
|
|
|
|
|
|
public: |
10391
|
|
|
|
|
|
|
generic_elementary_features(); |
10392
|
|
|
|
|
|
|
|
10393
|
|
|
|
|
|
|
enum features_per_form { FORM, FOLLOWING_VERB_TAG, FOLLOWING_VERB_LEMMA, NUM, CAP, DASH, PREFIX1, PREFIX2, PREFIX3, PREFIX4, PREFIX5, PREFIX6, PREFIX7, PREFIX8, PREFIX9, SUFFIX1, SUFFIX2, SUFFIX3, SUFFIX4, SUFFIX5, SUFFIX6, SUFFIX7, SUFFIX8, SUFFIX9, PER_FORM_TOTAL }; |
10394
|
|
|
|
|
|
|
enum features_per_tag { TAG, TAG1, TAG2, TAG3, TAG4, TAG5, LEMMA, PER_TAG_TOTAL }; |
10395
|
|
|
|
|
|
|
enum features_dynamic { PREVIOUS_VERB_TAG, PREVIOUS_VERB_LEMMA, PREVIOUS_OR_CURRENT_VERB_TAG, PREVIOUS_OR_CURRENT_VERB_LEMMA, DYNAMIC_TOTAL }; |
10396
|
|
|
|
|
|
|
enum features_map { MAP_NONE = -1, MAP_FORM, MAP_PREFIX1, MAP_PREFIX2, MAP_PREFIX3, MAP_PREFIX4, MAP_PREFIX5, MAP_PREFIX6, MAP_PREFIX7, MAP_PREFIX8, MAP_PREFIX9, MAP_SUFFIX1, MAP_SUFFIX2, MAP_SUFFIX3, MAP_SUFFIX4, MAP_SUFFIX5, MAP_SUFFIX6, MAP_SUFFIX7, MAP_SUFFIX8, MAP_SUFFIX9, MAP_TAG, MAP_TAG1, MAP_TAG2, MAP_TAG3, MAP_TAG4, MAP_TAG5, MAP_LEMMA, MAP_TOTAL } ; |
10397
|
|
|
|
|
|
|
|
10398
|
|
|
|
|
|
|
struct per_form_features { elementary_feature_value values[PER_FORM_TOTAL]; }; |
10399
|
|
|
|
|
|
|
struct per_tag_features { elementary_feature_value values[PER_TAG_TOTAL]; }; |
10400
|
|
|
|
|
|
|
struct dynamic_features { elementary_feature_value values[DYNAMIC_TOTAL]; }; |
10401
|
|
|
|
|
|
|
|
10402
|
|
|
|
|
|
|
static vector descriptions; |
10403
|
|
|
|
|
|
|
|
10404
|
|
|
|
|
|
|
void compute_features(const vector& forms, const vector>& analyses, vector& per_form, vector>& per_tag) const; |
10405
|
|
|
|
|
|
|
inline void compute_dynamic_features(const tagged_lemma& tag, const per_form_features& per_form, const per_tag_features& per_tag, const dynamic_features* prev_dynamic, dynamic_features& dynamic) const; |
10406
|
|
|
|
|
|
|
|
10407
|
|
|
|
|
|
|
using elementary_features |
10408
|
|
|
|
|
|
|
}; |
10409
|
|
|
|
|
|
|
|
10410
|
|
|
|
|
|
|
typedef generic_elementary_features persistent_generic_elementary_features; |
10411
|
|
|
|
|
|
|
|
10412
|
|
|
|
|
|
|
// Definitions |
10413
|
|
|
|
|
|
|
template |
10414
|
0
|
|
|
|
|
|
generic_elementary_features |
10415
|
0
|
0
|
|
|
|
|
maps.resize(MAP_TOTAL); |
10416
|
0
|
|
|
|
|
|
} |
10417
|
|
|
|
|
|
|
|
10418
|
|
|
|
|
|
|
template |
10419
|
|
|
|
|
|
|
vector generic_elementary_features |
10420
|
|
|
|
|
|
|
{"Form", PER_FORM, ANY_OFFSET, FORM, MAP_FORM}, |
10421
|
|
|
|
|
|
|
{"FollowingVerbTag", PER_FORM, ANY_OFFSET, FOLLOWING_VERB_TAG, MAP_TAG}, |
10422
|
|
|
|
|
|
|
{"FollowingVerbLemma", PER_FORM, ANY_OFFSET, FOLLOWING_VERB_LEMMA, MAP_LEMMA }, |
10423
|
|
|
|
|
|
|
{"Num", PER_FORM, ONLY_CURRENT, NUM, MAP_NONE}, |
10424
|
|
|
|
|
|
|
{"Cap", PER_FORM, ONLY_CURRENT, CAP, MAP_NONE}, |
10425
|
|
|
|
|
|
|
{"Dash", PER_FORM, ONLY_CURRENT, DASH, MAP_NONE}, |
10426
|
|
|
|
|
|
|
{"Prefix1", PER_FORM, ONLY_CURRENT, PREFIX1, MAP_PREFIX1}, |
10427
|
|
|
|
|
|
|
{"Prefix2", PER_FORM, ONLY_CURRENT, PREFIX2, MAP_PREFIX2}, |
10428
|
|
|
|
|
|
|
{"Prefix3", PER_FORM, ONLY_CURRENT, PREFIX3, MAP_PREFIX3}, |
10429
|
|
|
|
|
|
|
{"Prefix4", PER_FORM, ONLY_CURRENT, PREFIX4, MAP_PREFIX4}, |
10430
|
|
|
|
|
|
|
{"Prefix5", PER_FORM, ONLY_CURRENT, PREFIX5, MAP_PREFIX5}, |
10431
|
|
|
|
|
|
|
{"Prefix6", PER_FORM, ONLY_CURRENT, PREFIX6, MAP_PREFIX6}, |
10432
|
|
|
|
|
|
|
{"Prefix7", PER_FORM, ONLY_CURRENT, PREFIX7, MAP_PREFIX7}, |
10433
|
|
|
|
|
|
|
{"Prefix8", PER_FORM, ONLY_CURRENT, PREFIX8, MAP_PREFIX8}, |
10434
|
|
|
|
|
|
|
{"Prefix9", PER_FORM, ONLY_CURRENT, PREFIX9, MAP_PREFIX9}, |
10435
|
|
|
|
|
|
|
{"Suffix1", PER_FORM, ONLY_CURRENT, SUFFIX1, MAP_SUFFIX1}, |
10436
|
|
|
|
|
|
|
{"Suffix2", PER_FORM, ONLY_CURRENT, SUFFIX2, MAP_SUFFIX2}, |
10437
|
|
|
|
|
|
|
{"Suffix3", PER_FORM, ONLY_CURRENT, SUFFIX3, MAP_SUFFIX3}, |
10438
|
|
|
|
|
|
|
{"Suffix4", PER_FORM, ONLY_CURRENT, SUFFIX4, MAP_SUFFIX4}, |
10439
|
|
|
|
|
|
|
{"Suffix5", PER_FORM, ONLY_CURRENT, SUFFIX5, MAP_SUFFIX5}, |
10440
|
|
|
|
|
|
|
{"Suffix6", PER_FORM, ONLY_CURRENT, SUFFIX6, MAP_SUFFIX6}, |
10441
|
|
|
|
|
|
|
{"Suffix7", PER_FORM, ONLY_CURRENT, SUFFIX7, MAP_SUFFIX7}, |
10442
|
|
|
|
|
|
|
{"Suffix8", PER_FORM, ONLY_CURRENT, SUFFIX8, MAP_SUFFIX8}, |
10443
|
|
|
|
|
|
|
{"Suffix9", PER_FORM, ONLY_CURRENT, SUFFIX9, MAP_SUFFIX9}, |
10444
|
|
|
|
|
|
|
|
10445
|
|
|
|
|
|
|
{"Tag", PER_TAG, ANY_OFFSET, TAG, MAP_TAG}, |
10446
|
|
|
|
|
|
|
{"Tag1", PER_TAG, ANY_OFFSET, TAG1, MAP_TAG1}, |
10447
|
|
|
|
|
|
|
{"Tag2", PER_TAG, ANY_OFFSET, TAG2, MAP_TAG2}, |
10448
|
|
|
|
|
|
|
{"Tag3", PER_TAG, ANY_OFFSET, TAG3, MAP_TAG3}, |
10449
|
|
|
|
|
|
|
{"Tag4", PER_TAG, ANY_OFFSET, TAG4, MAP_TAG4}, |
10450
|
|
|
|
|
|
|
{"Tag5", PER_TAG, ANY_OFFSET, TAG5, MAP_TAG5}, |
10451
|
|
|
|
|
|
|
{"Lemma", PER_TAG, ANY_OFFSET, LEMMA, MAP_LEMMA}, |
10452
|
|
|
|
|
|
|
|
10453
|
|
|
|
|
|
|
{"PreviousVerbTag", DYNAMIC, ANY_OFFSET, PREVIOUS_VERB_TAG, MAP_TAG}, |
10454
|
|
|
|
|
|
|
{"PreviousVerbLemma", DYNAMIC, ANY_OFFSET, PREVIOUS_VERB_LEMMA, MAP_LEMMA} |
10455
|
|
|
|
|
|
|
}; |
10456
|
|
|
|
|
|
|
|
10457
|
|
|
|
|
|
|
template |
10458
|
0
|
|
|
|
|
|
void generic_elementary_features |
10459
|
|
|
|
|
|
|
using namespace unilib; |
10460
|
|
|
|
|
|
|
|
10461
|
|
|
|
|
|
|
// We process the sentence in reverse order, so that we can compute FollowingVerbTag and FollowingVerbLemma directly. |
10462
|
|
|
|
|
|
|
elementary_feature_value following_verb_tag = elementary_feature_empty, following_verb_lemma = elementary_feature_empty; |
10463
|
0
|
0
|
|
|
|
|
for (unsigned i = forms.size(); i--;) { |
10464
|
|
|
|
|
|
|
int verb_candidate = -1; |
10465
|
|
|
|
|
|
|
|
10466
|
|
|
|
|
|
|
// Per_tag features and verb_candidate |
10467
|
0
|
0
|
|
|
|
|
for (unsigned j = 0; j < analyses[i].size(); j++) { |
10468
|
0
|
|
|
|
|
|
per_tag[i][j].values[TAG] = maps[MAP_TAG].value(analyses[i][j].tag.c_str(), analyses[i][j].tag.size()); |
10469
|
0
|
0
|
|
|
|
|
per_tag[i][j].values[TAG1] = analyses[i][j].tag.size() >= 1 ? maps[MAP_TAG1].value(analyses[i][j].tag.c_str() + 0, 1) : elementary_feature_empty; |
10470
|
0
|
0
|
|
|
|
|
per_tag[i][j].values[TAG2] = analyses[i][j].tag.size() >= 2 ? maps[MAP_TAG2].value(analyses[i][j].tag.c_str() + 1, 1) : elementary_feature_empty; |
10471
|
0
|
0
|
|
|
|
|
per_tag[i][j].values[TAG3] = analyses[i][j].tag.size() >= 3 ? maps[MAP_TAG3].value(analyses[i][j].tag.c_str() + 2, 1) : elementary_feature_empty; |
10472
|
0
|
0
|
|
|
|
|
per_tag[i][j].values[TAG4] = analyses[i][j].tag.size() >= 4 ? maps[MAP_TAG4].value(analyses[i][j].tag.c_str() + 3, 1) : elementary_feature_empty; |
10473
|
0
|
0
|
|
|
|
|
per_tag[i][j].values[TAG5] = analyses[i][j].tag.size() >= 5 ? maps[MAP_TAG5].value(analyses[i][j].tag.c_str() + 4, 1) : elementary_feature_empty; |
10474
|
0
|
0
|
|
|
|
|
per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == analyses[i][j].lemma ? per_tag[i][j-1].values[LEMMA] : |
|
|
0
|
|
|
|
|
|
10475
|
|
|
|
|
|
|
maps[MAP_LEMMA].value(analyses[i][j].lemma.c_str(), analyses[i][j].lemma.size()); |
10476
|
|
|
|
|
|
|
|
10477
|
0
|
0
|
|
|
|
|
if (analyses[i][j].tag[0] == 'V') { |
10478
|
|
|
|
|
|
|
int tag_compare; |
10479
|
0
|
0
|
|
|
|
|
verb_candidate = verb_candidate < 0 || (tag_compare = analyses[i][j].tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && analyses[i][j].lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate; |
|
|
0
|
|
|
|
|
|
10480
|
|
|
|
|
|
|
} |
10481
|
|
|
|
|
|
|
} |
10482
|
|
|
|
|
|
|
|
10483
|
|
|
|
|
|
|
// Per_form features |
10484
|
0
|
|
|
|
|
|
per_form[i].values[FORM] = maps[MAP_FORM].value(forms[i].str, forms[i].len); |
10485
|
0
|
|
|
|
|
|
per_form[i].values[FOLLOWING_VERB_TAG] = following_verb_tag; |
10486
|
0
|
|
|
|
|
|
per_form[i].values[FOLLOWING_VERB_LEMMA] = following_verb_lemma; |
10487
|
|
|
|
|
|
|
|
10488
|
|
|
|
|
|
|
// Update following_verb_{tag,lemma} _after_ filling FOLLOWING_VERB_{TAG,LEMMA}. |
10489
|
0
|
0
|
|
|
|
|
if (verb_candidate >= 0) { |
10490
|
0
|
|
|
|
|
|
following_verb_tag = per_tag[i][verb_candidate].values[TAG]; |
10491
|
0
|
|
|
|
|
|
following_verb_lemma = per_tag[i][verb_candidate].values[LEMMA]; |
10492
|
|
|
|
|
|
|
} |
10493
|
|
|
|
|
|
|
|
10494
|
|
|
|
|
|
|
// Ortographic per_form features if needed |
10495
|
0
|
0
|
|
|
|
|
if (analyses[i].size() == 1) { |
10496
|
0
|
|
|
|
|
|
per_form[i].values[NUM] = per_form[i].values[CAP] = per_form[i].values[DASH] = elementary_feature_unknown; |
10497
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX1] = per_form[i].values[PREFIX2] = per_form[i].values[PREFIX3] = elementary_feature_unknown; |
10498
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX4] = per_form[i].values[PREFIX5] = per_form[i].values[PREFIX6] = elementary_feature_unknown; |
10499
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX7] = per_form[i].values[PREFIX8] = per_form[i].values[PREFIX9] = elementary_feature_unknown; |
10500
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX1] = per_form[i].values[SUFFIX2] = per_form[i].values[SUFFIX3] = elementary_feature_unknown; |
10501
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX4] = per_form[i].values[SUFFIX5] = per_form[i].values[SUFFIX6] = elementary_feature_unknown; |
10502
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX7] = per_form[i].values[SUFFIX8] = per_form[i].values[SUFFIX9] = elementary_feature_unknown; |
10503
|
0
|
0
|
|
|
|
|
} else if (forms[i].len <= 0) { |
10504
|
0
|
|
|
|
|
|
per_form[i].values[NUM] = per_form[i].values[CAP] = per_form[i].values[DASH] = elementary_feature_empty + 1; |
10505
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX1] = per_form[i].values[PREFIX2] = per_form[i].values[PREFIX3] = elementary_feature_empty; |
10506
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX4] = per_form[i].values[PREFIX5] = per_form[i].values[PREFIX6] = elementary_feature_empty; |
10507
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX7] = per_form[i].values[PREFIX8] = per_form[i].values[PREFIX9] = elementary_feature_empty; |
10508
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX1] = per_form[i].values[SUFFIX2] = per_form[i].values[SUFFIX3] = elementary_feature_empty; |
10509
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX4] = per_form[i].values[SUFFIX5] = per_form[i].values[SUFFIX6] = elementary_feature_empty; |
10510
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX7] = per_form[i].values[SUFFIX8] = per_form[i].values[SUFFIX9] = elementary_feature_empty; |
10511
|
|
|
|
|
|
|
} else { |
10512
|
0
|
|
|
|
|
|
string_piece form = forms[i]; |
10513
|
0
|
|
|
|
|
|
const char* form_start = form.str; |
10514
|
|
|
|
|
|
|
|
10515
|
|
|
|
|
|
|
bool num = false, cap = false, dash = false; |
10516
|
0
|
|
|
|
|
|
size_t indices[18] = {0, form.len, form.len, form.len, form.len, form.len, form.len, form.len, form.len, form.len, 0, 0, 0, 0, 0, 0, 0, 0}; // careful here regarding forms shorter than 9 characters |
10517
|
|
|
|
|
|
|
int index = 0; |
10518
|
0
|
0
|
|
|
|
|
while (form.len) { |
10519
|
0
|
|
|
|
|
|
indices[(index++) % 18] = form.str - form_start; |
10520
|
|
|
|
|
|
|
|
10521
|
0
|
|
|
|
|
|
unicode::category_t cat = unicode::category(utf8::decode(form.str, form.len)); |
10522
|
0
|
0
|
|
|
|
|
num = num || cat & unicode::N; |
|
|
0
|
|
|
|
|
|
10523
|
0
|
0
|
|
|
|
|
cap = cap || cat & unicode::Lut; |
|
|
0
|
|
|
|
|
|
10524
|
0
|
0
|
|
|
|
|
dash = dash || cat & unicode::Pd; |
|
|
0
|
|
|
|
|
|
10525
|
|
|
|
|
|
|
|
10526
|
0
|
0
|
|
|
|
|
if (index == 10 || (!form.len && index < 10)) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
10527
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX1] = maps[MAP_PREFIX1].value(form_start, indices[1]); |
10528
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX2] = maps[MAP_PREFIX2].value(form_start, indices[2]); |
10529
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX3] = maps[MAP_PREFIX3].value(form_start, indices[3]); |
10530
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX4] = maps[MAP_PREFIX4].value(form_start, indices[4]); |
10531
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX5] = maps[MAP_PREFIX5].value(form_start, indices[5]); |
10532
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX6] = maps[MAP_PREFIX6].value(form_start, indices[6]); |
10533
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX7] = maps[MAP_PREFIX7].value(form_start, indices[7]); |
10534
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX8] = maps[MAP_PREFIX8].value(form_start, indices[8]); |
10535
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX9] = maps[MAP_PREFIX9].value(form_start, indices[9]); |
10536
|
|
|
|
|
|
|
} |
10537
|
|
|
|
|
|
|
} |
10538
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX1] = maps[MAP_SUFFIX1].value(form_start + indices[(index+18-1) % 18], form.str - form_start - indices[(index+18-1) % 18]); |
10539
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX2] = maps[MAP_SUFFIX2].value(form_start + indices[(index+18-2) % 18], form.str - form_start - indices[(index+18-2) % 18]); |
10540
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX3] = maps[MAP_SUFFIX3].value(form_start + indices[(index+18-3) % 18], form.str - form_start - indices[(index+18-3) % 18]); |
10541
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX4] = maps[MAP_SUFFIX4].value(form_start + indices[(index+18-4) % 18], form.str - form_start - indices[(index+18-4) % 18]); |
10542
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX5] = maps[MAP_SUFFIX5].value(form_start + indices[(index+18-5) % 18], form.str - form_start - indices[(index+18-5) % 18]); |
10543
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX6] = maps[MAP_SUFFIX6].value(form_start + indices[(index+18-6) % 18], form.str - form_start - indices[(index+18-6) % 18]); |
10544
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX7] = maps[MAP_SUFFIX7].value(form_start + indices[(index+18-7) % 18], form.str - form_start - indices[(index+18-7) % 18]); |
10545
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX8] = maps[MAP_SUFFIX8].value(form_start + indices[(index+18-8) % 18], form.str - form_start - indices[(index+18-8) % 18]); |
10546
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX9] = maps[MAP_SUFFIX9].value(form_start + indices[(index+18-9) % 18], form.str - form_start - indices[(index+18-9) % 18]); |
10547
|
0
|
|
|
|
|
|
per_form[i].values[NUM] = elementary_feature_empty + 1 + num; |
10548
|
0
|
|
|
|
|
|
per_form[i].values[CAP] = elementary_feature_empty + 1 + cap; |
10549
|
0
|
|
|
|
|
|
per_form[i].values[DASH] = elementary_feature_empty + 1 + dash; |
10550
|
|
|
|
|
|
|
} |
10551
|
|
|
|
|
|
|
} |
10552
|
0
|
|
|
|
|
|
} |
10553
|
|
|
|
|
|
|
|
10554
|
|
|
|
|
|
|
template |
10555
|
|
|
|
|
|
|
void generic_elementary_features |
10556
|
0
|
0
|
|
|
|
|
if (prev_dynamic) { |
10557
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_VERB_TAG] = prev_dynamic->values[PREVIOUS_OR_CURRENT_VERB_TAG]; |
10558
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_VERB_LEMMA] = prev_dynamic->values[PREVIOUS_OR_CURRENT_VERB_LEMMA]; |
10559
|
|
|
|
|
|
|
} else { |
10560
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_VERB_TAG] = elementary_feature_empty; |
10561
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_VERB_LEMMA] = elementary_feature_empty; |
10562
|
|
|
|
|
|
|
} |
10563
|
|
|
|
|
|
|
|
10564
|
0
|
0
|
|
|
|
|
if (tag.tag[0] == 'V') { |
10565
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_OR_CURRENT_VERB_TAG] = per_tag.values[TAG]; |
10566
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_OR_CURRENT_VERB_LEMMA] = per_tag.values[LEMMA]; |
10567
|
|
|
|
|
|
|
} else { |
10568
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_OR_CURRENT_VERB_TAG] = dynamic.values[PREVIOUS_VERB_TAG]; |
10569
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_OR_CURRENT_VERB_LEMMA] = dynamic.values[PREVIOUS_VERB_LEMMA]; |
10570
|
|
|
|
|
|
|
} |
10571
|
|
|
|
|
|
|
} |
10572
|
|
|
|
|
|
|
|
10573
|
|
|
|
|
|
|
} // namespace morphodita |
10574
|
|
|
|
|
|
|
|
10575
|
|
|
|
|
|
|
///////// |
10576
|
|
|
|
|
|
|
// File: morphodita/tagger/perceptron_tagger.h |
10577
|
|
|
|
|
|
|
///////// |
10578
|
|
|
|
|
|
|
|
10579
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
10580
|
|
|
|
|
|
|
// |
10581
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
10582
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
10583
|
|
|
|
|
|
|
// |
10584
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
10585
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
10586
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
10587
|
|
|
|
|
|
|
|
10588
|
|
|
|
|
|
|
namespace morphodita { |
10589
|
|
|
|
|
|
|
|
10590
|
|
|
|
|
|
|
// Declarations |
10591
|
|
|
|
|
|
|
template |
10592
|
4
|
|
|
|
|
|
class perceptron_tagger : public tagger { |
10593
|
|
|
|
|
|
|
public: |
10594
|
|
|
|
|
|
|
perceptron_tagger(int decoding_order, int window_size); |
10595
|
|
|
|
|
|
|
|
10596
|
|
|
|
|
|
|
bool load(istream& is); |
10597
|
|
|
|
|
|
|
virtual const morpho* get_morpho() const override; |
10598
|
|
|
|
|
|
|
virtual void tag(const vector& forms, vector& tags, morpho::guesser_mode guesser = morpho::guesser_mode(-1)) const override; |
10599
|
|
|
|
|
|
|
virtual void tag_analyzed(const vector& forms, const vector>& analyses, vector& tags) const override; |
10600
|
|
|
|
|
|
|
|
10601
|
|
|
|
|
|
|
private: |
10602
|
|
|
|
|
|
|
int decoding_order, window_size; |
10603
|
|
|
|
|
|
|
|
10604
|
|
|
|
|
|
|
unique_ptr dict; |
10605
|
|
|
|
|
|
|
bool use_guesser; |
10606
|
|
|
|
|
|
|
FeatureSequences features; |
10607
|
|
|
|
|
|
|
typedef viterbi viterbi_decoder; |
10608
|
|
|
|
|
|
|
viterbi_decoder decoder; |
10609
|
3
|
|
|
|
|
|
struct cache { |
10610
|
|
|
|
|
|
|
vector forms; |
10611
|
|
|
|
|
|
|
vector> analyses; |
10612
|
|
|
|
|
|
|
vector tags; |
10613
|
|
|
|
|
|
|
typename viterbi_decoder::cache decoder_cache; |
10614
|
|
|
|
|
|
|
|
10615
|
1
|
0
|
|
|
|
|
cache(const perceptron_tagger& self) : decoder_cache(self.decoder) {} |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
10616
|
|
|
|
|
|
|
}; |
10617
|
|
|
|
|
|
|
|
10618
|
|
|
|
|
|
|
mutable threadsafe_stack caches; |
10619
|
|
|
|
|
|
|
}; |
10620
|
|
|
|
|
|
|
|
10621
|
|
|
|
|
|
|
// Definitions |
10622
|
|
|
|
|
|
|
|
10623
|
|
|
|
|
|
|
template |
10624
|
1
|
|
|
|
|
|
perceptron_tagger::perceptron_tagger(int decoding_order, int window_size) |
10625
|
1
|
|
|
|
|
|
: decoding_order(decoding_order), window_size(window_size), decoder(features, decoding_order, window_size) {} |
10626
|
|
|
|
|
|
|
|
10627
|
|
|
|
|
|
|
template |
10628
|
1
|
|
|
|
|
|
bool perceptron_tagger::load(istream& is) { |
10629
|
2
|
50
|
|
|
|
|
if (dict.reset(morpho::load(is)), !dict) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
10630
|
1
|
|
|
|
|
|
use_guesser = is.get(); |
10631
|
1
|
50
|
|
|
|
|
if (!features.load(is)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
10632
|
1
|
|
|
|
|
|
return true; |
10633
|
|
|
|
|
|
|
} |
10634
|
|
|
|
|
|
|
|
10635
|
|
|
|
|
|
|
template |
10636
|
1
|
|
|
|
|
|
const morpho* perceptron_tagger::get_morpho() const { |
10637
|
1
|
|
|
|
|
|
return dict.get(); |
10638
|
|
|
|
|
|
|
} |
10639
|
|
|
|
|
|
|
|
10640
|
|
|
|
|
|
|
template |
10641
|
1
|
|
|
|
|
|
void perceptron_tagger::tag(const vector& forms, vector& tags, morpho::guesser_mode guesser) const { |
10642
|
|
|
|
|
|
|
tags.clear(); |
10643
|
1
|
0
|
|
|
|
|
if (!dict) return; |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
10644
|
|
|
|
|
|
|
|
10645
|
1
|
|
|
|
|
|
cache* c = caches.pop(); |
10646
|
1
|
0
|
|
|
|
|
if (!c) c = new cache(*this); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
10647
|
|
|
|
|
|
|
|
10648
|
1
|
|
|
|
|
|
c->forms.resize(forms.size()); |
10649
|
1
|
0
|
|
|
|
|
if (c->analyses.size() < forms.size()) c->analyses.resize(forms.size()); |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
10650
|
8
|
0
|
|
|
|
|
for (unsigned i = 0; i < forms.size(); i++) { |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
10651
|
7
|
|
|
|
|
|
c->forms[i] = forms[i]; |
10652
|
7
|
|
|
|
|
|
c->forms[i].len = dict->raw_form_len(forms[i]); |
10653
|
7
|
0
|
|
|
|
|
dict->analyze(forms[i], guesser >= 0 ? guesser : use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, c->analyses[i]); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
10654
|
|
|
|
|
|
|
} |
10655
|
|
|
|
|
|
|
|
10656
|
1
|
0
|
|
|
|
|
if (c->tags.size() < forms.size()) c->tags.resize(forms.size() * 2); |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
10657
|
1
|
|
|
|
|
|
decoder.tag(c->forms, c->analyses, c->decoder_cache, c->tags); |
10658
|
|
|
|
|
|
|
|
10659
|
8
|
0
|
|
|
|
|
for (unsigned i = 0; i < forms.size(); i++) |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
10660
|
7
|
|
|
|
|
|
tags.emplace_back(c->analyses[i][c->tags[i]]); |
10661
|
|
|
|
|
|
|
|
10662
|
1
|
|
|
|
|
|
caches.push(c); |
10663
|
|
|
|
|
|
|
} |
10664
|
|
|
|
|
|
|
|
10665
|
|
|
|
|
|
|
template |
10666
|
0
|
|
|
|
|
|
void perceptron_tagger::tag_analyzed(const vector& forms, const vector>& analyses, vector& tags) const { |
10667
|
|
|
|
|
|
|
tags.clear(); |
10668
|
|
|
|
|
|
|
|
10669
|
0
|
|
|
|
|
|
cache* c = caches.pop(); |
10670
|
0
|
0
|
|
|
|
|
if (!c) c = new cache(*this); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
10671
|
|
|
|
|
|
|
|
10672
|
0
|
|
|
|
|
|
tags.resize(forms.size()); |
10673
|
0
|
|
|
|
|
|
decoder.tag(forms, analyses, c->decoder_cache, tags); |
10674
|
|
|
|
|
|
|
|
10675
|
0
|
|
|
|
|
|
caches.push(c); |
10676
|
0
|
|
|
|
|
|
} |
10677
|
|
|
|
|
|
|
|
10678
|
|
|
|
|
|
|
} // namespace morphodita |
10679
|
|
|
|
|
|
|
|
10680
|
|
|
|
|
|
|
///////// |
10681
|
|
|
|
|
|
|
// File: morphodita/tagger/tagger.cpp |
10682
|
|
|
|
|
|
|
///////// |
10683
|
|
|
|
|
|
|
|
10684
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
10685
|
|
|
|
|
|
|
// |
10686
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
10687
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
10688
|
|
|
|
|
|
|
// |
10689
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
10690
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
10691
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
10692
|
|
|
|
|
|
|
|
10693
|
|
|
|
|
|
|
namespace morphodita { |
10694
|
|
|
|
|
|
|
|
10695
|
1
|
|
|
|
|
|
tagger* tagger::load(istream& is) { |
10696
|
1
|
50
|
|
|
|
|
tagger_id id = tagger_id(is.get()); |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
10697
|
|
|
|
|
|
|
switch (id) { |
10698
|
|
|
|
|
|
|
case tagger_ids::CZECH2: |
10699
|
|
|
|
|
|
|
case tagger_ids::CZECH2_3: |
10700
|
|
|
|
|
|
|
case tagger_ids::CZECH3: |
10701
|
|
|
|
|
|
|
{ |
10702
|
0
|
0
|
|
|
|
|
auto res = new_unique_ptr>>(tagger_ids::decoding_order(id), tagger_ids::window_size(id)); |
10703
|
0
|
0
|
|
|
|
|
if (res->load(is)) return res.release(); |
|
|
0
|
|
|
|
|
|
10704
|
|
|
|
|
|
|
break; |
10705
|
|
|
|
|
|
|
} |
10706
|
|
|
|
|
|
|
case tagger_ids::GENERIC2: |
10707
|
|
|
|
|
|
|
case tagger_ids::GENERIC2_3: |
10708
|
|
|
|
|
|
|
case tagger_ids::GENERIC3: |
10709
|
|
|
|
|
|
|
case tagger_ids::GENERIC4: |
10710
|
|
|
|
|
|
|
{ |
10711
|
0
|
0
|
|
|
|
|
auto res = new_unique_ptr>>(tagger_ids::decoding_order(id), tagger_ids::window_size(id)); |
10712
|
0
|
0
|
|
|
|
|
if (res->load(is)) return res.release(); |
|
|
0
|
|
|
|
|
|
10713
|
|
|
|
|
|
|
break; |
10714
|
|
|
|
|
|
|
} |
10715
|
|
|
|
|
|
|
case tagger_ids::CONLLU2: |
10716
|
|
|
|
|
|
|
case tagger_ids::CONLLU2_3: |
10717
|
|
|
|
|
|
|
case tagger_ids::CONLLU3: |
10718
|
|
|
|
|
|
|
{ |
10719
|
1
|
50
|
|
|
|
|
auto res = new_unique_ptr>>(tagger_ids::decoding_order(id), tagger_ids::window_size(id)); |
10720
|
1
|
50
|
|
|
|
|
if (res->load(is)) return res.release(); |
|
|
50
|
|
|
|
|
|
10721
|
|
|
|
|
|
|
break; |
10722
|
|
|
|
|
|
|
} |
10723
|
|
|
|
|
|
|
} |
10724
|
|
|
|
|
|
|
|
10725
|
|
|
|
|
|
|
return nullptr; |
10726
|
|
|
|
|
|
|
} |
10727
|
|
|
|
|
|
|
|
10728
|
0
|
|
|
|
|
|
tagger* tagger::load(const char* fname) { |
10729
|
0
|
0
|
|
|
|
|
ifstream f(path_from_utf8(fname).c_str(), ifstream::binary); |
10730
|
0
|
0
|
|
|
|
|
if (!f) return nullptr; |
10731
|
|
|
|
|
|
|
|
10732
|
0
|
0
|
|
|
|
|
return load(f); |
10733
|
|
|
|
|
|
|
} |
10734
|
|
|
|
|
|
|
|
10735
|
0
|
|
|
|
|
|
tokenizer* tagger::new_tokenizer() const { |
10736
|
0
|
|
|
|
|
|
auto morpho = get_morpho(); |
10737
|
0
|
0
|
|
|
|
|
return morpho ? morpho->new_tokenizer() : nullptr; |
10738
|
|
|
|
|
|
|
} |
10739
|
|
|
|
|
|
|
|
10740
|
|
|
|
|
|
|
} // namespace morphodita |
10741
|
|
|
|
|
|
|
|
10742
|
|
|
|
|
|
|
///////// |
10743
|
|
|
|
|
|
|
// File: morphodita/tagset_converter/identity_tagset_converter.h |
10744
|
|
|
|
|
|
|
///////// |
10745
|
|
|
|
|
|
|
|
10746
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
10747
|
|
|
|
|
|
|
// |
10748
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
10749
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
10750
|
|
|
|
|
|
|
// |
10751
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
10752
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
10753
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
10754
|
|
|
|
|
|
|
|
10755
|
|
|
|
|
|
|
namespace morphodita { |
10756
|
|
|
|
|
|
|
|
10757
|
0
|
|
|
|
|
|
class identity_tagset_converter : public tagset_converter { |
10758
|
|
|
|
|
|
|
public: |
10759
|
|
|
|
|
|
|
virtual void convert(tagged_lemma& tagged_lemma) const override; |
10760
|
|
|
|
|
|
|
virtual void convert_analyzed(vector& tagged_lemmas) const override; |
10761
|
|
|
|
|
|
|
virtual void convert_generated(vector& forms) const override; |
10762
|
|
|
|
|
|
|
}; |
10763
|
|
|
|
|
|
|
|
10764
|
|
|
|
|
|
|
} // namespace morphodita |
10765
|
|
|
|
|
|
|
|
10766
|
|
|
|
|
|
|
///////// |
10767
|
|
|
|
|
|
|
// File: morphodita/tagset_converter/identity_tagset_converter.cpp |
10768
|
|
|
|
|
|
|
///////// |
10769
|
|
|
|
|
|
|
|
10770
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
10771
|
|
|
|
|
|
|
// |
10772
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
10773
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
10774
|
|
|
|
|
|
|
// |
10775
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
10776
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
10777
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
10778
|
|
|
|
|
|
|
|
10779
|
|
|
|
|
|
|
namespace morphodita { |
10780
|
|
|
|
|
|
|
|
10781
|
0
|
|
|
|
|
|
void identity_tagset_converter::convert(tagged_lemma& /*tagged_lemma*/) const {} |
10782
|
|
|
|
|
|
|
|
10783
|
0
|
|
|
|
|
|
void identity_tagset_converter::convert_analyzed(vector& /*tagged_lemmas*/) const {} |
10784
|
|
|
|
|
|
|
|
10785
|
0
|
|
|
|
|
|
void identity_tagset_converter::convert_generated(vector& /*forms*/) const {} |
10786
|
|
|
|
|
|
|
|
10787
|
|
|
|
|
|
|
} // namespace morphodita |
10788
|
|
|
|
|
|
|
|
10789
|
|
|
|
|
|
|
///////// |
10790
|
|
|
|
|
|
|
// File: morphodita/tagset_converter/pdt_to_conll2009_tagset_converter.h |
10791
|
|
|
|
|
|
|
///////// |
10792
|
|
|
|
|
|
|
|
10793
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
10794
|
|
|
|
|
|
|
// |
10795
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
10796
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
10797
|
|
|
|
|
|
|
// |
10798
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
10799
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
10800
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
10801
|
|
|
|
|
|
|
|
10802
|
|
|
|
|
|
|
namespace morphodita { |
10803
|
|
|
|
|
|
|
|
10804
|
0
|
|
|
|
|
|
class pdt_to_conll2009_tagset_converter : public tagset_converter { |
10805
|
|
|
|
|
|
|
public: |
10806
|
|
|
|
|
|
|
virtual void convert(tagged_lemma& tagged_lemma) const override; |
10807
|
|
|
|
|
|
|
virtual void convert_analyzed(vector& tagged_lemmas) const override; |
10808
|
|
|
|
|
|
|
virtual void convert_generated(vector& forms) const override; |
10809
|
|
|
|
|
|
|
|
10810
|
|
|
|
|
|
|
private: |
10811
|
|
|
|
|
|
|
inline void convert_tag(const string& lemma, string& tag) const; |
10812
|
|
|
|
|
|
|
inline bool convert_lemma(string& lemma) const; |
10813
|
|
|
|
|
|
|
}; |
10814
|
|
|
|
|
|
|
|
10815
|
|
|
|
|
|
|
} // namespace morphodita |
10816
|
|
|
|
|
|
|
|
10817
|
|
|
|
|
|
|
///////// |
10818
|
|
|
|
|
|
|
// File: morphodita/tagset_converter/pdt_to_conll2009_tagset_converter.cpp |
10819
|
|
|
|
|
|
|
///////// |
10820
|
|
|
|
|
|
|
|
10821
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
10822
|
|
|
|
|
|
|
// |
10823
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
10824
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
10825
|
|
|
|
|
|
|
// |
10826
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
10827
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
10828
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
10829
|
|
|
|
|
|
|
|
10830
|
|
|
|
|
|
|
namespace morphodita { |
10831
|
|
|
|
|
|
|
|
10832
|
|
|
|
|
|
|
static const char* names[15] = {"POS", "SubPOS", "Gen", "Num", "Cas", "PGe", "PNu", "Per", "Ten", "Gra", "Neg", "Voi", "", "", "Var"}; |
10833
|
|
|
|
|
|
|
|
10834
|
0
|
|
|
|
|
|
inline void pdt_to_conll2009_tagset_converter::convert_tag(const string& lemma, string& tag) const { |
10835
|
|
|
|
|
|
|
char pdt_tag[16]; |
10836
|
|
|
|
|
|
|
strncpy(pdt_tag, tag.c_str(), 15); |
10837
|
|
|
|
|
|
|
|
10838
|
|
|
|
|
|
|
// Clear the tag |
10839
|
|
|
|
|
|
|
tag.clear(); |
10840
|
|
|
|
|
|
|
|
10841
|
|
|
|
|
|
|
// Fill FEAT of filled tag characters |
10842
|
0
|
0
|
|
|
|
|
for (int i = 0; i < 15 && pdt_tag[i]; i++) |
|
|
0
|
|
|
|
|
|
10843
|
0
|
0
|
|
|
|
|
if (pdt_tag[i] != '-') { |
10844
|
0
|
0
|
|
|
|
|
if (!tag.empty()) tag.push_back('|'); |
10845
|
0
|
|
|
|
|
|
tag.append(names[i]); |
10846
|
0
|
|
|
|
|
|
tag.push_back('='); |
10847
|
0
|
|
|
|
|
|
tag.push_back(pdt_tag[i]); |
10848
|
|
|
|
|
|
|
} |
10849
|
|
|
|
|
|
|
|
10850
|
|
|
|
|
|
|
// Try adding Sem FEAT |
10851
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i + 2 < lemma.size(); i++) |
10852
|
0
|
0
|
|
|
|
|
if (lemma[i] == '_' && lemma[i + 1] == ';') { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
10853
|
0
|
0
|
|
|
|
|
if (!tag.empty()) tag.push_back('|'); |
10854
|
0
|
|
|
|
|
|
tag.append("Sem="); |
10855
|
0
|
|
|
|
|
|
tag.push_back(lemma[i + 2]); |
10856
|
|
|
|
|
|
|
break; |
10857
|
|
|
|
|
|
|
} |
10858
|
0
|
|
|
|
|
|
} |
10859
|
|
|
|
|
|
|
|
10860
|
0
|
|
|
|
|
|
inline bool pdt_to_conll2009_tagset_converter::convert_lemma(string& lemma) const { |
10861
|
0
|
|
|
|
|
|
unsigned raw_lemma = czech_lemma_addinfo::raw_lemma_len(lemma); |
10862
|
0
|
0
|
|
|
|
|
return raw_lemma < lemma.size() ? (lemma.resize(raw_lemma), true) : false; |
10863
|
|
|
|
|
|
|
} |
10864
|
|
|
|
|
|
|
|
10865
|
0
|
|
|
|
|
|
void pdt_to_conll2009_tagset_converter::convert(tagged_lemma& tagged_lemma) const { |
10866
|
0
|
|
|
|
|
|
convert_tag(tagged_lemma.lemma, tagged_lemma.tag); |
10867
|
0
|
|
|
|
|
|
convert_lemma(tagged_lemma.lemma); |
10868
|
0
|
|
|
|
|
|
} |
10869
|
|
|
|
|
|
|
|
10870
|
0
|
|
|
|
|
|
void pdt_to_conll2009_tagset_converter::convert_analyzed(vector& tagged_lemmas) const { |
10871
|
|
|
|
|
|
|
bool lemma_changed = false; |
10872
|
|
|
|
|
|
|
|
10873
|
0
|
0
|
|
|
|
|
for (auto&& tagged_lemma : tagged_lemmas) { |
10874
|
0
|
|
|
|
|
|
convert_tag(tagged_lemma.lemma, tagged_lemma.tag); |
10875
|
0
|
|
|
|
|
|
lemma_changed |= convert_lemma(tagged_lemma.lemma); |
10876
|
|
|
|
|
|
|
} |
10877
|
|
|
|
|
|
|
|
10878
|
|
|
|
|
|
|
// If no lemma was changed or there is 1 analysis, no duplicates could be created. |
10879
|
0
|
0
|
|
|
|
|
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
10880
|
|
|
|
|
|
|
|
10881
|
0
|
|
|
|
|
|
tagset_converter_unique_analyzed(tagged_lemmas); |
10882
|
|
|
|
|
|
|
} |
10883
|
|
|
|
|
|
|
|
10884
|
0
|
|
|
|
|
|
void pdt_to_conll2009_tagset_converter::convert_generated(vector& forms) const { |
10885
|
|
|
|
|
|
|
bool lemma_changed = false; |
10886
|
|
|
|
|
|
|
|
10887
|
0
|
0
|
|
|
|
|
for (auto&& tagged_lemma_forms : forms) { |
10888
|
0
|
0
|
|
|
|
|
for (auto&& tagged_form : tagged_lemma_forms.forms) |
10889
|
0
|
|
|
|
|
|
convert_tag(tagged_lemma_forms.lemma, tagged_form.tag); |
10890
|
0
|
|
|
|
|
|
lemma_changed |= convert_lemma(tagged_lemma_forms.lemma); |
10891
|
|
|
|
|
|
|
} |
10892
|
|
|
|
|
|
|
|
10893
|
|
|
|
|
|
|
// If no lemma was changed or there is 1 analysis, no duplicates could be created. |
10894
|
0
|
0
|
|
|
|
|
if (!lemma_changed || forms.size() < 2) return; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
10895
|
|
|
|
|
|
|
|
10896
|
0
|
|
|
|
|
|
tagset_converter_unique_generated(forms); |
10897
|
|
|
|
|
|
|
} |
10898
|
|
|
|
|
|
|
|
10899
|
|
|
|
|
|
|
} // namespace morphodita |
10900
|
|
|
|
|
|
|
|
10901
|
|
|
|
|
|
|
///////// |
10902
|
|
|
|
|
|
|
// File: morphodita/tagset_converter/strip_lemma_comment_tagset_converter.h |
10903
|
|
|
|
|
|
|
///////// |
10904
|
|
|
|
|
|
|
|
10905
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
10906
|
|
|
|
|
|
|
// |
10907
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
10908
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
10909
|
|
|
|
|
|
|
// |
10910
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
10911
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
10912
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
10913
|
|
|
|
|
|
|
|
10914
|
|
|
|
|
|
|
namespace morphodita { |
10915
|
|
|
|
|
|
|
|
10916
|
0
|
|
|
|
|
|
class strip_lemma_comment_tagset_converter : public tagset_converter { |
10917
|
|
|
|
|
|
|
public: |
10918
|
0
|
|
|
|
|
|
strip_lemma_comment_tagset_converter(const morpho& dictionary) : dictionary(dictionary) {} |
10919
|
|
|
|
|
|
|
|
10920
|
|
|
|
|
|
|
virtual void convert(tagged_lemma& tagged_lemma) const override; |
10921
|
|
|
|
|
|
|
virtual void convert_analyzed(vector& tagged_lemmas) const override; |
10922
|
|
|
|
|
|
|
virtual void convert_generated(vector& forms) const override; |
10923
|
|
|
|
|
|
|
|
10924
|
|
|
|
|
|
|
private: |
10925
|
|
|
|
|
|
|
inline bool convert_lemma(string& lemma) const; |
10926
|
|
|
|
|
|
|
const morpho& dictionary; |
10927
|
|
|
|
|
|
|
}; |
10928
|
|
|
|
|
|
|
|
10929
|
|
|
|
|
|
|
} // namespace morphodita |
10930
|
|
|
|
|
|
|
|
10931
|
|
|
|
|
|
|
///////// |
10932
|
|
|
|
|
|
|
// File: morphodita/tagset_converter/strip_lemma_comment_tagset_converter.cpp |
10933
|
|
|
|
|
|
|
///////// |
10934
|
|
|
|
|
|
|
|
10935
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
10936
|
|
|
|
|
|
|
// |
10937
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
10938
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
10939
|
|
|
|
|
|
|
// |
10940
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
10941
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
10942
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
10943
|
|
|
|
|
|
|
|
10944
|
|
|
|
|
|
|
namespace morphodita { |
10945
|
|
|
|
|
|
|
|
10946
|
0
|
|
|
|
|
|
inline bool strip_lemma_comment_tagset_converter::convert_lemma(string& lemma) const { |
10947
|
0
|
|
|
|
|
|
unsigned lemma_id_len = dictionary.lemma_id_len(lemma); |
10948
|
0
|
0
|
|
|
|
|
return lemma_id_len < lemma.size() ? (lemma.resize(lemma_id_len), true) : false; |
10949
|
|
|
|
|
|
|
} |
10950
|
|
|
|
|
|
|
|
10951
|
0
|
|
|
|
|
|
void strip_lemma_comment_tagset_converter::convert(tagged_lemma& tagged_lemma) const { |
10952
|
0
|
|
|
|
|
|
convert_lemma(tagged_lemma.lemma); |
10953
|
0
|
|
|
|
|
|
} |
10954
|
|
|
|
|
|
|
|
10955
|
0
|
|
|
|
|
|
void strip_lemma_comment_tagset_converter::convert_analyzed(vector& tagged_lemmas) const { |
10956
|
|
|
|
|
|
|
bool lemma_changed = false; |
10957
|
|
|
|
|
|
|
|
10958
|
0
|
0
|
|
|
|
|
for (auto&& tagged_lemma : tagged_lemmas) |
10959
|
0
|
|
|
|
|
|
lemma_changed |= convert_lemma(tagged_lemma.lemma); |
10960
|
|
|
|
|
|
|
|
10961
|
|
|
|
|
|
|
// If no lemma was changed or there is 1 analysis, no duplicates could be created. |
10962
|
0
|
0
|
|
|
|
|
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
10963
|
|
|
|
|
|
|
|
10964
|
0
|
|
|
|
|
|
tagset_converter_unique_analyzed(tagged_lemmas); |
10965
|
|
|
|
|
|
|
} |
10966
|
|
|
|
|
|
|
|
10967
|
0
|
|
|
|
|
|
void strip_lemma_comment_tagset_converter::convert_generated(vector& forms) const { |
10968
|
|
|
|
|
|
|
bool lemma_changed = false; |
10969
|
|
|
|
|
|
|
|
10970
|
0
|
0
|
|
|
|
|
for (auto&& tagged_lemma_forms : forms) |
10971
|
0
|
|
|
|
|
|
lemma_changed |= convert_lemma(tagged_lemma_forms.lemma); |
10972
|
|
|
|
|
|
|
|
10973
|
|
|
|
|
|
|
// If no lemma was changed or there is 1 analysis, no duplicates could be created. |
10974
|
0
|
0
|
|
|
|
|
if (!lemma_changed || forms.size() < 2) return; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
10975
|
|
|
|
|
|
|
|
10976
|
0
|
|
|
|
|
|
tagset_converter_unique_generated(forms); |
10977
|
|
|
|
|
|
|
} |
10978
|
|
|
|
|
|
|
|
10979
|
|
|
|
|
|
|
} // namespace morphodita |
10980
|
|
|
|
|
|
|
|
10981
|
|
|
|
|
|
|
///////// |
10982
|
|
|
|
|
|
|
// File: morphodita/tagset_converter/strip_lemma_id_tagset_converter.h |
10983
|
|
|
|
|
|
|
///////// |
10984
|
|
|
|
|
|
|
|
10985
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
10986
|
|
|
|
|
|
|
// |
10987
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
10988
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
10989
|
|
|
|
|
|
|
// |
10990
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
10991
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
10992
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
10993
|
|
|
|
|
|
|
|
10994
|
|
|
|
|
|
|
namespace morphodita { |
10995
|
|
|
|
|
|
|
|
10996
|
0
|
|
|
|
|
|
class strip_lemma_id_tagset_converter : public tagset_converter { |
10997
|
|
|
|
|
|
|
public: |
10998
|
0
|
|
|
|
|
|
strip_lemma_id_tagset_converter(const morpho& dictionary) : dictionary(dictionary) {} |
10999
|
|
|
|
|
|
|
|
11000
|
|
|
|
|
|
|
virtual void convert(tagged_lemma& tagged_lemma) const override; |
11001
|
|
|
|
|
|
|
virtual void convert_analyzed(vector& tagged_lemmas) const override; |
11002
|
|
|
|
|
|
|
virtual void convert_generated(vector& forms) const override; |
11003
|
|
|
|
|
|
|
|
11004
|
|
|
|
|
|
|
private: |
11005
|
|
|
|
|
|
|
inline bool convert_lemma(string& lemma) const; |
11006
|
|
|
|
|
|
|
const morpho& dictionary; |
11007
|
|
|
|
|
|
|
}; |
11008
|
|
|
|
|
|
|
|
11009
|
|
|
|
|
|
|
} // namespace morphodita |
11010
|
|
|
|
|
|
|
|
11011
|
|
|
|
|
|
|
///////// |
11012
|
|
|
|
|
|
|
// File: morphodita/tagset_converter/strip_lemma_id_tagset_converter.cpp |
11013
|
|
|
|
|
|
|
///////// |
11014
|
|
|
|
|
|
|
|
11015
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
11016
|
|
|
|
|
|
|
// |
11017
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
11018
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
11019
|
|
|
|
|
|
|
// |
11020
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
11021
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
11022
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
11023
|
|
|
|
|
|
|
|
11024
|
|
|
|
|
|
|
namespace morphodita { |
11025
|
|
|
|
|
|
|
|
11026
|
0
|
|
|
|
|
|
inline bool strip_lemma_id_tagset_converter::convert_lemma(string& lemma) const { |
11027
|
0
|
|
|
|
|
|
unsigned raw_lemma_len = dictionary.raw_lemma_len(lemma); |
11028
|
0
|
0
|
|
|
|
|
return raw_lemma_len < lemma.size() ? (lemma.resize(raw_lemma_len), true) : false; |
11029
|
|
|
|
|
|
|
} |
11030
|
|
|
|
|
|
|
|
11031
|
0
|
|
|
|
|
|
void strip_lemma_id_tagset_converter::convert(tagged_lemma& tagged_lemma) const { |
11032
|
0
|
|
|
|
|
|
convert_lemma(tagged_lemma.lemma); |
11033
|
0
|
|
|
|
|
|
} |
11034
|
|
|
|
|
|
|
|
11035
|
0
|
|
|
|
|
|
void strip_lemma_id_tagset_converter::convert_analyzed(vector& tagged_lemmas) const { |
11036
|
|
|
|
|
|
|
bool lemma_changed = false; |
11037
|
|
|
|
|
|
|
|
11038
|
0
|
0
|
|
|
|
|
for (auto&& tagged_lemma : tagged_lemmas) |
11039
|
0
|
|
|
|
|
|
lemma_changed |= convert_lemma(tagged_lemma.lemma); |
11040
|
|
|
|
|
|
|
|
11041
|
|
|
|
|
|
|
// If no lemma was changed or there is 1 analysis, no duplicates could be created. |
11042
|
0
|
0
|
|
|
|
|
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
11043
|
|
|
|
|
|
|
|
11044
|
0
|
|
|
|
|
|
tagset_converter_unique_analyzed(tagged_lemmas); |
11045
|
|
|
|
|
|
|
} |
11046
|
|
|
|
|
|
|
|
11047
|
0
|
|
|
|
|
|
void strip_lemma_id_tagset_converter::convert_generated(vector& forms) const { |
11048
|
|
|
|
|
|
|
bool lemma_changed = false; |
11049
|
|
|
|
|
|
|
|
11050
|
0
|
0
|
|
|
|
|
for (auto&& tagged_lemma_forms : forms) |
11051
|
0
|
|
|
|
|
|
lemma_changed |= convert_lemma(tagged_lemma_forms.lemma); |
11052
|
|
|
|
|
|
|
|
11053
|
|
|
|
|
|
|
// If no lemma was changed or there is 1 analysis, no duplicates could be created. |
11054
|
0
|
0
|
|
|
|
|
if (!lemma_changed || forms.size() < 2) return; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
11055
|
|
|
|
|
|
|
|
11056
|
0
|
|
|
|
|
|
tagset_converter_unique_generated(forms); |
11057
|
|
|
|
|
|
|
} |
11058
|
|
|
|
|
|
|
|
11059
|
|
|
|
|
|
|
} // namespace morphodita |
11060
|
|
|
|
|
|
|
|
11061
|
|
|
|
|
|
|
///////// |
11062
|
|
|
|
|
|
|
// File: morphodita/tagset_converter/tagset_converter.cpp |
11063
|
|
|
|
|
|
|
///////// |
11064
|
|
|
|
|
|
|
|
11065
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
11066
|
|
|
|
|
|
|
// |
11067
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
11068
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
11069
|
|
|
|
|
|
|
// |
11070
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
11071
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
11072
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
11073
|
|
|
|
|
|
|
|
11074
|
|
|
|
|
|
|
namespace morphodita { |
11075
|
|
|
|
|
|
|
|
11076
|
0
|
|
|
|
|
|
tagset_converter* tagset_converter::new_identity_converter() { |
11077
|
0
|
|
|
|
|
|
return new identity_tagset_converter(); |
11078
|
|
|
|
|
|
|
} |
11079
|
|
|
|
|
|
|
|
11080
|
0
|
|
|
|
|
|
tagset_converter* tagset_converter::new_pdt_to_conll2009_converter() { |
11081
|
0
|
|
|
|
|
|
return new pdt_to_conll2009_tagset_converter(); |
11082
|
|
|
|
|
|
|
} |
11083
|
|
|
|
|
|
|
|
11084
|
0
|
|
|
|
|
|
tagset_converter* tagset_converter::new_strip_lemma_comment_converter(const morpho& dictionary) { |
11085
|
0
|
|
|
|
|
|
return new strip_lemma_comment_tagset_converter(dictionary); |
11086
|
|
|
|
|
|
|
} |
11087
|
|
|
|
|
|
|
|
11088
|
0
|
|
|
|
|
|
tagset_converter* tagset_converter::new_strip_lemma_id_converter(const morpho& dictionary) { |
11089
|
0
|
|
|
|
|
|
return new strip_lemma_id_tagset_converter(dictionary); |
11090
|
|
|
|
|
|
|
} |
11091
|
|
|
|
|
|
|
|
11092
|
0
|
|
|
|
|
|
tagset_converter* new_tagset_converter(const string& name, const morpho& dictionary) { |
11093
|
0
|
0
|
|
|
|
|
if (name == "pdt_to_conll2009") return tagset_converter::new_pdt_to_conll2009_converter(); |
11094
|
0
|
0
|
|
|
|
|
if (name == "strip_lemma_comment") return tagset_converter::new_strip_lemma_comment_converter(dictionary); |
11095
|
0
|
0
|
|
|
|
|
if (name == "strip_lemma_id") return tagset_converter::new_strip_lemma_id_converter(dictionary); |
11096
|
|
|
|
|
|
|
return nullptr; |
11097
|
|
|
|
|
|
|
} |
11098
|
|
|
|
|
|
|
|
11099
|
0
|
|
|
|
|
|
void tagset_converter_unique_analyzed(vector& tagged_lemmas) { |
11100
|
|
|
|
|
|
|
// Remove possible lemma-tag pair duplicates |
11101
|
|
|
|
|
|
|
struct tagged_lemma_comparator { |
11102
|
0
|
0
|
|
|
|
|
inline static bool eq(const tagged_lemma& a, const tagged_lemma& b) { return a.lemma == b.lemma && a.tag == b.tag; } |
|
|
0
|
|
|
|
|
|
11103
|
0
|
0
|
|
|
|
|
inline static bool lt(const tagged_lemma& a, const tagged_lemma& b) { int lemma_compare = a.lemma.compare(b.lemma); return lemma_compare < 0 || (lemma_compare == 0 && a.tag < b.tag); } |
11104
|
|
|
|
|
|
|
}; |
11105
|
|
|
|
|
|
|
|
11106
|
|
|
|
|
|
|
sort(tagged_lemmas.begin(), tagged_lemmas.end(), tagged_lemma_comparator::lt); |
11107
|
0
|
|
|
|
|
|
tagged_lemmas.resize(unique(tagged_lemmas.begin(), tagged_lemmas.end(), tagged_lemma_comparator::eq) - tagged_lemmas.begin()); |
11108
|
0
|
|
|
|
|
|
} |
11109
|
|
|
|
|
|
|
|
11110
|
0
|
|
|
|
|
|
void tagset_converter_unique_generated(vector& forms) { |
11111
|
|
|
|
|
|
|
// Regroup and if needed remove duplicate form-tag pairs for each lemma |
11112
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < forms.size(); i++) { |
11113
|
|
|
|
|
|
|
bool any_merged = false; |
11114
|
0
|
0
|
|
|
|
|
for (unsigned j = forms.size() - 1; j > i; j--) |
11115
|
0
|
0
|
|
|
|
|
if (forms[j].lemma == forms[i].lemma) { |
11116
|
|
|
|
|
|
|
// Same lemma was found. Merge form-tag pairs |
11117
|
0
|
0
|
|
|
|
|
for (auto&& tagged_form : forms[j].forms) |
11118
|
0
|
|
|
|
|
|
forms[i].forms.emplace_back(move(tagged_form)); |
11119
|
|
|
|
|
|
|
|
11120
|
|
|
|
|
|
|
// Remove lemma j by moving it to end and deleting |
11121
|
0
|
0
|
|
|
|
|
if (j < forms.size() - 1) { |
11122
|
0
|
|
|
|
|
|
forms[j].lemma.swap(forms[forms.size() - 1].lemma); |
11123
|
0
|
|
|
|
|
|
forms[j].forms.swap(forms[forms.size() - 1].forms); |
11124
|
|
|
|
|
|
|
} |
11125
|
|
|
|
|
|
|
forms.pop_back(); |
11126
|
|
|
|
|
|
|
any_merged = true; |
11127
|
|
|
|
|
|
|
} |
11128
|
|
|
|
|
|
|
|
11129
|
0
|
0
|
|
|
|
|
if (any_merged && forms[i].forms.size() > 1) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
11130
|
|
|
|
|
|
|
// Remove duplicate form-tag pairs |
11131
|
|
|
|
|
|
|
struct tagged_form_comparator { |
11132
|
0
|
0
|
|
|
|
|
inline static bool eq(const tagged_form& a, const tagged_form& b) { return a.tag == b.tag && a.form == b.form; } |
|
|
0
|
|
|
|
|
|
11133
|
0
|
0
|
|
|
|
|
inline static bool lt(const tagged_form& a, const tagged_form& b) { int tag_compare = a.tag.compare(b.tag); return tag_compare < 0 || (tag_compare == 0 && a.form < b.form); } |
11134
|
|
|
|
|
|
|
}; |
11135
|
|
|
|
|
|
|
|
11136
|
|
|
|
|
|
|
sort(forms[i].forms.begin(), forms[i].forms.end(), tagged_form_comparator::lt); |
11137
|
0
|
|
|
|
|
|
forms[i].forms.resize(unique(forms[i].forms.begin(), forms[i].forms.end(), tagged_form_comparator::eq) - forms[i].forms.begin()); |
11138
|
|
|
|
|
|
|
} |
11139
|
|
|
|
|
|
|
} |
11140
|
0
|
|
|
|
|
|
} |
11141
|
|
|
|
|
|
|
|
11142
|
|
|
|
|
|
|
} // namespace morphodita |
11143
|
|
|
|
|
|
|
|
11144
|
|
|
|
|
|
|
///////// |
11145
|
|
|
|
|
|
|
// File: morphodita/tokenizer/czech_tokenizer.cpp |
11146
|
|
|
|
|
|
|
///////// |
11147
|
|
|
|
|
|
|
|
11148
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
11149
|
|
|
|
|
|
|
// |
11150
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
11151
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
11152
|
|
|
|
|
|
|
// |
11153
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
11154
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
11155
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
11156
|
|
|
|
|
|
|
|
11157
|
|
|
|
|
|
|
namespace morphodita { |
11158
|
|
|
|
|
|
|
|
11159
|
|
|
|
|
|
|
static const char _czech_tokenizer_cond_offsets[] = { |
11160
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
11161
|
|
|
|
|
|
|
2, 2, 2, 2, 2, 2, 2, 2, |
11162
|
|
|
|
|
|
|
2, 2, 2, 2, 2, 2, 2 |
11163
|
|
|
|
|
|
|
}; |
11164
|
|
|
|
|
|
|
|
11165
|
|
|
|
|
|
|
static const char _czech_tokenizer_cond_lengths[] = { |
11166
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 2, |
11167
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
11168
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0 |
11169
|
|
|
|
|
|
|
}; |
11170
|
|
|
|
|
|
|
|
11171
|
|
|
|
|
|
|
static const short _czech_tokenizer_cond_keys[] = { |
11172
|
|
|
|
|
|
|
43u, 43u, 45u, 45u, 0 |
11173
|
|
|
|
|
|
|
}; |
11174
|
|
|
|
|
|
|
|
11175
|
|
|
|
|
|
|
static const char _czech_tokenizer_cond_spaces[] = { |
11176
|
|
|
|
|
|
|
1, 0, 0 |
11177
|
|
|
|
|
|
|
}; |
11178
|
|
|
|
|
|
|
|
11179
|
|
|
|
|
|
|
static const unsigned char _czech_tokenizer_key_offsets[] = { |
11180
|
|
|
|
|
|
|
0, 0, 17, 29, 43, 46, 51, 54, |
11181
|
|
|
|
|
|
|
89, 94, 98, 101, 105, 110, 111, 116, |
11182
|
|
|
|
|
|
|
117, 122, 136, 143, 148, 151, 163 |
11183
|
|
|
|
|
|
|
}; |
11184
|
|
|
|
|
|
|
|
11185
|
|
|
|
|
|
|
static const short _czech_tokenizer_trans_keys[] = { |
11186
|
|
|
|
|
|
|
13u, 32u, 34u, 40u, 91u, 96u, 123u, 129u, |
11187
|
|
|
|
|
|
|
133u, 135u, 147u, 150u, 162u, 9u, 10u, 65u, |
11188
|
|
|
|
|
|
|
90u, 34u, 40u, 91u, 96u, 123u, 129u, 133u, |
11189
|
|
|
|
|
|
|
135u, 150u, 162u, 65u, 90u, 13u, 32u, 34u, |
11190
|
|
|
|
|
|
|
39u, 41u, 59u, 93u, 125u, 139u, 141u, 147u, |
11191
|
|
|
|
|
|
|
161u, 9u, 10u, 159u, 48u, 57u, 43u, 45u, |
11192
|
|
|
|
|
|
|
159u, 48u, 57u, 159u, 48u, 57u, 9u, 10u, |
11193
|
|
|
|
|
|
|
13u, 32u, 33u, 44u, 46u, 47u, 63u, 129u, |
11194
|
|
|
|
|
|
|
131u, 135u, 142u, 147u, 157u, 159u, 160u, 301u, |
11195
|
|
|
|
|
|
|
557u, 811u, 1067u, 0u, 42u, 48u, 57u, 58u, |
11196
|
|
|
|
|
|
|
64u, 65u, 90u, 91u, 96u, 97u, 122u, 123u, |
11197
|
|
|
|
|
|
|
255u, 9u, 10u, 13u, 32u, 147u, 9u, 13u, |
11198
|
|
|
|
|
|
|
32u, 147u, 9u, 32u, 147u, 9u, 10u, 32u, |
11199
|
|
|
|
|
|
|
147u, 9u, 10u, 13u, 32u, 147u, 13u, 9u, |
11200
|
|
|
|
|
|
|
10u, 13u, 32u, 147u, 10u, 9u, 10u, 13u, |
11201
|
|
|
|
|
|
|
32u, 147u, 13u, 32u, 34u, 39u, 41u, 59u, |
11202
|
|
|
|
|
|
|
93u, 125u, 139u, 141u, 147u, 161u, 9u, 10u, |
11203
|
|
|
|
|
|
|
44u, 46u, 69u, 101u, 159u, 48u, 57u, 69u, |
11204
|
|
|
|
|
|
|
101u, 159u, 48u, 57u, 159u, 48u, 57u, 129u, |
11205
|
|
|
|
|
|
|
131u, 135u, 151u, 155u, 157u, 65u, 90u, 97u, |
11206
|
|
|
|
|
|
|
122u, 142u, 143u, 159u, 48u, 57u, 0 |
11207
|
|
|
|
|
|
|
}; |
11208
|
|
|
|
|
|
|
|
11209
|
|
|
|
|
|
|
static const char _czech_tokenizer_single_lengths[] = { |
11210
|
|
|
|
|
|
|
0, 13, 10, 12, 1, 3, 1, 21, |
11211
|
|
|
|
|
|
|
5, 4, 3, 4, 5, 1, 5, 1, |
11212
|
|
|
|
|
|
|
5, 12, 5, 3, 1, 6, 1 |
11213
|
|
|
|
|
|
|
}; |
11214
|
|
|
|
|
|
|
|
11215
|
|
|
|
|
|
|
static const char _czech_tokenizer_range_lengths[] = { |
11216
|
|
|
|
|
|
|
0, 2, 1, 1, 1, 1, 1, 7, |
11217
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
11218
|
|
|
|
|
|
|
0, 1, 1, 1, 1, 3, 1 |
11219
|
|
|
|
|
|
|
}; |
11220
|
|
|
|
|
|
|
|
11221
|
|
|
|
|
|
|
static const unsigned char _czech_tokenizer_index_offsets[] = { |
11222
|
|
|
|
|
|
|
0, 0, 16, 28, 42, 45, 50, 53, |
11223
|
|
|
|
|
|
|
82, 88, 93, 97, 102, 108, 110, 116, |
11224
|
|
|
|
|
|
|
118, 124, 138, 145, 150, 153, 163 |
11225
|
|
|
|
|
|
|
}; |
11226
|
|
|
|
|
|
|
|
11227
|
|
|
|
|
|
|
static const char _czech_tokenizer_indicies[] = { |
11228
|
|
|
|
|
|
|
1, 1, 2, 2, 2, 2, 2, 3, |
11229
|
|
|
|
|
|
|
2, 3, 1, 2, 2, 1, 3, 0, |
11230
|
|
|
|
|
|
|
2, 2, 2, 2, 2, 3, 2, 3, |
11231
|
|
|
|
|
|
|
2, 2, 3, 0, 4, 4, 5, 5, |
11232
|
|
|
|
|
|
|
5, 5, 5, 5, 5, 5, 4, 5, |
11233
|
|
|
|
|
|
|
4, 0, 6, 6, 0, 7, 7, 8, |
11234
|
|
|
|
|
|
|
8, 0, 8, 8, 0, 10, 11, 12, |
11235
|
|
|
|
|
|
|
10, 13, 9, 13, 9, 13, 16, 16, |
11236
|
|
|
|
|
|
|
16, 16, 10, 16, 15, 13, 9, 17, |
11237
|
|
|
|
|
|
|
9, 17, 9, 15, 9, 16, 9, 16, |
11238
|
|
|
|
|
|
|
9, 14, 10, 19, 20, 10, 10, 18, |
11239
|
|
|
|
|
|
|
10, 21, 10, 10, 18, 10, 10, 10, |
11240
|
|
|
|
|
|
|
18, 10, 21, 10, 10, 18, 10, 22, |
11241
|
|
|
|
|
|
|
23, 10, 10, 18, 25, 24, 10, 22, |
11242
|
|
|
|
|
|
|
26, 10, 10, 18, 25, 24, 10, 23, |
11243
|
|
|
|
|
|
|
26, 10, 10, 18, 4, 4, 5, 5, |
11244
|
|
|
|
|
|
|
5, 5, 5, 5, 5, 5, 4, 5, |
11245
|
|
|
|
|
|
|
4, 27, 28, 28, 29, 29, 15, 15, |
11246
|
|
|
|
|
|
|
27, 29, 29, 6, 6, 27, 8, 8, |
11247
|
|
|
|
|
|
|
27, 16, 16, 16, 16, 16, 16, 16, |
11248
|
|
|
|
|
|
|
16, 16, 27, 15, 15, 27, 0 |
11249
|
|
|
|
|
|
|
}; |
11250
|
|
|
|
|
|
|
|
11251
|
|
|
|
|
|
|
static const char _czech_tokenizer_trans_targs[] = { |
11252
|
|
|
|
|
|
|
7, 1, 2, 7, 1, 3, 19, 6, |
11253
|
|
|
|
|
|
|
20, 7, 8, 12, 16, 17, 0, 18, |
11254
|
|
|
|
|
|
|
21, 22, 7, 9, 11, 10, 13, 14, |
11255
|
|
|
|
|
|
|
7, 7, 15, 7, 4, 5 |
11256
|
|
|
|
|
|
|
}; |
11257
|
|
|
|
|
|
|
|
11258
|
|
|
|
|
|
|
static const char _czech_tokenizer_trans_actions[] = { |
11259
|
|
|
|
|
|
|
1, 0, 0, 2, 3, 0, 4, 0, |
11260
|
|
|
|
|
|
|
0, 7, 0, 0, 0, 4, 0, 4, |
11261
|
|
|
|
|
|
|
0, 0, 8, 0, 0, 0, 0, 0, |
11262
|
|
|
|
|
|
|
9, 10, 0, 11, 0, 0 |
11263
|
|
|
|
|
|
|
}; |
11264
|
|
|
|
|
|
|
|
11265
|
|
|
|
|
|
|
static const char _czech_tokenizer_to_state_actions[] = { |
11266
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 5, |
11267
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
11268
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0 |
11269
|
|
|
|
|
|
|
}; |
11270
|
|
|
|
|
|
|
|
11271
|
|
|
|
|
|
|
static const char _czech_tokenizer_from_state_actions[] = { |
11272
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 6, |
11273
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
11274
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0 |
11275
|
|
|
|
|
|
|
}; |
11276
|
|
|
|
|
|
|
|
11277
|
|
|
|
|
|
|
static const unsigned char _czech_tokenizer_eof_trans[] = { |
11278
|
|
|
|
|
|
|
0, 1, 1, 1, 1, 1, 1, 0, |
11279
|
|
|
|
|
|
|
19, 19, 19, 19, 19, 25, 19, 25, |
11280
|
|
|
|
|
|
|
19, 28, 28, 28, 28, 28, 28 |
11281
|
|
|
|
|
|
|
}; |
11282
|
|
|
|
|
|
|
|
11283
|
|
|
|
|
|
|
static const int czech_tokenizer_start = 7; |
11284
|
|
|
|
|
|
|
|
11285
|
|
|
|
|
|
|
// The list of lower cased words that when preceding eos do not end sentence. |
11286
|
|
|
|
|
|
|
// Note: because of VS, we cannot list the abbreviations directly in UTF-8, |
11287
|
|
|
|
|
|
|
// because the compilation of utf-8 encoded sources fail on some locales |
11288
|
|
|
|
|
|
|
// (e.g., Japanese). |
11289
|
|
|
|
|
|
|
// perl -CS -ple 'use Encode;s/([^[:ascii:]])/join("", map {sprintf "\\%o", ord($_)} split(m@@, encode("utf-8", $1)))/ge' |
11290
|
|
|
|
|
|
|
// perl -CS -ple 'use Encode;s/\\([0-7]{3})\\([0-7]{3})/decode("utf-8", chr(oct($1)).chr(oct($2)))/ge' |
11291
|
218
|
100
|
|
|
|
|
const unordered_set czech_tokenizer::abbreviations_czech = { |
|
|
0
|
|
|
|
|
|
11292
|
|
|
|
|
|
|
// Titles |
11293
|
|
|
|
|
|
|
"prof", "csc", "drsc", "doc", "phd", "ph", "d", |
11294
|
|
|
|
|
|
|
"judr", "mddr", "mudr", "mvdr", "paeddr", "paedr", "phdr", "rndr", "rsdr", "dr", |
11295
|
|
|
|
|
|
|
"ing", "arch", "mgr", "bc", "mag", "mba", "bca", "mga", |
11296
|
|
|
|
|
|
|
"gen", "plk", "pplk", "npor", "por", "ppor", "kpt", "mjr", "sgt", "pls", "p", "s", |
11297
|
|
|
|
|
|
|
"p", "p\303\255", "fa", "fy", "mr", "mrs", "ms", "miss", "tr", "sv", |
11298
|
|
|
|
|
|
|
// Geographic names |
11299
|
|
|
|
|
|
|
"angl", "fr", "\304\215es", "ces", "\304\215s", "cs", "slov", "n\304\233m", "nem", "it", "pol", "ma\304\217", "mad", "rus", |
11300
|
|
|
|
|
|
|
"sev", "v\303\275ch", "vych", "ji\305\276", "jiz", "z\303\241p", "zap", |
11301
|
|
|
|
|
|
|
// Common abbrevs |
11302
|
|
|
|
|
|
|
"adr", "\304\215", "c", "eg", "ev", "g", "hod", "j", "kr", "m", "max", "min", "mj", "nap\305\231", "napr", |
11303
|
|
|
|
|
|
|
"okr", "pop\305\231", "popr", "pozn", "r", "\305\231", "red", "rep", "resp", "srov", "st", "st\305\231", "str", |
11304
|
|
|
|
|
|
|
"sv", "tel", "tj", "tzv", "\303\272", "u", "uh", "ul", "um", "zl", "zn", |
11305
|
|
|
|
|
|
|
}; |
11306
|
|
|
|
|
|
|
|
11307
|
210
|
100
|
|
|
|
|
const unordered_set czech_tokenizer::abbreviations_slovak = { |
|
|
0
|
|
|
|
|
|
11308
|
|
|
|
|
|
|
// Titles |
11309
|
|
|
|
|
|
|
"prof", "csc", "drsc", "doc", "phd", "ph", "d", |
11310
|
|
|
|
|
|
|
"judr", "mddr", "mudr", "mvdr", "paeddr", "paedr", "phdr", "rndr", "rsdr", "dr", |
11311
|
|
|
|
|
|
|
"ing", "arch", "mgr", "bc", "mag", "mba", "bca", "mga", |
11312
|
|
|
|
|
|
|
"gen", "plk", "pplk", "npor", "por", "ppor", "kpt", "mjr", "sgt", "pls", "p", "s", |
11313
|
|
|
|
|
|
|
"p", "p\303\255", "fa", "fy", "mr", "mrs", "ms", "miss", "tr", "sv", |
11314
|
|
|
|
|
|
|
// Geographic names |
11315
|
|
|
|
|
|
|
"angl", "fr", "\304\215es", "ces", "\304\215s", "cs", "slov", "nem", "it", "po\304\276", "pol", "ma\304\217", "mad", |
11316
|
|
|
|
|
|
|
"rus", "sev", "v\303\275ch", "vych", "ju\305\276", "juz", "z\303\241p", "zap", |
11317
|
|
|
|
|
|
|
// Common abbrevs |
11318
|
|
|
|
|
|
|
"adr", "\304\215", "c", "eg", "ev", "g", "hod", "j", "kr", "m", "max", "min", "mj", "napr", |
11319
|
|
|
|
|
|
|
"okr", "popr", "pozn", "r", "red", "rep", "resp", "srov", "st", "str", |
11320
|
|
|
|
|
|
|
"sv", "tel", "tj", "tzv", "\303\272", "u", "uh", "ul", "um", "zl", "zn", |
11321
|
|
|
|
|
|
|
}; |
11322
|
|
|
|
|
|
|
|
11323
|
0
|
|
|
|
|
|
czech_tokenizer::czech_tokenizer(tokenizer_language language, unsigned version, const morpho* m) |
11324
|
0
|
0
|
|
|
|
|
: ragel_tokenizer(version <= 1 ? 1 : 2), m(m) { |
|
|
0
|
|
|
|
|
|
11325
|
0
|
|
|
|
|
|
switch (language) { |
11326
|
|
|
|
|
|
|
case CZECH: |
11327
|
0
|
|
|
|
|
|
abbreviations = &abbreviations_czech; |
11328
|
0
|
|
|
|
|
|
break; |
11329
|
|
|
|
|
|
|
case SLOVAK: |
11330
|
0
|
|
|
|
|
|
abbreviations = &abbreviations_slovak; |
11331
|
0
|
|
|
|
|
|
break; |
11332
|
|
|
|
|
|
|
} |
11333
|
0
|
|
|
|
|
|
} |
11334
|
|
|
|
|
|
|
|
11335
|
0
|
|
|
|
|
|
void czech_tokenizer::merge_hyphenated(vector& tokens) { |
11336
|
|
|
|
|
|
|
using namespace unilib; |
11337
|
|
|
|
|
|
|
|
11338
|
0
|
0
|
|
|
|
|
if (!m) return; |
11339
|
0
|
0
|
|
|
|
|
if (tokens.empty() || chars[tokens.back().start].cat & ~unicode::L) return; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
11340
|
|
|
|
|
|
|
|
11341
|
|
|
|
|
|
|
unsigned matched_hyphens = 0; |
11342
|
0
|
0
|
|
|
|
|
for (unsigned hyphens = 1; hyphens <= 2; hyphens++) { |
11343
|
|
|
|
|
|
|
// Are the tokens a sequence of 'hyphens' hyphenated tokens? |
11344
|
0
|
0
|
|
|
|
|
if (tokens.size() < 2*hyphens + 1) break; |
11345
|
0
|
|
|
|
|
|
unsigned first_hyphen = tokens.size() - 2*hyphens; |
11346
|
0
|
0
|
|
|
|
|
if (tokens[first_hyphen].length != 1 || chars[tokens[first_hyphen].start].cat & ~unicode::P || |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
11347
|
0
|
0
|
|
|
|
|
tokens[first_hyphen].start + tokens[first_hyphen].length != tokens[first_hyphen + 1].start || |
11348
|
0
|
0
|
|
|
|
|
tokens[first_hyphen-1].start + tokens[first_hyphen-1].length != tokens[first_hyphen].start || |
|
|
0
|
|
|
|
|
|
11349
|
0
|
|
|
|
|
|
chars[tokens[first_hyphen-1].start].cat & ~unicode::L) |
11350
|
|
|
|
|
|
|
break; |
11351
|
|
|
|
|
|
|
|
11352
|
0
|
0
|
|
|
|
|
if (m->analyze(string_piece(chars[tokens[first_hyphen-1].start].str, chars[tokens.back().start + tokens.back().length].str - chars[tokens[first_hyphen-1].start].str), morpho::NO_GUESSER, lemmas) >= 0) |
11353
|
|
|
|
|
|
|
matched_hyphens = hyphens; |
11354
|
|
|
|
|
|
|
} |
11355
|
|
|
|
|
|
|
|
11356
|
0
|
0
|
|
|
|
|
if (matched_hyphens) { |
11357
|
0
|
|
|
|
|
|
unsigned first = tokens.size() - 2*matched_hyphens - 1; |
11358
|
0
|
|
|
|
|
|
tokens[first].length = tokens.back().start + tokens.back().length - tokens[first].start; |
11359
|
0
|
|
|
|
|
|
tokens.resize(first + 1); |
11360
|
|
|
|
|
|
|
} |
11361
|
|
|
|
|
|
|
} |
11362
|
|
|
|
|
|
|
|
11363
|
0
|
|
|
|
|
|
bool czech_tokenizer::next_sentence(vector& tokens) { |
11364
|
|
|
|
|
|
|
using namespace unilib; |
11365
|
|
|
|
|
|
|
|
11366
|
|
|
|
|
|
|
int cs, act; |
11367
|
|
|
|
|
|
|
size_t ts, te; |
11368
|
|
|
|
|
|
|
size_t whitespace = 0; // Suppress "may be uninitialized" warning |
11369
|
|
|
|
|
|
|
|
11370
|
0
|
0
|
|
|
|
|
while (tokenize_url_email(tokens)) |
11371
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) |
11372
|
|
|
|
|
|
|
return true; |
11373
|
|
|
|
|
|
|
|
11374
|
|
|
|
|
|
|
{ |
11375
|
|
|
|
|
|
|
cs = czech_tokenizer_start; |
11376
|
0
|
|
|
|
|
|
ts = 0; |
11377
|
|
|
|
|
|
|
te = 0; |
11378
|
|
|
|
|
|
|
act = 0; |
11379
|
|
|
|
|
|
|
} |
11380
|
|
|
|
|
|
|
|
11381
|
|
|
|
|
|
|
{ |
11382
|
|
|
|
|
|
|
int _klen; |
11383
|
|
|
|
|
|
|
const short *_keys; |
11384
|
|
|
|
|
|
|
int _trans; |
11385
|
|
|
|
|
|
|
short _widec; |
11386
|
|
|
|
|
|
|
|
11387
|
0
|
0
|
|
|
|
|
if ( ( current) == ( (chars.size() - 1)) ) |
11388
|
|
|
|
|
|
|
goto _test_eof; |
11389
|
|
|
|
|
|
|
if ( cs == 0 ) |
11390
|
|
|
|
|
|
|
goto _out; |
11391
|
|
|
|
|
|
|
_resume: |
11392
|
0
|
0
|
|
|
|
|
switch ( _czech_tokenizer_from_state_actions[cs] ) { |
11393
|
|
|
|
|
|
|
case 6: |
11394
|
0
|
|
|
|
|
|
{ts = ( current);} |
11395
|
0
|
|
|
|
|
|
break; |
11396
|
|
|
|
|
|
|
} |
11397
|
|
|
|
|
|
|
|
11398
|
0
|
|
|
|
|
|
_widec = ( ragel_char(chars[current])); |
11399
|
0
|
|
|
|
|
|
_klen = _czech_tokenizer_cond_lengths[cs]; |
11400
|
0
|
|
|
|
|
|
_keys = _czech_tokenizer_cond_keys + (_czech_tokenizer_cond_offsets[cs]*2); |
11401
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
11402
|
|
|
|
|
|
|
const short *_lower = _keys; |
11403
|
|
|
|
|
|
|
const short *_mid; |
11404
|
0
|
|
|
|
|
|
const short *_upper = _keys + (_klen<<1) - 2; |
11405
|
|
|
|
|
|
|
while (1) { |
11406
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
11407
|
|
|
|
|
|
|
break; |
11408
|
|
|
|
|
|
|
|
11409
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
11410
|
0
|
0
|
|
|
|
|
if ( _widec < _mid[0] ) |
11411
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
11412
|
0
|
0
|
|
|
|
|
else if ( _widec > _mid[1] ) |
11413
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
11414
|
|
|
|
|
|
|
else { |
11415
|
0
|
|
|
|
|
|
switch ( _czech_tokenizer_cond_spaces[_czech_tokenizer_cond_offsets[cs] + ((_mid - _keys)>>1)] ) { |
11416
|
|
|
|
|
|
|
case 0: { |
11417
|
0
|
|
|
|
|
|
_widec = (short)(256u + (( ragel_char(chars[current])) - 0u)); |
11418
|
0
|
0
|
|
|
|
|
if ( |
11419
|
0
|
0
|
|
|
|
|
!current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256; |
|
|
0
|
|
|
|
|
|
11420
|
|
|
|
|
|
|
break; |
11421
|
|
|
|
|
|
|
} |
11422
|
|
|
|
|
|
|
case 1: { |
11423
|
0
|
|
|
|
|
|
_widec = (short)(768u + (( ragel_char(chars[current])) - 0u)); |
11424
|
0
|
0
|
|
|
|
|
if ( |
11425
|
0
|
0
|
|
|
|
|
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
11426
|
|
|
|
|
|
|
break; |
11427
|
|
|
|
|
|
|
} |
11428
|
|
|
|
|
|
|
} |
11429
|
|
|
|
|
|
|
break; |
11430
|
|
|
|
|
|
|
} |
11431
|
|
|
|
|
|
|
} |
11432
|
|
|
|
|
|
|
} |
11433
|
|
|
|
|
|
|
|
11434
|
0
|
|
|
|
|
|
_keys = _czech_tokenizer_trans_keys + _czech_tokenizer_key_offsets[cs]; |
11435
|
0
|
|
|
|
|
|
_trans = _czech_tokenizer_index_offsets[cs]; |
11436
|
|
|
|
|
|
|
|
11437
|
0
|
|
|
|
|
|
_klen = _czech_tokenizer_single_lengths[cs]; |
11438
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
11439
|
|
|
|
|
|
|
const short *_lower = _keys; |
11440
|
|
|
|
|
|
|
const short *_mid; |
11441
|
0
|
|
|
|
|
|
const short *_upper = _keys + _klen - 1; |
11442
|
|
|
|
|
|
|
while (1) { |
11443
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
11444
|
|
|
|
|
|
|
break; |
11445
|
|
|
|
|
|
|
|
11446
|
0
|
|
|
|
|
|
_mid = _lower + ((_upper-_lower) >> 1); |
11447
|
0
|
0
|
|
|
|
|
if ( _widec < *_mid ) |
11448
|
0
|
|
|
|
|
|
_upper = _mid - 1; |
11449
|
0
|
0
|
|
|
|
|
else if ( _widec > *_mid ) |
11450
|
0
|
|
|
|
|
|
_lower = _mid + 1; |
11451
|
|
|
|
|
|
|
else { |
11452
|
0
|
|
|
|
|
|
_trans += (unsigned int)(_mid - _keys); |
11453
|
0
|
|
|
|
|
|
goto _match; |
11454
|
|
|
|
|
|
|
} |
11455
|
|
|
|
|
|
|
} |
11456
|
0
|
|
|
|
|
|
_keys += _klen; |
11457
|
0
|
|
|
|
|
|
_trans += _klen; |
11458
|
|
|
|
|
|
|
} |
11459
|
|
|
|
|
|
|
|
11460
|
0
|
|
|
|
|
|
_klen = _czech_tokenizer_range_lengths[cs]; |
11461
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
11462
|
|
|
|
|
|
|
const short *_lower = _keys; |
11463
|
|
|
|
|
|
|
const short *_mid; |
11464
|
0
|
|
|
|
|
|
const short *_upper = _keys + (_klen<<1) - 2; |
11465
|
|
|
|
|
|
|
while (1) { |
11466
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
11467
|
|
|
|
|
|
|
break; |
11468
|
|
|
|
|
|
|
|
11469
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
11470
|
0
|
0
|
|
|
|
|
if ( _widec < _mid[0] ) |
11471
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
11472
|
0
|
0
|
|
|
|
|
else if ( _widec > _mid[1] ) |
11473
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
11474
|
|
|
|
|
|
|
else { |
11475
|
0
|
|
|
|
|
|
_trans += (unsigned int)((_mid - _keys)>>1); |
11476
|
0
|
|
|
|
|
|
goto _match; |
11477
|
|
|
|
|
|
|
} |
11478
|
|
|
|
|
|
|
} |
11479
|
0
|
|
|
|
|
|
_trans += _klen; |
11480
|
|
|
|
|
|
|
} |
11481
|
|
|
|
|
|
|
|
11482
|
|
|
|
|
|
|
_match: |
11483
|
0
|
|
|
|
|
|
_trans = _czech_tokenizer_indicies[_trans]; |
11484
|
|
|
|
|
|
|
_eof_trans: |
11485
|
0
|
|
|
|
|
|
cs = _czech_tokenizer_trans_targs[_trans]; |
11486
|
|
|
|
|
|
|
|
11487
|
0
|
0
|
|
|
|
|
if ( _czech_tokenizer_trans_actions[_trans] == 0 ) |
11488
|
|
|
|
|
|
|
goto _again; |
11489
|
|
|
|
|
|
|
|
11490
|
0
|
|
|
|
|
|
switch ( _czech_tokenizer_trans_actions[_trans] ) { |
11491
|
|
|
|
|
|
|
case 3: |
11492
|
0
|
|
|
|
|
|
{ whitespace = current; } |
11493
|
0
|
|
|
|
|
|
break; |
11494
|
|
|
|
|
|
|
case 4: |
11495
|
0
|
|
|
|
|
|
{te = ( current)+1;} |
11496
|
0
|
|
|
|
|
|
break; |
11497
|
|
|
|
|
|
|
case 7: |
11498
|
0
|
|
|
|
|
|
{te = ( current)+1;{ tokens.emplace_back(ts, te - ts); |
11499
|
0
|
|
|
|
|
|
merge_hyphenated(tokens); |
11500
|
0
|
|
|
|
|
|
current = te; |
11501
|
0
|
0
|
|
|
|
|
do |
11502
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
11503
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
11504
|
0
|
|
|
|
|
|
( current)--; |
11505
|
|
|
|
|
|
|
}} |
11506
|
0
|
|
|
|
|
|
break; |
11507
|
|
|
|
|
|
|
case 2: |
11508
|
0
|
|
|
|
|
|
{te = ( current)+1;{ |
11509
|
0
|
|
|
|
|
|
bool eos = is_eos(tokens, chars[ts].chr, abbreviations); |
11510
|
0
|
0
|
|
|
|
|
for (current = ts; current < whitespace; current++) |
11511
|
0
|
|
|
|
|
|
tokens.emplace_back(current, 1); |
11512
|
0
|
|
|
|
|
|
{( current) = (( whitespace))-1;} |
11513
|
0
|
0
|
|
|
|
|
if (eos) {( current)++; goto _out; } |
11514
|
|
|
|
|
|
|
}} |
11515
|
|
|
|
|
|
|
break; |
11516
|
|
|
|
|
|
|
case 10: |
11517
|
0
|
|
|
|
|
|
{te = ( current)+1;{ |
11518
|
0
|
0
|
|
|
|
|
if (!tokens.empty()) {( current)++; goto _out; } |
11519
|
0
|
|
|
|
|
|
current = te; |
11520
|
0
|
0
|
|
|
|
|
do |
11521
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
11522
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
11523
|
0
|
|
|
|
|
|
( current)--; |
11524
|
|
|
|
|
|
|
}} |
11525
|
0
|
|
|
|
|
|
break; |
11526
|
|
|
|
|
|
|
case 11: |
11527
|
0
|
|
|
|
|
|
{te = ( current);( current)--;{ tokens.emplace_back(ts, te - ts); |
11528
|
0
|
|
|
|
|
|
merge_hyphenated(tokens); |
11529
|
0
|
|
|
|
|
|
current = te; |
11530
|
0
|
0
|
|
|
|
|
do |
11531
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
11532
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
11533
|
0
|
|
|
|
|
|
( current)--; |
11534
|
|
|
|
|
|
|
}} |
11535
|
0
|
|
|
|
|
|
break; |
11536
|
|
|
|
|
|
|
case 8: |
11537
|
0
|
|
|
|
|
|
{te = ( current);( current)--;{ |
11538
|
0
|
|
|
|
|
|
current = te; |
11539
|
0
|
0
|
|
|
|
|
do |
11540
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
11541
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
11542
|
0
|
|
|
|
|
|
( current)--; |
11543
|
|
|
|
|
|
|
}} |
11544
|
0
|
|
|
|
|
|
break; |
11545
|
|
|
|
|
|
|
case 9: |
11546
|
0
|
|
|
|
|
|
{te = ( current);( current)--;{ |
11547
|
0
|
0
|
|
|
|
|
if (!tokens.empty()) {( current)++; goto _out; } |
11548
|
0
|
|
|
|
|
|
current = te; |
11549
|
0
|
0
|
|
|
|
|
do |
11550
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
11551
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
11552
|
0
|
|
|
|
|
|
( current)--; |
11553
|
|
|
|
|
|
|
}} |
11554
|
0
|
|
|
|
|
|
break; |
11555
|
|
|
|
|
|
|
case 1: |
11556
|
0
|
|
|
|
|
|
{{( current) = ((te))-1;}{ tokens.emplace_back(ts, te - ts); |
11557
|
0
|
|
|
|
|
|
merge_hyphenated(tokens); |
11558
|
0
|
|
|
|
|
|
current = te; |
11559
|
0
|
0
|
|
|
|
|
do |
11560
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
11561
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
11562
|
0
|
|
|
|
|
|
( current)--; |
11563
|
|
|
|
|
|
|
}} |
11564
|
0
|
|
|
|
|
|
break; |
11565
|
|
|
|
|
|
|
} |
11566
|
|
|
|
|
|
|
|
11567
|
|
|
|
|
|
|
_again: |
11568
|
0
|
0
|
|
|
|
|
switch ( _czech_tokenizer_to_state_actions[cs] ) { |
11569
|
|
|
|
|
|
|
case 5: |
11570
|
0
|
|
|
|
|
|
{ts = 0;} |
11571
|
0
|
|
|
|
|
|
break; |
11572
|
|
|
|
|
|
|
} |
11573
|
|
|
|
|
|
|
|
11574
|
0
|
0
|
|
|
|
|
if ( cs == 0 ) |
11575
|
|
|
|
|
|
|
goto _out; |
11576
|
0
|
0
|
|
|
|
|
if ( ++( current) != ( (chars.size() - 1)) ) |
11577
|
|
|
|
|
|
|
goto _resume; |
11578
|
|
|
|
|
|
|
_test_eof: {} |
11579
|
0
|
0
|
|
|
|
|
if ( ( current) == ( (chars.size() - 1)) ) |
11580
|
|
|
|
|
|
|
{ |
11581
|
0
|
0
|
|
|
|
|
if ( _czech_tokenizer_eof_trans[cs] > 0 ) { |
11582
|
0
|
|
|
|
|
|
_trans = _czech_tokenizer_eof_trans[cs] - 1; |
11583
|
0
|
|
|
|
|
|
goto _eof_trans; |
11584
|
|
|
|
|
|
|
} |
11585
|
|
|
|
|
|
|
} |
11586
|
|
|
|
|
|
|
|
11587
|
|
|
|
|
|
|
_out: {} |
11588
|
|
|
|
|
|
|
} |
11589
|
|
|
|
|
|
|
|
11590
|
|
|
|
|
|
|
(void)act; // Suppress unused variable warning |
11591
|
|
|
|
|
|
|
|
11592
|
0
|
|
|
|
|
|
return !tokens.empty(); |
11593
|
|
|
|
|
|
|
} |
11594
|
|
|
|
|
|
|
|
11595
|
|
|
|
|
|
|
} // namespace morphodita |
11596
|
|
|
|
|
|
|
|
11597
|
|
|
|
|
|
|
///////// |
11598
|
|
|
|
|
|
|
// File: morphodita/tokenizer/czech_tokenizer_factory.h |
11599
|
|
|
|
|
|
|
///////// |
11600
|
|
|
|
|
|
|
|
11601
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
11602
|
|
|
|
|
|
|
// |
11603
|
|
|
|
|
|
|
// Copyright 2019 Institute of Formal and Applied Linguistics, Faculty of |
11604
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
11605
|
|
|
|
|
|
|
// |
11606
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
11607
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
11608
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
11609
|
|
|
|
|
|
|
|
11610
|
|
|
|
|
|
|
namespace morphodita { |
11611
|
|
|
|
|
|
|
|
11612
|
0
|
|
|
|
|
|
class czech_tokenizer_factory : public tokenizer_factory { |
11613
|
|
|
|
|
|
|
public: |
11614
|
|
|
|
|
|
|
// Construct a new tokenizer instance. |
11615
|
|
|
|
|
|
|
virtual tokenizer* new_tokenizer(const morpho* m) const override; |
11616
|
|
|
|
|
|
|
|
11617
|
|
|
|
|
|
|
bool load(istream& is); |
11618
|
|
|
|
|
|
|
private: |
11619
|
|
|
|
|
|
|
czech_tokenizer::tokenizer_language language; |
11620
|
|
|
|
|
|
|
unsigned version; |
11621
|
|
|
|
|
|
|
}; |
11622
|
|
|
|
|
|
|
|
11623
|
|
|
|
|
|
|
} // namespace morphodita |
11624
|
|
|
|
|
|
|
|
11625
|
|
|
|
|
|
|
///////// |
11626
|
|
|
|
|
|
|
// File: morphodita/tokenizer/czech_tokenizer_factory.cpp |
11627
|
|
|
|
|
|
|
///////// |
11628
|
|
|
|
|
|
|
|
11629
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
11630
|
|
|
|
|
|
|
// |
11631
|
|
|
|
|
|
|
// Copyright 2019 Institute of Formal and Applied Linguistics, Faculty of |
11632
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
11633
|
|
|
|
|
|
|
// |
11634
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
11635
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
11636
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
11637
|
|
|
|
|
|
|
|
11638
|
|
|
|
|
|
|
namespace morphodita { |
11639
|
|
|
|
|
|
|
|
11640
|
0
|
|
|
|
|
|
tokenizer* czech_tokenizer_factory::new_tokenizer(const morpho* m) const { |
11641
|
0
|
0
|
|
|
|
|
return new czech_tokenizer(language, version, m); |
11642
|
|
|
|
|
|
|
} |
11643
|
|
|
|
|
|
|
|
11644
|
0
|
|
|
|
|
|
bool czech_tokenizer_factory::load(istream& is) { |
11645
|
0
|
|
|
|
|
|
language = czech_tokenizer::tokenizer_language(is.get()); |
11646
|
0
|
|
|
|
|
|
version = is.get(); |
11647
|
|
|
|
|
|
|
|
11648
|
0
|
0
|
|
|
|
|
return bool(is) && (language == czech_tokenizer::CZECH || language == czech_tokenizer::SLOVAK); |
|
|
0
|
|
|
|
|
|
11649
|
|
|
|
|
|
|
} |
11650
|
|
|
|
|
|
|
|
11651
|
|
|
|
|
|
|
} // namespace morphodita |
11652
|
|
|
|
|
|
|
|
11653
|
|
|
|
|
|
|
///////// |
11654
|
|
|
|
|
|
|
// File: morphodita/tokenizer/czech_tokenizer_factory_encoder.h |
11655
|
|
|
|
|
|
|
///////// |
11656
|
|
|
|
|
|
|
|
11657
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
11658
|
|
|
|
|
|
|
// |
11659
|
|
|
|
|
|
|
// Copyright 2019 Institute of Formal and Applied Linguistics, Faculty of |
11660
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
11661
|
|
|
|
|
|
|
// |
11662
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
11663
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
11664
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
11665
|
|
|
|
|
|
|
|
11666
|
|
|
|
|
|
|
namespace morphodita { |
11667
|
|
|
|
|
|
|
|
11668
|
|
|
|
|
|
|
class czech_tokenizer_factory_encoder { |
11669
|
|
|
|
|
|
|
public: |
11670
|
|
|
|
|
|
|
static void encode(czech_tokenizer::tokenizer_language language, unsigned version, ostream& os); |
11671
|
|
|
|
|
|
|
}; |
11672
|
|
|
|
|
|
|
|
11673
|
|
|
|
|
|
|
} // namespace morphodita |
11674
|
|
|
|
|
|
|
|
11675
|
|
|
|
|
|
|
///////// |
11676
|
|
|
|
|
|
|
// File: morphodita/tokenizer/czech_tokenizer_factory_encoder.cpp |
11677
|
|
|
|
|
|
|
///////// |
11678
|
|
|
|
|
|
|
|
11679
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
11680
|
|
|
|
|
|
|
// |
11681
|
|
|
|
|
|
|
// Copyright 2019 Institute of Formal and Applied Linguistics, Faculty of |
11682
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
11683
|
|
|
|
|
|
|
// |
11684
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
11685
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
11686
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
11687
|
|
|
|
|
|
|
|
11688
|
|
|
|
|
|
|
namespace morphodita { |
11689
|
|
|
|
|
|
|
|
11690
|
0
|
|
|
|
|
|
void czech_tokenizer_factory_encoder::encode(czech_tokenizer::tokenizer_language language, unsigned version, ostream& os) { |
11691
|
0
|
|
|
|
|
|
os.put(language); |
11692
|
0
|
|
|
|
|
|
os.put(version); |
11693
|
0
|
|
|
|
|
|
} |
11694
|
|
|
|
|
|
|
|
11695
|
|
|
|
|
|
|
} // namespace morphodita |
11696
|
|
|
|
|
|
|
|
11697
|
|
|
|
|
|
|
///////// |
11698
|
|
|
|
|
|
|
// File: morphodita/tokenizer/english_tokenizer.cpp |
11699
|
|
|
|
|
|
|
///////// |
11700
|
|
|
|
|
|
|
|
11701
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
11702
|
|
|
|
|
|
|
// |
11703
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
11704
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
11705
|
|
|
|
|
|
|
// |
11706
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
11707
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
11708
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
11709
|
|
|
|
|
|
|
|
11710
|
|
|
|
|
|
|
namespace morphodita { |
11711
|
|
|
|
|
|
|
|
11712
|
|
|
|
|
|
|
// The list of lowercased words that when preceding eos do not end sentence. |
11713
|
232
|
100
|
|
|
|
|
const unordered_set english_tokenizer::abbreviations = { |
|
|
0
|
|
|
|
|
|
11714
|
|
|
|
|
|
|
// Titles |
11715
|
|
|
|
|
|
|
"adj", "adm", "adv", "assoc", "asst", "bart", "bldg", "brig", "bros", "capt", |
11716
|
|
|
|
|
|
|
"cmdr", "col", "comdr", "con", "corp", "cpl", "d", "dr", "dr", "drs", "ens", |
11717
|
|
|
|
|
|
|
"gen", "gov", "hon", "hosp", "hr", "insp", "lt", "mm", "mr", "mrs", "ms", |
11718
|
|
|
|
|
|
|
"maj", "messrs", "mlle", "mme", "mr", "mrs", "ms", "msgr", "op", "ord", |
11719
|
|
|
|
|
|
|
"pfc", "ph", "phd", "prof", "pvt", "rep", "reps", "res", "rev", "rt", "sen", |
11720
|
|
|
|
|
|
|
"sens", "sfc", "sgt", "sr", "st", "supt", "surg", "univ", |
11721
|
|
|
|
|
|
|
// Common abbrevs |
11722
|
|
|
|
|
|
|
"addr", "approx", "apr", "aug", "calif", "co", "corp", "dec", "def", "e", |
11723
|
|
|
|
|
|
|
"e.g", "eg", "feb", "fla", "ft", "gen", "gov", "hrs", "i.", "i.e", "ie", |
11724
|
|
|
|
|
|
|
"inc", "jan", "jr", "ltd", "mar", "max", "min", "mph", "mt", "n", "nov", |
11725
|
|
|
|
|
|
|
"oct", "ont", "pa", "pres", "rep", "rev", "s", "sec", "sen", "sep", "sept", |
11726
|
|
|
|
|
|
|
"sgt", "sr", "tel", "un", "univ", "v", "va", "vs", "w", "yrs", |
11727
|
|
|
|
|
|
|
}; |
11728
|
|
|
|
|
|
|
|
11729
|
|
|
|
|
|
|
static const char _english_tokenizer_split_token_key_offsets[] = { |
11730
|
|
|
|
|
|
|
0, 0, 16, 20, 22, 26, 28, 30, |
11731
|
|
|
|
|
|
|
32, 34, 36, 44, 46, 50, 52, 54, |
11732
|
|
|
|
|
|
|
56, 58, 60, 62, 64, 66, 68, 72, |
11733
|
|
|
|
|
|
|
74, 76, 78, 80, 82, 82 |
11734
|
|
|
|
|
|
|
}; |
11735
|
|
|
|
|
|
|
|
11736
|
|
|
|
|
|
|
static const unsigned char _english_tokenizer_split_token_trans_keys[] = { |
11737
|
|
|
|
|
|
|
65u, 68u, 69u, 76u, 77u, 78u, 83u, 84u, |
11738
|
|
|
|
|
|
|
97u, 100u, 101u, 108u, 109u, 110u, 115u, 116u, |
11739
|
|
|
|
|
|
|
78u, 84u, 110u, 116u, 78u, 110u, 65u, 79u, |
11740
|
|
|
|
|
|
|
97u, 111u, 87u, 119u, 71u, 103u, 84u, 116u, |
11741
|
|
|
|
|
|
|
79u, 111u, 39u, 161u, 77u, 82u, 86u, 89u, |
11742
|
|
|
|
|
|
|
109u, 114u, 118u, 121u, 77u, 109u, 69u, 73u, |
11743
|
|
|
|
|
|
|
101u, 105u, 76u, 108u, 39u, 161u, 68u, 100u, |
11744
|
|
|
|
|
|
|
76u, 108u, 39u, 161u, 69u, 101u, 82u, 114u, |
11745
|
|
|
|
|
|
|
79u, 111u, 77u, 109u, 39u, 79u, 111u, 161u, |
11746
|
|
|
|
|
|
|
78u, 110u, 78u, 110u, 78u, 110u, 65u, 97u, |
11747
|
|
|
|
|
|
|
67u, 99u, 0 |
11748
|
|
|
|
|
|
|
}; |
11749
|
|
|
|
|
|
|
|
11750
|
|
|
|
|
|
|
static const char _english_tokenizer_split_token_single_lengths[] = { |
11751
|
|
|
|
|
|
|
0, 16, 4, 2, 4, 2, 2, 2, |
11752
|
|
|
|
|
|
|
2, 2, 8, 2, 4, 2, 2, 2, |
11753
|
|
|
|
|
|
|
2, 2, 2, 2, 2, 2, 4, 2, |
11754
|
|
|
|
|
|
|
2, 2, 2, 2, 0, 0 |
11755
|
|
|
|
|
|
|
}; |
11756
|
|
|
|
|
|
|
|
11757
|
|
|
|
|
|
|
static const char _english_tokenizer_split_token_range_lengths[] = { |
11758
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
11759
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
11760
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
11761
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0 |
11762
|
|
|
|
|
|
|
}; |
11763
|
|
|
|
|
|
|
|
11764
|
|
|
|
|
|
|
static const unsigned char _english_tokenizer_split_token_index_offsets[] = { |
11765
|
|
|
|
|
|
|
0, 0, 17, 22, 25, 30, 33, 36, |
11766
|
|
|
|
|
|
|
39, 42, 45, 54, 57, 62, 65, 68, |
11767
|
|
|
|
|
|
|
71, 74, 77, 80, 83, 86, 89, 94, |
11768
|
|
|
|
|
|
|
97, 100, 103, 106, 109, 110 |
11769
|
|
|
|
|
|
|
}; |
11770
|
|
|
|
|
|
|
|
11771
|
|
|
|
|
|
|
static const char _english_tokenizer_split_token_indicies[] = { |
11772
|
|
|
|
|
|
|
0, 2, 3, 4, 2, 5, 2, 6, |
11773
|
|
|
|
|
|
|
0, 2, 3, 4, 2, 5, 2, 6, |
11774
|
|
|
|
|
|
|
1, 7, 8, 7, 8, 1, 9, 9, |
11775
|
|
|
|
|
|
|
1, 10, 11, 10, 11, 1, 12, 12, |
11776
|
|
|
|
|
|
|
1, 12, 12, 1, 13, 13, 1, 11, |
11777
|
|
|
|
|
|
|
11, 1, 14, 14, 1, 15, 2, 2, |
11778
|
|
|
|
|
|
|
16, 15, 2, 2, 16, 1, 17, 17, |
11779
|
|
|
|
|
|
|
1, 18, 11, 18, 11, 1, 12, 12, |
11780
|
|
|
|
|
|
|
1, 19, 19, 1, 12, 12, 1, 2, |
11781
|
|
|
|
|
|
|
2, 1, 20, 20, 1, 21, 21, 1, |
11782
|
|
|
|
|
|
|
22, 22, 1, 23, 23, 1, 12, 12, |
11783
|
|
|
|
|
|
|
1, 24, 25, 25, 24, 1, 14, 14, |
11784
|
|
|
|
|
|
|
1, 26, 26, 1, 27, 27, 1, 28, |
11785
|
|
|
|
|
|
|
28, 1, 12, 12, 1, 1, 1, 0 |
11786
|
|
|
|
|
|
|
}; |
11787
|
|
|
|
|
|
|
|
11788
|
|
|
|
|
|
|
static const char _english_tokenizer_split_token_trans_targs[] = { |
11789
|
|
|
|
|
|
|
2, 0, 9, 10, 16, 17, 22, 3, |
11790
|
|
|
|
|
|
|
7, 4, 5, 6, 28, 8, 29, 11, |
11791
|
|
|
|
|
|
|
14, 12, 13, 15, 18, 19, 20, 21, |
11792
|
|
|
|
|
|
|
23, 24, 25, 26, 27 |
11793
|
|
|
|
|
|
|
}; |
11794
|
|
|
|
|
|
|
|
11795
|
|
|
|
|
|
|
static const char _english_tokenizer_split_token_trans_actions[] = { |
11796
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 1, |
11797
|
|
|
|
|
|
|
1, 0, 0, 0, 0, 0, 2, 1, |
11798
|
|
|
|
|
|
|
1, 0, 0, 0, 1, 0, 0, 0, |
11799
|
|
|
|
|
|
|
0, 0, 1, 0, 0 |
11800
|
|
|
|
|
|
|
}; |
11801
|
|
|
|
|
|
|
|
11802
|
|
|
|
|
|
|
static const char _english_tokenizer_split_token_eof_actions[] = { |
11803
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
11804
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
11805
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
11806
|
|
|
|
|
|
|
0, 0, 0, 0, 3, 0 |
11807
|
|
|
|
|
|
|
}; |
11808
|
|
|
|
|
|
|
|
11809
|
|
|
|
|
|
|
static const int english_tokenizer_split_token_start = 1; |
11810
|
|
|
|
|
|
|
|
11811
|
0
|
|
|
|
|
|
void english_tokenizer::split_token(vector& tokens) { |
11812
|
0
|
0
|
|
|
|
|
if (tokens.empty() || chars[tokens.back().start].cat & ~unilib::unicode::L) return; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
11813
|
|
|
|
|
|
|
|
11814
|
0
|
|
|
|
|
|
size_t index = tokens.back().start, end = index + tokens.back().length; |
11815
|
|
|
|
|
|
|
int cs; |
11816
|
0
|
|
|
|
|
|
size_t split_mark = 0, split_len = 0; |
11817
|
|
|
|
|
|
|
|
11818
|
|
|
|
|
|
|
{ |
11819
|
|
|
|
|
|
|
cs = english_tokenizer_split_token_start; |
11820
|
|
|
|
|
|
|
} |
11821
|
|
|
|
|
|
|
|
11822
|
|
|
|
|
|
|
{ |
11823
|
|
|
|
|
|
|
int _klen; |
11824
|
|
|
|
|
|
|
const unsigned char *_keys; |
11825
|
|
|
|
|
|
|
int _trans; |
11826
|
|
|
|
|
|
|
|
11827
|
0
|
0
|
|
|
|
|
if ( ( index) == ( end) ) |
11828
|
|
|
|
|
|
|
goto _test_eof; |
11829
|
|
|
|
|
|
|
if ( cs == 0 ) |
11830
|
|
|
|
|
|
|
goto _out; |
11831
|
|
|
|
|
|
|
_resume: |
11832
|
0
|
|
|
|
|
|
_keys = _english_tokenizer_split_token_trans_keys + _english_tokenizer_split_token_key_offsets[cs]; |
11833
|
0
|
|
|
|
|
|
_trans = _english_tokenizer_split_token_index_offsets[cs]; |
11834
|
|
|
|
|
|
|
|
11835
|
0
|
|
|
|
|
|
_klen = _english_tokenizer_split_token_single_lengths[cs]; |
11836
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
11837
|
|
|
|
|
|
|
const unsigned char *_lower = _keys; |
11838
|
|
|
|
|
|
|
const unsigned char *_mid; |
11839
|
0
|
|
|
|
|
|
const unsigned char *_upper = _keys + _klen - 1; |
11840
|
|
|
|
|
|
|
while (1) { |
11841
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
11842
|
|
|
|
|
|
|
break; |
11843
|
|
|
|
|
|
|
|
11844
|
0
|
|
|
|
|
|
_mid = _lower + ((_upper-_lower) >> 1); |
11845
|
0
|
0
|
|
|
|
|
if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) < *_mid ) |
11846
|
0
|
|
|
|
|
|
_upper = _mid - 1; |
11847
|
0
|
0
|
|
|
|
|
else if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) > *_mid ) |
11848
|
0
|
|
|
|
|
|
_lower = _mid + 1; |
11849
|
|
|
|
|
|
|
else { |
11850
|
0
|
|
|
|
|
|
_trans += (unsigned int)(_mid - _keys); |
11851
|
0
|
|
|
|
|
|
goto _match; |
11852
|
|
|
|
|
|
|
} |
11853
|
|
|
|
|
|
|
} |
11854
|
0
|
|
|
|
|
|
_keys += _klen; |
11855
|
0
|
|
|
|
|
|
_trans += _klen; |
11856
|
|
|
|
|
|
|
} |
11857
|
|
|
|
|
|
|
|
11858
|
0
|
|
|
|
|
|
_klen = _english_tokenizer_split_token_range_lengths[cs]; |
11859
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
11860
|
|
|
|
|
|
|
const unsigned char *_lower = _keys; |
11861
|
|
|
|
|
|
|
const unsigned char *_mid; |
11862
|
0
|
|
|
|
|
|
const unsigned char *_upper = _keys + (_klen<<1) - 2; |
11863
|
|
|
|
|
|
|
while (1) { |
11864
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
11865
|
|
|
|
|
|
|
break; |
11866
|
|
|
|
|
|
|
|
11867
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
11868
|
0
|
0
|
|
|
|
|
if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) < _mid[0] ) |
11869
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
11870
|
0
|
0
|
|
|
|
|
else if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) > _mid[1] ) |
11871
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
11872
|
|
|
|
|
|
|
else { |
11873
|
0
|
|
|
|
|
|
_trans += (unsigned int)((_mid - _keys)>>1); |
11874
|
0
|
|
|
|
|
|
goto _match; |
11875
|
|
|
|
|
|
|
} |
11876
|
|
|
|
|
|
|
} |
11877
|
0
|
|
|
|
|
|
_trans += _klen; |
11878
|
|
|
|
|
|
|
} |
11879
|
|
|
|
|
|
|
|
11880
|
|
|
|
|
|
|
_match: |
11881
|
0
|
|
|
|
|
|
_trans = _english_tokenizer_split_token_indicies[_trans]; |
11882
|
0
|
|
|
|
|
|
cs = _english_tokenizer_split_token_trans_targs[_trans]; |
11883
|
|
|
|
|
|
|
|
11884
|
0
|
0
|
|
|
|
|
if ( _english_tokenizer_split_token_trans_actions[_trans] == 0 ) |
11885
|
|
|
|
|
|
|
goto _again; |
11886
|
|
|
|
|
|
|
|
11887
|
0
|
|
|
|
|
|
switch ( _english_tokenizer_split_token_trans_actions[_trans] ) { |
11888
|
|
|
|
|
|
|
case 1: |
11889
|
0
|
|
|
|
|
|
{ split_mark = index - tokens.back().start + 1; } |
11890
|
0
|
|
|
|
|
|
break; |
11891
|
|
|
|
|
|
|
case 2: |
11892
|
0
|
|
|
|
|
|
{ split_mark = index - tokens.back().start + 1; } |
11893
|
0
|
|
|
|
|
|
{ split_len = split_mark; {( index)++; goto _out; } } |
11894
|
|
|
|
|
|
|
break; |
11895
|
|
|
|
|
|
|
} |
11896
|
|
|
|
|
|
|
|
11897
|
|
|
|
|
|
|
_again: |
11898
|
0
|
0
|
|
|
|
|
if ( cs == 0 ) |
11899
|
|
|
|
|
|
|
goto _out; |
11900
|
0
|
0
|
|
|
|
|
if ( ++( index) != ( end) ) |
11901
|
|
|
|
|
|
|
goto _resume; |
11902
|
|
|
|
|
|
|
_test_eof: {} |
11903
|
0
|
0
|
|
|
|
|
if ( ( index) == ( end) ) |
11904
|
|
|
|
|
|
|
{ |
11905
|
0
|
0
|
|
|
|
|
switch ( _english_tokenizer_split_token_eof_actions[cs] ) { |
11906
|
|
|
|
|
|
|
case 3: |
11907
|
0
|
|
|
|
|
|
{ split_len = split_mark; {( index)++; goto _out; } } |
11908
|
|
|
|
|
|
|
break; |
11909
|
|
|
|
|
|
|
} |
11910
|
|
|
|
|
|
|
} |
11911
|
|
|
|
|
|
|
|
11912
|
|
|
|
|
|
|
_out: {} |
11913
|
|
|
|
|
|
|
} |
11914
|
|
|
|
|
|
|
|
11915
|
0
|
0
|
|
|
|
|
if (split_len && split_len < end) { |
11916
|
0
|
|
|
|
|
|
tokens.back().length -= split_len; |
11917
|
0
|
|
|
|
|
|
tokens.emplace_back(end - split_len, split_len); |
11918
|
|
|
|
|
|
|
} |
11919
|
|
|
|
|
|
|
} |
11920
|
|
|
|
|
|
|
|
11921
|
|
|
|
|
|
|
static const char _english_tokenizer_cond_offsets[] = { |
11922
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
11923
|
|
|
|
|
|
|
0, 0, 0, 2, 2, 2, 2, 2, |
11924
|
|
|
|
|
|
|
2, 2, 2, 2, 2, 2, 2, 2, |
11925
|
|
|
|
|
|
|
2, 2, 2, 2, 2 |
11926
|
|
|
|
|
|
|
}; |
11927
|
|
|
|
|
|
|
|
11928
|
|
|
|
|
|
|
static const char _english_tokenizer_cond_lengths[] = { |
11929
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
11930
|
|
|
|
|
|
|
0, 0, 2, 0, 0, 0, 0, 0, |
11931
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
11932
|
|
|
|
|
|
|
0, 0, 0, 0, 0 |
11933
|
|
|
|
|
|
|
}; |
11934
|
|
|
|
|
|
|
|
11935
|
|
|
|
|
|
|
static const short _english_tokenizer_cond_keys[] = { |
11936
|
|
|
|
|
|
|
43u, 43u, 45u, 45u, 0 |
11937
|
|
|
|
|
|
|
}; |
11938
|
|
|
|
|
|
|
|
11939
|
|
|
|
|
|
|
static const char _english_tokenizer_cond_spaces[] = { |
11940
|
|
|
|
|
|
|
1, 0, 0 |
11941
|
|
|
|
|
|
|
}; |
11942
|
|
|
|
|
|
|
|
11943
|
|
|
|
|
|
|
static const unsigned char _english_tokenizer_key_offsets[] = { |
11944
|
|
|
|
|
|
|
0, 0, 17, 29, 43, 46, 49, 52, |
11945
|
|
|
|
|
|
|
55, 60, 63, 98, 103, 107, 110, 114, |
11946
|
|
|
|
|
|
|
119, 120, 125, 126, 131, 145, 152, 156, |
11947
|
|
|
|
|
|
|
161, 164, 179, 192, 206 |
11948
|
|
|
|
|
|
|
}; |
11949
|
|
|
|
|
|
|
|
11950
|
|
|
|
|
|
|
static const short _english_tokenizer_trans_keys[] = { |
11951
|
|
|
|
|
|
|
13u, 32u, 34u, 40u, 91u, 96u, 123u, 129u, |
11952
|
|
|
|
|
|
|
133u, 135u, 147u, 150u, 162u, 9u, 10u, 65u, |
11953
|
|
|
|
|
|
|
90u, 34u, 40u, 91u, 96u, 123u, 129u, 133u, |
11954
|
|
|
|
|
|
|
135u, 150u, 162u, 65u, 90u, 13u, 32u, 34u, |
11955
|
|
|
|
|
|
|
39u, 41u, 59u, 93u, 125u, 139u, 141u, 147u, |
11956
|
|
|
|
|
|
|
161u, 9u, 10u, 159u, 48u, 57u, 159u, 48u, |
11957
|
|
|
|
|
|
|
57u, 159u, 48u, 57u, 159u, 48u, 57u, 43u, |
11958
|
|
|
|
|
|
|
45u, 159u, 48u, 57u, 159u, 48u, 57u, 9u, |
11959
|
|
|
|
|
|
|
10u, 13u, 32u, 33u, 44u, 46u, 47u, 63u, |
11960
|
|
|
|
|
|
|
129u, 131u, 135u, 142u, 147u, 157u, 159u, 160u, |
11961
|
|
|
|
|
|
|
301u, 557u, 811u, 1067u, 0u, 42u, 48u, 57u, |
11962
|
|
|
|
|
|
|
58u, 64u, 65u, 90u, 91u, 96u, 97u, 122u, |
11963
|
|
|
|
|
|
|
123u, 255u, 9u, 10u, 13u, 32u, 147u, 9u, |
11964
|
|
|
|
|
|
|
13u, 32u, 147u, 9u, 32u, 147u, 9u, 10u, |
11965
|
|
|
|
|
|
|
32u, 147u, 9u, 10u, 13u, 32u, 147u, 13u, |
11966
|
|
|
|
|
|
|
9u, 10u, 13u, 32u, 147u, 10u, 9u, 10u, |
11967
|
|
|
|
|
|
|
13u, 32u, 147u, 13u, 32u, 34u, 39u, 41u, |
11968
|
|
|
|
|
|
|
59u, 93u, 125u, 139u, 141u, 147u, 161u, 9u, |
11969
|
|
|
|
|
|
|
10u, 44u, 46u, 69u, 101u, 159u, 48u, 57u, |
11970
|
|
|
|
|
|
|
44u, 46u, 69u, 101u, 69u, 101u, 159u, 48u, |
11971
|
|
|
|
|
|
|
57u, 159u, 48u, 57u, 39u, 45u, 129u, 131u, |
11972
|
|
|
|
|
|
|
135u, 151u, 155u, 157u, 161u, 65u, 90u, 97u, |
11973
|
|
|
|
|
|
|
122u, 142u, 143u, 45u, 129u, 131u, 135u, 151u, |
11974
|
|
|
|
|
|
|
155u, 157u, 65u, 90u, 97u, 122u, 142u, 143u, |
11975
|
|
|
|
|
|
|
39u, 129u, 131u, 135u, 151u, 155u, 157u, 161u, |
11976
|
|
|
|
|
|
|
65u, 90u, 97u, 122u, 142u, 143u, 159u, 48u, |
11977
|
|
|
|
|
|
|
57u, 0 |
11978
|
|
|
|
|
|
|
}; |
11979
|
|
|
|
|
|
|
|
11980
|
|
|
|
|
|
|
static const char _english_tokenizer_single_lengths[] = { |
11981
|
|
|
|
|
|
|
0, 13, 10, 12, 1, 1, 1, 1, |
11982
|
|
|
|
|
|
|
3, 1, 21, 5, 4, 3, 4, 5, |
11983
|
|
|
|
|
|
|
1, 5, 1, 5, 12, 5, 4, 3, |
11984
|
|
|
|
|
|
|
1, 9, 7, 8, 1 |
11985
|
|
|
|
|
|
|
}; |
11986
|
|
|
|
|
|
|
|
11987
|
|
|
|
|
|
|
static const char _english_tokenizer_range_lengths[] = { |
11988
|
|
|
|
|
|
|
0, 2, 1, 1, 1, 1, 1, 1, |
11989
|
|
|
|
|
|
|
1, 1, 7, 0, 0, 0, 0, 0, |
11990
|
|
|
|
|
|
|
0, 0, 0, 0, 1, 1, 0, 1, |
11991
|
|
|
|
|
|
|
1, 3, 3, 3, 1 |
11992
|
|
|
|
|
|
|
}; |
11993
|
|
|
|
|
|
|
|
11994
|
|
|
|
|
|
|
static const unsigned char _english_tokenizer_index_offsets[] = { |
11995
|
|
|
|
|
|
|
0, 0, 16, 28, 42, 45, 48, 51, |
11996
|
|
|
|
|
|
|
54, 59, 62, 91, 97, 102, 106, 111, |
11997
|
|
|
|
|
|
|
117, 119, 125, 127, 133, 147, 154, 159, |
11998
|
|
|
|
|
|
|
164, 167, 180, 191, 203 |
11999
|
|
|
|
|
|
|
}; |
12000
|
|
|
|
|
|
|
|
12001
|
|
|
|
|
|
|
static const char _english_tokenizer_indicies[] = { |
12002
|
|
|
|
|
|
|
1, 1, 2, 2, 2, 2, 2, 3, |
12003
|
|
|
|
|
|
|
2, 3, 1, 2, 2, 1, 3, 0, |
12004
|
|
|
|
|
|
|
2, 2, 2, 2, 2, 3, 2, 3, |
12005
|
|
|
|
|
|
|
2, 2, 3, 0, 4, 4, 5, 5, |
12006
|
|
|
|
|
|
|
5, 5, 5, 5, 5, 5, 4, 5, |
12007
|
|
|
|
|
|
|
4, 0, 6, 6, 0, 7, 7, 0, |
12008
|
|
|
|
|
|
|
8, 8, 0, 9, 9, 0, 10, 10, |
12009
|
|
|
|
|
|
|
11, 11, 0, 11, 11, 0, 13, 14, |
12010
|
|
|
|
|
|
|
15, 13, 16, 12, 16, 12, 16, 19, |
12011
|
|
|
|
|
|
|
19, 19, 19, 13, 19, 18, 16, 12, |
12012
|
|
|
|
|
|
|
20, 12, 20, 12, 18, 12, 19, 12, |
12013
|
|
|
|
|
|
|
19, 12, 17, 13, 22, 23, 13, 13, |
12014
|
|
|
|
|
|
|
21, 13, 24, 13, 13, 21, 13, 13, |
12015
|
|
|
|
|
|
|
13, 21, 13, 24, 13, 13, 21, 13, |
12016
|
|
|
|
|
|
|
25, 26, 13, 13, 21, 28, 27, 13, |
12017
|
|
|
|
|
|
|
25, 29, 13, 13, 21, 28, 27, 13, |
12018
|
|
|
|
|
|
|
26, 29, 13, 13, 21, 4, 4, 5, |
12019
|
|
|
|
|
|
|
5, 5, 5, 5, 5, 5, 5, 4, |
12020
|
|
|
|
|
|
|
5, 4, 30, 31, 32, 33, 33, 18, |
12021
|
|
|
|
|
|
|
18, 30, 31, 32, 33, 33, 30, 33, |
12022
|
|
|
|
|
|
|
33, 9, 9, 30, 11, 11, 30, 34, |
12023
|
|
|
|
|
|
|
35, 19, 19, 19, 19, 19, 19, 34, |
12024
|
|
|
|
|
|
|
19, 19, 19, 30, 35, 19, 19, 19, |
12025
|
|
|
|
|
|
|
19, 19, 19, 19, 19, 19, 30, 34, |
12026
|
|
|
|
|
|
|
19, 19, 19, 19, 19, 19, 34, 19, |
12027
|
|
|
|
|
|
|
19, 19, 30, 18, 18, 30, 0 |
12028
|
|
|
|
|
|
|
}; |
12029
|
|
|
|
|
|
|
|
12030
|
|
|
|
|
|
|
static const char _english_tokenizer_trans_targs[] = { |
12031
|
|
|
|
|
|
|
10, 1, 2, 10, 1, 3, 5, 6, |
12032
|
|
|
|
|
|
|
22, 23, 9, 24, 10, 11, 15, 19, |
12033
|
|
|
|
|
|
|
20, 0, 21, 25, 28, 10, 12, 14, |
12034
|
|
|
|
|
|
|
13, 16, 17, 10, 10, 18, 10, 4, |
12035
|
|
|
|
|
|
|
7, 8, 26, 27 |
12036
|
|
|
|
|
|
|
}; |
12037
|
|
|
|
|
|
|
|
12038
|
|
|
|
|
|
|
static const char _english_tokenizer_trans_actions[] = { |
12039
|
|
|
|
|
|
|
1, 0, 0, 2, 3, 0, 0, 0, |
12040
|
|
|
|
|
|
|
4, 4, 0, 0, 7, 0, 0, 0, |
12041
|
|
|
|
|
|
|
4, 0, 4, 0, 0, 8, 0, 0, |
12042
|
|
|
|
|
|
|
0, 0, 0, 9, 10, 0, 11, 0, |
12043
|
|
|
|
|
|
|
0, 0, 0, 0 |
12044
|
|
|
|
|
|
|
}; |
12045
|
|
|
|
|
|
|
|
12046
|
|
|
|
|
|
|
static const char _english_tokenizer_to_state_actions[] = { |
12047
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
12048
|
|
|
|
|
|
|
0, 0, 5, 0, 0, 0, 0, 0, |
12049
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
12050
|
|
|
|
|
|
|
0, 0, 0, 0, 0 |
12051
|
|
|
|
|
|
|
}; |
12052
|
|
|
|
|
|
|
|
12053
|
|
|
|
|
|
|
static const char _english_tokenizer_from_state_actions[] = { |
12054
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
12055
|
|
|
|
|
|
|
0, 0, 6, 0, 0, 0, 0, 0, |
12056
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
12057
|
|
|
|
|
|
|
0, 0, 0, 0, 0 |
12058
|
|
|
|
|
|
|
}; |
12059
|
|
|
|
|
|
|
|
12060
|
|
|
|
|
|
|
static const unsigned char _english_tokenizer_eof_trans[] = { |
12061
|
|
|
|
|
|
|
0, 1, 1, 1, 1, 1, 1, 1, |
12062
|
|
|
|
|
|
|
1, 1, 0, 22, 22, 22, 22, 22, |
12063
|
|
|
|
|
|
|
28, 22, 28, 22, 31, 31, 31, 31, |
12064
|
|
|
|
|
|
|
31, 31, 31, 31, 31 |
12065
|
|
|
|
|
|
|
}; |
12066
|
|
|
|
|
|
|
|
12067
|
|
|
|
|
|
|
static const int english_tokenizer_start = 10; |
12068
|
|
|
|
|
|
|
|
12069
|
0
|
0
|
|
|
|
|
english_tokenizer::english_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {} |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
12070
|
|
|
|
|
|
|
|
12071
|
0
|
|
|
|
|
|
bool english_tokenizer::next_sentence(vector& tokens) { |
12072
|
|
|
|
|
|
|
using namespace unilib; |
12073
|
|
|
|
|
|
|
|
12074
|
|
|
|
|
|
|
int cs, act; |
12075
|
|
|
|
|
|
|
size_t ts, te; |
12076
|
|
|
|
|
|
|
size_t whitespace = 0; // Suppress "may be uninitialized" warning |
12077
|
|
|
|
|
|
|
|
12078
|
0
|
0
|
|
|
|
|
while (tokenize_url_email(tokens)) |
12079
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) |
12080
|
|
|
|
|
|
|
return true; |
12081
|
|
|
|
|
|
|
|
12082
|
|
|
|
|
|
|
{ |
12083
|
|
|
|
|
|
|
cs = english_tokenizer_start; |
12084
|
0
|
|
|
|
|
|
ts = 0; |
12085
|
|
|
|
|
|
|
te = 0; |
12086
|
|
|
|
|
|
|
act = 0; |
12087
|
|
|
|
|
|
|
} |
12088
|
|
|
|
|
|
|
|
12089
|
|
|
|
|
|
|
{ |
12090
|
|
|
|
|
|
|
int _klen; |
12091
|
|
|
|
|
|
|
const short *_keys; |
12092
|
|
|
|
|
|
|
int _trans; |
12093
|
|
|
|
|
|
|
short _widec; |
12094
|
|
|
|
|
|
|
|
12095
|
0
|
0
|
|
|
|
|
if ( ( current) == ( (chars.size() - 1)) ) |
12096
|
|
|
|
|
|
|
goto _test_eof; |
12097
|
|
|
|
|
|
|
if ( cs == 0 ) |
12098
|
|
|
|
|
|
|
goto _out; |
12099
|
|
|
|
|
|
|
_resume: |
12100
|
0
|
0
|
|
|
|
|
switch ( _english_tokenizer_from_state_actions[cs] ) { |
12101
|
|
|
|
|
|
|
case 6: |
12102
|
0
|
|
|
|
|
|
{ts = ( current);} |
12103
|
0
|
|
|
|
|
|
break; |
12104
|
|
|
|
|
|
|
} |
12105
|
|
|
|
|
|
|
|
12106
|
0
|
|
|
|
|
|
_widec = ( ragel_char(chars[current])); |
12107
|
0
|
|
|
|
|
|
_klen = _english_tokenizer_cond_lengths[cs]; |
12108
|
0
|
|
|
|
|
|
_keys = _english_tokenizer_cond_keys + (_english_tokenizer_cond_offsets[cs]*2); |
12109
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
12110
|
|
|
|
|
|
|
const short *_lower = _keys; |
12111
|
|
|
|
|
|
|
const short *_mid; |
12112
|
0
|
|
|
|
|
|
const short *_upper = _keys + (_klen<<1) - 2; |
12113
|
|
|
|
|
|
|
while (1) { |
12114
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
12115
|
|
|
|
|
|
|
break; |
12116
|
|
|
|
|
|
|
|
12117
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
12118
|
0
|
0
|
|
|
|
|
if ( _widec < _mid[0] ) |
12119
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
12120
|
0
|
0
|
|
|
|
|
else if ( _widec > _mid[1] ) |
12121
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
12122
|
|
|
|
|
|
|
else { |
12123
|
0
|
|
|
|
|
|
switch ( _english_tokenizer_cond_spaces[_english_tokenizer_cond_offsets[cs] + ((_mid - _keys)>>1)] ) { |
12124
|
|
|
|
|
|
|
case 0: { |
12125
|
0
|
|
|
|
|
|
_widec = (short)(256u + (( ragel_char(chars[current])) - 0u)); |
12126
|
0
|
0
|
|
|
|
|
if ( |
12127
|
0
|
0
|
|
|
|
|
!current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256; |
|
|
0
|
|
|
|
|
|
12128
|
|
|
|
|
|
|
break; |
12129
|
|
|
|
|
|
|
} |
12130
|
|
|
|
|
|
|
case 1: { |
12131
|
0
|
|
|
|
|
|
_widec = (short)(768u + (( ragel_char(chars[current])) - 0u)); |
12132
|
0
|
0
|
|
|
|
|
if ( |
12133
|
0
|
0
|
|
|
|
|
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
12134
|
|
|
|
|
|
|
break; |
12135
|
|
|
|
|
|
|
} |
12136
|
|
|
|
|
|
|
} |
12137
|
|
|
|
|
|
|
break; |
12138
|
|
|
|
|
|
|
} |
12139
|
|
|
|
|
|
|
} |
12140
|
|
|
|
|
|
|
} |
12141
|
|
|
|
|
|
|
|
12142
|
0
|
|
|
|
|
|
_keys = _english_tokenizer_trans_keys + _english_tokenizer_key_offsets[cs]; |
12143
|
0
|
|
|
|
|
|
_trans = _english_tokenizer_index_offsets[cs]; |
12144
|
|
|
|
|
|
|
|
12145
|
0
|
|
|
|
|
|
_klen = _english_tokenizer_single_lengths[cs]; |
12146
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
12147
|
|
|
|
|
|
|
const short *_lower = _keys; |
12148
|
|
|
|
|
|
|
const short *_mid; |
12149
|
0
|
|
|
|
|
|
const short *_upper = _keys + _klen - 1; |
12150
|
|
|
|
|
|
|
while (1) { |
12151
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
12152
|
|
|
|
|
|
|
break; |
12153
|
|
|
|
|
|
|
|
12154
|
0
|
|
|
|
|
|
_mid = _lower + ((_upper-_lower) >> 1); |
12155
|
0
|
0
|
|
|
|
|
if ( _widec < *_mid ) |
12156
|
0
|
|
|
|
|
|
_upper = _mid - 1; |
12157
|
0
|
0
|
|
|
|
|
else if ( _widec > *_mid ) |
12158
|
0
|
|
|
|
|
|
_lower = _mid + 1; |
12159
|
|
|
|
|
|
|
else { |
12160
|
0
|
|
|
|
|
|
_trans += (unsigned int)(_mid - _keys); |
12161
|
0
|
|
|
|
|
|
goto _match; |
12162
|
|
|
|
|
|
|
} |
12163
|
|
|
|
|
|
|
} |
12164
|
0
|
|
|
|
|
|
_keys += _klen; |
12165
|
0
|
|
|
|
|
|
_trans += _klen; |
12166
|
|
|
|
|
|
|
} |
12167
|
|
|
|
|
|
|
|
12168
|
0
|
|
|
|
|
|
_klen = _english_tokenizer_range_lengths[cs]; |
12169
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
12170
|
|
|
|
|
|
|
const short *_lower = _keys; |
12171
|
|
|
|
|
|
|
const short *_mid; |
12172
|
0
|
|
|
|
|
|
const short *_upper = _keys + (_klen<<1) - 2; |
12173
|
|
|
|
|
|
|
while (1) { |
12174
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
12175
|
|
|
|
|
|
|
break; |
12176
|
|
|
|
|
|
|
|
12177
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
12178
|
0
|
0
|
|
|
|
|
if ( _widec < _mid[0] ) |
12179
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
12180
|
0
|
0
|
|
|
|
|
else if ( _widec > _mid[1] ) |
12181
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
12182
|
|
|
|
|
|
|
else { |
12183
|
0
|
|
|
|
|
|
_trans += (unsigned int)((_mid - _keys)>>1); |
12184
|
0
|
|
|
|
|
|
goto _match; |
12185
|
|
|
|
|
|
|
} |
12186
|
|
|
|
|
|
|
} |
12187
|
0
|
|
|
|
|
|
_trans += _klen; |
12188
|
|
|
|
|
|
|
} |
12189
|
|
|
|
|
|
|
|
12190
|
|
|
|
|
|
|
_match: |
12191
|
0
|
|
|
|
|
|
_trans = _english_tokenizer_indicies[_trans]; |
12192
|
|
|
|
|
|
|
_eof_trans: |
12193
|
0
|
|
|
|
|
|
cs = _english_tokenizer_trans_targs[_trans]; |
12194
|
|
|
|
|
|
|
|
12195
|
0
|
0
|
|
|
|
|
if ( _english_tokenizer_trans_actions[_trans] == 0 ) |
12196
|
|
|
|
|
|
|
goto _again; |
12197
|
|
|
|
|
|
|
|
12198
|
0
|
|
|
|
|
|
switch ( _english_tokenizer_trans_actions[_trans] ) { |
12199
|
|
|
|
|
|
|
case 3: |
12200
|
0
|
|
|
|
|
|
{ whitespace = current; } |
12201
|
0
|
|
|
|
|
|
break; |
12202
|
|
|
|
|
|
|
case 4: |
12203
|
0
|
|
|
|
|
|
{te = ( current)+1;} |
12204
|
0
|
|
|
|
|
|
break; |
12205
|
|
|
|
|
|
|
case 7: |
12206
|
0
|
|
|
|
|
|
{te = ( current)+1;{ tokens.emplace_back(ts, te - ts); |
12207
|
0
|
|
|
|
|
|
split_token(tokens); |
12208
|
0
|
|
|
|
|
|
current = te; |
12209
|
0
|
0
|
|
|
|
|
do |
12210
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
12211
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
12212
|
0
|
|
|
|
|
|
( current)--; |
12213
|
|
|
|
|
|
|
}} |
12214
|
0
|
|
|
|
|
|
break; |
12215
|
|
|
|
|
|
|
case 2: |
12216
|
0
|
|
|
|
|
|
{te = ( current)+1;{ |
12217
|
0
|
|
|
|
|
|
bool eos = is_eos(tokens, chars[ts].chr, &abbreviations); |
12218
|
0
|
0
|
|
|
|
|
for (current = ts; current < whitespace; current++) |
12219
|
0
|
|
|
|
|
|
tokens.emplace_back(current, 1); |
12220
|
0
|
|
|
|
|
|
{( current) = (( whitespace))-1;} |
12221
|
0
|
0
|
|
|
|
|
if (eos) {( current)++; goto _out; } |
12222
|
|
|
|
|
|
|
}} |
12223
|
|
|
|
|
|
|
break; |
12224
|
|
|
|
|
|
|
case 10: |
12225
|
0
|
|
|
|
|
|
{te = ( current)+1;{ |
12226
|
0
|
0
|
|
|
|
|
if (!tokens.empty()) {( current)++; goto _out; } |
12227
|
0
|
|
|
|
|
|
current = te; |
12228
|
0
|
0
|
|
|
|
|
do |
12229
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
12230
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
12231
|
0
|
|
|
|
|
|
( current)--; |
12232
|
|
|
|
|
|
|
}} |
12233
|
0
|
|
|
|
|
|
break; |
12234
|
|
|
|
|
|
|
case 11: |
12235
|
0
|
|
|
|
|
|
{te = ( current);( current)--;{ tokens.emplace_back(ts, te - ts); |
12236
|
0
|
|
|
|
|
|
split_token(tokens); |
12237
|
0
|
|
|
|
|
|
current = te; |
12238
|
0
|
0
|
|
|
|
|
do |
12239
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
12240
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
12241
|
0
|
|
|
|
|
|
( current)--; |
12242
|
|
|
|
|
|
|
}} |
12243
|
0
|
|
|
|
|
|
break; |
12244
|
|
|
|
|
|
|
case 8: |
12245
|
0
|
|
|
|
|
|
{te = ( current);( current)--;{ |
12246
|
0
|
|
|
|
|
|
current = te; |
12247
|
0
|
0
|
|
|
|
|
do |
12248
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
12249
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
12250
|
0
|
|
|
|
|
|
( current)--; |
12251
|
|
|
|
|
|
|
}} |
12252
|
0
|
|
|
|
|
|
break; |
12253
|
|
|
|
|
|
|
case 9: |
12254
|
0
|
|
|
|
|
|
{te = ( current);( current)--;{ |
12255
|
0
|
0
|
|
|
|
|
if (!tokens.empty()) {( current)++; goto _out; } |
12256
|
0
|
|
|
|
|
|
current = te; |
12257
|
0
|
0
|
|
|
|
|
do |
12258
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
12259
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
12260
|
0
|
|
|
|
|
|
( current)--; |
12261
|
|
|
|
|
|
|
}} |
12262
|
0
|
|
|
|
|
|
break; |
12263
|
|
|
|
|
|
|
case 1: |
12264
|
0
|
|
|
|
|
|
{{( current) = ((te))-1;}{ tokens.emplace_back(ts, te - ts); |
12265
|
0
|
|
|
|
|
|
split_token(tokens); |
12266
|
0
|
|
|
|
|
|
current = te; |
12267
|
0
|
0
|
|
|
|
|
do |
12268
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
12269
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
12270
|
0
|
|
|
|
|
|
( current)--; |
12271
|
|
|
|
|
|
|
}} |
12272
|
0
|
|
|
|
|
|
break; |
12273
|
|
|
|
|
|
|
} |
12274
|
|
|
|
|
|
|
|
12275
|
|
|
|
|
|
|
_again: |
12276
|
0
|
0
|
|
|
|
|
switch ( _english_tokenizer_to_state_actions[cs] ) { |
12277
|
|
|
|
|
|
|
case 5: |
12278
|
0
|
|
|
|
|
|
{ts = 0;} |
12279
|
0
|
|
|
|
|
|
break; |
12280
|
|
|
|
|
|
|
} |
12281
|
|
|
|
|
|
|
|
12282
|
0
|
0
|
|
|
|
|
if ( cs == 0 ) |
12283
|
|
|
|
|
|
|
goto _out; |
12284
|
0
|
0
|
|
|
|
|
if ( ++( current) != ( (chars.size() - 1)) ) |
12285
|
|
|
|
|
|
|
goto _resume; |
12286
|
|
|
|
|
|
|
_test_eof: {} |
12287
|
0
|
0
|
|
|
|
|
if ( ( current) == ( (chars.size() - 1)) ) |
12288
|
|
|
|
|
|
|
{ |
12289
|
0
|
0
|
|
|
|
|
if ( _english_tokenizer_eof_trans[cs] > 0 ) { |
12290
|
0
|
|
|
|
|
|
_trans = _english_tokenizer_eof_trans[cs] - 1; |
12291
|
0
|
|
|
|
|
|
goto _eof_trans; |
12292
|
|
|
|
|
|
|
} |
12293
|
|
|
|
|
|
|
} |
12294
|
|
|
|
|
|
|
|
12295
|
|
|
|
|
|
|
_out: {} |
12296
|
|
|
|
|
|
|
} |
12297
|
|
|
|
|
|
|
|
12298
|
|
|
|
|
|
|
(void)act; // Suppress unused variable warning |
12299
|
|
|
|
|
|
|
|
12300
|
0
|
|
|
|
|
|
return !tokens.empty(); |
12301
|
|
|
|
|
|
|
} |
12302
|
|
|
|
|
|
|
|
12303
|
|
|
|
|
|
|
} // namespace morphodita |
12304
|
|
|
|
|
|
|
|
12305
|
|
|
|
|
|
|
///////// |
12306
|
|
|
|
|
|
|
// File: morphodita/tokenizer/generic_tokenizer.cpp |
12307
|
|
|
|
|
|
|
///////// |
12308
|
|
|
|
|
|
|
|
12309
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
12310
|
|
|
|
|
|
|
// |
12311
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
12312
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
12313
|
|
|
|
|
|
|
// |
12314
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
12315
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
12316
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
12317
|
|
|
|
|
|
|
|
12318
|
|
|
|
|
|
|
namespace morphodita { |
12319
|
|
|
|
|
|
|
|
12320
|
|
|
|
|
|
|
static const char _generic_tokenizer_cond_offsets[] = { |
12321
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
12322
|
|
|
|
|
|
|
2, 2, 2, 2, 2, 2, 2, 2, |
12323
|
|
|
|
|
|
|
2, 2, 2, 2, 2, 2, 2 |
12324
|
|
|
|
|
|
|
}; |
12325
|
|
|
|
|
|
|
|
12326
|
|
|
|
|
|
|
static const char _generic_tokenizer_cond_lengths[] = { |
12327
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 2, |
12328
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
12329
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0 |
12330
|
|
|
|
|
|
|
}; |
12331
|
|
|
|
|
|
|
|
12332
|
|
|
|
|
|
|
static const short _generic_tokenizer_cond_keys[] = { |
12333
|
|
|
|
|
|
|
43u, 43u, 45u, 45u, 0 |
12334
|
|
|
|
|
|
|
}; |
12335
|
|
|
|
|
|
|
|
12336
|
|
|
|
|
|
|
static const char _generic_tokenizer_cond_spaces[] = { |
12337
|
|
|
|
|
|
|
1, 0, 0 |
12338
|
|
|
|
|
|
|
}; |
12339
|
|
|
|
|
|
|
|
12340
|
|
|
|
|
|
|
static const unsigned char _generic_tokenizer_key_offsets[] = { |
12341
|
|
|
|
|
|
|
0, 0, 17, 29, 43, 46, 51, 54, |
12342
|
|
|
|
|
|
|
89, 94, 98, 101, 105, 110, 111, 116, |
12343
|
|
|
|
|
|
|
117, 122, 136, 142, 147, 150, 162 |
12344
|
|
|
|
|
|
|
}; |
12345
|
|
|
|
|
|
|
|
12346
|
|
|
|
|
|
|
static const short _generic_tokenizer_trans_keys[] = { |
12347
|
|
|
|
|
|
|
13u, 32u, 34u, 40u, 91u, 96u, 123u, 129u, |
12348
|
|
|
|
|
|
|
133u, 135u, 147u, 150u, 162u, 9u, 10u, 65u, |
12349
|
|
|
|
|
|
|
90u, 34u, 40u, 91u, 96u, 123u, 129u, 133u, |
12350
|
|
|
|
|
|
|
135u, 150u, 162u, 65u, 90u, 13u, 32u, 34u, |
12351
|
|
|
|
|
|
|
39u, 41u, 59u, 93u, 125u, 139u, 141u, 147u, |
12352
|
|
|
|
|
|
|
161u, 9u, 10u, 159u, 48u, 57u, 43u, 45u, |
12353
|
|
|
|
|
|
|
159u, 48u, 57u, 159u, 48u, 57u, 9u, 10u, |
12354
|
|
|
|
|
|
|
13u, 32u, 33u, 44u, 46u, 47u, 63u, 129u, |
12355
|
|
|
|
|
|
|
131u, 135u, 142u, 147u, 157u, 159u, 160u, 301u, |
12356
|
|
|
|
|
|
|
557u, 811u, 1067u, 0u, 42u, 48u, 57u, 58u, |
12357
|
|
|
|
|
|
|
64u, 65u, 90u, 91u, 96u, 97u, 122u, 123u, |
12358
|
|
|
|
|
|
|
255u, 9u, 10u, 13u, 32u, 147u, 9u, 13u, |
12359
|
|
|
|
|
|
|
32u, 147u, 9u, 32u, 147u, 9u, 10u, 32u, |
12360
|
|
|
|
|
|
|
147u, 9u, 10u, 13u, 32u, 147u, 13u, 9u, |
12361
|
|
|
|
|
|
|
10u, 13u, 32u, 147u, 10u, 9u, 10u, 13u, |
12362
|
|
|
|
|
|
|
32u, 147u, 13u, 32u, 34u, 39u, 41u, 59u, |
12363
|
|
|
|
|
|
|
93u, 125u, 139u, 141u, 147u, 161u, 9u, 10u, |
12364
|
|
|
|
|
|
|
46u, 69u, 101u, 159u, 48u, 57u, 69u, 101u, |
12365
|
|
|
|
|
|
|
159u, 48u, 57u, 159u, 48u, 57u, 129u, 131u, |
12366
|
|
|
|
|
|
|
135u, 151u, 155u, 157u, 65u, 90u, 97u, 122u, |
12367
|
|
|
|
|
|
|
142u, 143u, 159u, 48u, 57u, 0 |
12368
|
|
|
|
|
|
|
}; |
12369
|
|
|
|
|
|
|
|
12370
|
|
|
|
|
|
|
static const char _generic_tokenizer_single_lengths[] = { |
12371
|
|
|
|
|
|
|
0, 13, 10, 12, 1, 3, 1, 21, |
12372
|
|
|
|
|
|
|
5, 4, 3, 4, 5, 1, 5, 1, |
12373
|
|
|
|
|
|
|
5, 12, 4, 3, 1, 6, 1 |
12374
|
|
|
|
|
|
|
}; |
12375
|
|
|
|
|
|
|
|
12376
|
|
|
|
|
|
|
static const char _generic_tokenizer_range_lengths[] = { |
12377
|
|
|
|
|
|
|
0, 2, 1, 1, 1, 1, 1, 7, |
12378
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
12379
|
|
|
|
|
|
|
0, 1, 1, 1, 1, 3, 1 |
12380
|
|
|
|
|
|
|
}; |
12381
|
|
|
|
|
|
|
|
12382
|
|
|
|
|
|
|
static const unsigned char _generic_tokenizer_index_offsets[] = { |
12383
|
|
|
|
|
|
|
0, 0, 16, 28, 42, 45, 50, 53, |
12384
|
|
|
|
|
|
|
82, 88, 93, 97, 102, 108, 110, 116, |
12385
|
|
|
|
|
|
|
118, 124, 138, 144, 149, 152, 162 |
12386
|
|
|
|
|
|
|
}; |
12387
|
|
|
|
|
|
|
|
12388
|
|
|
|
|
|
|
static const char _generic_tokenizer_indicies[] = { |
12389
|
|
|
|
|
|
|
1, 1, 2, 2, 2, 2, 2, 3, |
12390
|
|
|
|
|
|
|
2, 3, 1, 2, 2, 1, 3, 0, |
12391
|
|
|
|
|
|
|
2, 2, 2, 2, 2, 3, 2, 3, |
12392
|
|
|
|
|
|
|
2, 2, 3, 0, 4, 4, 5, 5, |
12393
|
|
|
|
|
|
|
5, 5, 5, 5, 5, 5, 4, 5, |
12394
|
|
|
|
|
|
|
4, 0, 6, 6, 0, 7, 7, 8, |
12395
|
|
|
|
|
|
|
8, 0, 8, 8, 0, 10, 11, 12, |
12396
|
|
|
|
|
|
|
10, 13, 9, 13, 9, 13, 16, 16, |
12397
|
|
|
|
|
|
|
16, 16, 10, 16, 15, 13, 9, 17, |
12398
|
|
|
|
|
|
|
9, 17, 9, 15, 9, 16, 9, 16, |
12399
|
|
|
|
|
|
|
9, 14, 10, 19, 20, 10, 10, 18, |
12400
|
|
|
|
|
|
|
10, 21, 10, 10, 18, 10, 10, 10, |
12401
|
|
|
|
|
|
|
18, 10, 21, 10, 10, 18, 10, 22, |
12402
|
|
|
|
|
|
|
23, 10, 10, 18, 25, 24, 10, 22, |
12403
|
|
|
|
|
|
|
26, 10, 10, 18, 25, 24, 10, 23, |
12404
|
|
|
|
|
|
|
26, 10, 10, 18, 4, 4, 5, 5, |
12405
|
|
|
|
|
|
|
5, 5, 5, 5, 5, 5, 4, 5, |
12406
|
|
|
|
|
|
|
4, 27, 28, 29, 29, 15, 15, 27, |
12407
|
|
|
|
|
|
|
29, 29, 6, 6, 27, 8, 8, 27, |
12408
|
|
|
|
|
|
|
16, 16, 16, 16, 16, 16, 16, 16, |
12409
|
|
|
|
|
|
|
16, 27, 15, 15, 27, 0 |
12410
|
|
|
|
|
|
|
}; |
12411
|
|
|
|
|
|
|
|
12412
|
|
|
|
|
|
|
static const char _generic_tokenizer_trans_targs[] = { |
12413
|
|
|
|
|
|
|
7, 1, 2, 7, 1, 3, 19, 6, |
12414
|
|
|
|
|
|
|
20, 7, 8, 12, 16, 17, 0, 18, |
12415
|
|
|
|
|
|
|
21, 22, 7, 9, 11, 10, 13, 14, |
12416
|
|
|
|
|
|
|
7, 7, 15, 7, 4, 5 |
12417
|
|
|
|
|
|
|
}; |
12418
|
|
|
|
|
|
|
|
12419
|
|
|
|
|
|
|
static const char _generic_tokenizer_trans_actions[] = { |
12420
|
|
|
|
|
|
|
1, 0, 0, 2, 3, 0, 4, 0, |
12421
|
|
|
|
|
|
|
0, 7, 0, 0, 0, 4, 0, 4, |
12422
|
|
|
|
|
|
|
0, 0, 8, 0, 0, 0, 0, 0, |
12423
|
|
|
|
|
|
|
9, 10, 0, 11, 0, 0 |
12424
|
|
|
|
|
|
|
}; |
12425
|
|
|
|
|
|
|
|
12426
|
|
|
|
|
|
|
static const char _generic_tokenizer_to_state_actions[] = { |
12427
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 5, |
12428
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
12429
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0 |
12430
|
|
|
|
|
|
|
}; |
12431
|
|
|
|
|
|
|
|
12432
|
|
|
|
|
|
|
static const char _generic_tokenizer_from_state_actions[] = { |
12433
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 6, |
12434
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
12435
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0 |
12436
|
|
|
|
|
|
|
}; |
12437
|
|
|
|
|
|
|
|
12438
|
|
|
|
|
|
|
static const unsigned char _generic_tokenizer_eof_trans[] = { |
12439
|
|
|
|
|
|
|
0, 1, 1, 1, 1, 1, 1, 0, |
12440
|
|
|
|
|
|
|
19, 19, 19, 19, 19, 25, 19, 25, |
12441
|
|
|
|
|
|
|
19, 28, 28, 28, 28, 28, 28 |
12442
|
|
|
|
|
|
|
}; |
12443
|
|
|
|
|
|
|
|
12444
|
|
|
|
|
|
|
static const int generic_tokenizer_start = 7; |
12445
|
|
|
|
|
|
|
|
12446
|
0
|
0
|
|
|
|
|
generic_tokenizer::generic_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {} |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
12447
|
|
|
|
|
|
|
|
12448
|
0
|
|
|
|
|
|
bool generic_tokenizer::next_sentence(vector& tokens) { |
12449
|
|
|
|
|
|
|
using namespace unilib; |
12450
|
|
|
|
|
|
|
|
12451
|
|
|
|
|
|
|
int cs, act; |
12452
|
|
|
|
|
|
|
size_t ts, te; |
12453
|
|
|
|
|
|
|
size_t whitespace = 0; // Suppress "may be uninitialized" warning |
12454
|
|
|
|
|
|
|
|
12455
|
0
|
0
|
|
|
|
|
while (tokenize_url_email(tokens)) |
12456
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) |
12457
|
|
|
|
|
|
|
return true; |
12458
|
|
|
|
|
|
|
|
12459
|
|
|
|
|
|
|
{ |
12460
|
|
|
|
|
|
|
cs = generic_tokenizer_start; |
12461
|
0
|
|
|
|
|
|
ts = 0; |
12462
|
|
|
|
|
|
|
te = 0; |
12463
|
|
|
|
|
|
|
act = 0; |
12464
|
|
|
|
|
|
|
} |
12465
|
|
|
|
|
|
|
|
12466
|
|
|
|
|
|
|
{ |
12467
|
|
|
|
|
|
|
int _klen; |
12468
|
|
|
|
|
|
|
const short *_keys; |
12469
|
|
|
|
|
|
|
int _trans; |
12470
|
|
|
|
|
|
|
short _widec; |
12471
|
|
|
|
|
|
|
|
12472
|
0
|
0
|
|
|
|
|
if ( ( current) == ( (chars.size() - 1)) ) |
12473
|
|
|
|
|
|
|
goto _test_eof; |
12474
|
|
|
|
|
|
|
if ( cs == 0 ) |
12475
|
|
|
|
|
|
|
goto _out; |
12476
|
|
|
|
|
|
|
_resume: |
12477
|
0
|
0
|
|
|
|
|
switch ( _generic_tokenizer_from_state_actions[cs] ) { |
12478
|
|
|
|
|
|
|
case 6: |
12479
|
0
|
|
|
|
|
|
{ts = ( current);} |
12480
|
0
|
|
|
|
|
|
break; |
12481
|
|
|
|
|
|
|
} |
12482
|
|
|
|
|
|
|
|
12483
|
0
|
|
|
|
|
|
_widec = ( ragel_char(chars[current])); |
12484
|
0
|
|
|
|
|
|
_klen = _generic_tokenizer_cond_lengths[cs]; |
12485
|
0
|
|
|
|
|
|
_keys = _generic_tokenizer_cond_keys + (_generic_tokenizer_cond_offsets[cs]*2); |
12486
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
12487
|
|
|
|
|
|
|
const short *_lower = _keys; |
12488
|
|
|
|
|
|
|
const short *_mid; |
12489
|
0
|
|
|
|
|
|
const short *_upper = _keys + (_klen<<1) - 2; |
12490
|
|
|
|
|
|
|
while (1) { |
12491
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
12492
|
|
|
|
|
|
|
break; |
12493
|
|
|
|
|
|
|
|
12494
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
12495
|
0
|
0
|
|
|
|
|
if ( _widec < _mid[0] ) |
12496
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
12497
|
0
|
0
|
|
|
|
|
else if ( _widec > _mid[1] ) |
12498
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
12499
|
|
|
|
|
|
|
else { |
12500
|
0
|
|
|
|
|
|
switch ( _generic_tokenizer_cond_spaces[_generic_tokenizer_cond_offsets[cs] + ((_mid - _keys)>>1)] ) { |
12501
|
|
|
|
|
|
|
case 0: { |
12502
|
0
|
|
|
|
|
|
_widec = (short)(256u + (( ragel_char(chars[current])) - 0u)); |
12503
|
0
|
0
|
|
|
|
|
if ( |
12504
|
0
|
0
|
|
|
|
|
!current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256; |
|
|
0
|
|
|
|
|
|
12505
|
|
|
|
|
|
|
break; |
12506
|
|
|
|
|
|
|
} |
12507
|
|
|
|
|
|
|
case 1: { |
12508
|
0
|
|
|
|
|
|
_widec = (short)(768u + (( ragel_char(chars[current])) - 0u)); |
12509
|
0
|
0
|
|
|
|
|
if ( |
12510
|
0
|
0
|
|
|
|
|
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
12511
|
|
|
|
|
|
|
break; |
12512
|
|
|
|
|
|
|
} |
12513
|
|
|
|
|
|
|
} |
12514
|
|
|
|
|
|
|
break; |
12515
|
|
|
|
|
|
|
} |
12516
|
|
|
|
|
|
|
} |
12517
|
|
|
|
|
|
|
} |
12518
|
|
|
|
|
|
|
|
12519
|
0
|
|
|
|
|
|
_keys = _generic_tokenizer_trans_keys + _generic_tokenizer_key_offsets[cs]; |
12520
|
0
|
|
|
|
|
|
_trans = _generic_tokenizer_index_offsets[cs]; |
12521
|
|
|
|
|
|
|
|
12522
|
0
|
|
|
|
|
|
_klen = _generic_tokenizer_single_lengths[cs]; |
12523
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
12524
|
|
|
|
|
|
|
const short *_lower = _keys; |
12525
|
|
|
|
|
|
|
const short *_mid; |
12526
|
0
|
|
|
|
|
|
const short *_upper = _keys + _klen - 1; |
12527
|
|
|
|
|
|
|
while (1) { |
12528
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
12529
|
|
|
|
|
|
|
break; |
12530
|
|
|
|
|
|
|
|
12531
|
0
|
|
|
|
|
|
_mid = _lower + ((_upper-_lower) >> 1); |
12532
|
0
|
0
|
|
|
|
|
if ( _widec < *_mid ) |
12533
|
0
|
|
|
|
|
|
_upper = _mid - 1; |
12534
|
0
|
0
|
|
|
|
|
else if ( _widec > *_mid ) |
12535
|
0
|
|
|
|
|
|
_lower = _mid + 1; |
12536
|
|
|
|
|
|
|
else { |
12537
|
0
|
|
|
|
|
|
_trans += (unsigned int)(_mid - _keys); |
12538
|
0
|
|
|
|
|
|
goto _match; |
12539
|
|
|
|
|
|
|
} |
12540
|
|
|
|
|
|
|
} |
12541
|
0
|
|
|
|
|
|
_keys += _klen; |
12542
|
0
|
|
|
|
|
|
_trans += _klen; |
12543
|
|
|
|
|
|
|
} |
12544
|
|
|
|
|
|
|
|
12545
|
0
|
|
|
|
|
|
_klen = _generic_tokenizer_range_lengths[cs]; |
12546
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
12547
|
|
|
|
|
|
|
const short *_lower = _keys; |
12548
|
|
|
|
|
|
|
const short *_mid; |
12549
|
0
|
|
|
|
|
|
const short *_upper = _keys + (_klen<<1) - 2; |
12550
|
|
|
|
|
|
|
while (1) { |
12551
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
12552
|
|
|
|
|
|
|
break; |
12553
|
|
|
|
|
|
|
|
12554
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
12555
|
0
|
0
|
|
|
|
|
if ( _widec < _mid[0] ) |
12556
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
12557
|
0
|
0
|
|
|
|
|
else if ( _widec > _mid[1] ) |
12558
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
12559
|
|
|
|
|
|
|
else { |
12560
|
0
|
|
|
|
|
|
_trans += (unsigned int)((_mid - _keys)>>1); |
12561
|
0
|
|
|
|
|
|
goto _match; |
12562
|
|
|
|
|
|
|
} |
12563
|
|
|
|
|
|
|
} |
12564
|
0
|
|
|
|
|
|
_trans += _klen; |
12565
|
|
|
|
|
|
|
} |
12566
|
|
|
|
|
|
|
|
12567
|
|
|
|
|
|
|
_match: |
12568
|
0
|
|
|
|
|
|
_trans = _generic_tokenizer_indicies[_trans]; |
12569
|
|
|
|
|
|
|
_eof_trans: |
12570
|
0
|
|
|
|
|
|
cs = _generic_tokenizer_trans_targs[_trans]; |
12571
|
|
|
|
|
|
|
|
12572
|
0
|
0
|
|
|
|
|
if ( _generic_tokenizer_trans_actions[_trans] == 0 ) |
12573
|
|
|
|
|
|
|
goto _again; |
12574
|
|
|
|
|
|
|
|
12575
|
0
|
|
|
|
|
|
switch ( _generic_tokenizer_trans_actions[_trans] ) { |
12576
|
|
|
|
|
|
|
case 3: |
12577
|
0
|
|
|
|
|
|
{ whitespace = current; } |
12578
|
0
|
|
|
|
|
|
break; |
12579
|
|
|
|
|
|
|
case 4: |
12580
|
0
|
|
|
|
|
|
{te = ( current)+1;} |
12581
|
0
|
|
|
|
|
|
break; |
12582
|
|
|
|
|
|
|
case 7: |
12583
|
0
|
|
|
|
|
|
{te = ( current)+1;{ tokens.emplace_back(ts, te - ts); |
12584
|
0
|
|
|
|
|
|
current = te; |
12585
|
0
|
0
|
|
|
|
|
do |
12586
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
12587
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
12588
|
0
|
|
|
|
|
|
( current)--; |
12589
|
|
|
|
|
|
|
}} |
12590
|
0
|
|
|
|
|
|
break; |
12591
|
|
|
|
|
|
|
case 2: |
12592
|
0
|
|
|
|
|
|
{te = ( current)+1;{ |
12593
|
0
|
|
|
|
|
|
bool eos = is_eos(tokens, chars[ts].chr, nullptr); |
12594
|
0
|
0
|
|
|
|
|
for (current = ts; current < whitespace; current++) |
12595
|
0
|
|
|
|
|
|
tokens.emplace_back(current, 1); |
12596
|
0
|
|
|
|
|
|
{( current) = (( whitespace))-1;} |
12597
|
0
|
0
|
|
|
|
|
if (eos) {( current)++; goto _out; } |
12598
|
|
|
|
|
|
|
}} |
12599
|
|
|
|
|
|
|
break; |
12600
|
|
|
|
|
|
|
case 10: |
12601
|
0
|
|
|
|
|
|
{te = ( current)+1;{ |
12602
|
0
|
0
|
|
|
|
|
if (!tokens.empty()) {( current)++; goto _out; } |
12603
|
0
|
|
|
|
|
|
current = te; |
12604
|
0
|
0
|
|
|
|
|
do |
12605
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
12606
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
12607
|
0
|
|
|
|
|
|
( current)--; |
12608
|
|
|
|
|
|
|
}} |
12609
|
0
|
|
|
|
|
|
break; |
12610
|
|
|
|
|
|
|
case 11: |
12611
|
0
|
|
|
|
|
|
{te = ( current);( current)--;{ tokens.emplace_back(ts, te - ts); |
12612
|
0
|
|
|
|
|
|
current = te; |
12613
|
0
|
0
|
|
|
|
|
do |
12614
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
12615
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
12616
|
0
|
|
|
|
|
|
( current)--; |
12617
|
|
|
|
|
|
|
}} |
12618
|
0
|
|
|
|
|
|
break; |
12619
|
|
|
|
|
|
|
case 8: |
12620
|
0
|
|
|
|
|
|
{te = ( current);( current)--;{ |
12621
|
0
|
|
|
|
|
|
current = te; |
12622
|
0
|
0
|
|
|
|
|
do |
12623
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
12624
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
12625
|
0
|
|
|
|
|
|
( current)--; |
12626
|
|
|
|
|
|
|
}} |
12627
|
0
|
|
|
|
|
|
break; |
12628
|
|
|
|
|
|
|
case 9: |
12629
|
0
|
|
|
|
|
|
{te = ( current);( current)--;{ |
12630
|
0
|
0
|
|
|
|
|
if (!tokens.empty()) {( current)++; goto _out; } |
12631
|
0
|
|
|
|
|
|
current = te; |
12632
|
0
|
0
|
|
|
|
|
do |
12633
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
12634
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
12635
|
0
|
|
|
|
|
|
( current)--; |
12636
|
|
|
|
|
|
|
}} |
12637
|
0
|
|
|
|
|
|
break; |
12638
|
|
|
|
|
|
|
case 1: |
12639
|
0
|
|
|
|
|
|
{{( current) = ((te))-1;}{ tokens.emplace_back(ts, te - ts); |
12640
|
0
|
|
|
|
|
|
current = te; |
12641
|
0
|
0
|
|
|
|
|
do |
12642
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
12643
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
12644
|
0
|
|
|
|
|
|
( current)--; |
12645
|
|
|
|
|
|
|
}} |
12646
|
0
|
|
|
|
|
|
break; |
12647
|
|
|
|
|
|
|
} |
12648
|
|
|
|
|
|
|
|
12649
|
|
|
|
|
|
|
_again: |
12650
|
0
|
0
|
|
|
|
|
switch ( _generic_tokenizer_to_state_actions[cs] ) { |
12651
|
|
|
|
|
|
|
case 5: |
12652
|
0
|
|
|
|
|
|
{ts = 0;} |
12653
|
0
|
|
|
|
|
|
break; |
12654
|
|
|
|
|
|
|
} |
12655
|
|
|
|
|
|
|
|
12656
|
0
|
0
|
|
|
|
|
if ( cs == 0 ) |
12657
|
|
|
|
|
|
|
goto _out; |
12658
|
0
|
0
|
|
|
|
|
if ( ++( current) != ( (chars.size() - 1)) ) |
12659
|
|
|
|
|
|
|
goto _resume; |
12660
|
|
|
|
|
|
|
_test_eof: {} |
12661
|
0
|
0
|
|
|
|
|
if ( ( current) == ( (chars.size() - 1)) ) |
12662
|
|
|
|
|
|
|
{ |
12663
|
0
|
0
|
|
|
|
|
if ( _generic_tokenizer_eof_trans[cs] > 0 ) { |
12664
|
0
|
|
|
|
|
|
_trans = _generic_tokenizer_eof_trans[cs] - 1; |
12665
|
0
|
|
|
|
|
|
goto _eof_trans; |
12666
|
|
|
|
|
|
|
} |
12667
|
|
|
|
|
|
|
} |
12668
|
|
|
|
|
|
|
|
12669
|
|
|
|
|
|
|
_out: {} |
12670
|
|
|
|
|
|
|
} |
12671
|
|
|
|
|
|
|
|
12672
|
|
|
|
|
|
|
(void)act; // Suppress unused variable warning |
12673
|
|
|
|
|
|
|
|
12674
|
0
|
|
|
|
|
|
return !tokens.empty(); |
12675
|
|
|
|
|
|
|
} |
12676
|
|
|
|
|
|
|
|
12677
|
|
|
|
|
|
|
} // namespace morphodita |
12678
|
|
|
|
|
|
|
|
12679
|
|
|
|
|
|
|
///////// |
12680
|
|
|
|
|
|
|
// File: morphodita/tokenizer/generic_tokenizer_factory.h |
12681
|
|
|
|
|
|
|
///////// |
12682
|
|
|
|
|
|
|
|
12683
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
12684
|
|
|
|
|
|
|
// |
12685
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
12686
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
12687
|
|
|
|
|
|
|
// |
12688
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
12689
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
12690
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
12691
|
|
|
|
|
|
|
|
12692
|
|
|
|
|
|
|
namespace morphodita { |
12693
|
|
|
|
|
|
|
|
12694
|
0
|
|
|
|
|
|
class generic_tokenizer_factory : public tokenizer_factory { |
12695
|
|
|
|
|
|
|
public: |
12696
|
|
|
|
|
|
|
// Construct a new tokenizer instance. |
12697
|
|
|
|
|
|
|
virtual tokenizer* new_tokenizer(const morpho* m) const override; |
12698
|
|
|
|
|
|
|
|
12699
|
|
|
|
|
|
|
bool load(istream& is); |
12700
|
|
|
|
|
|
|
private: |
12701
|
|
|
|
|
|
|
unsigned version; |
12702
|
|
|
|
|
|
|
}; |
12703
|
|
|
|
|
|
|
|
12704
|
|
|
|
|
|
|
} // namespace morphodita |
12705
|
|
|
|
|
|
|
|
12706
|
|
|
|
|
|
|
///////// |
12707
|
|
|
|
|
|
|
// File: morphodita/tokenizer/generic_tokenizer_factory.cpp |
12708
|
|
|
|
|
|
|
///////// |
12709
|
|
|
|
|
|
|
|
12710
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
12711
|
|
|
|
|
|
|
// |
12712
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
12713
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
12714
|
|
|
|
|
|
|
// |
12715
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
12716
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
12717
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
12718
|
|
|
|
|
|
|
|
12719
|
|
|
|
|
|
|
namespace morphodita { |
12720
|
|
|
|
|
|
|
|
12721
|
0
|
|
|
|
|
|
tokenizer* generic_tokenizer_factory::new_tokenizer(const morpho* /*m*/) const { |
12722
|
0
|
|
|
|
|
|
return new generic_tokenizer(version); |
12723
|
|
|
|
|
|
|
} |
12724
|
|
|
|
|
|
|
|
12725
|
0
|
|
|
|
|
|
bool generic_tokenizer_factory::load(istream& is) { |
12726
|
0
|
0
|
|
|
|
|
version = is.get(); |
12727
|
|
|
|
|
|
|
|
12728
|
0
|
|
|
|
|
|
return bool(is); |
12729
|
|
|
|
|
|
|
} |
12730
|
|
|
|
|
|
|
|
12731
|
|
|
|
|
|
|
} // namespace morphodita |
12732
|
|
|
|
|
|
|
|
12733
|
|
|
|
|
|
|
///////// |
12734
|
|
|
|
|
|
|
// File: morphodita/tokenizer/generic_tokenizer_factory_encoder.h |
12735
|
|
|
|
|
|
|
///////// |
12736
|
|
|
|
|
|
|
|
12737
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
12738
|
|
|
|
|
|
|
// |
12739
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
12740
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
12741
|
|
|
|
|
|
|
// |
12742
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
12743
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
12744
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
12745
|
|
|
|
|
|
|
|
12746
|
|
|
|
|
|
|
namespace morphodita { |
12747
|
|
|
|
|
|
|
|
12748
|
|
|
|
|
|
|
class generic_tokenizer_factory_encoder { |
12749
|
|
|
|
|
|
|
public: |
12750
|
|
|
|
|
|
|
static void encode(unsigned version, ostream& os); |
12751
|
|
|
|
|
|
|
}; |
12752
|
|
|
|
|
|
|
|
12753
|
|
|
|
|
|
|
} // namespace morphodita |
12754
|
|
|
|
|
|
|
|
12755
|
|
|
|
|
|
|
///////// |
12756
|
|
|
|
|
|
|
// File: morphodita/tokenizer/generic_tokenizer_factory_encoder.cpp |
12757
|
|
|
|
|
|
|
///////// |
12758
|
|
|
|
|
|
|
|
12759
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
12760
|
|
|
|
|
|
|
// |
12761
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
12762
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
12763
|
|
|
|
|
|
|
// |
12764
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
12765
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
12766
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
12767
|
|
|
|
|
|
|
|
12768
|
|
|
|
|
|
|
namespace morphodita { |
12769
|
|
|
|
|
|
|
|
12770
|
0
|
|
|
|
|
|
void generic_tokenizer_factory_encoder::encode(unsigned version, ostream& os) { |
12771
|
0
|
0
|
|
|
|
|
os.put(version); |
12772
|
0
|
|
|
|
|
|
} |
12773
|
|
|
|
|
|
|
|
12774
|
|
|
|
|
|
|
} // namespace morphodita |
12775
|
|
|
|
|
|
|
|
12776
|
|
|
|
|
|
|
///////// |
12777
|
|
|
|
|
|
|
// File: unilib/uninorms.h |
12778
|
|
|
|
|
|
|
///////// |
12779
|
|
|
|
|
|
|
|
12780
|
|
|
|
|
|
|
// This file is part of UniLib . |
12781
|
|
|
|
|
|
|
// |
12782
|
|
|
|
|
|
|
// Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of |
12783
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
12784
|
|
|
|
|
|
|
// |
12785
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
12786
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
12787
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
12788
|
|
|
|
|
|
|
// |
12789
|
|
|
|
|
|
|
// UniLib version: 3.3.0 |
12790
|
|
|
|
|
|
|
// Unicode version: 15.0.0 |
12791
|
|
|
|
|
|
|
|
12792
|
|
|
|
|
|
|
namespace unilib { |
12793
|
|
|
|
|
|
|
|
12794
|
|
|
|
|
|
|
class uninorms { |
12795
|
|
|
|
|
|
|
public: |
12796
|
|
|
|
|
|
|
static void nfc(std::u32string& str); |
12797
|
|
|
|
|
|
|
static void nfd(std::u32string& str); |
12798
|
|
|
|
|
|
|
static void nfkc(std::u32string& str); |
12799
|
|
|
|
|
|
|
static void nfkd(std::u32string& str); |
12800
|
|
|
|
|
|
|
|
12801
|
|
|
|
|
|
|
private: |
12802
|
|
|
|
|
|
|
static void compose(std::u32string& str); |
12803
|
|
|
|
|
|
|
static void decompose(std::u32string& str, bool kanonical); |
12804
|
|
|
|
|
|
|
|
12805
|
|
|
|
|
|
|
static const char32_t CHARS = 0x110000; |
12806
|
|
|
|
|
|
|
|
12807
|
|
|
|
|
|
|
struct Hangul { |
12808
|
|
|
|
|
|
|
// Hangul decomposition and composition |
12809
|
|
|
|
|
|
|
static const char32_t SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7; |
12810
|
|
|
|
|
|
|
static const char32_t LCount = 19, VCount = 21, TCount = 28, NCount = VCount * TCount, SCount = LCount * NCount; |
12811
|
|
|
|
|
|
|
}; |
12812
|
|
|
|
|
|
|
|
12813
|
|
|
|
|
|
|
static const uint8_t ccc_index[CHARS >> 8]; |
12814
|
|
|
|
|
|
|
static const uint8_t ccc_block[][256]; |
12815
|
|
|
|
|
|
|
|
12816
|
|
|
|
|
|
|
static const uint8_t composition_index[CHARS >> 8]; |
12817
|
|
|
|
|
|
|
static const uint16_t composition_block[][257]; |
12818
|
|
|
|
|
|
|
static const char32_t composition_data[]; |
12819
|
|
|
|
|
|
|
|
12820
|
|
|
|
|
|
|
static const uint8_t decomposition_index[CHARS >> 8]; |
12821
|
|
|
|
|
|
|
static const uint16_t decomposition_block[][257]; |
12822
|
|
|
|
|
|
|
static const char32_t decomposition_data[]; |
12823
|
|
|
|
|
|
|
}; |
12824
|
|
|
|
|
|
|
|
12825
|
|
|
|
|
|
|
} // namespace unilib |
12826
|
|
|
|
|
|
|
|
12827
|
|
|
|
|
|
|
///////// |
12828
|
|
|
|
|
|
|
// File: morphodita/tokenizer/gru_tokenizer_network.h |
12829
|
|
|
|
|
|
|
///////// |
12830
|
|
|
|
|
|
|
|
12831
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
12832
|
|
|
|
|
|
|
// |
12833
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
12834
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
12835
|
|
|
|
|
|
|
// |
12836
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
12837
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
12838
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
12839
|
|
|
|
|
|
|
|
12840
|
|
|
|
|
|
|
namespace morphodita { |
12841
|
|
|
|
|
|
|
|
12842
|
|
|
|
|
|
|
// Declarations |
12843
|
|
|
|
|
|
|
|
12844
|
1
|
|
|
|
|
|
class gru_tokenizer_network { |
12845
|
|
|
|
|
|
|
public: |
12846
|
1
|
|
|
|
|
|
virtual ~gru_tokenizer_network() {} |
12847
|
|
|
|
|
|
|
|
12848
|
|
|
|
|
|
|
template struct matrix { |
12849
|
|
|
|
|
|
|
float w[R][C]; |
12850
|
|
|
|
|
|
|
float b[R]; |
12851
|
|
|
|
|
|
|
|
12852
|
|
|
|
|
|
|
void clear(); |
12853
|
|
|
|
|
|
|
void load(binary_decoder& data); |
12854
|
|
|
|
|
|
|
}; |
12855
|
|
|
|
|
|
|
|
12856
|
|
|
|
|
|
|
enum { NO_SPLIT, END_OF_TOKEN, END_OF_SENTENCE, OUTCOMES }; |
12857
|
|
|
|
|
|
|
struct outcome_t { |
12858
|
|
|
|
|
|
|
int outcome; |
12859
|
|
|
|
|
|
|
float w[3]; |
12860
|
|
|
|
|
|
|
const float* embedding; |
12861
|
|
|
|
|
|
|
}; |
12862
|
|
|
|
|
|
|
struct char_info { |
12863
|
|
|
|
|
|
|
char32_t chr; |
12864
|
|
|
|
|
|
|
unilib::unicode::category_t cat; |
12865
|
|
|
|
|
|
|
|
12866
|
|
|
|
|
|
|
char_info() {} |
12867
|
34
|
|
|
|
|
|
char_info(char32_t chr, unilib::unicode::category_t cat) : chr(chr), cat(cat) {} |
12868
|
|
|
|
|
|
|
}; |
12869
|
|
|
|
|
|
|
|
12870
|
|
|
|
|
|
|
virtual void classify(const vector& chars, vector& outcomes) const = 0; |
12871
|
|
|
|
|
|
|
|
12872
|
|
|
|
|
|
|
static gru_tokenizer_network* load(binary_decoder& data); |
12873
|
|
|
|
|
|
|
}; |
12874
|
|
|
|
|
|
|
|
12875
|
|
|
|
|
|
|
template |
12876
|
2
|
|
|
|
|
|
class gru_tokenizer_network_implementation : public gru_tokenizer_network { |
12877
|
|
|
|
|
|
|
public: |
12878
|
|
|
|
|
|
|
virtual void classify(const vector& chars, vector& outcomes) const override; |
12879
|
|
|
|
|
|
|
|
12880
|
|
|
|
|
|
|
static gru_tokenizer_network_implementation* load(binary_decoder& data); |
12881
|
|
|
|
|
|
|
|
12882
|
|
|
|
|
|
|
protected: |
12883
|
|
|
|
|
|
|
void cache_embeddings(); |
12884
|
|
|
|
|
|
|
|
12885
|
|
|
|
|
|
|
struct cached_embedding { |
12886
|
|
|
|
|
|
|
matrix<1, D> e; |
12887
|
|
|
|
|
|
|
matrix<6, D> cache; |
12888
|
|
|
|
|
|
|
}; |
12889
|
|
|
|
|
|
|
|
12890
|
|
|
|
|
|
|
struct gru { |
12891
|
|
|
|
|
|
|
matrix X, X_r, X_z; |
12892
|
|
|
|
|
|
|
matrix H, H_r, H_z; |
12893
|
|
|
|
|
|
|
|
12894
|
|
|
|
|
|
|
void load(binary_decoder& data); |
12895
|
|
|
|
|
|
|
}; |
12896
|
|
|
|
|
|
|
|
12897
|
|
|
|
|
|
|
unordered_map embeddings; |
12898
|
|
|
|
|
|
|
cached_embedding empty_embedding; |
12899
|
|
|
|
|
|
|
gru gru_fwd, gru_bwd; |
12900
|
|
|
|
|
|
|
matrix<3, D> projection_fwd, projection_bwd; |
12901
|
|
|
|
|
|
|
unordered_map unknown_chars; |
12902
|
|
|
|
|
|
|
}; |
12903
|
|
|
|
|
|
|
|
12904
|
|
|
|
|
|
|
// Definitions |
12905
|
|
|
|
|
|
|
|
12906
|
|
|
|
|
|
|
template |
12907
|
|
|
|
|
|
|
void gru_tokenizer_network::matrix::clear() { |
12908
|
4
|
100
|
|
|
|
|
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
12909
|
0
|
|
|
|
|
|
fill_n(b, R, 0.f); |
12910
|
|
|
|
|
|
|
} |
12911
|
|
|
|
|
|
|
|
12912
|
|
|
|
|
|
|
template |
12913
|
28
|
|
|
|
|
|
void gru_tokenizer_network::matrix::load(binary_decoder& data) { |
12914
|
212
|
0
|
|
|
|
|
for (int i = 0; i < R; i++) memcpy(w[i], data.next(C), sizeof(float) * C); |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
12915
|
14
|
|
|
|
|
|
memcpy(b, data.next(R), sizeof(float) * R); |
12916
|
14
|
|
|
|
|
|
} |
12917
|
|
|
|
|
|
|
|
12918
|
|
|
|
|
|
|
template |
12919
|
2
|
|
|
|
|
|
void gru_tokenizer_network_implementation::gru::load(binary_decoder& data) { |
12920
|
2
|
|
|
|
|
|
X.load(data); |
12921
|
2
|
|
|
|
|
|
X_r.load(data); |
12922
|
2
|
|
|
|
|
|
X_z.load(data); |
12923
|
2
|
|
|
|
|
|
H.load(data); |
12924
|
2
|
|
|
|
|
|
H_r.load(data); |
12925
|
2
|
|
|
|
|
|
H_z.load(data); |
12926
|
2
|
|
|
|
|
|
} |
12927
|
|
|
|
|
|
|
|
12928
|
|
|
|
|
|
|
template |
12929
|
1
|
|
|
|
|
|
void gru_tokenizer_network_implementation::classify(const vector& chars, vector& outcomes) const { |
12930
|
2
|
50
|
|
|
|
|
if (chars.empty()) return; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
12931
|
|
|
|
|
|
|
|
12932
|
|
|
|
|
|
|
// Resolve embeddings, possibly with unknown_chars or empty_embedding |
12933
|
|
|
|
|
|
|
u32string decomposition; |
12934
|
35
|
100
|
|
|
|
|
for (size_t i = 0; i < chars.size(); i++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
12935
|
34
|
|
|
|
|
|
auto embedding = embeddings.find(chars[i].chr); |
12936
|
|
|
|
|
|
|
|
12937
|
|
|
|
|
|
|
// Try finding substitute character if not found, by using NFKD |
12938
|
|
|
|
|
|
|
// and by replacing IDEOGRAPHIC FULL STOP/COMMA. |
12939
|
34
|
|
|
|
|
|
if (embedding == embeddings.end()) { |
12940
|
0
|
|
|
|
|
|
decomposition.assign(1, chars[i].chr); |
12941
|
|
|
|
|
|
|
unilib::uninorms::nfkd(decomposition); |
12942
|
0
|
0
|
|
|
|
|
if (decomposition[0] == 0x3001) decomposition[0] = char32_t(','); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
12943
|
0
|
0
|
|
|
|
|
if (decomposition[0] == 0x3002) decomposition[0] = char32_t('.'); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
12944
|
0
|
0
|
|
|
|
|
if (decomposition[0] != chars[i].chr) embedding = embeddings.find(decomposition[0]); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
12945
|
|
|
|
|
|
|
} |
12946
|
|
|
|
|
|
|
|
12947
|
34
|
50
|
|
|
|
|
if (embedding != embeddings.end()) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
12948
|
34
|
|
|
|
|
|
outcomes[i].embedding = embedding->second.cache.w[0]; |
12949
|
|
|
|
|
|
|
} else { |
12950
|
0
|
|
|
|
|
|
auto unknown_char = unknown_chars.find(chars[i].cat); |
12951
|
0
|
|
|
|
|
|
if (unknown_char != unknown_chars.end()) embedding = embeddings.find(unknown_char->second); |
12952
|
0
|
0
|
|
|
|
|
outcomes[i].embedding = embedding != embeddings.end() ? embedding->second.cache.w[0] : empty_embedding.cache.w[0]; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
12953
|
|
|
|
|
|
|
} |
12954
|
|
|
|
|
|
|
} |
12955
|
|
|
|
|
|
|
|
12956
|
|
|
|
|
|
|
// Clear outcome probabilities |
12957
|
35
|
100
|
|
|
|
|
for (auto&& outcome : outcomes) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
12958
|
136
|
100
|
|
|
|
|
for (int i = 0; i < 3; i++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
12959
|
102
|
|
|
|
|
|
outcome.w[i] = projection_fwd.b[i]; |
12960
|
|
|
|
|
|
|
|
12961
|
|
|
|
|
|
|
// Perform forward & backward GRU |
12962
|
|
|
|
|
|
|
matrix<1, D> state, update, reset, candidate; |
12963
|
3
|
100
|
|
|
|
|
for (int dir = 0; dir < 2; dir++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
12964
|
2
|
100
|
|
|
|
|
auto& gru = dir == 0 ? gru_fwd : gru_bwd; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
12965
|
2
|
100
|
|
|
|
|
auto& projection = dir == 0 ? projection_fwd : projection_bwd; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
12966
|
|
|
|
|
|
|
|
12967
|
|
|
|
|
|
|
state.clear(); |
12968
|
70
|
100
|
|
|
|
|
for (size_t i = 0; i < outcomes.size(); i++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
12969
|
68
|
100
|
|
|
|
|
auto& outcome = outcomes[dir == 0 ? i : outcomes.size() - 1 - i]; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
12970
|
68
|
100
|
|
|
|
|
auto* embedding_cache = outcome.embedding + (dir == 1) * 3 * D; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
12971
|
|
|
|
|
|
|
|
12972
|
1156
|
100
|
|
|
|
|
for (int j = 0; j < D; j++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
12973
|
1088
|
|
|
|
|
|
update.w[0][j] = gru.X_z.b[j] + embedding_cache[2*D + j]; |
12974
|
1088
|
|
|
|
|
|
reset.w[0][j] = gru.X_r.b[j] + embedding_cache[D + j]; |
12975
|
18496
|
100
|
|
|
|
|
for (int k = 0; k < D; k++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
12976
|
17408
|
|
|
|
|
|
update.w[0][j] += state.w[0][k] * gru.H_z.w[j][k]; |
12977
|
17408
|
|
|
|
|
|
reset.w[0][j] += state.w[0][k] * gru.H_r.w[j][k]; |
12978
|
|
|
|
|
|
|
} |
12979
|
2176
|
|
|
|
|
|
update.w[0][j] = 1.f / (1.f + exp(-update.w[0][j])); |
12980
|
2176
|
|
|
|
|
|
reset.w[0][j] = 1.f / (1.f + exp(-reset.w[0][j])); |
12981
|
1088
|
|
|
|
|
|
reset.w[0][j] *= state.w[0][j]; |
12982
|
|
|
|
|
|
|
} |
12983
|
1156
|
100
|
|
|
|
|
for (int j = 0; j < D; j++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
12984
|
1088
|
|
|
|
|
|
candidate.w[0][j] = gru.X.b[j] + embedding_cache[j]; |
12985
|
18496
|
100
|
|
|
|
|
for (int k = 0; k < D; k++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
12986
|
17408
|
|
|
|
|
|
candidate.w[0][j] += reset.w[0][k] * gru.H.w[j][k]; |
12987
|
1088
|
|
|
|
|
|
candidate.w[0][j] = tanh(candidate.w[0][j]); |
12988
|
1088
|
|
|
|
|
|
state.w[0][j] = update.w[0][j] * state.w[0][j] + (1.f - update.w[0][j]) * candidate.w[0][j]; |
12989
|
|
|
|
|
|
|
} |
12990
|
|
|
|
|
|
|
|
12991
|
272
|
100
|
|
|
|
|
for (int j = 0; j < 3; j++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
12992
|
3468
|
100
|
|
|
|
|
for (int k = 0; k < D; k++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
12993
|
3264
|
|
|
|
|
|
outcome.w[j] += projection.w[j][k] * state.w[0][k]; |
12994
|
|
|
|
|
|
|
} |
12995
|
|
|
|
|
|
|
} |
12996
|
|
|
|
|
|
|
|
12997
|
|
|
|
|
|
|
// Choose the outcome with the highest weight |
12998
|
35
|
100
|
|
|
|
|
for (auto&& outcome : outcomes) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
12999
|
34
|
|
|
|
|
|
outcome.outcome = outcome.w[1] > outcome.w[0]; |
13000
|
34
|
100
|
|
|
|
|
if (outcome.w[2] > outcome.w[outcome.outcome]) outcome.outcome = 2; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13001
|
|
|
|
|
|
|
} |
13002
|
|
|
|
|
|
|
} |
13003
|
|
|
|
|
|
|
|
13004
|
|
|
|
|
|
|
template |
13005
|
1
|
|
|
|
|
|
gru_tokenizer_network_implementation* gru_tokenizer_network_implementation::load(binary_decoder& data) { |
13006
|
1
|
|
|
|
|
|
unique_ptr> network(new gru_tokenizer_network_implementation()); |
13007
|
|
|
|
|
|
|
|
13008
|
21
|
0
|
|
|
|
|
for (unsigned chars = data.next_4B(); chars; chars--) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
13009
|
20
|
0
|
|
|
|
|
auto& embedding = network->embeddings[data.next_4B()]; |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
13010
|
20
|
0
|
|
|
|
|
copy_n(data.next(D), D, embedding.e.w[0]); |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
13011
|
|
|
|
|
|
|
} |
13012
|
1
|
|
|
|
|
|
fill_n(network->empty_embedding.e.w[0], D, 0.f); |
13013
|
|
|
|
|
|
|
|
13014
|
1
|
0
|
|
|
|
|
network->gru_fwd.load(data); |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
13015
|
1
|
0
|
|
|
|
|
network->gru_bwd.load(data); |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
13016
|
1
|
0
|
|
|
|
|
network->projection_fwd.load(data); |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
13017
|
1
|
0
|
|
|
|
|
network->projection_bwd.load(data); |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
13018
|
|
|
|
|
|
|
|
13019
|
|
|
|
|
|
|
network->unknown_chars.clear(); |
13020
|
5
|
0
|
|
|
|
|
for (unsigned unknown_chars_len = data.next_1B(); unknown_chars_len; unknown_chars_len--) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
13021
|
4
|
0
|
|
|
|
|
unilib::unicode::category_t cat = data.next_4B(); |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
13022
|
4
|
0
|
|
|
|
|
network->unknown_chars[cat] = data.next_4B(); |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
13023
|
|
|
|
|
|
|
} |
13024
|
|
|
|
|
|
|
|
13025
|
1
|
|
|
|
|
|
network->cache_embeddings(); |
13026
|
|
|
|
|
|
|
|
13027
|
1
|
|
|
|
|
|
return network.release(); |
13028
|
|
|
|
|
|
|
} |
13029
|
|
|
|
|
|
|
|
13030
|
|
|
|
|
|
|
template |
13031
|
2
|
|
|
|
|
|
void gru_tokenizer_network_implementation::cache_embeddings() { |
13032
|
21
|
0
|
|
|
|
|
for (auto&& embedding : embeddings) { |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
13033
|
|
|
|
|
|
|
auto& e = embedding.second.e; |
13034
|
|
|
|
|
|
|
auto& cache = embedding.second.cache; |
13035
|
|
|
|
|
|
|
|
13036
|
140
|
0
|
|
|
|
|
for (int i = 0; i < 6; i++) fill_n(cache.w[i], D, 0.f); |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
13037
|
5460
|
0
|
|
|
|
|
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[0][i] += e.w[0][j] * gru_fwd.X.w[i][j]; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
13038
|
5460
|
0
|
|
|
|
|
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[1][i] += e.w[0][j] * gru_fwd.X_r.w[i][j]; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
13039
|
5460
|
0
|
|
|
|
|
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[2][i] += e.w[0][j] * gru_fwd.X_z.w[i][j]; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
13040
|
5460
|
0
|
|
|
|
|
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[3][i] += e.w[0][j] * gru_bwd.X.w[i][j]; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
13041
|
5460
|
0
|
|
|
|
|
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[4][i] += e.w[0][j] * gru_bwd.X_r.w[i][j]; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
13042
|
5460
|
0
|
|
|
|
|
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[5][i] += e.w[0][j] * gru_bwd.X_z.w[i][j]; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
13043
|
|
|
|
|
|
|
} |
13044
|
7
|
0
|
|
|
|
|
for (int i = 0; i < 6; i++) fill_n(empty_embedding.cache.w[i], D, 0.f); |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
13045
|
1
|
|
|
|
|
|
} |
13046
|
|
|
|
|
|
|
|
13047
|
|
|
|
|
|
|
} // namespace morphodita |
13048
|
|
|
|
|
|
|
|
13049
|
|
|
|
|
|
|
///////// |
13050
|
|
|
|
|
|
|
// File: morphodita/tokenizer/gru_tokenizer.h |
13051
|
|
|
|
|
|
|
///////// |
13052
|
|
|
|
|
|
|
|
13053
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
13054
|
|
|
|
|
|
|
// |
13055
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
13056
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
13057
|
|
|
|
|
|
|
// |
13058
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
13059
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
13060
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
13061
|
|
|
|
|
|
|
|
13062
|
|
|
|
|
|
|
namespace morphodita { |
13063
|
|
|
|
|
|
|
|
13064
|
4
|
|
|
|
|
|
class gru_tokenizer : public unicode_tokenizer { |
13065
|
|
|
|
|
|
|
public: |
13066
|
|
|
|
|
|
|
gru_tokenizer(unsigned url_email_tokenizer, unsigned segment, bool allow_spaces, const gru_tokenizer_network& network) |
13067
|
1
|
0
|
|
|
|
|
: unicode_tokenizer(url_email_tokenizer), segment(segment), allow_spaces(allow_spaces), network_index(0), network_length(0), network(network) {} |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
13068
|
|
|
|
|
|
|
|
13069
|
|
|
|
|
|
|
virtual bool next_sentence(vector& tokens) override; |
13070
|
|
|
|
|
|
|
|
13071
|
|
|
|
|
|
|
private: |
13072
|
|
|
|
|
|
|
inline bool is_space(size_t index); |
13073
|
|
|
|
|
|
|
int next_outcome(); |
13074
|
|
|
|
|
|
|
|
13075
|
|
|
|
|
|
|
unsigned segment; |
13076
|
|
|
|
|
|
|
bool allow_spaces; |
13077
|
|
|
|
|
|
|
unsigned network_index, network_length; |
13078
|
|
|
|
|
|
|
vector network_chars; |
13079
|
|
|
|
|
|
|
vector network_outcomes; |
13080
|
|
|
|
|
|
|
vector network_offsets; |
13081
|
|
|
|
|
|
|
const gru_tokenizer_network& network; |
13082
|
|
|
|
|
|
|
}; |
13083
|
|
|
|
|
|
|
|
13084
|
|
|
|
|
|
|
} // namespace morphodita |
13085
|
|
|
|
|
|
|
|
13086
|
|
|
|
|
|
|
///////// |
13087
|
|
|
|
|
|
|
// File: morphodita/tokenizer/gru_tokenizer.cpp |
13088
|
|
|
|
|
|
|
///////// |
13089
|
|
|
|
|
|
|
|
13090
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
13091
|
|
|
|
|
|
|
// |
13092
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
13093
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
13094
|
|
|
|
|
|
|
// |
13095
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
13096
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
13097
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
13098
|
|
|
|
|
|
|
|
13099
|
|
|
|
|
|
|
namespace morphodita { |
13100
|
|
|
|
|
|
|
|
13101
|
|
|
|
|
|
|
bool gru_tokenizer::is_space(size_t index) { |
13102
|
83
|
100
|
|
|
|
|
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
13103
|
|
|
|
|
|
|
} |
13104
|
|
|
|
|
|
|
|
13105
|
2
|
|
|
|
|
|
bool gru_tokenizer::next_sentence(vector& tokens) { |
13106
|
|
|
|
|
|
|
tokens.clear(); |
13107
|
|
|
|
|
|
|
|
13108
|
|
|
|
|
|
|
// Reset tokenizer on new text |
13109
|
9
|
100
|
|
|
|
|
if (current == 0) network_index = network_length = 0; |
13110
|
|
|
|
|
|
|
|
13111
|
|
|
|
|
|
|
// Tokenize until EOS |
13112
|
9
|
100
|
|
|
|
|
for (bool eos = false; !eos && !emergency_sentence_split(tokens); ) { |
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
13113
|
25
|
100
|
|
|
|
|
while (current < chars.size() - 1 && is_space(current)) |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
13114
|
5
|
50
|
|
|
|
|
if (next_outcome() == gru_tokenizer_network::END_OF_SENTENCE && !tokens.empty()) |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
13115
|
|
|
|
|
|
|
break; |
13116
|
|
|
|
|
|
|
|
13117
|
8
|
100
|
|
|
|
|
if (current >= chars.size() - 1) break; |
13118
|
|
|
|
|
|
|
|
13119
|
|
|
|
|
|
|
// We have a beginning of a token. Try if it is an URL. |
13120
|
7
|
50
|
|
|
|
|
if (tokenize_url_email(tokens)) { |
13121
|
0
|
0
|
|
|
|
|
while (network_index < network_length && network_offsets[network_index] < current) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13122
|
0
|
0
|
|
|
|
|
if (network_outcomes[network_index++].outcome == gru_tokenizer_network::END_OF_SENTENCE && !tokens.empty()) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13123
|
|
|
|
|
|
|
eos = true; |
13124
|
0
|
|
|
|
|
|
continue; |
13125
|
|
|
|
|
|
|
} |
13126
|
|
|
|
|
|
|
|
13127
|
|
|
|
|
|
|
// Slurp current token |
13128
|
7
|
|
|
|
|
|
size_t token_start = current; |
13129
|
22
|
50
|
|
|
|
|
do { |
13130
|
29
|
|
|
|
|
|
int outcome = next_outcome(); |
13131
|
29
|
|
|
|
|
|
eos = outcome == gru_tokenizer_network::END_OF_SENTENCE; |
13132
|
29
|
100
|
|
|
|
|
if (outcome != gru_tokenizer_network::NO_SPLIT) break; |
13133
|
44
|
|
|
|
|
|
} while (current < chars.size() - 1); |
13134
|
8
|
|
|
|
|
|
tokens.emplace_back(token_start, current - token_start); |
13135
|
|
|
|
|
|
|
} |
13136
|
|
|
|
|
|
|
|
13137
|
2
|
|
|
|
|
|
return !tokens.empty(); |
13138
|
|
|
|
|
|
|
} |
13139
|
|
|
|
|
|
|
|
13140
|
34
|
|
|
|
|
|
int gru_tokenizer::next_outcome() { |
13141
|
34
|
100
|
|
|
|
|
if (network_index >= network_length) { |
13142
|
|
|
|
|
|
|
// Compute required window |
13143
|
1
|
|
|
|
|
|
network_index = 0; |
13144
|
1
|
|
|
|
|
|
network_length = 0; |
13145
|
|
|
|
|
|
|
network_chars.clear(); |
13146
|
|
|
|
|
|
|
network_outcomes.clear(); |
13147
|
|
|
|
|
|
|
network_offsets.clear(); |
13148
|
|
|
|
|
|
|
|
13149
|
|
|
|
|
|
|
// Prepare data for the classification |
13150
|
70
|
100
|
|
|
|
|
for (size_t offset = current; |
13151
|
35
|
100
|
|
|
|
|
network_offsets.push_back(offset), offset < chars.size() - 1 && network_length < segment; |
|
|
50
|
|
|
|
|
|
13152
|
34
|
|
|
|
|
|
network_length++, offset++) { |
13153
|
34
|
100
|
|
|
|
|
if (is_space(offset)) { |
13154
|
5
|
|
|
|
|
|
network_chars.emplace_back(' ', unilib::unicode::Zs); |
13155
|
9
|
100
|
|
|
|
|
while (offset + 1 < chars.size() - 1 && is_space(offset + 1)) offset++; |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
13156
|
|
|
|
|
|
|
} else { |
13157
|
29
|
|
|
|
|
|
network_chars.emplace_back(chars[offset].chr, chars[offset].cat); |
13158
|
|
|
|
|
|
|
} |
13159
|
|
|
|
|
|
|
} |
13160
|
|
|
|
|
|
|
// Add a space to the end on the EOD |
13161
|
1
|
50
|
|
|
|
|
if (network_length < segment && network_chars.back().chr != ' ') |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
13162
|
0
|
|
|
|
|
|
network_chars.emplace_back(' ', unilib::unicode::Zs); |
13163
|
1
|
|
|
|
|
|
network_outcomes.resize(network_chars.size()); |
13164
|
|
|
|
|
|
|
|
13165
|
|
|
|
|
|
|
// Perform the classification |
13166
|
34
|
|
|
|
|
|
network.classify(network_chars, network_outcomes); |
13167
|
|
|
|
|
|
|
|
13168
|
|
|
|
|
|
|
// Add spacing token/sentence breaks |
13169
|
34
|
100
|
|
|
|
|
for (size_t i = 0; i < network_length - 1; i++) |
13170
|
33
|
100
|
|
|
|
|
if (is_space(network_offsets[i+1])) { |
13171
|
|
|
|
|
|
|
// Detect EOS on the following space or \n\n or \r\n\r\n, or if there is end of text |
13172
|
5
|
|
|
|
|
|
bool eos = network_outcomes[i+1].outcome == gru_tokenizer_network::END_OF_SENTENCE; |
13173
|
5
|
100
|
|
|
|
|
if (i + 2 == network_length) eos = true; |
13174
|
5
|
50
|
|
|
|
|
for (size_t j = network_offsets[i+1]; j + 1 < network_offsets[i+2] && !eos; j++) |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
13175
|
0
|
0
|
|
|
|
|
eos = (chars[j].chr == '\n' && chars[j+1].chr == '\n') || |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13176
|
0
|
0
|
|
|
|
|
(j + 3 < network_offsets[i+2] && chars[j].chr == '\r' && chars[j+1].chr == '\n' && chars[j+2].chr == '\r' && chars[j+3].chr == '\n'); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13177
|
5
|
100
|
|
|
|
|
if (eos) network_outcomes[i].outcome = gru_tokenizer_network::END_OF_SENTENCE; |
13178
|
|
|
|
|
|
|
|
13179
|
5
|
100
|
|
|
|
|
if (network_outcomes[i].outcome == gru_tokenizer_network::NO_SPLIT) |
13180
|
|
|
|
|
|
|
// Force EOT if not allowing spaces, and also detect EOT on the following space |
13181
|
4
|
50
|
|
|
|
|
if (!allow_spaces || network_outcomes[i+1].outcome == gru_tokenizer_network::END_OF_TOKEN) |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
13182
|
4
|
|
|
|
|
|
network_outcomes[i].outcome = gru_tokenizer_network::END_OF_TOKEN; |
13183
|
|
|
|
|
|
|
} |
13184
|
|
|
|
|
|
|
|
13185
|
|
|
|
|
|
|
// Adjust network_length to suitable break |
13186
|
1
|
50
|
|
|
|
|
if (network_length == segment && network_length >= 10) { |
|
|
0
|
|
|
|
|
|
13187
|
0
|
|
|
|
|
|
network_length -= 5; |
13188
|
0
|
0
|
|
|
|
|
while (network_length > segment / 2) |
13189
|
0
|
0
|
|
|
|
|
if (network_outcomes[--network_length].outcome != gru_tokenizer_network::NO_SPLIT) |
13190
|
|
|
|
|
|
|
break; |
13191
|
|
|
|
|
|
|
} |
13192
|
|
|
|
|
|
|
} |
13193
|
102
|
|
|
|
|
|
return current = network_offsets[network_index + 1], network_outcomes[network_index++].outcome; |
13194
|
|
|
|
|
|
|
} |
13195
|
|
|
|
|
|
|
|
13196
|
|
|
|
|
|
|
} // namespace morphodita |
13197
|
|
|
|
|
|
|
|
13198
|
|
|
|
|
|
|
///////// |
13199
|
|
|
|
|
|
|
// File: morphodita/tokenizer/gru_tokenizer_factory.h |
13200
|
|
|
|
|
|
|
///////// |
13201
|
|
|
|
|
|
|
|
13202
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
13203
|
|
|
|
|
|
|
// |
13204
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
13205
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
13206
|
|
|
|
|
|
|
// |
13207
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
13208
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
13209
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
13210
|
|
|
|
|
|
|
|
13211
|
|
|
|
|
|
|
namespace morphodita { |
13212
|
|
|
|
|
|
|
|
13213
|
2
|
|
|
|
|
|
class gru_tokenizer_factory : public tokenizer_factory { |
13214
|
|
|
|
|
|
|
public: |
13215
|
|
|
|
|
|
|
// Construct a new tokenizer instance. |
13216
|
|
|
|
|
|
|
virtual tokenizer* new_tokenizer(const morpho* m) const override; |
13217
|
|
|
|
|
|
|
|
13218
|
|
|
|
|
|
|
bool load(istream& is); |
13219
|
|
|
|
|
|
|
|
13220
|
|
|
|
|
|
|
private: |
13221
|
|
|
|
|
|
|
unsigned url_email_tokenizer; |
13222
|
|
|
|
|
|
|
unsigned segment; |
13223
|
|
|
|
|
|
|
bool allow_spaces; |
13224
|
|
|
|
|
|
|
|
13225
|
|
|
|
|
|
|
unique_ptr network; |
13226
|
|
|
|
|
|
|
}; |
13227
|
|
|
|
|
|
|
|
13228
|
|
|
|
|
|
|
} // namespace morphodita |
13229
|
|
|
|
|
|
|
|
13230
|
|
|
|
|
|
|
///////// |
13231
|
|
|
|
|
|
|
// File: morphodita/tokenizer/gru_tokenizer_factory.cpp |
13232
|
|
|
|
|
|
|
///////// |
13233
|
|
|
|
|
|
|
|
13234
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
13235
|
|
|
|
|
|
|
// |
13236
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
13237
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
13238
|
|
|
|
|
|
|
// |
13239
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
13240
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
13241
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
13242
|
|
|
|
|
|
|
|
13243
|
|
|
|
|
|
|
namespace morphodita { |
13244
|
|
|
|
|
|
|
|
13245
|
1
|
|
|
|
|
|
tokenizer* gru_tokenizer_factory::new_tokenizer(const morpho* /*m*/) const { |
13246
|
2
|
|
|
|
|
|
return new gru_tokenizer(url_email_tokenizer, segment, allow_spaces, *network); |
13247
|
|
|
|
|
|
|
} |
13248
|
|
|
|
|
|
|
|
13249
|
1
|
|
|
|
|
|
bool gru_tokenizer_factory::load(istream& is) { |
13250
|
|
|
|
|
|
|
char version; |
13251
|
1
|
50
|
|
|
|
|
if (!is.get(version)) return false; |
13252
|
1
|
50
|
|
|
|
|
if (!(version >= 1 && version <= 2)) return false; |
13253
|
|
|
|
|
|
|
|
13254
|
|
|
|
|
|
|
binary_decoder data; |
13255
|
1
|
50
|
|
|
|
|
if (!compressor::load(is, data)) return false; |
|
|
50
|
|
|
|
|
|
13256
|
|
|
|
|
|
|
|
13257
|
|
|
|
|
|
|
try { |
13258
|
1
|
50
|
|
|
|
|
url_email_tokenizer = data.next_1B(); |
13259
|
1
|
50
|
|
|
|
|
segment = data.next_2B(); |
13260
|
1
|
50
|
|
|
|
|
allow_spaces = version >= 2 ? data.next_1B() : false /*false was default for version 1*/; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13261
|
|
|
|
|
|
|
|
13262
|
1
|
50
|
|
|
|
|
network.reset(gru_tokenizer_network::load(data)); |
13263
|
1
|
50
|
|
|
|
|
if (!network) return false; |
|
|
0
|
|
|
|
|
|
13264
|
|
|
|
|
|
|
} catch (binary_decoder_error&) { |
13265
|
|
|
|
|
|
|
return false; |
13266
|
|
|
|
|
|
|
} |
13267
|
|
|
|
|
|
|
|
13268
|
1
|
|
|
|
|
|
return data.is_end(); |
13269
|
|
|
|
|
|
|
} |
13270
|
|
|
|
|
|
|
|
13271
|
|
|
|
|
|
|
} // namespace morphodita |
13272
|
|
|
|
|
|
|
|
13273
|
|
|
|
|
|
|
///////// |
13274
|
|
|
|
|
|
|
// File: morphodita/tokenizer/gru_tokenizer_network.cpp |
13275
|
|
|
|
|
|
|
///////// |
13276
|
|
|
|
|
|
|
|
13277
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
13278
|
|
|
|
|
|
|
// |
13279
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
13280
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
13281
|
|
|
|
|
|
|
// |
13282
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
13283
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
13284
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
13285
|
|
|
|
|
|
|
|
13286
|
|
|
|
|
|
|
namespace morphodita { |
13287
|
|
|
|
|
|
|
|
13288
|
1
|
|
|
|
|
|
gru_tokenizer_network* gru_tokenizer_network::load(binary_decoder& data) { |
13289
|
1
|
50
|
|
|
|
|
if (data.next_1B() != 1) return nullptr; |
13290
|
1
|
|
|
|
|
|
switch (data.next_1B()) { |
13291
|
1
|
|
|
|
|
|
case 16: return gru_tokenizer_network_implementation<16>::load(data); |
13292
|
0
|
|
|
|
|
|
case 24: return gru_tokenizer_network_implementation<24>::load(data); |
13293
|
0
|
|
|
|
|
|
case 64: return gru_tokenizer_network_implementation<64>::load(data); |
13294
|
|
|
|
|
|
|
} |
13295
|
|
|
|
|
|
|
return nullptr; |
13296
|
|
|
|
|
|
|
} |
13297
|
|
|
|
|
|
|
|
13298
|
|
|
|
|
|
|
} // namespace morphodita |
13299
|
|
|
|
|
|
|
|
13300
|
|
|
|
|
|
|
///////// |
13301
|
|
|
|
|
|
|
// File: morphodita/tokenizer/gru_tokenizer_trainer.h |
13302
|
|
|
|
|
|
|
///////// |
13303
|
|
|
|
|
|
|
|
13304
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
13305
|
|
|
|
|
|
|
// |
13306
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
13307
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
13308
|
|
|
|
|
|
|
// |
13309
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
13310
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
13311
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
13312
|
|
|
|
|
|
|
|
13313
|
|
|
|
|
|
|
namespace morphodita { |
13314
|
|
|
|
|
|
|
|
13315
|
0
|
|
|
|
|
|
struct tokenized_sentence { |
13316
|
|
|
|
|
|
|
u32string sentence; |
13317
|
|
|
|
|
|
|
vector tokens; |
13318
|
|
|
|
|
|
|
}; |
13319
|
|
|
|
|
|
|
|
13320
|
|
|
|
|
|
|
class gru_tokenizer_trainer { |
13321
|
|
|
|
|
|
|
public: |
13322
|
|
|
|
|
|
|
enum { URL_EMAIL_LATEST = unicode_tokenizer::URL_EMAIL_LATEST }; |
13323
|
|
|
|
|
|
|
|
13324
|
|
|
|
|
|
|
static bool train(unsigned url_email_tokenizer, unsigned segment, bool allow_spaces, unsigned dimension, unsigned epochs, |
13325
|
|
|
|
|
|
|
unsigned batch_size, float learning_rate, float learning_rate_final, float dropout, |
13326
|
|
|
|
|
|
|
float initialization_range, bool early_stopping, const vector& data, |
13327
|
|
|
|
|
|
|
const vector& heldout, ostream& os, string& error); |
13328
|
|
|
|
|
|
|
}; |
13329
|
|
|
|
|
|
|
|
13330
|
|
|
|
|
|
|
} // namespace morphodita |
13331
|
|
|
|
|
|
|
|
13332
|
|
|
|
|
|
|
///////// |
13333
|
|
|
|
|
|
|
// File: morphodita/tokenizer/gru_tokenizer_network_trainer.h |
13334
|
|
|
|
|
|
|
///////// |
13335
|
|
|
|
|
|
|
|
13336
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
13337
|
|
|
|
|
|
|
// |
13338
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
13339
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
13340
|
|
|
|
|
|
|
// |
13341
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
13342
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
13343
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
13344
|
|
|
|
|
|
|
|
13345
|
|
|
|
|
|
|
namespace morphodita { |
13346
|
|
|
|
|
|
|
|
13347
|
|
|
|
|
|
|
// |
13348
|
|
|
|
|
|
|
// Declarations |
13349
|
|
|
|
|
|
|
// |
13350
|
|
|
|
|
|
|
|
13351
|
|
|
|
|
|
|
template |
13352
|
0
|
0
|
|
|
|
|
class gru_tokenizer_network_trainer : public gru_tokenizer_network_implementation { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13353
|
|
|
|
|
|
|
public: |
13354
|
|
|
|
|
|
|
bool train(unsigned url_email_tokenizer, unsigned segment, bool allow_spaces, unsigned epochs, unsigned batch_size, |
13355
|
|
|
|
|
|
|
float learning_rate, float learning_rate_final, float dropout, float initialization_range, |
13356
|
|
|
|
|
|
|
bool early_stopping, const vector& data, const vector& heldout, |
13357
|
|
|
|
|
|
|
binary_encoder& enc, string& error); |
13358
|
|
|
|
|
|
|
|
13359
|
|
|
|
|
|
|
private: |
13360
|
|
|
|
|
|
|
template using matrix = typename gru_tokenizer_network_implementation::template matrix; |
13361
|
|
|
|
|
|
|
using typename gru_tokenizer_network_implementation::cached_embedding; |
13362
|
|
|
|
|
|
|
using typename gru_tokenizer_network_implementation::gru; |
13363
|
|
|
|
|
|
|
|
13364
|
|
|
|
|
|
|
template struct matrix_trainer { |
13365
|
|
|
|
|
|
|
matrix& original; |
13366
|
|
|
|
|
|
|
float w_g[R][C], b_g[R]; |
13367
|
|
|
|
|
|
|
float w_m[R][C], b_m[R]; |
13368
|
|
|
|
|
|
|
float w_v[R][C], b_v[R]; |
13369
|
|
|
|
|
|
|
|
13370
|
0
|
0
|
|
|
|
|
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13371
|
|
|
|
|
|
|
void update_weights(float learning_rate); |
13372
|
|
|
|
|
|
|
}; |
13373
|
0
|
|
|
|
|
|
struct gru_trainer { |
13374
|
|
|
|
|
|
|
matrix_trainer X, X_r, X_z; |
13375
|
|
|
|
|
|
|
matrix_trainer H, H_r, H_z; |
13376
|
|
|
|
|
|
|
vector> states, updates, resets, resetstates, candidates, dropouts; |
13377
|
|
|
|
|
|
|
|
13378
|
0
|
|
|
|
|
|
gru_trainer(gru& g, unsigned segment) |
13379
|
|
|
|
|
|
|
: X(g.X), X_r(g.X_r), X_z(g.X_z), H(g.H), H_r(g.H_r), H_z(g.H_z), states(segment + 1), |
13380
|
0
|
0
|
|
|
|
|
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13381
|
|
|
|
|
|
|
void update_weights(float learning_rate); |
13382
|
|
|
|
|
|
|
}; |
13383
|
|
|
|
|
|
|
|
13384
|
|
|
|
|
|
|
struct f1_info { double precision, recall, f1; }; |
13385
|
|
|
|
|
|
|
void evaluate(unsigned url_email_tokenizer, unsigned segment, bool allow_spaces, const vector& heldout, |
13386
|
|
|
|
|
|
|
f1_info& tokens_f1, f1_info& sentences_f1); |
13387
|
|
|
|
|
|
|
void evaluate_f1(const vector& system, const vector& gold, f1_info& f1); |
13388
|
|
|
|
|
|
|
|
13389
|
|
|
|
|
|
|
template void random_matrix(matrix& m, mt19937& generator, float range, float bias); |
13390
|
|
|
|
|
|
|
void random_gru(gru& g, mt19937& generator, float range); |
13391
|
|
|
|
|
|
|
|
13392
|
|
|
|
|
|
|
template void save_matrix(const matrix& m, binary_encoder& enc); |
13393
|
|
|
|
|
|
|
void save_gru(const gru& g, binary_encoder& enc); |
13394
|
|
|
|
|
|
|
}; |
13395
|
|
|
|
|
|
|
|
13396
|
|
|
|
|
|
|
// |
13397
|
|
|
|
|
|
|
// Definitions |
13398
|
|
|
|
|
|
|
// |
13399
|
|
|
|
|
|
|
|
13400
|
|
|
|
|
|
|
template |
13401
|
0
|
|
|
|
|
|
bool gru_tokenizer_network_trainer::train(unsigned url_email_tokenizer, unsigned segment, bool allow_spaces, unsigned epochs, unsigned batch_size, |
13402
|
|
|
|
|
|
|
float learning_rate_initial, float learning_rate_final, float dropout, |
13403
|
|
|
|
|
|
|
float initialization_range, bool early_stopping, const vector& data, |
13404
|
|
|
|
|
|
|
const vector& heldout, binary_encoder& enc, string& error) { |
13405
|
0
|
0
|
|
|
|
|
if (segment < 10) return error.assign("Segment size must be at least 10!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13406
|
|
|
|
|
|
|
|
13407
|
|
|
|
|
|
|
unsigned characters = 0; |
13408
|
0
|
0
|
|
|
|
|
for (auto&& sentence : data) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13409
|
0
|
|
|
|
|
|
characters += sentence.sentence.size(); |
13410
|
0
|
0
|
|
|
|
|
if (characters < segment) return error.assign("Not enought training data for the gru_tokenizer!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13411
|
|
|
|
|
|
|
|
13412
|
|
|
|
|
|
|
mt19937 generator; |
13413
|
|
|
|
|
|
|
|
13414
|
0
|
|
|
|
|
|
float dropout_multiplier = 1.f / (1.f - dropout); |
13415
|
0
|
|
|
|
|
|
bernoulli_distribution dropout_distribution(dropout); |
13416
|
|
|
|
|
|
|
|
13417
|
|
|
|
|
|
|
// Generate embeddings |
13418
|
0
|
0
|
|
|
|
|
for (auto&& sentence : data) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13419
|
0
|
0
|
|
|
|
|
for (auto&& chr : sentence.sentence) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13420
|
0
|
|
|
|
|
|
if (!this->embeddings.count(chr)) { |
13421
|
|
|
|
|
|
|
cached_embedding embedding; |
13422
|
0
|
|
|
|
|
|
random_matrix(embedding.e, generator, initialization_range, 0.f); |
13423
|
|
|
|
|
|
|
this->embeddings.emplace(chr, embedding); |
13424
|
|
|
|
|
|
|
} |
13425
|
|
|
|
|
|
|
this->empty_embedding.e.clear(); |
13426
|
|
|
|
|
|
|
|
13427
|
|
|
|
|
|
|
// Initialize weights |
13428
|
0
|
|
|
|
|
|
random_gru(this->gru_fwd, generator, initialization_range); |
13429
|
0
|
|
|
|
|
|
random_gru(this->gru_bwd, generator, initialization_range); |
13430
|
0
|
|
|
|
|
|
random_matrix(this->projection_fwd, generator, initialization_range, 0.f); this->projection_fwd.b[this->NO_SPLIT] = 1.f; |
13431
|
0
|
|
|
|
|
|
random_matrix(this->projection_bwd, generator, initialization_range, 0.f); this->projection_bwd.b[this->NO_SPLIT] = 1.f; |
13432
|
|
|
|
|
|
|
|
13433
|
|
|
|
|
|
|
// Train the network |
13434
|
|
|
|
|
|
|
unordered_map> embeddings; |
13435
|
0
|
0
|
|
|
|
|
for (auto&& embedding : this->embeddings) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13436
|
0
|
|
|
|
|
|
embeddings.emplace(embedding.first, embedding.second.e); |
13437
|
0
|
0
|
|
|
|
|
vector*> chosen_embeddings(segment); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13438
|
0
|
0
|
|
|
|
|
vector> embedding_dropouts(segment); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13439
|
0
|
0
|
|
|
|
|
gru_trainer gru_fwd(this->gru_fwd, segment), gru_bwd(this->gru_bwd, segment); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13440
|
0
|
|
|
|
|
|
matrix_trainer<3, D> projection_fwd(this->projection_fwd), projection_bwd(this->projection_bwd); |
13441
|
|
|
|
|
|
|
float learning_rate = learning_rate_initial, b1t = 1.f, b2t = 1.f; |
13442
|
|
|
|
|
|
|
|
13443
|
|
|
|
|
|
|
float best_combined_f1 = 0.f; unsigned best_combined_f1_epoch = 0; |
13444
|
|
|
|
|
|
|
gru_tokenizer_network_trainer best_combined_f1_network; |
13445
|
|
|
|
|
|
|
|
13446
|
|
|
|
|
|
|
size_t training_offset = 0, training_shift; |
13447
|
0
|
0
|
|
|
|
|
vector training_input, instance_input(segment); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13448
|
0
|
0
|
|
|
|
|
vector training_output, instance_output(segment); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13449
|
0
|
0
|
|
|
|
|
vector permutation; for (size_t i = 0; i < data.size(); i++) permutation.push_back(permutation.size()); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13450
|
0
|
0
|
|
|
|
|
for (unsigned epoch = 0; epoch < epochs; epoch++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13451
|
|
|
|
|
|
|
double logprob = 0; |
13452
|
|
|
|
|
|
|
int total = 0, correct = 0; |
13453
|
|
|
|
|
|
|
|
13454
|
0
|
0
|
|
|
|
|
for (int instance = 0, instances = 10000; instance < instances; instance++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13455
|
|
|
|
|
|
|
// Prepare input instance |
13456
|
0
|
0
|
|
|
|
|
if (training_offset + segment >= training_input.size()) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13457
|
0
|
|
|
|
|
|
shuffle(permutation.begin(), permutation.end(), generator); |
13458
|
|
|
|
|
|
|
training_input.clear(); training_output.clear(); |
13459
|
0
|
0
|
|
|
|
|
for (auto&& index : permutation) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13460
|
0
|
|
|
|
|
|
auto& sentence = data[index]; |
13461
|
0
|
0
|
|
|
|
|
if (sentence.tokens.empty()) continue; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13462
|
|
|
|
|
|
|
|
13463
|
|
|
|
|
|
|
training_offset = training_input.size(); |
13464
|
0
|
0
|
|
|
|
|
training_input.resize(training_offset + sentence.sentence.size()); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13465
|
0
|
0
|
|
|
|
|
training_output.resize(training_offset + sentence.sentence.size()); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13466
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < sentence.sentence.size(); i++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13467
|
0
|
|
|
|
|
|
training_input[training_offset + i].chr = sentence.sentence[i]; |
13468
|
0
|
|
|
|
|
|
training_output[training_offset + i].outcome = gru_tokenizer_network::NO_SPLIT; |
13469
|
|
|
|
|
|
|
} |
13470
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < sentence.tokens.size(); i++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13471
|
0
|
0
|
|
|
|
|
training_output[training_offset + sentence.tokens[i].start + sentence.tokens[i].length - 1].outcome = |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13472
|
|
|
|
|
|
|
i+1 < sentence.tokens.size() ? gru_tokenizer_network::END_OF_TOKEN : gru_tokenizer_network::END_OF_SENTENCE; |
13473
|
|
|
|
|
|
|
} |
13474
|
|
|
|
|
|
|
training_offset = 0; |
13475
|
|
|
|
|
|
|
} |
13476
|
|
|
|
|
|
|
copy_n(training_input.begin() + training_offset, segment, instance_input.begin()); |
13477
|
|
|
|
|
|
|
copy_n(training_output.begin() + training_offset, segment, instance_output.begin()); |
13478
|
|
|
|
|
|
|
|
13479
|
|
|
|
|
|
|
// Shift training_offset |
13480
|
0
|
0
|
|
|
|
|
for (training_shift = segment - 5; training_shift > segment / 2; training_shift--) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13481
|
0
|
0
|
|
|
|
|
if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ') |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13482
|
|
|
|
|
|
|
break; |
13483
|
0
|
|
|
|
|
|
training_offset += training_shift; |
13484
|
|
|
|
|
|
|
|
13485
|
|
|
|
|
|
|
// Forward pass |
13486
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < segment; i++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13487
|
0
|
|
|
|
|
|
chosen_embeddings[i] = &embeddings.at(instance_input[i].chr); |
13488
|
0
|
0
|
|
|
|
|
for (unsigned k = 0; k < D; k++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13489
|
0
|
0
|
|
|
|
|
embedding_dropouts[i].w[0][k] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13490
|
0
|
0
|
|
|
|
|
for (int j = 0; j < 3; j++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13491
|
0
|
|
|
|
|
|
instance_output[i].w[j] = projection_fwd.original.b[j]; |
13492
|
|
|
|
|
|
|
} |
13493
|
|
|
|
|
|
|
|
13494
|
0
|
0
|
|
|
|
|
for (int dir = 0; dir < 2; dir++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13495
|
0
|
0
|
|
|
|
|
auto& gru = dir == 0 ? gru_fwd : gru_bwd; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13496
|
0
|
0
|
|
|
|
|
auto& projection = dir == 0 ? projection_fwd : projection_bwd; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13497
|
|
|
|
|
|
|
|
13498
|
|
|
|
|
|
|
gru.states[0].clear(); |
13499
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < segment; i++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13500
|
0
|
0
|
|
|
|
|
auto& embedding = chosen_embeddings[dir == 0 ? i : segment - 1 - i]; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13501
|
0
|
0
|
|
|
|
|
auto& embedding_dropout = embedding_dropouts[dir == 0 ? i : segment - 1 - i]; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13502
|
0
|
0
|
|
|
|
|
auto& output = instance_output[dir == 0 ? i : segment - 1 - i]; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13503
|
|
|
|
|
|
|
|
13504
|
0
|
0
|
|
|
|
|
for (int j = 0; j < D; j++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13505
|
0
|
|
|
|
|
|
gru.updates[i].w[0][j] = gru.X_z.original.b[j]; |
13506
|
0
|
|
|
|
|
|
gru.resets[i].w[0][j] = gru.X_r.original.b[j]; |
13507
|
0
|
0
|
|
|
|
|
for (int k = 0; k < D; k++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13508
|
0
|
|
|
|
|
|
gru.updates[i].w[0][j] += embedding_dropout.w[0][k] * embedding->original.w[0][k] * gru.X_z.original.w[j][k] + gru.states[i].w[0][k] * gru.H_z.original.w[j][k]; |
13509
|
0
|
|
|
|
|
|
gru.resets[i].w[0][j] += embedding_dropout.w[0][k] * embedding->original.w[0][k] * gru.X_r.original.w[j][k] + gru.states[i].w[0][k] * gru.H_r.original.w[j][k]; |
13510
|
|
|
|
|
|
|
} |
13511
|
0
|
|
|
|
|
|
gru.updates[i].w[0][j] = 1.f / (1.f + exp(-gru.updates[i].w[0][j])); |
13512
|
0
|
|
|
|
|
|
gru.resets[i].w[0][j] = 1.f / (1.f + exp(-gru.resets[i].w[0][j])); |
13513
|
0
|
|
|
|
|
|
gru.resetstates[i].w[0][j] = gru.resets[i].w[0][j] * gru.states[i].w[0][j]; |
13514
|
|
|
|
|
|
|
} |
13515
|
0
|
0
|
|
|
|
|
for (int j = 0; j < D; j++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13516
|
0
|
|
|
|
|
|
gru.candidates[i].w[0][j] = gru.X.original.b[j]; |
13517
|
0
|
0
|
|
|
|
|
for (int k = 0; k < D; k++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13518
|
0
|
|
|
|
|
|
gru.candidates[i].w[0][j] += embedding_dropout.w[0][k] * embedding->original.w[0][k] * gru.X.original.w[j][k] + gru.resetstates[i].w[0][k] * gru.H.original.w[j][k]; |
13519
|
0
|
|
|
|
|
|
gru.candidates[i].w[0][j] = tanh(gru.candidates[i].w[0][j]); |
13520
|
0
|
|
|
|
|
|
gru.states[i+1].w[0][j] = gru.updates[i].w[0][j] * gru.states[i].w[0][j] + (1.f - gru.updates[i].w[0][j]) * gru.candidates[i].w[0][j]; |
13521
|
|
|
|
|
|
|
} |
13522
|
|
|
|
|
|
|
|
13523
|
0
|
0
|
|
|
|
|
for (int j = 0; j < D; j++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13524
|
0
|
0
|
|
|
|
|
gru.dropouts[i].w[0][j] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier * gru.states[i+1].w[0][j]; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13525
|
|
|
|
|
|
|
|
13526
|
0
|
0
|
|
|
|
|
for (int j = 0; j < 3; j++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13527
|
0
|
0
|
|
|
|
|
for (int k = 0; k < D; k++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13528
|
0
|
|
|
|
|
|
output.w[j] += projection.original.w[j][k] * gru.dropouts[i].w[0][k]; |
13529
|
|
|
|
|
|
|
} |
13530
|
|
|
|
|
|
|
} |
13531
|
|
|
|
|
|
|
|
13532
|
0
|
0
|
|
|
|
|
for (auto&& output : instance_output) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13533
|
0
|
|
|
|
|
|
int best = output.w[1] > output.w[0]; |
13534
|
0
|
0
|
|
|
|
|
if (output.w[2] > output.w[best]) best = 2; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13535
|
0
|
|
|
|
|
|
float maximum = output.w[best], sum = 0; |
13536
|
0
|
0
|
|
|
|
|
for (int j = 0; j < 3; j++) sum += (output.w[j] = exp(output.w[j] - maximum)); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13537
|
0
|
|
|
|
|
|
sum = 1.f / sum; |
13538
|
0
|
0
|
|
|
|
|
for (int j = 0; j < 3; j++) output.w[j] *= sum; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13539
|
|
|
|
|
|
|
|
13540
|
0
|
|
|
|
|
|
total++; |
13541
|
0
|
|
|
|
|
|
correct += best == output.outcome; |
13542
|
0
|
|
|
|
|
|
logprob += log(output.w[output.outcome]); |
13543
|
|
|
|
|
|
|
} |
13544
|
|
|
|
|
|
|
|
13545
|
|
|
|
|
|
|
// Backward pass |
13546
|
0
|
0
|
|
|
|
|
for (auto&& output : instance_output) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13547
|
0
|
0
|
|
|
|
|
for (int j = 0; j < 3; j++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13548
|
0
|
0
|
|
|
|
|
output.w[j] = (output.outcome == j) - output.w[j]; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13549
|
|
|
|
|
|
|
|
13550
|
0
|
0
|
|
|
|
|
for (int dir = 0; dir < 2; dir++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13551
|
0
|
0
|
|
|
|
|
auto& gru = dir == 0 ? gru_fwd : gru_bwd; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13552
|
0
|
0
|
|
|
|
|
auto& projection = dir == 0 ? projection_fwd : projection_bwd; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13553
|
|
|
|
|
|
|
|
13554
|
|
|
|
|
|
|
matrix<1, D> state_g, update_g, candidate_g, reset_g, resetstate_g; |
13555
|
|
|
|
|
|
|
state_g.clear(); |
13556
|
0
|
0
|
|
|
|
|
for (size_t i = segment; i--; ) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13557
|
0
|
0
|
|
|
|
|
auto& embedding = chosen_embeddings[dir == 0 ? i : segment - 1 - i]; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13558
|
0
|
0
|
|
|
|
|
auto& embedding_dropout = embedding_dropouts[dir == 0 ? i : segment - 1 - i]; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13559
|
0
|
0
|
|
|
|
|
auto& output = instance_output[dir == 0 ? i : segment - 1 - i]; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13560
|
|
|
|
|
|
|
|
13561
|
0
|
0
|
|
|
|
|
for (int j = 0; j < D; j++) // These for cycles are swapped because |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13562
|
0
|
0
|
|
|
|
|
for (int k = 0; k < 3; k++) // g++-4.8 generates wrong code otherwise. |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13563
|
0
|
|
|
|
|
|
projection.w_g[k][j] += gru.dropouts[i].w[0][j] * output.w[k]; |
13564
|
|
|
|
|
|
|
|
13565
|
0
|
0
|
|
|
|
|
for (int j = 0; j < D; j++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13566
|
0
|
0
|
|
|
|
|
if (gru.dropouts[i].w[0][j]) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13567
|
0
|
0
|
|
|
|
|
for (int k = 0; k < 3; k++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13568
|
0
|
|
|
|
|
|
state_g.w[0][j] += projection.original.w[k][j] * output.w[k]; |
13569
|
|
|
|
|
|
|
|
13570
|
|
|
|
|
|
|
resetstate_g.clear(); |
13571
|
0
|
0
|
|
|
|
|
for (int j = 0; j < D; j++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13572
|
0
|
|
|
|
|
|
update_g.w[0][j] = state_g.w[0][j] * (gru.states[i].w[0][j] - gru.candidates[i].w[0][j]); |
13573
|
0
|
|
|
|
|
|
candidate_g.w[0][j] = state_g.w[0][j] * (1.f - gru.updates[i].w[0][j]); |
13574
|
0
|
|
|
|
|
|
state_g.w[0][j] = state_g.w[0][j] * gru.updates[i].w[0][j]; |
13575
|
|
|
|
|
|
|
|
13576
|
0
|
|
|
|
|
|
candidate_g.w[0][j] *= 1 - gru.candidates[i].w[0][j] * gru.candidates[i].w[0][j]; |
13577
|
0
|
|
|
|
|
|
gru.X.b_g[j] += candidate_g.w[0][j]; |
13578
|
0
|
0
|
|
|
|
|
for (int k = 0; k < D; k++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13579
|
0
|
|
|
|
|
|
gru.X.w_g[j][k] += candidate_g.w[0][j] * embedding_dropout.w[0][k] * embedding->original.w[0][k]; |
13580
|
0
|
|
|
|
|
|
gru.H.w_g[j][k] += candidate_g.w[0][j] * gru.resetstates[i].w[0][k]; |
13581
|
0
|
|
|
|
|
|
embedding->w_g[0][k] += embedding_dropout.w[0][k] * candidate_g.w[0][j] * gru.X.original.w[j][k]; |
13582
|
0
|
|
|
|
|
|
resetstate_g.w[0][k] += candidate_g.w[0][j] * gru.H.original.w[j][k]; |
13583
|
|
|
|
|
|
|
} |
13584
|
|
|
|
|
|
|
} |
13585
|
0
|
0
|
|
|
|
|
for (int j = 0; j < D; j++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13586
|
0
|
|
|
|
|
|
state_g.w[0][j] += resetstate_g.w[0][j] * gru.resets[i].w[0][j]; |
13587
|
0
|
|
|
|
|
|
reset_g.w[0][j] = resetstate_g.w[0][j] * gru.states[i].w[0][j]; |
13588
|
|
|
|
|
|
|
|
13589
|
0
|
|
|
|
|
|
update_g.w[0][j] *= gru.updates[i].w[0][j] * (1 - gru.updates[i].w[0][j]); |
13590
|
0
|
|
|
|
|
|
reset_g.w[0][j] *= gru.resets[i].w[0][j] * (1 - gru.resets[i].w[0][j]); |
13591
|
|
|
|
|
|
|
|
13592
|
0
|
|
|
|
|
|
gru.X_z.b_g[j] += update_g.w[0][j]; |
13593
|
0
|
|
|
|
|
|
gru.X_r.b_g[j] += reset_g.w[0][j]; |
13594
|
0
|
0
|
|
|
|
|
for (int k = 0; k < D; k++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13595
|
0
|
|
|
|
|
|
gru.X_z.w_g[j][k] += update_g.w[0][j] * embedding_dropout.w[0][k] * embedding->original.w[0][k]; |
13596
|
0
|
|
|
|
|
|
gru.H_z.w_g[j][k] += update_g.w[0][j] * gru.states[i].w[0][k]; |
13597
|
0
|
|
|
|
|
|
gru.X_r.w_g[j][k] += reset_g.w[0][j] * embedding_dropout.w[0][k] * embedding->original.w[0][k]; |
13598
|
0
|
|
|
|
|
|
gru.H_r.w_g[j][k] += reset_g.w[0][j] * gru.states[i].w[0][k]; |
13599
|
0
|
|
|
|
|
|
embedding->w_g[0][k] += embedding_dropout.w[0][k] * (update_g.w[0][j] * gru.X_z.original.w[j][k] + |
13600
|
0
|
|
|
|
|
|
reset_g.w[0][j] * gru.X_r.original.w[j][k]); |
13601
|
0
|
|
|
|
|
|
state_g.w[0][k] += update_g.w[0][j] * gru.H_z.original.w[j][k] + reset_g.w[0][j] * gru.H_r.original.w[j][k]; |
13602
|
|
|
|
|
|
|
} |
13603
|
|
|
|
|
|
|
} |
13604
|
|
|
|
|
|
|
} |
13605
|
|
|
|
|
|
|
} |
13606
|
|
|
|
|
|
|
|
13607
|
|
|
|
|
|
|
// Update the weights |
13608
|
0
|
0
|
|
|
|
|
if (batch_size == 1 || |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13609
|
|
|
|
|
|
|
instance+1 == instances || |
13610
|
|
|
|
|
|
|
(instance+1) % batch_size == 0) { |
13611
|
0
|
|
|
|
|
|
b1t *= 0.9f; |
13612
|
0
|
|
|
|
|
|
b2t *= 0.999f; |
13613
|
0
|
|
|
|
|
|
float learning_rate_biased = learning_rate * sqrt(1-b2t) / (1-b1t); |
13614
|
|
|
|
|
|
|
|
13615
|
0
|
0
|
|
|
|
|
if (batch_size == 1) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13616
|
0
|
0
|
|
|
|
|
for (auto&& chosen_embedding : chosen_embeddings) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13617
|
0
|
|
|
|
|
|
chosen_embedding->update_weights(learning_rate_biased); |
13618
|
|
|
|
|
|
|
else |
13619
|
0
|
0
|
|
|
|
|
for (auto&& embedding : embeddings) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13620
|
0
|
|
|
|
|
|
embedding.second.update_weights(learning_rate_biased); |
13621
|
0
|
|
|
|
|
|
gru_fwd.update_weights(learning_rate_biased); |
13622
|
0
|
|
|
|
|
|
gru_bwd.update_weights(learning_rate_biased); |
13623
|
0
|
|
|
|
|
|
projection_fwd.update_weights(learning_rate_biased); |
13624
|
0
|
|
|
|
|
|
projection_bwd.update_weights(learning_rate_biased); |
13625
|
|
|
|
|
|
|
} |
13626
|
|
|
|
|
|
|
} |
13627
|
0
|
0
|
|
|
|
|
if (learning_rate_final && learning_rate_final != learning_rate_initial) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13628
|
0
|
|
|
|
|
|
learning_rate = exp(((epochs - epoch - 2) * log(learning_rate_initial) + (epoch + 1) * log(learning_rate_final)) / (epochs - 1)); |
13629
|
|
|
|
|
|
|
|
13630
|
|
|
|
|
|
|
// Evaluate |
13631
|
0
|
0
|
|
|
|
|
cerr << "Epoch " << epoch+1 << ", logprob: " << scientific << setprecision(4) << logprob |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13632
|
0
|
|
|
|
|
|
<< ", training acc: " << fixed << setprecision(2) << 100. * correct / double(total) << "%"; |
13633
|
0
|
0
|
|
|
|
|
if (!heldout.empty()) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13634
|
|
|
|
|
|
|
f1_info tokens, sentences; |
13635
|
0
|
0
|
|
|
|
|
evaluate(url_email_tokenizer, segment, allow_spaces, heldout, tokens, sentences); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13636
|
0
|
0
|
|
|
|
|
cerr << ", heldout tokens: " << 100. * tokens.precision << "%P/" << 100. * tokens.recall << "%R/" |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13637
|
0
|
|
|
|
|
|
<< 100. * tokens.f1 << "%, sentences: " << 100. * sentences.precision << "%P/" |
13638
|
0
|
|
|
|
|
|
<< 100. * sentences.recall << "%R/" << 100. * sentences.f1 << "%"; |
13639
|
|
|
|
|
|
|
|
13640
|
0
|
0
|
|
|
|
|
if (early_stopping && sentences.f1 + tokens.f1 > best_combined_f1) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13641
|
0
|
|
|
|
|
|
best_combined_f1 = sentences.f1 + tokens.f1; |
13642
|
|
|
|
|
|
|
best_combined_f1_epoch = epoch; |
13643
|
|
|
|
|
|
|
best_combined_f1_network = *this; |
13644
|
|
|
|
|
|
|
} |
13645
|
0
|
0
|
|
|
|
|
if (early_stopping && best_combined_f1 && epoch - best_combined_f1_epoch > 30) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13646
|
|
|
|
|
|
|
cerr << endl << "Stopping after 30 iterations of not improving sum of sentence and token f1." << endl; |
13647
|
0
|
|
|
|
|
|
break; |
13648
|
|
|
|
|
|
|
} |
13649
|
|
|
|
|
|
|
} |
13650
|
|
|
|
|
|
|
cerr << endl; |
13651
|
|
|
|
|
|
|
} |
13652
|
|
|
|
|
|
|
|
13653
|
|
|
|
|
|
|
// Choose best network if desired |
13654
|
0
|
0
|
|
|
|
|
if (early_stopping && best_combined_f1) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13655
|
0
|
|
|
|
|
|
cerr << "Choosing parameters from epoch " << best_combined_f1_epoch+1 << "." << endl; |
13656
|
|
|
|
|
|
|
this->embeddings = best_combined_f1_network.embeddings; |
13657
|
0
|
|
|
|
|
|
this->gru_fwd = best_combined_f1_network.gru_fwd; |
13658
|
0
|
|
|
|
|
|
this->gru_bwd = best_combined_f1_network.gru_bwd; |
13659
|
0
|
|
|
|
|
|
this->projection_fwd = best_combined_f1_network.projection_fwd; |
13660
|
0
|
|
|
|
|
|
this->projection_bwd = best_combined_f1_network.projection_bwd; |
13661
|
|
|
|
|
|
|
} |
13662
|
|
|
|
|
|
|
|
13663
|
|
|
|
|
|
|
// Encode the network |
13664
|
0
|
0
|
|
|
|
|
enc.add_1B(1); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13665
|
0
|
0
|
|
|
|
|
enc.add_1B(D); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13666
|
|
|
|
|
|
|
|
13667
|
0
|
|
|
|
|
|
enc.add_4B(this->embeddings.size()); |
13668
|
0
|
0
|
|
|
|
|
for (auto&& embedding : this->embeddings) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13669
|
0
|
|
|
|
|
|
enc.add_4B(embedding.first); |
13670
|
0
|
|
|
|
|
|
enc.add_data(embedding.second.e.w[0], D); |
13671
|
|
|
|
|
|
|
} |
13672
|
0
|
0
|
|
|
|
|
save_gru(this->gru_fwd, enc); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13673
|
0
|
0
|
|
|
|
|
save_gru(this->gru_bwd, enc); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13674
|
0
|
0
|
|
|
|
|
save_matrix(this->projection_fwd, enc); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13675
|
0
|
0
|
|
|
|
|
save_matrix(this->projection_bwd, enc); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13676
|
|
|
|
|
|
|
|
13677
|
|
|
|
|
|
|
return true; |
13678
|
|
|
|
|
|
|
} |
13679
|
|
|
|
|
|
|
|
13680
|
|
|
|
|
|
|
template template |
13681
|
0
|
|
|
|
|
|
void gru_tokenizer_network_trainer::matrix_trainer::update_weights(float learning_rate) { |
13682
|
0
|
0
|
|
|
|
|
for (int i = 0; i < R; i++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13683
|
0
|
0
|
|
|
|
|
for (int j = 0; j < C; j++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13684
|
0
|
|
|
|
|
|
w_m[i][j] = 0.9 * w_m[i][j] + (1-0.9) * w_g[i][j]; |
13685
|
0
|
|
|
|
|
|
w_v[i][j] = 0.999 * w_v[i][j] + (1-0.999) * w_g[i][j] * w_g[i][j]; |
13686
|
0
|
|
|
|
|
|
original.w[i][j] += learning_rate * w_m[i][j] / (sqrt(w_v[i][j]) + 1e-8); |
13687
|
|
|
|
|
|
|
} |
13688
|
0
|
|
|
|
|
|
b_m[i] = 0.9 * b_m[i] + (1-0.9) * b_g[i]; |
13689
|
0
|
|
|
|
|
|
b_v[i] = 0.999 * b_v[i] + (1-0.999) * b_g[i] * b_g[i]; |
13690
|
0
|
|
|
|
|
|
original.b[i] += learning_rate * b_m[i] / (sqrt(b_v[i]) + 1e-8); |
13691
|
|
|
|
|
|
|
} |
13692
|
|
|
|
|
|
|
|
13693
|
0
|
0
|
|
|
|
|
for (int i = 0; i < R; i++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13694
|
0
|
0
|
|
|
|
|
for (int j = 0; j < C; j++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13695
|
0
|
|
|
|
|
|
w_g[i][j] = 0.f; |
13696
|
0
|
|
|
|
|
|
b_g[i] = 0.f; |
13697
|
|
|
|
|
|
|
} |
13698
|
0
|
|
|
|
|
|
} |
13699
|
|
|
|
|
|
|
|
13700
|
|
|
|
|
|
|
template |
13701
|
0
|
|
|
|
|
|
void gru_tokenizer_network_trainer::gru_trainer::update_weights(float learning_rate) { |
13702
|
0
|
|
|
|
|
|
X.update_weights(learning_rate); |
13703
|
0
|
|
|
|
|
|
X_r.update_weights(learning_rate); |
13704
|
0
|
|
|
|
|
|
X_z.update_weights(learning_rate); |
13705
|
0
|
|
|
|
|
|
H.update_weights(learning_rate); |
13706
|
0
|
|
|
|
|
|
H_r.update_weights(learning_rate); |
13707
|
0
|
|
|
|
|
|
H_z.update_weights(learning_rate); |
13708
|
0
|
|
|
|
|
|
} |
13709
|
|
|
|
|
|
|
|
13710
|
|
|
|
|
|
|
template |
13711
|
0
|
|
|
|
|
|
void gru_tokenizer_network_trainer::evaluate(unsigned url_email_tokenizer, unsigned segment, bool allow_spaces, const vector& heldout, |
13712
|
|
|
|
|
|
|
f1_info& tokens_f1, f1_info& sentences_f1) { |
13713
|
|
|
|
|
|
|
// Generate gold data |
13714
|
|
|
|
|
|
|
vector gold_sentences, gold_tokens; |
13715
|
|
|
|
|
|
|
u32string text; |
13716
|
0
|
0
|
|
|
|
|
for (auto&& sentence : heldout) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13717
|
0
|
0
|
|
|
|
|
if (sentence.tokens.empty()) continue; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13718
|
|
|
|
|
|
|
|
13719
|
0
|
0
|
|
|
|
|
gold_sentences.emplace_back(text.size() + sentence.tokens.front().start, sentence.tokens.back().start + sentence.tokens.back().length - sentence.tokens.front().start); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13720
|
0
|
0
|
|
|
|
|
for (auto&& token : sentence.tokens) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13721
|
0
|
0
|
|
|
|
|
gold_tokens.emplace_back(text.size() + token.start, token.length); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13722
|
|
|
|
|
|
|
text.append(sentence.sentence); |
13723
|
|
|
|
|
|
|
} |
13724
|
|
|
|
|
|
|
|
13725
|
|
|
|
|
|
|
// Generate system data |
13726
|
|
|
|
|
|
|
vector system_sentences, system_tokens, tokens; |
13727
|
|
|
|
|
|
|
string text_utf8; |
13728
|
|
|
|
|
|
|
|
13729
|
0
|
|
|
|
|
|
this->cache_embeddings(); |
13730
|
0
|
|
|
|
|
|
gru_tokenizer tokenizer(url_email_tokenizer, segment, allow_spaces, *this); |
13731
|
0
|
0
|
|
|
|
|
unilib::utf8::encode(text, text_utf8); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13732
|
0
|
0
|
|
|
|
|
tokenizer.set_text(text_utf8); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13733
|
|
|
|
|
|
|
|
13734
|
0
|
0
|
|
|
|
|
while (tokenizer.next_sentence(tokens)) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13735
|
0
|
0
|
|
|
|
|
if (!tokens.empty()) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13736
|
0
|
0
|
|
|
|
|
system_sentences.emplace_back(tokens.front().start, tokens.back().start + tokens.back().length - tokens.front().start); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13737
|
0
|
0
|
|
|
|
|
system_tokens.insert(system_tokens.end(), tokens.begin(), tokens.end()); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13738
|
|
|
|
|
|
|
} |
13739
|
|
|
|
|
|
|
|
13740
|
0
|
|
|
|
|
|
evaluate_f1(system_tokens, gold_tokens, tokens_f1); |
13741
|
0
|
|
|
|
|
|
evaluate_f1(system_sentences, gold_sentences, sentences_f1); |
13742
|
0
|
|
|
|
|
|
} |
13743
|
|
|
|
|
|
|
|
13744
|
|
|
|
|
|
|
template |
13745
|
0
|
|
|
|
|
|
void gru_tokenizer_network_trainer::evaluate_f1(const vector& system, const vector& gold, f1_info& f1) { |
13746
|
|
|
|
|
|
|
size_t both = 0; |
13747
|
0
|
0
|
|
|
|
|
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13748
|
0
|
0
|
|
|
|
|
if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start)) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13749
|
0
|
|
|
|
|
|
si++; |
13750
|
0
|
0
|
|
|
|
|
else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start)) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13751
|
0
|
|
|
|
|
|
gi++; |
13752
|
|
|
|
|
|
|
else |
13753
|
0
|
|
|
|
|
|
both += system[si++].length == gold[gi++].length; |
13754
|
|
|
|
|
|
|
|
13755
|
0
|
0
|
|
|
|
|
f1.precision = system.size() ? both / double(system.size()) : 0.; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13756
|
0
|
0
|
|
|
|
|
f1.recall = gold.size() ? both / double(gold.size()) : 0.; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13757
|
0
|
0
|
|
|
|
|
f1.f1 = system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0.; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13758
|
0
|
|
|
|
|
|
} |
13759
|
|
|
|
|
|
|
|
13760
|
|
|
|
|
|
|
template template |
13761
|
0
|
|
|
|
|
|
void gru_tokenizer_network_trainer::random_matrix(matrix& m, mt19937& generator, float range, float bias) { |
13762
|
0
|
|
|
|
|
|
uniform_real_distribution uniform(-range, range); |
13763
|
0
|
0
|
|
|
|
|
for (int i = 0; i < R; i++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13764
|
0
|
|
|
|
|
|
m.b[i] = bias; |
13765
|
0
|
0
|
|
|
|
|
for (int j = 0; j < C; j++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13766
|
0
|
|
|
|
|
|
m.w[i][j] = uniform(generator); |
13767
|
|
|
|
|
|
|
} |
13768
|
0
|
|
|
|
|
|
} |
13769
|
|
|
|
|
|
|
|
13770
|
|
|
|
|
|
|
template |
13771
|
0
|
|
|
|
|
|
void gru_tokenizer_network_trainer::random_gru(gru& g, mt19937& generator, float range) { |
13772
|
0
|
|
|
|
|
|
random_matrix(g.X, generator, range, 0.f); |
13773
|
0
|
|
|
|
|
|
random_matrix(g.X_r, generator, range, 1.f); |
13774
|
0
|
|
|
|
|
|
random_matrix(g.X_z, generator, range, 1.f); |
13775
|
0
|
|
|
|
|
|
random_matrix(g.H, generator, range, 0.f); |
13776
|
0
|
|
|
|
|
|
random_matrix(g.H_r, generator, range, 1.f); |
13777
|
0
|
|
|
|
|
|
random_matrix(g.H_z, generator, range, 1.f); |
13778
|
0
|
|
|
|
|
|
} |
13779
|
|
|
|
|
|
|
|
13780
|
|
|
|
|
|
|
template template |
13781
|
0
|
|
|
|
|
|
void gru_tokenizer_network_trainer::save_matrix(const matrix& m, binary_encoder& enc) { |
13782
|
0
|
0
|
|
|
|
|
for (int i = 0; i < R; i++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13783
|
0
|
|
|
|
|
|
enc.add_data(m.w[i], C); |
13784
|
0
|
|
|
|
|
|
enc.add_data(m.b, R); |
13785
|
0
|
|
|
|
|
|
} |
13786
|
|
|
|
|
|
|
|
13787
|
|
|
|
|
|
|
template |
13788
|
0
|
|
|
|
|
|
void gru_tokenizer_network_trainer::save_gru(const gru& g, binary_encoder& enc) { |
13789
|
0
|
|
|
|
|
|
save_matrix(g.X, enc); |
13790
|
0
|
|
|
|
|
|
save_matrix(g.X_r, enc); |
13791
|
0
|
|
|
|
|
|
save_matrix(g.X_z, enc); |
13792
|
0
|
|
|
|
|
|
save_matrix(g.H, enc); |
13793
|
0
|
|
|
|
|
|
save_matrix(g.H_r, enc); |
13794
|
0
|
|
|
|
|
|
save_matrix(g.H_z, enc); |
13795
|
0
|
|
|
|
|
|
} |
13796
|
|
|
|
|
|
|
|
13797
|
|
|
|
|
|
|
} // namespace morphodita |
13798
|
|
|
|
|
|
|
|
13799
|
|
|
|
|
|
|
///////// |
13800
|
|
|
|
|
|
|
// File: morphodita/tokenizer/gru_tokenizer_trainer.cpp |
13801
|
|
|
|
|
|
|
///////// |
13802
|
|
|
|
|
|
|
|
13803
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
13804
|
|
|
|
|
|
|
// |
13805
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
13806
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
13807
|
|
|
|
|
|
|
// |
13808
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
13809
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
13810
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
13811
|
|
|
|
|
|
|
|
13812
|
|
|
|
|
|
|
namespace morphodita { |
13813
|
|
|
|
|
|
|
|
13814
|
0
|
|
|
|
|
|
bool gru_tokenizer_trainer::train(unsigned url_email_tokenizer, unsigned segment, bool allow_spaces, unsigned dimension, unsigned epochs, |
13815
|
|
|
|
|
|
|
unsigned batch_size, float learning_rate, float learning_rate_final, float dropout, |
13816
|
|
|
|
|
|
|
float initialization_range, bool early_stopping, const vector& data, |
13817
|
|
|
|
|
|
|
const vector& heldout, ostream& os, string& error) { |
13818
|
|
|
|
|
|
|
using namespace unilib; |
13819
|
|
|
|
|
|
|
|
13820
|
|
|
|
|
|
|
error.clear(); |
13821
|
|
|
|
|
|
|
|
13822
|
|
|
|
|
|
|
// Start encoding the tokenizer |
13823
|
0
|
|
|
|
|
|
os.put(2); |
13824
|
|
|
|
|
|
|
|
13825
|
0
|
|
|
|
|
|
binary_encoder enc; |
13826
|
0
|
0
|
|
|
|
|
enc.add_1B(url_email_tokenizer); |
13827
|
0
|
0
|
|
|
|
|
enc.add_2B(segment); |
13828
|
0
|
0
|
|
|
|
|
enc.add_1B(allow_spaces); |
13829
|
|
|
|
|
|
|
|
13830
|
|
|
|
|
|
|
// Train the GRU network |
13831
|
0
|
0
|
|
|
|
|
if (dimension == 16) { |
13832
|
|
|
|
|
|
|
gru_tokenizer_network_trainer<16> network; |
13833
|
0
|
0
|
|
|
|
|
if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final, |
|
|
0
|
|
|
|
|
|
13834
|
|
|
|
|
|
|
dropout, initialization_range, early_stopping, data, heldout, enc, error)) return false; |
13835
|
0
|
0
|
|
|
|
|
} else if (dimension == 24) { |
13836
|
|
|
|
|
|
|
gru_tokenizer_network_trainer<24> network; |
13837
|
0
|
0
|
|
|
|
|
if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final, |
|
|
0
|
|
|
|
|
|
13838
|
|
|
|
|
|
|
dropout, initialization_range, early_stopping, data, heldout, enc, error)) return false; |
13839
|
0
|
0
|
|
|
|
|
} else if (dimension == 64) { |
13840
|
|
|
|
|
|
|
gru_tokenizer_network_trainer<64> network; |
13841
|
0
|
0
|
|
|
|
|
if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final, |
|
|
0
|
|
|
|
|
|
13842
|
|
|
|
|
|
|
dropout, initialization_range, early_stopping, data, heldout, enc, error)) return false; |
13843
|
|
|
|
|
|
|
} else { |
13844
|
0
|
0
|
|
|
|
|
return error.assign("Gru tokenizer dimension '").append(to_string(dimension)).append("' is not supported!"), false; |
|
|
0
|
|
|
|
|
|
13845
|
|
|
|
|
|
|
} |
13846
|
|
|
|
|
|
|
|
13847
|
|
|
|
|
|
|
// Compute best substitutions for every category |
13848
|
|
|
|
|
|
|
unordered_map> counts; |
13849
|
0
|
0
|
|
|
|
|
for (auto&& sentence : data) |
13850
|
0
|
0
|
|
|
|
|
for (auto&& chr : sentence.sentence) |
13851
|
0
|
|
|
|
|
|
counts[unicode::category(chr)][chr]++; |
13852
|
|
|
|
|
|
|
|
13853
|
|
|
|
|
|
|
unordered_map unknown_chars; |
13854
|
0
|
0
|
|
|
|
|
for (auto&& count : counts) { |
13855
|
0
|
|
|
|
|
|
char32_t best_chr = 0; |
13856
|
|
|
|
|
|
|
unsigned best = 0; |
13857
|
0
|
0
|
|
|
|
|
for (auto&& chr : count.second) |
13858
|
0
|
0
|
|
|
|
|
if (chr.second > best) |
13859
|
0
|
|
|
|
|
|
best = chr.second, best_chr = chr.first; |
13860
|
0
|
0
|
|
|
|
|
if (best_chr) |
13861
|
0
|
|
|
|
|
|
unknown_chars.emplace(count.first, best_chr); |
13862
|
|
|
|
|
|
|
} |
13863
|
0
|
0
|
|
|
|
|
enc.add_1B(unknown_chars.size()); |
13864
|
0
|
0
|
|
|
|
|
for (auto&& unknown_char : unknown_chars) { |
13865
|
0
|
|
|
|
|
|
enc.add_4B(unknown_char.first); |
13866
|
0
|
|
|
|
|
|
enc.add_4B(unknown_char.second); |
13867
|
|
|
|
|
|
|
} |
13868
|
|
|
|
|
|
|
|
13869
|
0
|
0
|
|
|
|
|
if (!compressor::save(os, enc)) return error.assign("Cannot save gru_tokenizer_factory!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
13870
|
|
|
|
|
|
|
return true; |
13871
|
|
|
|
|
|
|
} |
13872
|
|
|
|
|
|
|
|
13873
|
|
|
|
|
|
|
} // namespace morphodita |
13874
|
|
|
|
|
|
|
|
13875
|
|
|
|
|
|
|
///////// |
13876
|
|
|
|
|
|
|
// File: morphodita/tokenizer/ragel_tokenizer.cpp |
13877
|
|
|
|
|
|
|
///////// |
13878
|
|
|
|
|
|
|
|
13879
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
13880
|
|
|
|
|
|
|
// |
13881
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
13882
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
13883
|
|
|
|
|
|
|
// |
13884
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
13885
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
13886
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
13887
|
|
|
|
|
|
|
|
13888
|
|
|
|
|
|
|
namespace morphodita { |
13889
|
|
|
|
|
|
|
|
13890
|
|
|
|
|
|
|
static const char _ragel_url_email_cond_offsets[] = { |
13891
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
13892
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 1, 1, 1, |
13893
|
|
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, |
13894
|
|
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, |
13895
|
|
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, |
13896
|
|
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, |
13897
|
|
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, |
13898
|
|
|
|
|
|
|
1, 1, 1, 2, 3, 3, 4, 5, |
13899
|
|
|
|
|
|
|
6, 7, 8, 9, 10, 11, 12, 13, |
13900
|
|
|
|
|
|
|
14, 15, 16 |
13901
|
|
|
|
|
|
|
}; |
13902
|
|
|
|
|
|
|
|
13903
|
|
|
|
|
|
|
static const char _ragel_url_email_cond_lengths[] = { |
13904
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
13905
|
|
|
|
|
|
|
0, 0, 0, 0, 1, 0, 0, 0, |
13906
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
13907
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
13908
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
13909
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
13910
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
13911
|
|
|
|
|
|
|
0, 0, 1, 1, 0, 1, 1, 1, |
13912
|
|
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, |
13913
|
|
|
|
|
|
|
1, 1, 1 |
13914
|
|
|
|
|
|
|
}; |
13915
|
|
|
|
|
|
|
|
13916
|
|
|
|
|
|
|
static const short _ragel_url_email_cond_keys[] = { |
13917
|
|
|
|
|
|
|
41u, 41u, 47u, 47u, 47u, 47u, 41u, 41u, |
13918
|
|
|
|
|
|
|
47u, 47u, 47u, 47u, 47u, 47u, 47u, 47u, |
13919
|
|
|
|
|
|
|
47u, 47u, 47u, 47u, 47u, 47u, 47u, 47u, |
13920
|
|
|
|
|
|
|
47u, 47u, 47u, 47u, 47u, 47u, 47u, 47u, |
13921
|
|
|
|
|
|
|
47u, 47u, 0 |
13922
|
|
|
|
|
|
|
}; |
13923
|
|
|
|
|
|
|
|
13924
|
|
|
|
|
|
|
static const char _ragel_url_email_cond_spaces[] = { |
13925
|
|
|
|
|
|
|
1, 0, 0, 1, 0, 0, 0, 0, |
13926
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
13927
|
|
|
|
|
|
|
0, 0 |
13928
|
|
|
|
|
|
|
}; |
13929
|
|
|
|
|
|
|
|
13930
|
|
|
|
|
|
|
static const short _ragel_url_email_key_offsets[] = { |
13931
|
|
|
|
|
|
|
0, 0, 15, 29, 41, 54, 63, 71, |
13932
|
|
|
|
|
|
|
78, 86, 92, 100, 117, 145, 154, 162, |
13933
|
|
|
|
|
|
|
171, 179, 188, 196, 204, 215, 225, 233, |
13934
|
|
|
|
|
|
|
241, 252, 262, 270, 278, 289, 299, 315, |
13935
|
|
|
|
|
|
|
330, 346, 360, 376, 393, 409, 426, 442, |
13936
|
|
|
|
|
|
|
459, 475, 491, 510, 528, 544, 560, 579, |
13937
|
|
|
|
|
|
|
597, 613, 629, 648, 666, 682, 698, 714, |
13938
|
|
|
|
|
|
|
725, 726, 741, 752, 756, 773, 801, 812, |
13939
|
|
|
|
|
|
|
823, 834, 848, 861, 879, 893, 908, 926, |
13940
|
|
|
|
|
|
|
944, 962, 983 |
13941
|
|
|
|
|
|
|
}; |
13942
|
|
|
|
|
|
|
|
13943
|
|
|
|
|
|
|
static const short _ragel_url_email_trans_keys[] = { |
13944
|
|
|
|
|
|
|
33u, 48u, 49u, 50u, 95u, 36u, 37u, 39u, |
13945
|
|
|
|
|
|
|
46u, 51u, 57u, 65u, 90u, 97u, 122u, 33u, |
13946
|
|
|
|
|
|
|
58u, 64u, 95u, 36u, 37u, 39u, 46u, 48u, |
13947
|
|
|
|
|
|
|
57u, 65u, 90u, 97u, 122u, 33u, 95u, 36u, |
13948
|
|
|
|
|
|
|
37u, 39u, 46u, 48u, 57u, 65u, 90u, 97u, |
13949
|
|
|
|
|
|
|
122u, 33u, 64u, 95u, 36u, 37u, 39u, 46u, |
13950
|
|
|
|
|
|
|
48u, 57u, 65u, 90u, 97u, 122u, 48u, 49u, |
13951
|
|
|
|
|
|
|
50u, 51u, 57u, 65u, 90u, 97u, 122u, 45u, |
13952
|
|
|
|
|
|
|
46u, 48u, 57u, 65u, 90u, 97u, 122u, 45u, |
13953
|
|
|
|
|
|
|
48u, 57u, 65u, 90u, 97u, 122u, 45u, 46u, |
13954
|
|
|
|
|
|
|
48u, 57u, 65u, 90u, 97u, 122u, 48u, 57u, |
13955
|
|
|
|
|
|
|
65u, 90u, 97u, 122u, 45u, 46u, 48u, 57u, |
13956
|
|
|
|
|
|
|
65u, 90u, 97u, 122u, 33u, 39u, 41u, 61u, |
13957
|
|
|
|
|
|
|
95u, 36u, 47u, 48u, 57u, 58u, 59u, 63u, |
13958
|
|
|
|
|
|
|
64u, 65u, 90u, 97u, 122u, 33u, 39u, 40u, |
13959
|
|
|
|
|
|
|
44u, 46u, 61u, 63u, 95u, 129u, 131u, 135u, |
13960
|
|
|
|
|
|
|
151u, 809u, 1065u, 36u, 38u, 42u, 57u, 58u, |
13961
|
|
|
|
|
|
|
59u, 64u, 90u, 97u, 122u, 142u, 143u, 155u, |
13962
|
|
|
|
|
|
|
159u, 48u, 49u, 50u, 51u, 57u, 65u, 90u, |
13963
|
|
|
|
|
|
|
97u, 122u, 45u, 46u, 48u, 57u, 65u, 90u, |
13964
|
|
|
|
|
|
|
97u, 122u, 48u, 49u, 50u, 51u, 57u, 65u, |
13965
|
|
|
|
|
|
|
90u, 97u, 122u, 45u, 46u, 48u, 57u, 65u, |
13966
|
|
|
|
|
|
|
90u, 97u, 122u, 48u, 49u, 50u, 51u, 57u, |
13967
|
|
|
|
|
|
|
65u, 90u, 97u, 122u, 45u, 46u, 48u, 57u, |
13968
|
|
|
|
|
|
|
65u, 90u, 97u, 122u, 45u, 46u, 48u, 57u, |
13969
|
|
|
|
|
|
|
65u, 90u, 97u, 122u, 45u, 46u, 53u, 48u, |
13970
|
|
|
|
|
|
|
52u, 54u, 57u, 65u, 90u, 97u, 122u, 45u, |
13971
|
|
|
|
|
|
|
46u, 48u, 53u, 54u, 57u, 65u, 90u, 97u, |
13972
|
|
|
|
|
|
|
122u, 45u, 46u, 48u, 57u, 65u, 90u, 97u, |
13973
|
|
|
|
|
|
|
122u, 45u, 46u, 48u, 57u, 65u, 90u, 97u, |
13974
|
|
|
|
|
|
|
122u, 45u, 46u, 53u, 48u, 52u, 54u, 57u, |
13975
|
|
|
|
|
|
|
65u, 90u, 97u, 122u, 45u, 46u, 48u, 53u, |
13976
|
|
|
|
|
|
|
54u, 57u, 65u, 90u, 97u, 122u, 45u, 46u, |
13977
|
|
|
|
|
|
|
48u, 57u, 65u, 90u, 97u, 122u, 45u, 46u, |
13978
|
|
|
|
|
|
|
48u, 57u, 65u, 90u, 97u, 122u, 45u, 46u, |
13979
|
|
|
|
|
|
|
53u, 48u, 52u, 54u, 57u, 65u, 90u, 97u, |
13980
|
|
|
|
|
|
|
122u, 45u, 46u, 48u, 53u, 54u, 57u, 65u, |
13981
|
|
|
|
|
|
|
90u, 97u, 122u, 33u, 45u, 46u, 58u, 64u, |
13982
|
|
|
|
|
|
|
95u, 36u, 37u, 39u, 44u, 48u, 57u, 65u, |
13983
|
|
|
|
|
|
|
90u, 97u, 122u, 33u, 45u, 58u, 64u, 95u, |
13984
|
|
|
|
|
|
|
36u, 37u, 39u, 46u, 48u, 57u, 65u, 90u, |
13985
|
|
|
|
|
|
|
97u, 122u, 33u, 45u, 46u, 58u, 64u, 95u, |
13986
|
|
|
|
|
|
|
36u, 37u, 39u, 44u, 48u, 57u, 65u, 90u, |
13987
|
|
|
|
|
|
|
97u, 122u, 33u, 58u, 64u, 95u, 36u, 37u, |
13988
|
|
|
|
|
|
|
39u, 46u, 48u, 57u, 65u, 90u, 97u, 122u, |
13989
|
|
|
|
|
|
|
33u, 45u, 46u, 58u, 64u, 95u, 36u, 37u, |
13990
|
|
|
|
|
|
|
39u, 44u, 48u, 57u, 65u, 90u, 97u, 122u, |
13991
|
|
|
|
|
|
|
33u, 48u, 49u, 50u, 58u, 64u, 95u, 36u, |
13992
|
|
|
|
|
|
|
37u, 39u, 46u, 51u, 57u, 65u, 90u, 97u, |
13993
|
|
|
|
|
|
|
122u, 33u, 45u, 46u, 58u, 64u, 95u, 36u, |
13994
|
|
|
|
|
|
|
37u, 39u, 44u, 48u, 57u, 65u, 90u, 97u, |
13995
|
|
|
|
|
|
|
122u, 33u, 48u, 49u, 50u, 58u, 64u, 95u, |
13996
|
|
|
|
|
|
|
36u, 37u, 39u, 46u, 51u, 57u, 65u, 90u, |
13997
|
|
|
|
|
|
|
97u, 122u, 33u, 45u, 46u, 58u, 64u, 95u, |
13998
|
|
|
|
|
|
|
36u, 37u, 39u, 44u, 48u, 57u, 65u, 90u, |
13999
|
|
|
|
|
|
|
97u, 122u, 33u, 48u, 49u, 50u, 58u, 64u, |
14000
|
|
|
|
|
|
|
95u, 36u, 37u, 39u, 46u, 51u, 57u, 65u, |
14001
|
|
|
|
|
|
|
90u, 97u, 122u, 33u, 45u, 46u, 58u, 64u, |
14002
|
|
|
|
|
|
|
95u, 36u, 37u, 39u, 44u, 48u, 57u, 65u, |
14003
|
|
|
|
|
|
|
90u, 97u, 122u, 33u, 45u, 46u, 58u, 64u, |
14004
|
|
|
|
|
|
|
95u, 36u, 37u, 39u, 44u, 48u, 57u, 65u, |
14005
|
|
|
|
|
|
|
90u, 97u, 122u, 33u, 45u, 46u, 53u, 58u, |
14006
|
|
|
|
|
|
|
64u, 95u, 36u, 37u, 39u, 44u, 48u, 52u, |
14007
|
|
|
|
|
|
|
54u, 57u, 65u, 90u, 97u, 122u, 33u, 45u, |
14008
|
|
|
|
|
|
|
46u, 58u, 64u, 95u, 36u, 37u, 39u, 44u, |
14009
|
|
|
|
|
|
|
48u, 53u, 54u, 57u, 65u, 90u, 97u, 122u, |
14010
|
|
|
|
|
|
|
33u, 45u, 46u, 58u, 64u, 95u, 36u, 37u, |
14011
|
|
|
|
|
|
|
39u, 44u, 48u, 57u, 65u, 90u, 97u, 122u, |
14012
|
|
|
|
|
|
|
33u, 45u, 46u, 58u, 64u, 95u, 36u, 37u, |
14013
|
|
|
|
|
|
|
39u, 44u, 48u, 57u, 65u, 90u, 97u, 122u, |
14014
|
|
|
|
|
|
|
33u, 45u, 46u, 53u, 58u, 64u, 95u, 36u, |
14015
|
|
|
|
|
|
|
37u, 39u, 44u, 48u, 52u, 54u, 57u, 65u, |
14016
|
|
|
|
|
|
|
90u, 97u, 122u, 33u, 45u, 46u, 58u, 64u, |
14017
|
|
|
|
|
|
|
95u, 36u, 37u, 39u, 44u, 48u, 53u, 54u, |
14018
|
|
|
|
|
|
|
57u, 65u, 90u, 97u, 122u, 33u, 45u, 46u, |
14019
|
|
|
|
|
|
|
58u, 64u, 95u, 36u, 37u, 39u, 44u, 48u, |
14020
|
|
|
|
|
|
|
57u, 65u, 90u, 97u, 122u, 33u, 45u, 46u, |
14021
|
|
|
|
|
|
|
58u, 64u, 95u, 36u, 37u, 39u, 44u, 48u, |
14022
|
|
|
|
|
|
|
57u, 65u, 90u, 97u, 122u, 33u, 45u, 46u, |
14023
|
|
|
|
|
|
|
53u, 58u, 64u, 95u, 36u, 37u, 39u, 44u, |
14024
|
|
|
|
|
|
|
48u, 52u, 54u, 57u, 65u, 90u, 97u, 122u, |
14025
|
|
|
|
|
|
|
33u, 45u, 46u, 58u, 64u, 95u, 36u, 37u, |
14026
|
|
|
|
|
|
|
39u, 44u, 48u, 53u, 54u, 57u, 65u, 90u, |
14027
|
|
|
|
|
|
|
97u, 122u, 33u, 45u, 46u, 58u, 64u, 95u, |
14028
|
|
|
|
|
|
|
36u, 37u, 39u, 44u, 48u, 57u, 65u, 90u, |
14029
|
|
|
|
|
|
|
97u, 122u, 33u, 45u, 46u, 58u, 64u, 95u, |
14030
|
|
|
|
|
|
|
36u, 37u, 39u, 44u, 48u, 57u, 65u, 90u, |
14031
|
|
|
|
|
|
|
97u, 122u, 33u, 45u, 46u, 58u, 64u, 95u, |
14032
|
|
|
|
|
|
|
36u, 37u, 39u, 44u, 48u, 57u, 65u, 90u, |
14033
|
|
|
|
|
|
|
97u, 122u, 33u, 47u, 95u, 36u, 37u, 39u, |
14034
|
|
|
|
|
|
|
57u, 65u, 90u, 97u, 122u, 47u, 33u, 48u, |
14035
|
|
|
|
|
|
|
49u, 50u, 95u, 36u, 37u, 39u, 46u, 51u, |
14036
|
|
|
|
|
|
|
57u, 65u, 90u, 97u, 122u, 45u, 46u, 58u, |
14037
|
|
|
|
|
|
|
303u, 559u, 48u, 57u, 65u, 90u, 97u, 122u, |
14038
|
|
|
|
|
|
|
303u, 559u, 48u, 57u, 33u, 39u, 41u, 61u, |
14039
|
|
|
|
|
|
|
95u, 36u, 47u, 48u, 57u, 58u, 59u, 63u, |
14040
|
|
|
|
|
|
|
64u, 65u, 90u, 97u, 122u, 33u, 39u, 40u, |
14041
|
|
|
|
|
|
|
44u, 46u, 61u, 63u, 95u, 129u, 131u, 135u, |
14042
|
|
|
|
|
|
|
151u, 809u, 1065u, 36u, 38u, 42u, 57u, 58u, |
14043
|
|
|
|
|
|
|
59u, 64u, 90u, 97u, 122u, 142u, 143u, 155u, |
14044
|
|
|
|
|
|
|
159u, 45u, 46u, 58u, 303u, 559u, 48u, 57u, |
14045
|
|
|
|
|
|
|
65u, 90u, 97u, 122u, 45u, 46u, 58u, 303u, |
14046
|
|
|
|
|
|
|
559u, 48u, 57u, 65u, 90u, 97u, 122u, 45u, |
14047
|
|
|
|
|
|
|
46u, 58u, 303u, 559u, 48u, 57u, 65u, 90u, |
14048
|
|
|
|
|
|
|
97u, 122u, 45u, 46u, 53u, 58u, 303u, 559u, |
14049
|
|
|
|
|
|
|
48u, 52u, 54u, 57u, 65u, 90u, 97u, 122u, |
14050
|
|
|
|
|
|
|
45u, 46u, 58u, 303u, 559u, 48u, 53u, 54u, |
14051
|
|
|
|
|
|
|
57u, 65u, 90u, 97u, 122u, 33u, 45u, 46u, |
14052
|
|
|
|
|
|
|
58u, 64u, 95u, 303u, 559u, 36u, 37u, 39u, |
14053
|
|
|
|
|
|
|
44u, 48u, 57u, 65u, 90u, 97u, 122u, 33u, |
14054
|
|
|
|
|
|
|
95u, 303u, 559u, 36u, 37u, 39u, 46u, 48u, |
14055
|
|
|
|
|
|
|
57u, 65u, 90u, 97u, 122u, 33u, 64u, 95u, |
14056
|
|
|
|
|
|
|
303u, 559u, 36u, 37u, 39u, 46u, 48u, 57u, |
14057
|
|
|
|
|
|
|
65u, 90u, 97u, 122u, 33u, 45u, 46u, 58u, |
14058
|
|
|
|
|
|
|
64u, 95u, 303u, 559u, 36u, 37u, 39u, 44u, |
14059
|
|
|
|
|
|
|
48u, 57u, 65u, 90u, 97u, 122u, 33u, 45u, |
14060
|
|
|
|
|
|
|
46u, 58u, 64u, 95u, 303u, 559u, 36u, 37u, |
14061
|
|
|
|
|
|
|
39u, 44u, 48u, 57u, 65u, 90u, 97u, 122u, |
14062
|
|
|
|
|
|
|
33u, 45u, 46u, 58u, 64u, 95u, 303u, 559u, |
14063
|
|
|
|
|
|
|
36u, 37u, 39u, 44u, 48u, 57u, 65u, 90u, |
14064
|
|
|
|
|
|
|
97u, 122u, 33u, 45u, 46u, 53u, 58u, 64u, |
14065
|
|
|
|
|
|
|
95u, 303u, 559u, 36u, 37u, 39u, 44u, 48u, |
14066
|
|
|
|
|
|
|
52u, 54u, 57u, 65u, 90u, 97u, 122u, 33u, |
14067
|
|
|
|
|
|
|
45u, 46u, 58u, 64u, 95u, 303u, 559u, 36u, |
14068
|
|
|
|
|
|
|
37u, 39u, 44u, 48u, 53u, 54u, 57u, 65u, |
14069
|
|
|
|
|
|
|
90u, 97u, 122u, 0 |
14070
|
|
|
|
|
|
|
}; |
14071
|
|
|
|
|
|
|
|
14072
|
|
|
|
|
|
|
static const char _ragel_url_email_single_lengths[] = { |
14073
|
|
|
|
|
|
|
0, 5, 4, 2, 3, 3, 2, 1, |
14074
|
|
|
|
|
|
|
2, 0, 2, 5, 14, 3, 2, 3, |
14075
|
|
|
|
|
|
|
2, 3, 2, 2, 3, 2, 2, 2, |
14076
|
|
|
|
|
|
|
3, 2, 2, 2, 3, 2, 6, 5, |
14077
|
|
|
|
|
|
|
6, 4, 6, 7, 6, 7, 6, 7, |
14078
|
|
|
|
|
|
|
6, 6, 7, 6, 6, 6, 7, 6, |
14079
|
|
|
|
|
|
|
6, 6, 7, 6, 6, 6, 6, 3, |
14080
|
|
|
|
|
|
|
1, 5, 5, 2, 5, 14, 5, 5, |
14081
|
|
|
|
|
|
|
5, 6, 5, 8, 4, 5, 8, 8, |
14082
|
|
|
|
|
|
|
8, 9, 8 |
14083
|
|
|
|
|
|
|
}; |
14084
|
|
|
|
|
|
|
|
14085
|
|
|
|
|
|
|
static const char _ragel_url_email_range_lengths[] = { |
14086
|
|
|
|
|
|
|
0, 5, 5, 5, 5, 3, 3, 3, |
14087
|
|
|
|
|
|
|
3, 3, 3, 6, 7, 3, 3, 3, |
14088
|
|
|
|
|
|
|
3, 3, 3, 3, 4, 4, 3, 3, |
14089
|
|
|
|
|
|
|
4, 4, 3, 3, 4, 4, 5, 5, |
14090
|
|
|
|
|
|
|
5, 5, 5, 5, 5, 5, 5, 5, |
14091
|
|
|
|
|
|
|
5, 5, 6, 6, 5, 5, 6, 6, |
14092
|
|
|
|
|
|
|
5, 5, 6, 6, 5, 5, 5, 4, |
14093
|
|
|
|
|
|
|
0, 5, 3, 1, 6, 7, 3, 3, |
14094
|
|
|
|
|
|
|
3, 4, 4, 5, 5, 5, 5, 5, |
14095
|
|
|
|
|
|
|
5, 6, 6 |
14096
|
|
|
|
|
|
|
}; |
14097
|
|
|
|
|
|
|
|
14098
|
|
|
|
|
|
|
static const short _ragel_url_email_index_offsets[] = { |
14099
|
|
|
|
|
|
|
0, 0, 11, 21, 29, 38, 45, 51, |
14100
|
|
|
|
|
|
|
56, 62, 66, 72, 84, 106, 113, 119, |
14101
|
|
|
|
|
|
|
126, 132, 139, 145, 151, 159, 166, 172, |
14102
|
|
|
|
|
|
|
178, 186, 193, 199, 205, 213, 220, 232, |
14103
|
|
|
|
|
|
|
243, 255, 265, 277, 290, 302, 315, 327, |
14104
|
|
|
|
|
|
|
340, 352, 364, 378, 391, 403, 415, 429, |
14105
|
|
|
|
|
|
|
442, 454, 466, 480, 493, 505, 517, 529, |
14106
|
|
|
|
|
|
|
537, 539, 550, 559, 563, 575, 597, 606, |
14107
|
|
|
|
|
|
|
615, 624, 635, 645, 659, 669, 680, 694, |
14108
|
|
|
|
|
|
|
708, 722, 738 |
14109
|
|
|
|
|
|
|
}; |
14110
|
|
|
|
|
|
|
|
14111
|
|
|
|
|
|
|
static const char _ragel_url_email_indicies[] = { |
14112
|
|
|
|
|
|
|
0, 2, 3, 4, 0, 0, 0, 5, |
14113
|
|
|
|
|
|
|
6, 6, 1, 0, 7, 8, 0, 0, |
14114
|
|
|
|
|
|
|
0, 0, 0, 0, 1, 9, 9, 9, |
14115
|
|
|
|
|
|
|
9, 9, 9, 9, 1, 9, 8, 9, |
14116
|
|
|
|
|
|
|
9, 9, 9, 9, 9, 1, 10, 11, |
14117
|
|
|
|
|
|
|
12, 13, 14, 14, 1, 15, 16, 14, |
14118
|
|
|
|
|
|
|
14, 14, 1, 15, 14, 14, 14, 1, |
14119
|
|
|
|
|
|
|
15, 17, 14, 14, 14, 1, 14, 18, |
14120
|
|
|
|
|
|
|
18, 1, 15, 17, 14, 19, 19, 1, |
14121
|
|
|
|
|
|
|
20, 21, 21, 20, 20, 20, 21, 20, |
14122
|
|
|
|
|
|
|
20, 21, 21, 1, 22, 22, 24, 22, |
14123
|
|
|
|
|
|
|
22, 23, 22, 23, 23, 23, 23, 23, |
14124
|
|
|
|
|
|
|
25, 26, 23, 23, 22, 23, 23, 23, |
14125
|
|
|
|
|
|
|
23, 1, 27, 28, 29, 30, 18, 18, |
14126
|
|
|
|
|
|
|
1, 15, 31, 14, 14, 14, 1, 32, |
14127
|
|
|
|
|
|
|
33, 34, 35, 18, 18, 1, 15, 36, |
14128
|
|
|
|
|
|
|
14, 14, 14, 1, 37, 38, 39, 40, |
14129
|
|
|
|
|
|
|
18, 18, 1, 15, 36, 35, 14, 14, |
14130
|
|
|
|
|
|
|
1, 15, 36, 32, 14, 14, 1, 15, |
14131
|
|
|
|
|
|
|
36, 41, 35, 32, 14, 14, 1, 15, |
14132
|
|
|
|
|
|
|
36, 32, 14, 14, 14, 1, 15, 31, |
14133
|
|
|
|
|
|
|
30, 14, 14, 1, 15, 31, 27, 14, |
14134
|
|
|
|
|
|
|
14, 1, 15, 31, 42, 30, 27, 14, |
14135
|
|
|
|
|
|
|
14, 1, 15, 31, 27, 14, 14, 14, |
14136
|
|
|
|
|
|
|
1, 15, 16, 13, 14, 14, 1, 15, |
14137
|
|
|
|
|
|
|
16, 10, 14, 14, 1, 15, 16, 43, |
14138
|
|
|
|
|
|
|
13, 10, 14, 14, 1, 15, 16, 10, |
14139
|
|
|
|
|
|
|
14, 14, 14, 1, 0, 44, 45, 7, |
14140
|
|
|
|
|
|
|
8, 0, 0, 0, 46, 46, 46, 1, |
14141
|
|
|
|
|
|
|
0, 44, 7, 8, 0, 0, 0, 46, |
14142
|
|
|
|
|
|
|
46, 46, 1, 0, 44, 47, 7, 8, |
14143
|
|
|
|
|
|
|
0, 0, 0, 46, 46, 46, 1, 0, |
14144
|
|
|
|
|
|
|
7, 8, 0, 0, 0, 46, 48, 48, |
14145
|
|
|
|
|
|
|
1, 0, 44, 47, 7, 8, 0, 0, |
14146
|
|
|
|
|
|
|
0, 46, 49, 49, 1, 0, 50, 51, |
14147
|
|
|
|
|
|
|
52, 7, 8, 0, 0, 0, 53, 48, |
14148
|
|
|
|
|
|
|
48, 1, 0, 44, 54, 7, 8, 0, |
14149
|
|
|
|
|
|
|
0, 0, 46, 46, 46, 1, 0, 55, |
14150
|
|
|
|
|
|
|
56, 57, 7, 8, 0, 0, 0, 58, |
14151
|
|
|
|
|
|
|
48, 48, 1, 0, 44, 59, 7, 8, |
14152
|
|
|
|
|
|
|
0, 0, 0, 46, 46, 46, 1, 0, |
14153
|
|
|
|
|
|
|
60, 61, 62, 7, 8, 0, 0, 0, |
14154
|
|
|
|
|
|
|
63, 48, 48, 1, 0, 44, 59, 7, |
14155
|
|
|
|
|
|
|
8, 0, 0, 0, 58, 46, 46, 1, |
14156
|
|
|
|
|
|
|
0, 44, 59, 7, 8, 0, 0, 0, |
14157
|
|
|
|
|
|
|
55, 46, 46, 1, 0, 44, 59, 64, |
14158
|
|
|
|
|
|
|
7, 8, 0, 0, 0, 58, 55, 46, |
14159
|
|
|
|
|
|
|
46, 1, 0, 44, 59, 7, 8, 0, |
14160
|
|
|
|
|
|
|
0, 0, 55, 46, 46, 46, 1, 0, |
14161
|
|
|
|
|
|
|
44, 54, 7, 8, 0, 0, 0, 53, |
14162
|
|
|
|
|
|
|
46, 46, 1, 0, 44, 54, 7, 8, |
14163
|
|
|
|
|
|
|
0, 0, 0, 50, 46, 46, 1, 0, |
14164
|
|
|
|
|
|
|
44, 54, 65, 7, 8, 0, 0, 0, |
14165
|
|
|
|
|
|
|
53, 50, 46, 46, 1, 0, 44, 54, |
14166
|
|
|
|
|
|
|
7, 8, 0, 0, 0, 50, 46, 46, |
14167
|
|
|
|
|
|
|
46, 1, 0, 44, 45, 7, 8, 0, |
14168
|
|
|
|
|
|
|
0, 0, 5, 46, 46, 1, 0, 44, |
14169
|
|
|
|
|
|
|
45, 7, 8, 0, 0, 0, 2, 46, |
14170
|
|
|
|
|
|
|
46, 1, 0, 44, 45, 66, 7, 8, |
14171
|
|
|
|
|
|
|
0, 0, 0, 5, 2, 46, 46, 1, |
14172
|
|
|
|
|
|
|
0, 44, 45, 7, 8, 0, 0, 0, |
14173
|
|
|
|
|
|
|
2, 46, 46, 46, 1, 0, 44, 47, |
14174
|
|
|
|
|
|
|
7, 8, 0, 0, 0, 46, 67, 67, |
14175
|
|
|
|
|
|
|
1, 0, 44, 47, 7, 8, 0, 0, |
14176
|
|
|
|
|
|
|
0, 46, 68, 68, 1, 0, 44, 47, |
14177
|
|
|
|
|
|
|
69, 8, 0, 0, 0, 46, 68, 68, |
14178
|
|
|
|
|
|
|
1, 9, 70, 9, 9, 9, 9, 9, |
14179
|
|
|
|
|
|
|
1, 71, 1, 0, 2, 3, 4, 0, |
14180
|
|
|
|
|
|
|
0, 0, 5, 46, 46, 1, 15, 17, |
14181
|
|
|
|
|
|
|
72, 21, 23, 14, 19, 19, 1, 21, |
14182
|
|
|
|
|
|
|
23, 72, 1, 20, 21, 21, 20, 20, |
14183
|
|
|
|
|
|
|
20, 21, 20, 20, 21, 21, 1, 22, |
14184
|
|
|
|
|
|
|
22, 24, 22, 22, 23, 22, 23, 23, |
14185
|
|
|
|
|
|
|
23, 23, 23, 25, 26, 23, 23, 22, |
14186
|
|
|
|
|
|
|
23, 23, 23, 23, 1, 15, 17, 72, |
14187
|
|
|
|
|
|
|
21, 23, 14, 14, 14, 1, 15, 17, |
14188
|
|
|
|
|
|
|
72, 21, 23, 40, 14, 14, 1, 15, |
14189
|
|
|
|
|
|
|
17, 72, 21, 23, 37, 14, 14, 1, |
14190
|
|
|
|
|
|
|
15, 17, 73, 72, 21, 23, 40, 37, |
14191
|
|
|
|
|
|
|
14, 14, 1, 15, 17, 72, 21, 23, |
14192
|
|
|
|
|
|
|
37, 14, 14, 14, 1, 0, 44, 47, |
14193
|
|
|
|
|
|
|
74, 8, 0, 21, 23, 0, 0, 46, |
14194
|
|
|
|
|
|
|
49, 49, 1, 9, 9, 21, 23, 9, |
14195
|
|
|
|
|
|
|
9, 75, 9, 9, 1, 9, 8, 9, |
14196
|
|
|
|
|
|
|
21, 23, 9, 9, 75, 9, 9, 1, |
14197
|
|
|
|
|
|
|
0, 44, 47, 74, 8, 0, 21, 23, |
14198
|
|
|
|
|
|
|
0, 0, 46, 46, 46, 1, 0, 44, |
14199
|
|
|
|
|
|
|
47, 74, 8, 0, 21, 23, 0, 0, |
14200
|
|
|
|
|
|
|
63, 46, 46, 1, 0, 44, 47, 74, |
14201
|
|
|
|
|
|
|
8, 0, 21, 23, 0, 0, 60, 46, |
14202
|
|
|
|
|
|
|
46, 1, 0, 44, 47, 76, 74, 8, |
14203
|
|
|
|
|
|
|
0, 21, 23, 0, 0, 63, 60, 46, |
14204
|
|
|
|
|
|
|
46, 1, 0, 44, 47, 74, 8, 0, |
14205
|
|
|
|
|
|
|
21, 23, 0, 0, 60, 46, 46, 46, |
14206
|
|
|
|
|
|
|
1, 0 |
14207
|
|
|
|
|
|
|
}; |
14208
|
|
|
|
|
|
|
|
14209
|
|
|
|
|
|
|
static const char _ragel_url_email_trans_targs[] = { |
14210
|
|
|
|
|
|
|
2, 0, 30, 48, 50, 49, 52, 3, |
14211
|
|
|
|
|
|
|
5, 4, 6, 26, 28, 27, 8, 7, |
14212
|
|
|
|
|
|
|
13, 9, 10, 58, 11, 60, 12, 61, |
14213
|
|
|
|
|
|
|
61, 12, 61, 14, 22, 24, 23, 15, |
14214
|
|
|
|
|
|
|
16, 18, 20, 19, 17, 62, 63, 65, |
14215
|
|
|
|
|
|
|
64, 21, 25, 29, 31, 35, 32, 33, |
14216
|
|
|
|
|
|
|
34, 67, 36, 44, 46, 45, 37, 38, |
14217
|
|
|
|
|
|
|
40, 42, 41, 39, 70, 71, 73, 72, |
14218
|
|
|
|
|
|
|
43, 47, 51, 53, 54, 55, 56, 57, |
14219
|
|
|
|
|
|
|
59, 66, 68, 69, 74 |
14220
|
|
|
|
|
|
|
}; |
14221
|
|
|
|
|
|
|
|
14222
|
|
|
|
|
|
|
static const char _ragel_url_email_trans_actions[] = { |
14223
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
14224
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
14225
|
|
|
|
|
|
|
0, 0, 0, 1, 0, 1, 0, 1, |
14226
|
|
|
|
|
|
|
2, 3, 4, 0, 0, 0, 0, 0, |
14227
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 1, 1, 1, |
14228
|
|
|
|
|
|
|
1, 0, 0, 0, 0, 0, 0, 0, |
14229
|
|
|
|
|
|
|
0, 1, 0, 0, 0, 0, 0, 0, |
14230
|
|
|
|
|
|
|
0, 0, 0, 0, 1, 1, 1, 1, |
14231
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
14232
|
|
|
|
|
|
|
1, 1, 1, 1, 1 |
14233
|
|
|
|
|
|
|
}; |
14234
|
|
|
|
|
|
|
|
14235
|
|
|
|
|
|
|
static const int ragel_url_email_start = 1; |
14236
|
|
|
|
|
|
|
|
14237
|
2
|
|
|
|
|
|
vector ragel_tokenizer::ragel_map; |
14238
|
|
|
|
|
|
|
atomic_flag ragel_tokenizer::ragel_map_flag = ATOMIC_FLAG_INIT; |
14239
|
|
|
|
|
|
|
|
14240
|
0
|
|
|
|
|
|
ragel_tokenizer::ragel_tokenizer(unsigned url_email_tokenizer) : unicode_tokenizer(url_email_tokenizer) { |
14241
|
0
|
0
|
|
|
|
|
initialize_ragel_map(); |
14242
|
0
|
|
|
|
|
|
} |
14243
|
|
|
|
|
|
|
|
14244
|
2
|
|
|
|
|
|
void ragel_tokenizer::initialize_ragel_map() { |
14245
|
1
|
50
|
|
|
|
|
while (ragel_map_flag.test_and_set()) {} |
14246
|
1
|
50
|
|
|
|
|
if (ragel_map.empty()) { |
14247
|
129
|
100
|
|
|
|
|
for (uint8_t ascii = 0; ascii < 128; ascii++) |
14248
|
128
|
|
|
|
|
|
ragel_map.push_back(ascii); |
14249
|
|
|
|
|
|
|
|
14250
|
1
|
|
|
|
|
|
ragel_map_add(U'\u2026', 160); // horizontal ellipsis (TRIPLE DOT) |
14251
|
1
|
|
|
|
|
|
ragel_map_add(U'\u2019', 161); // right single quotation mark |
14252
|
1
|
|
|
|
|
|
ragel_map_add(U'\u2018', 162); // left single quotation mark |
14253
|
1
|
|
|
|
|
|
ragel_map_add(U'\u2010', 163); // hyphen |
14254
|
|
|
|
|
|
|
} |
14255
|
|
|
|
|
|
|
ragel_map_flag.clear(); |
14256
|
1
|
|
|
|
|
|
} |
14257
|
|
|
|
|
|
|
|
14258
|
4
|
|
|
|
|
|
void ragel_tokenizer::ragel_map_add(char32_t chr, uint8_t mapping) { |
14259
|
4
|
100
|
|
|
|
|
if (chr >= ragel_map.size()) |
14260
|
1
|
|
|
|
|
|
ragel_map.resize(chr + 1, 128); |
14261
|
4
|
|
|
|
|
|
ragel_map[chr] = mapping; |
14262
|
4
|
|
|
|
|
|
} |
14263
|
|
|
|
|
|
|
|
14264
|
7
|
|
|
|
|
|
bool ragel_tokenizer::ragel_url_email(unsigned version, const vector& chars, size_t& current, vector& tokens) { |
14265
|
|
|
|
|
|
|
int cs; |
14266
|
|
|
|
|
|
|
|
14267
|
7
|
|
|
|
|
|
size_t start = current, end = current, parens = 0; |
14268
|
|
|
|
|
|
|
|
14269
|
|
|
|
|
|
|
{ |
14270
|
|
|
|
|
|
|
cs = ragel_url_email_start; |
14271
|
|
|
|
|
|
|
} |
14272
|
|
|
|
|
|
|
|
14273
|
|
|
|
|
|
|
{ |
14274
|
|
|
|
|
|
|
int _klen; |
14275
|
|
|
|
|
|
|
const short *_keys; |
14276
|
|
|
|
|
|
|
int _trans; |
14277
|
|
|
|
|
|
|
short _widec; |
14278
|
|
|
|
|
|
|
|
14279
|
7
|
50
|
|
|
|
|
if ( ( current) == ( (chars.size() - 1)) ) |
14280
|
|
|
|
|
|
|
goto _test_eof; |
14281
|
|
|
|
|
|
|
if ( cs == 0 ) |
14282
|
|
|
|
|
|
|
goto _out; |
14283
|
|
|
|
|
|
|
_resume: |
14284
|
60
|
|
|
|
|
|
_widec = ( ragel_char(chars[current])); |
14285
|
30
|
|
|
|
|
|
_klen = _ragel_url_email_cond_lengths[cs]; |
14286
|
30
|
|
|
|
|
|
_keys = _ragel_url_email_cond_keys + (_ragel_url_email_cond_offsets[cs]*2); |
14287
|
30
|
50
|
|
|
|
|
if ( _klen > 0 ) { |
14288
|
|
|
|
|
|
|
const short *_lower = _keys; |
14289
|
|
|
|
|
|
|
const short *_mid; |
14290
|
0
|
|
|
|
|
|
const short *_upper = _keys + (_klen<<1) - 2; |
14291
|
|
|
|
|
|
|
while (1) { |
14292
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
14293
|
|
|
|
|
|
|
break; |
14294
|
|
|
|
|
|
|
|
14295
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
14296
|
0
|
0
|
|
|
|
|
if ( _widec < _mid[0] ) |
14297
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
14298
|
0
|
0
|
|
|
|
|
else if ( _widec > _mid[1] ) |
14299
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
14300
|
|
|
|
|
|
|
else { |
14301
|
0
|
|
|
|
|
|
switch ( _ragel_url_email_cond_spaces[_ragel_url_email_cond_offsets[cs] + ((_mid - _keys)>>1)] ) { |
14302
|
|
|
|
|
|
|
case 0: { |
14303
|
0
|
|
|
|
|
|
_widec = (short)(256u + (( ragel_char(chars[current])) - 0u)); |
14304
|
0
|
0
|
|
|
|
|
if ( |
14305
|
0
|
|
|
|
|
|
version >= 2 ) _widec += 256; |
14306
|
|
|
|
|
|
|
break; |
14307
|
|
|
|
|
|
|
} |
14308
|
|
|
|
|
|
|
case 1: { |
14309
|
0
|
|
|
|
|
|
_widec = (short)(768u + (( ragel_char(chars[current])) - 0u)); |
14310
|
0
|
0
|
|
|
|
|
if ( |
14311
|
0
|
|
|
|
|
|
parens ) _widec += 256; |
14312
|
|
|
|
|
|
|
break; |
14313
|
|
|
|
|
|
|
} |
14314
|
|
|
|
|
|
|
} |
14315
|
|
|
|
|
|
|
break; |
14316
|
|
|
|
|
|
|
} |
14317
|
|
|
|
|
|
|
} |
14318
|
|
|
|
|
|
|
} |
14319
|
|
|
|
|
|
|
|
14320
|
30
|
|
|
|
|
|
_keys = _ragel_url_email_trans_keys + _ragel_url_email_key_offsets[cs]; |
14321
|
30
|
|
|
|
|
|
_trans = _ragel_url_email_index_offsets[cs]; |
14322
|
|
|
|
|
|
|
|
14323
|
30
|
|
|
|
|
|
_klen = _ragel_url_email_single_lengths[cs]; |
14324
|
30
|
50
|
|
|
|
|
if ( _klen > 0 ) { |
14325
|
|
|
|
|
|
|
const short *_lower = _keys; |
14326
|
|
|
|
|
|
|
const short *_mid; |
14327
|
117
|
|
|
|
|
|
const short *_upper = _keys + _klen - 1; |
14328
|
|
|
|
|
|
|
while (1) { |
14329
|
117
|
100
|
|
|
|
|
if ( _upper < _lower ) |
14330
|
|
|
|
|
|
|
break; |
14331
|
|
|
|
|
|
|
|
14332
|
87
|
|
|
|
|
|
_mid = _lower + ((_upper-_lower) >> 1); |
14333
|
87
|
100
|
|
|
|
|
if ( _widec < *_mid ) |
14334
|
13
|
|
|
|
|
|
_upper = _mid - 1; |
14335
|
74
|
50
|
|
|
|
|
else if ( _widec > *_mid ) |
14336
|
74
|
|
|
|
|
|
_lower = _mid + 1; |
14337
|
|
|
|
|
|
|
else { |
14338
|
0
|
|
|
|
|
|
_trans += (unsigned int)(_mid - _keys); |
14339
|
0
|
|
|
|
|
|
goto _match; |
14340
|
|
|
|
|
|
|
} |
14341
|
|
|
|
|
|
|
} |
14342
|
30
|
|
|
|
|
|
_keys += _klen; |
14343
|
30
|
|
|
|
|
|
_trans += _klen; |
14344
|
|
|
|
|
|
|
} |
14345
|
|
|
|
|
|
|
|
14346
|
30
|
|
|
|
|
|
_klen = _ragel_url_email_range_lengths[cs]; |
14347
|
30
|
50
|
|
|
|
|
if ( _klen > 0 ) { |
14348
|
|
|
|
|
|
|
const short *_lower = _keys; |
14349
|
|
|
|
|
|
|
const short *_mid; |
14350
|
93
|
|
|
|
|
|
const short *_upper = _keys + (_klen<<1) - 2; |
14351
|
|
|
|
|
|
|
while (1) { |
14352
|
93
|
100
|
|
|
|
|
if ( _upper < _lower ) |
14353
|
|
|
|
|
|
|
break; |
14354
|
|
|
|
|
|
|
|
14355
|
86
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
14356
|
86
|
100
|
|
|
|
|
if ( _widec < _mid[0] ) |
14357
|
9
|
|
|
|
|
|
_upper = _mid - 2; |
14358
|
77
|
100
|
|
|
|
|
else if ( _widec > _mid[1] ) |
14359
|
54
|
|
|
|
|
|
_lower = _mid + 2; |
14360
|
|
|
|
|
|
|
else { |
14361
|
23
|
|
|
|
|
|
_trans += (unsigned int)((_mid - _keys)>>1); |
14362
|
23
|
|
|
|
|
|
goto _match; |
14363
|
|
|
|
|
|
|
} |
14364
|
|
|
|
|
|
|
} |
14365
|
7
|
|
|
|
|
|
_trans += _klen; |
14366
|
|
|
|
|
|
|
} |
14367
|
|
|
|
|
|
|
|
14368
|
|
|
|
|
|
|
_match: |
14369
|
30
|
|
|
|
|
|
_trans = _ragel_url_email_indicies[_trans]; |
14370
|
30
|
|
|
|
|
|
cs = _ragel_url_email_trans_targs[_trans]; |
14371
|
|
|
|
|
|
|
|
14372
|
30
|
50
|
|
|
|
|
if ( _ragel_url_email_trans_actions[_trans] == 0 ) |
14373
|
|
|
|
|
|
|
goto _again; |
14374
|
|
|
|
|
|
|
|
14375
|
0
|
|
|
|
|
|
switch ( _ragel_url_email_trans_actions[_trans] ) { |
14376
|
|
|
|
|
|
|
case 3: |
14377
|
0
|
|
|
|
|
|
{parens-=!!parens;} |
14378
|
0
|
|
|
|
|
|
break; |
14379
|
|
|
|
|
|
|
case 1: |
14380
|
0
|
|
|
|
|
|
{ end = current + 1; } |
14381
|
0
|
|
|
|
|
|
break; |
14382
|
|
|
|
|
|
|
case 2: |
14383
|
0
|
|
|
|
|
|
{parens++;} |
14384
|
0
|
|
|
|
|
|
{ end = current + 1; } |
14385
|
0
|
|
|
|
|
|
break; |
14386
|
|
|
|
|
|
|
case 4: |
14387
|
0
|
|
|
|
|
|
{parens-=!!parens;} |
14388
|
0
|
|
|
|
|
|
{ end = current + 1; } |
14389
|
0
|
|
|
|
|
|
break; |
14390
|
|
|
|
|
|
|
} |
14391
|
|
|
|
|
|
|
|
14392
|
|
|
|
|
|
|
_again: |
14393
|
30
|
100
|
|
|
|
|
if ( cs == 0 ) |
14394
|
|
|
|
|
|
|
goto _out; |
14395
|
23
|
50
|
|
|
|
|
if ( ++( current) != ( (chars.size() - 1)) ) |
14396
|
|
|
|
|
|
|
goto _resume; |
14397
|
|
|
|
|
|
|
_test_eof: {} |
14398
|
|
|
|
|
|
|
_out: {} |
14399
|
|
|
|
|
|
|
} |
14400
|
|
|
|
|
|
|
|
14401
|
7
|
50
|
|
|
|
|
if (end > start) { |
14402
|
0
|
|
|
|
|
|
tokens.emplace_back(start, end - start); |
14403
|
0
|
|
|
|
|
|
current = end; |
14404
|
0
|
|
|
|
|
|
return true; |
14405
|
|
|
|
|
|
|
} else { |
14406
|
7
|
|
|
|
|
|
current = start; |
14407
|
7
|
|
|
|
|
|
return false; |
14408
|
|
|
|
|
|
|
} |
14409
|
|
|
|
|
|
|
} |
14410
|
|
|
|
|
|
|
|
14411
|
|
|
|
|
|
|
} // namespace morphodita |
14412
|
|
|
|
|
|
|
|
14413
|
|
|
|
|
|
|
///////// |
14414
|
|
|
|
|
|
|
// File: morphodita/tokenizer/vertical_tokenizer.h |
14415
|
|
|
|
|
|
|
///////// |
14416
|
|
|
|
|
|
|
|
14417
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
14418
|
|
|
|
|
|
|
// |
14419
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
14420
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
14421
|
|
|
|
|
|
|
// |
14422
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
14423
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
14424
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
14425
|
|
|
|
|
|
|
|
14426
|
|
|
|
|
|
|
namespace morphodita { |
14427
|
|
|
|
|
|
|
|
14428
|
0
|
|
|
|
|
|
class vertical_tokenizer : public unicode_tokenizer { |
14429
|
|
|
|
|
|
|
public: |
14430
|
0
|
0
|
|
|
|
|
vertical_tokenizer() : unicode_tokenizer(0) {} |
14431
|
|
|
|
|
|
|
|
14432
|
|
|
|
|
|
|
virtual bool next_sentence(vector& tokens) override; |
14433
|
|
|
|
|
|
|
}; |
14434
|
|
|
|
|
|
|
|
14435
|
|
|
|
|
|
|
} // namespace morphodita |
14436
|
|
|
|
|
|
|
|
14437
|
|
|
|
|
|
|
///////// |
14438
|
|
|
|
|
|
|
// File: morphodita/tokenizer/tokenizer.cpp |
14439
|
|
|
|
|
|
|
///////// |
14440
|
|
|
|
|
|
|
|
14441
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
14442
|
|
|
|
|
|
|
// |
14443
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
14444
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
14445
|
|
|
|
|
|
|
// |
14446
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
14447
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
14448
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
14449
|
|
|
|
|
|
|
|
14450
|
|
|
|
|
|
|
namespace morphodita { |
14451
|
|
|
|
|
|
|
|
14452
|
0
|
|
|
|
|
|
tokenizer* tokenizer::new_vertical_tokenizer() { |
14453
|
0
|
|
|
|
|
|
return new vertical_tokenizer(); |
14454
|
|
|
|
|
|
|
} |
14455
|
|
|
|
|
|
|
|
14456
|
0
|
|
|
|
|
|
tokenizer* tokenizer::new_czech_tokenizer() { |
14457
|
0
|
|
|
|
|
|
return new czech_tokenizer(czech_tokenizer::CZECH, czech_tokenizer::LATEST); |
14458
|
|
|
|
|
|
|
} |
14459
|
|
|
|
|
|
|
|
14460
|
0
|
|
|
|
|
|
tokenizer* tokenizer::new_english_tokenizer() { |
14461
|
0
|
|
|
|
|
|
return new english_tokenizer(english_tokenizer::LATEST); |
14462
|
|
|
|
|
|
|
} |
14463
|
|
|
|
|
|
|
|
14464
|
0
|
|
|
|
|
|
tokenizer* tokenizer::new_generic_tokenizer() { |
14465
|
0
|
|
|
|
|
|
return new generic_tokenizer(generic_tokenizer::LATEST); |
14466
|
|
|
|
|
|
|
} |
14467
|
|
|
|
|
|
|
|
14468
|
|
|
|
|
|
|
} // namespace morphodita |
14469
|
|
|
|
|
|
|
|
14470
|
|
|
|
|
|
|
///////// |
14471
|
|
|
|
|
|
|
// File: morphodita/tokenizer/tokenizer_ids.h |
14472
|
|
|
|
|
|
|
///////// |
14473
|
|
|
|
|
|
|
|
14474
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
14475
|
|
|
|
|
|
|
// |
14476
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
14477
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
14478
|
|
|
|
|
|
|
// |
14479
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
14480
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
14481
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
14482
|
|
|
|
|
|
|
|
14483
|
|
|
|
|
|
|
namespace morphodita { |
14484
|
|
|
|
|
|
|
|
14485
|
|
|
|
|
|
|
class tokenizer_ids { |
14486
|
|
|
|
|
|
|
public: |
14487
|
|
|
|
|
|
|
enum tokenizer_id { |
14488
|
|
|
|
|
|
|
CZECH = 0, |
14489
|
|
|
|
|
|
|
ENGLISH = 1, |
14490
|
|
|
|
|
|
|
GENERIC = 2, |
14491
|
|
|
|
|
|
|
GRU = 3, |
14492
|
|
|
|
|
|
|
}; |
14493
|
|
|
|
|
|
|
|
14494
|
|
|
|
|
|
|
static bool parse(const string& str, tokenizer_id& id) { |
14495
|
|
|
|
|
|
|
if (str == "czech") return id = CZECH, true; |
14496
|
|
|
|
|
|
|
if (str == "english") return id = ENGLISH, true; |
14497
|
|
|
|
|
|
|
if (str == "generic") return id = GENERIC, true; |
14498
|
|
|
|
|
|
|
if (str == "gru") return id = GRU, true; |
14499
|
|
|
|
|
|
|
return false; |
14500
|
|
|
|
|
|
|
} |
14501
|
|
|
|
|
|
|
}; |
14502
|
|
|
|
|
|
|
|
14503
|
|
|
|
|
|
|
typedef tokenizer_ids::tokenizer_id tokenizer_id; |
14504
|
|
|
|
|
|
|
|
14505
|
|
|
|
|
|
|
} // namespace morphodita |
14506
|
|
|
|
|
|
|
|
14507
|
|
|
|
|
|
|
///////// |
14508
|
|
|
|
|
|
|
// File: morphodita/tokenizer/tokenizer_factory.cpp |
14509
|
|
|
|
|
|
|
///////// |
14510
|
|
|
|
|
|
|
|
14511
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
14512
|
|
|
|
|
|
|
// |
14513
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
14514
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
14515
|
|
|
|
|
|
|
// |
14516
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
14517
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
14518
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
14519
|
|
|
|
|
|
|
|
14520
|
|
|
|
|
|
|
namespace morphodita { |
14521
|
|
|
|
|
|
|
|
14522
|
1
|
|
|
|
|
|
tokenizer_factory* tokenizer_factory::load(istream& is) { |
14523
|
1
|
|
|
|
|
|
tokenizer_id id = tokenizer_id(is.get()); |
14524
|
1
|
|
|
|
|
|
switch (id) { |
14525
|
|
|
|
|
|
|
case tokenizer_ids::GENERIC: |
14526
|
|
|
|
|
|
|
{ |
14527
|
|
|
|
|
|
|
auto res = new_unique_ptr(); |
14528
|
0
|
0
|
|
|
|
|
if (res->load(is)) return res.release(); |
14529
|
|
|
|
|
|
|
break; |
14530
|
|
|
|
|
|
|
} |
14531
|
|
|
|
|
|
|
case tokenizer_ids::GRU: |
14532
|
|
|
|
|
|
|
{ |
14533
|
1
|
|
|
|
|
|
auto res = new_unique_ptr(); |
14534
|
1
|
50
|
|
|
|
|
if (res->load(is)) return res.release(); |
|
|
50
|
|
|
|
|
|
14535
|
|
|
|
|
|
|
break; |
14536
|
|
|
|
|
|
|
} |
14537
|
|
|
|
|
|
|
case tokenizer_ids::CZECH: |
14538
|
|
|
|
|
|
|
{ |
14539
|
|
|
|
|
|
|
auto res = new_unique_ptr(); |
14540
|
0
|
0
|
|
|
|
|
if (res->load(is)) return res.release(); |
|
|
0
|
|
|
|
|
|
14541
|
|
|
|
|
|
|
break; |
14542
|
|
|
|
|
|
|
} |
14543
|
|
|
|
|
|
|
case tokenizer_ids::ENGLISH: |
14544
|
|
|
|
|
|
|
break; |
14545
|
|
|
|
|
|
|
} |
14546
|
|
|
|
|
|
|
|
14547
|
|
|
|
|
|
|
return nullptr; |
14548
|
|
|
|
|
|
|
} |
14549
|
|
|
|
|
|
|
|
14550
|
0
|
|
|
|
|
|
tokenizer_factory* tokenizer_factory::load(const char* fname) { |
14551
|
0
|
0
|
|
|
|
|
ifstream f(path_from_utf8(fname).c_str(), ifstream::binary); |
14552
|
0
|
0
|
|
|
|
|
if (!f) return nullptr; |
14553
|
|
|
|
|
|
|
|
14554
|
0
|
0
|
|
|
|
|
return load(f); |
14555
|
|
|
|
|
|
|
} |
14556
|
|
|
|
|
|
|
|
14557
|
|
|
|
|
|
|
} // namespace morphodita |
14558
|
|
|
|
|
|
|
|
14559
|
|
|
|
|
|
|
///////// |
14560
|
|
|
|
|
|
|
// File: morphodita/tokenizer/unicode_tokenizer.cpp |
14561
|
|
|
|
|
|
|
///////// |
14562
|
|
|
|
|
|
|
|
14563
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
14564
|
|
|
|
|
|
|
// |
14565
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
14566
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
14567
|
|
|
|
|
|
|
// |
14568
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
14569
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
14570
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
14571
|
|
|
|
|
|
|
|
14572
|
|
|
|
|
|
|
namespace morphodita { |
14573
|
|
|
|
|
|
|
|
14574
|
1
|
|
|
|
|
|
unicode_tokenizer::unicode_tokenizer(unsigned url_email_tokenizer) : url_email_tokenizer(url_email_tokenizer) { |
14575
|
1
|
50
|
|
|
|
|
ragel_tokenizer::initialize_ragel_map(); |
14576
|
|
|
|
|
|
|
|
14577
|
1
|
50
|
|
|
|
|
set_text(string_piece(nullptr, 0)); |
14578
|
1
|
|
|
|
|
|
} |
14579
|
|
|
|
|
|
|
|
14580
|
2
|
|
|
|
|
|
void unicode_tokenizer::set_text(string_piece text, bool make_copy /*= false*/) { |
14581
|
|
|
|
|
|
|
using namespace unilib; |
14582
|
|
|
|
|
|
|
|
14583
|
2
|
50
|
|
|
|
|
if (make_copy && text.str) { |
|
|
0
|
|
|
|
|
|
14584
|
0
|
|
|
|
|
|
text_buffer.assign(text.str, text.len); |
14585
|
0
|
|
|
|
|
|
text.str = text_buffer.c_str(); |
14586
|
|
|
|
|
|
|
} |
14587
|
2
|
|
|
|
|
|
current = 0; |
14588
|
|
|
|
|
|
|
|
14589
|
|
|
|
|
|
|
chars.clear(); |
14590
|
36
|
100
|
|
|
|
|
for (const char* curr_str = text.str; text.len; curr_str = text.str) |
14591
|
34
|
|
|
|
|
|
chars.emplace_back(utf8::decode(text.str, text.len), curr_str); |
14592
|
2
|
|
|
|
|
|
chars.emplace_back(0, text.str); |
14593
|
2
|
|
|
|
|
|
} |
14594
|
|
|
|
|
|
|
|
14595
|
2
|
|
|
|
|
|
bool unicode_tokenizer::next_sentence(vector* forms, vector* tokens_ptr) { |
14596
|
2
|
50
|
|
|
|
|
vector& tokens = tokens_ptr ? *tokens_ptr : tokens_buffer; |
14597
|
|
|
|
|
|
|
tokens.clear(); |
14598
|
2
|
50
|
|
|
|
|
if (forms) forms->clear(); |
14599
|
2
|
50
|
|
|
|
|
if (current >= chars.size() - 1) return false; |
14600
|
|
|
|
|
|
|
|
14601
|
2
|
|
|
|
|
|
bool result = next_sentence(tokens); |
14602
|
2
|
50
|
|
|
|
|
if (forms) |
14603
|
9
|
100
|
|
|
|
|
for (auto&& token : tokens) |
14604
|
7
|
|
|
|
|
|
forms->emplace_back(chars[token.start].str, chars[token.start + token.length].str - chars[token.start].str); |
14605
|
|
|
|
|
|
|
|
14606
|
|
|
|
|
|
|
return result; |
14607
|
|
|
|
|
|
|
} |
14608
|
|
|
|
|
|
|
|
14609
|
7
|
|
|
|
|
|
bool unicode_tokenizer::tokenize_url_email(vector& tokens) { |
14610
|
7
|
50
|
|
|
|
|
if (current >= chars.size() - 1) return false; |
14611
|
|
|
|
|
|
|
|
14612
|
7
|
50
|
|
|
|
|
return url_email_tokenizer ? ragel_tokenizer::ragel_url_email(url_email_tokenizer, chars, current, tokens) : false; |
14613
|
|
|
|
|
|
|
} |
14614
|
|
|
|
|
|
|
|
14615
|
8
|
|
|
|
|
|
bool unicode_tokenizer::emergency_sentence_split(const vector& tokens) { |
14616
|
|
|
|
|
|
|
using namespace unilib; |
14617
|
|
|
|
|
|
|
|
14618
|
|
|
|
|
|
|
// Implement emergency splitting for large sentences |
14619
|
8
|
50
|
|
|
|
|
return tokens.size() >= 500 || |
14620
|
16
|
50
|
|
|
|
|
(tokens.size() >= 450 && chars[tokens.back().start].cat & unicode::P) || |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
14621
|
0
|
0
|
|
|
|
|
(tokens.size() >= 400 && chars[tokens.back().start].cat & unicode::Po); |
14622
|
|
|
|
|
|
|
} |
14623
|
|
|
|
|
|
|
|
14624
|
0
|
|
|
|
|
|
bool unicode_tokenizer::is_eos(const vector& tokens, char32_t eos_chr, const unordered_set* abbreviations) { |
14625
|
|
|
|
|
|
|
using namespace unilib; |
14626
|
|
|
|
|
|
|
|
14627
|
0
|
0
|
|
|
|
|
if (eos_chr == '.' && !tokens.empty()) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
14628
|
|
|
|
|
|
|
// Ignore one-letter capitals before dot |
14629
|
0
|
0
|
|
|
|
|
if (tokens.back().length == 1 && chars[tokens.back().start].cat & unicode::Lut) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
14630
|
|
|
|
|
|
|
return false; |
14631
|
|
|
|
|
|
|
|
14632
|
|
|
|
|
|
|
// Ignore specified abbreviations |
14633
|
0
|
0
|
|
|
|
|
if (abbreviations) { |
14634
|
|
|
|
|
|
|
eos_buffer.clear(); |
14635
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < tokens.back().length; i++) |
14636
|
0
|
|
|
|
|
|
utf8::append(eos_buffer, unicode::lowercase(chars[tokens.back().start + i].chr)); |
14637
|
0
|
0
|
|
|
|
|
if (abbreviations->count(eos_buffer)) |
14638
|
|
|
|
|
|
|
return false; |
14639
|
|
|
|
|
|
|
} |
14640
|
|
|
|
|
|
|
} |
14641
|
|
|
|
|
|
|
return true; |
14642
|
|
|
|
|
|
|
} |
14643
|
|
|
|
|
|
|
|
14644
|
|
|
|
|
|
|
} // namespace morphodita |
14645
|
|
|
|
|
|
|
|
14646
|
|
|
|
|
|
|
///////// |
14647
|
|
|
|
|
|
|
// File: morphodita/tokenizer/vertical_tokenizer.cpp |
14648
|
|
|
|
|
|
|
///////// |
14649
|
|
|
|
|
|
|
|
14650
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
14651
|
|
|
|
|
|
|
// |
14652
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
14653
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
14654
|
|
|
|
|
|
|
// |
14655
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
14656
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
14657
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
14658
|
|
|
|
|
|
|
|
14659
|
|
|
|
|
|
|
namespace morphodita { |
14660
|
|
|
|
|
|
|
|
14661
|
0
|
|
|
|
|
|
bool vertical_tokenizer::next_sentence(vector& tokens) { |
14662
|
0
|
0
|
|
|
|
|
if (current >= chars.size() - 1) return false; |
14663
|
|
|
|
|
|
|
|
14664
|
0
|
|
|
|
|
|
while (true) { |
14665
|
0
|
|
|
|
|
|
size_t line_start = current; |
14666
|
0
|
0
|
|
|
|
|
while (current < chars.size() - 1 && chars[current].chr != '\r' && chars[current].chr != '\n') current++; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
14667
|
|
|
|
|
|
|
|
14668
|
|
|
|
|
|
|
size_t line_end = current; |
14669
|
0
|
0
|
|
|
|
|
if (current < chars.size() - 1) { |
14670
|
0
|
|
|
|
|
|
current++; |
14671
|
0
|
0
|
|
|
|
|
if (current < chars.size() - 1 && |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
14672
|
0
|
0
|
|
|
|
|
((chars[current-1].chr == '\r' && chars[current].chr == '\n') || |
|
|
0
|
|
|
|
|
|
14673
|
0
|
0
|
|
|
|
|
(chars[current-1].chr == '\n' && chars[current].chr == '\r'))) |
14674
|
0
|
|
|
|
|
|
current++; |
14675
|
|
|
|
|
|
|
} |
14676
|
|
|
|
|
|
|
|
14677
|
0
|
0
|
|
|
|
|
if (line_start < line_end) |
14678
|
0
|
|
|
|
|
|
tokens.emplace_back(line_start, line_end - line_start); |
14679
|
|
|
|
|
|
|
else |
14680
|
|
|
|
|
|
|
break; |
14681
|
|
|
|
|
|
|
} |
14682
|
|
|
|
|
|
|
|
14683
|
0
|
|
|
|
|
|
return true; |
14684
|
|
|
|
|
|
|
} |
14685
|
|
|
|
|
|
|
|
14686
|
|
|
|
|
|
|
} // namespace morphodita |
14687
|
|
|
|
|
|
|
|
14688
|
|
|
|
|
|
|
///////// |
14689
|
|
|
|
|
|
|
// File: unilib/version.h |
14690
|
|
|
|
|
|
|
///////// |
14691
|
|
|
|
|
|
|
|
14692
|
|
|
|
|
|
|
// This file is part of UniLib . |
14693
|
|
|
|
|
|
|
// |
14694
|
|
|
|
|
|
|
// Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of |
14695
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
14696
|
|
|
|
|
|
|
// |
14697
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
14698
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
14699
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
14700
|
|
|
|
|
|
|
// |
14701
|
|
|
|
|
|
|
// UniLib version: 3.3.0 |
14702
|
|
|
|
|
|
|
// Unicode version: 15.0.0 |
14703
|
|
|
|
|
|
|
|
14704
|
|
|
|
|
|
|
namespace unilib { |
14705
|
|
|
|
|
|
|
|
14706
|
0
|
|
|
|
|
|
struct version { |
14707
|
|
|
|
|
|
|
unsigned major; |
14708
|
|
|
|
|
|
|
unsigned minor; |
14709
|
|
|
|
|
|
|
unsigned patch; |
14710
|
|
|
|
|
|
|
std::string prerelease; |
14711
|
|
|
|
|
|
|
|
14712
|
|
|
|
|
|
|
// Returns current version. |
14713
|
|
|
|
|
|
|
static version current(); |
14714
|
|
|
|
|
|
|
}; |
14715
|
|
|
|
|
|
|
|
14716
|
|
|
|
|
|
|
} // namespace unilib |
14717
|
|
|
|
|
|
|
|
14718
|
|
|
|
|
|
|
///////// |
14719
|
|
|
|
|
|
|
// File: morphodita/version/version.h |
14720
|
|
|
|
|
|
|
///////// |
14721
|
|
|
|
|
|
|
|
14722
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
14723
|
|
|
|
|
|
|
// |
14724
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
14725
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
14726
|
|
|
|
|
|
|
// |
14727
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
14728
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
14729
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
14730
|
|
|
|
|
|
|
|
14731
|
|
|
|
|
|
|
namespace morphodita { |
14732
|
|
|
|
|
|
|
|
14733
|
0
|
|
|
|
|
|
class version { |
14734
|
|
|
|
|
|
|
public: |
14735
|
|
|
|
|
|
|
unsigned major; |
14736
|
|
|
|
|
|
|
unsigned minor; |
14737
|
|
|
|
|
|
|
unsigned patch; |
14738
|
|
|
|
|
|
|
string prerelease; |
14739
|
|
|
|
|
|
|
|
14740
|
|
|
|
|
|
|
// Returns current MorphoDiTa version. |
14741
|
|
|
|
|
|
|
static version current(); |
14742
|
|
|
|
|
|
|
|
14743
|
|
|
|
|
|
|
// Returns multi-line formated version and copyright string. |
14744
|
|
|
|
|
|
|
static string version_and_copyright(const string& other_libraries = string()); |
14745
|
|
|
|
|
|
|
}; |
14746
|
|
|
|
|
|
|
|
14747
|
|
|
|
|
|
|
} // namespace morphodita |
14748
|
|
|
|
|
|
|
|
14749
|
|
|
|
|
|
|
///////// |
14750
|
|
|
|
|
|
|
// File: morphodita/version/version.cpp |
14751
|
|
|
|
|
|
|
///////// |
14752
|
|
|
|
|
|
|
|
14753
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
14754
|
|
|
|
|
|
|
// |
14755
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
14756
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
14757
|
|
|
|
|
|
|
// |
14758
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
14759
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
14760
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
14761
|
|
|
|
|
|
|
|
14762
|
|
|
|
|
|
|
namespace morphodita { |
14763
|
|
|
|
|
|
|
|
14764
|
0
|
|
|
|
|
|
version version::current() { |
14765
|
0
|
0
|
|
|
|
|
return {1, 11, 1, "dev"}; |
|
|
0
|
|
|
|
|
|
14766
|
|
|
|
|
|
|
} |
14767
|
|
|
|
|
|
|
|
14768
|
|
|
|
|
|
|
// Returns multi-line formated version and copyright string. |
14769
|
0
|
|
|
|
|
|
string version::version_and_copyright(const string& other_libraries) { |
14770
|
0
|
|
|
|
|
|
ostringstream info; |
14771
|
|
|
|
|
|
|
|
14772
|
|
|
|
|
|
|
auto morphodita = version::current(); |
14773
|
|
|
|
|
|
|
auto unilib = unilib::version::current(); |
14774
|
|
|
|
|
|
|
|
14775
|
0
|
|
|
|
|
|
info << "MorphoDiTa version " << morphodita.major << '.' << morphodita.minor << '.' << morphodita.patch |
14776
|
0
|
0
|
|
|
|
|
<< (morphodita.prerelease.empty() ? "" : "-") << morphodita.prerelease |
|
|
0
|
|
|
|
|
|
14777
|
0
|
|
|
|
|
|
<< " (using UniLib " << unilib.major << '.' << unilib.minor << '.' << unilib.patch |
14778
|
0
|
0
|
|
|
|
|
<< (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n" |
|
|
0
|
|
|
|
|
|
14779
|
|
|
|
|
|
|
"Copyright 2015 by Institute of Formal and Applied Linguistics, Faculty of\n" |
14780
|
0
|
0
|
|
|
|
|
"Mathematics and Physics, Charles University in Prague, Czech Republic."; |
14781
|
|
|
|
|
|
|
|
14782
|
0
|
|
|
|
|
|
return info.str(); |
14783
|
|
|
|
|
|
|
} |
14784
|
|
|
|
|
|
|
|
14785
|
|
|
|
|
|
|
} // namespace morphodita |
14786
|
|
|
|
|
|
|
|
14787
|
|
|
|
|
|
|
///////// |
14788
|
|
|
|
|
|
|
// File: parsito/configuration/configuration.cpp |
14789
|
|
|
|
|
|
|
///////// |
14790
|
|
|
|
|
|
|
|
14791
|
|
|
|
|
|
|
// This file is part of Parsito . |
14792
|
|
|
|
|
|
|
// |
14793
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
14794
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
14795
|
|
|
|
|
|
|
// |
14796
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
14797
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
14798
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
14799
|
|
|
|
|
|
|
|
14800
|
|
|
|
|
|
|
namespace parsito { |
14801
|
|
|
|
|
|
|
|
14802
|
1
|
|
|
|
|
|
void configuration::init(tree* t) { |
14803
|
1
|
50
|
|
|
|
|
assert(t); |
14804
|
|
|
|
|
|
|
|
14805
|
|
|
|
|
|
|
t->unlink_all_nodes(); |
14806
|
1
|
|
|
|
|
|
this->t = t; |
14807
|
|
|
|
|
|
|
|
14808
|
|
|
|
|
|
|
stack.clear(); |
14809
|
2
|
50
|
|
|
|
|
if (!t->nodes.empty()) stack.push_back(0); |
14810
|
|
|
|
|
|
|
|
14811
|
|
|
|
|
|
|
buffer.clear(); |
14812
|
1
|
|
|
|
|
|
buffer.reserve(t->nodes.size()); |
14813
|
8
|
100
|
|
|
|
|
for (size_t i = t->nodes.size(); i > 1; i--) |
14814
|
14
|
|
|
|
|
|
buffer.push_back(i - 1); |
14815
|
1
|
|
|
|
|
|
} |
14816
|
|
|
|
|
|
|
|
14817
|
0
|
|
|
|
|
|
bool configuration::final() { |
14818
|
67
|
0
|
|
|
|
|
return buffer.empty() && stack.size() <= 1; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
14819
|
|
|
|
|
|
|
} |
14820
|
|
|
|
|
|
|
|
14821
|
|
|
|
|
|
|
} // namespace parsito |
14822
|
|
|
|
|
|
|
|
14823
|
|
|
|
|
|
|
///////// |
14824
|
|
|
|
|
|
|
// File: parsito/configuration/node_extractor.h |
14825
|
|
|
|
|
|
|
///////// |
14826
|
|
|
|
|
|
|
|
14827
|
|
|
|
|
|
|
// This file is part of Parsito . |
14828
|
|
|
|
|
|
|
// |
14829
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
14830
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
14831
|
|
|
|
|
|
|
// |
14832
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
14833
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
14834
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
14835
|
|
|
|
|
|
|
|
14836
|
|
|
|
|
|
|
namespace parsito { |
14837
|
|
|
|
|
|
|
|
14838
|
1
|
|
|
|
|
|
class node_extractor { |
14839
|
|
|
|
|
|
|
public: |
14840
|
|
|
|
|
|
|
unsigned node_count() const; |
14841
|
|
|
|
|
|
|
void extract(const configuration& conf, vector& nodes) const; |
14842
|
|
|
|
|
|
|
|
14843
|
|
|
|
|
|
|
bool create(string_piece description, string& error); |
14844
|
|
|
|
|
|
|
|
14845
|
|
|
|
|
|
|
private: |
14846
|
|
|
|
|
|
|
enum start_t { STACK = 0, BUFFER = 1 }; |
14847
|
|
|
|
|
|
|
enum direction_t { PARENT = 0, CHILD = 1 }; |
14848
|
80
|
|
|
|
|
|
struct node_selector { |
14849
|
|
|
|
|
|
|
pair start; |
14850
|
|
|
|
|
|
|
vector> directions; |
14851
|
|
|
|
|
|
|
|
14852
|
|
|
|
|
|
|
node_selector(start_t start, int start_index) : start(start, start_index) {} |
14853
|
|
|
|
|
|
|
}; |
14854
|
|
|
|
|
|
|
|
14855
|
|
|
|
|
|
|
vector selectors; |
14856
|
|
|
|
|
|
|
}; |
14857
|
|
|
|
|
|
|
|
14858
|
|
|
|
|
|
|
} // namespace parsito |
14859
|
|
|
|
|
|
|
|
14860
|
|
|
|
|
|
|
///////// |
14861
|
|
|
|
|
|
|
// File: parsito/configuration/node_extractor.cpp |
14862
|
|
|
|
|
|
|
///////// |
14863
|
|
|
|
|
|
|
|
14864
|
|
|
|
|
|
|
// This file is part of Parsito . |
14865
|
|
|
|
|
|
|
// |
14866
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
14867
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
14868
|
|
|
|
|
|
|
// |
14869
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
14870
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
14871
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
14872
|
|
|
|
|
|
|
|
14873
|
|
|
|
|
|
|
namespace parsito { |
14874
|
|
|
|
|
|
|
|
14875
|
0
|
|
|
|
|
|
unsigned node_extractor::node_count() const { |
14876
|
0
|
|
|
|
|
|
return selectors.size(); |
14877
|
|
|
|
|
|
|
} |
14878
|
|
|
|
|
|
|
|
14879
|
62
|
|
|
|
|
|
void node_extractor::extract(const configuration& conf, vector& nodes) const { |
14880
|
|
|
|
|
|
|
nodes.clear(); |
14881
|
1178
|
100
|
|
|
|
|
for (auto&& selector : selectors) { |
14882
|
|
|
|
|
|
|
// Start by locating starting node |
14883
|
1116
|
|
|
|
|
|
int current = -1; |
14884
|
1116
|
|
|
|
|
|
switch (selector.start.first) { |
14885
|
|
|
|
|
|
|
case STACK: |
14886
|
930
|
100
|
|
|
|
|
if (selector.start.second < int(conf.stack.size())) |
14887
|
867
|
|
|
|
|
|
current = conf.stack[conf.stack.size() - 1 - selector.start.second]; |
14888
|
|
|
|
|
|
|
break; |
14889
|
|
|
|
|
|
|
case BUFFER: |
14890
|
186
|
100
|
|
|
|
|
if (selector.start.second < int(conf.buffer.size())) |
14891
|
98
|
|
|
|
|
|
current = conf.buffer[conf.buffer.size() - 1 - selector.start.second]; |
14892
|
|
|
|
|
|
|
break; |
14893
|
|
|
|
|
|
|
} |
14894
|
|
|
|
|
|
|
|
14895
|
|
|
|
|
|
|
// Follow directions to the final node |
14896
|
1116
|
100
|
|
|
|
|
if (current >= 0) |
14897
|
1212
|
100
|
|
|
|
|
for (auto&& direction : selector.directions) { |
14898
|
802
|
|
|
|
|
|
const node& node = conf.t->nodes[current]; |
14899
|
802
|
|
|
|
|
|
switch (direction.first) { |
14900
|
|
|
|
|
|
|
case PARENT: |
14901
|
0
|
0
|
|
|
|
|
current = node.head ? node.head : -1; |
14902
|
0
|
|
|
|
|
|
break; |
14903
|
|
|
|
|
|
|
case CHILD: |
14904
|
401
|
100
|
|
|
|
|
current = direction.second >= 0 && direction.second < int(node.children.size()) ? |
14905
|
120
|
|
|
|
|
|
node.children[direction.second] : |
14906
|
401
|
100
|
|
|
|
|
direction.second < 0 && -direction.second <= int(node.children.size()) ? |
14907
|
127
|
|
|
|
|
|
node.children[node.children.size() + direction.second] : |
14908
|
1330
|
100
|
|
|
|
|
-1; |
|
|
100
|
|
|
|
|
|
14909
|
802
|
|
|
|
|
|
break; |
14910
|
|
|
|
|
|
|
} |
14911
|
802
|
100
|
|
|
|
|
if (current <= 0) break; |
14912
|
|
|
|
|
|
|
} |
14913
|
|
|
|
|
|
|
|
14914
|
|
|
|
|
|
|
// Add the selected node |
14915
|
1116
|
|
|
|
|
|
nodes.push_back(current); |
14916
|
|
|
|
|
|
|
} |
14917
|
62
|
|
|
|
|
|
} |
14918
|
|
|
|
|
|
|
|
14919
|
1
|
|
|
|
|
|
bool node_extractor::create(string_piece description, string& error) { |
14920
|
1
|
|
|
|
|
|
selectors.clear(); |
14921
|
|
|
|
|
|
|
error.clear(); |
14922
|
|
|
|
|
|
|
|
14923
|
|
|
|
|
|
|
vector lines, parts, words; |
14924
|
1
|
50
|
|
|
|
|
split(description, '\n', lines); |
14925
|
20
|
100
|
|
|
|
|
for (auto&& line : lines) { |
14926
|
19
|
100
|
|
|
|
|
if (!line.len || line.str[0] == '#') continue; |
|
|
50
|
|
|
|
|
|
14927
|
|
|
|
|
|
|
|
14928
|
|
|
|
|
|
|
// Separate start and directions |
14929
|
18
|
50
|
|
|
|
|
split(line, ',', parts); |
14930
|
|
|
|
|
|
|
|
14931
|
|
|
|
|
|
|
// Parse start |
14932
|
18
|
50
|
|
|
|
|
split(parts[0], ' ', words); |
14933
|
18
|
50
|
|
|
|
|
if (words.size() != 2) |
14934
|
0
|
0
|
|
|
|
|
return error.assign("The node selector '").append(parts[0].str, parts[0].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
14935
|
|
|
|
|
|
|
|
14936
|
|
|
|
|
|
|
start_t start; |
14937
|
18
|
100
|
|
|
|
|
if (words[0] == "stack") |
14938
|
15
|
|
|
|
|
|
start = STACK; |
14939
|
3
|
50
|
|
|
|
|
else if (words[0] == "buffer") |
14940
|
3
|
|
|
|
|
|
start = BUFFER; |
14941
|
|
|
|
|
|
|
else |
14942
|
0
|
0
|
|
|
|
|
return error.assign("Cannot parse starting location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
14943
|
|
|
|
|
|
|
|
14944
|
|
|
|
|
|
|
int start_index; |
14945
|
18
|
50
|
|
|
|
|
if (!parse_int(words[1], "starting index", start_index, error)) return false; |
|
|
50
|
|
|
|
|
|
14946
|
|
|
|
|
|
|
|
14947
|
18
|
50
|
|
|
|
|
selectors.emplace_back(start, start_index); |
14948
|
|
|
|
|
|
|
|
14949
|
|
|
|
|
|
|
// Parse directions |
14950
|
34
|
100
|
|
|
|
|
for (size_t i = 1; i < parts.size(); i++) { |
14951
|
16
|
50
|
|
|
|
|
split(parts[i], ' ', words); |
14952
|
16
|
50
|
|
|
|
|
if (words.empty()) |
14953
|
0
|
0
|
|
|
|
|
return error.assign("Empty node selector on line '").append(line.str, line.len).append(".!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
14954
|
|
|
|
|
|
|
|
14955
|
16
|
50
|
|
|
|
|
if (words[0] == "parent") { |
14956
|
0
|
0
|
|
|
|
|
if (words.size() != 1) |
14957
|
0
|
0
|
|
|
|
|
return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain one space separated value!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
14958
|
0
|
0
|
|
|
|
|
selectors.back().directions.emplace_back(PARENT, 0); |
14959
|
16
|
50
|
|
|
|
|
} else if (words[0] == "child") { |
14960
|
16
|
50
|
|
|
|
|
if (words.size() != 2) |
14961
|
0
|
0
|
|
|
|
|
return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
14962
|
|
|
|
|
|
|
int child_index; |
14963
|
16
|
50
|
|
|
|
|
if (!parse_int(words[1], "child index", child_index, error)) return false; |
|
|
50
|
|
|
|
|
|
14964
|
16
|
50
|
|
|
|
|
selectors.back().directions.emplace_back(CHILD, child_index); |
14965
|
|
|
|
|
|
|
} else { |
14966
|
0
|
0
|
|
|
|
|
return error.assign("Cannot parse direction location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
14967
|
|
|
|
|
|
|
} |
14968
|
|
|
|
|
|
|
} |
14969
|
|
|
|
|
|
|
} |
14970
|
|
|
|
|
|
|
|
14971
|
|
|
|
|
|
|
return true; |
14972
|
|
|
|
|
|
|
} |
14973
|
|
|
|
|
|
|
|
14974
|
|
|
|
|
|
|
} // namespace parsito |
14975
|
|
|
|
|
|
|
|
14976
|
|
|
|
|
|
|
///////// |
14977
|
|
|
|
|
|
|
// File: parsito/configuration/value_extractor.h |
14978
|
|
|
|
|
|
|
///////// |
14979
|
|
|
|
|
|
|
|
14980
|
|
|
|
|
|
|
// This file is part of Parsito . |
14981
|
|
|
|
|
|
|
// |
14982
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
14983
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
14984
|
|
|
|
|
|
|
// |
14985
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
14986
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
14987
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
14988
|
|
|
|
|
|
|
|
14989
|
|
|
|
|
|
|
namespace parsito { |
14990
|
|
|
|
|
|
|
|
14991
|
|
|
|
|
|
|
class value_extractor { |
14992
|
|
|
|
|
|
|
public: |
14993
|
|
|
|
|
|
|
void extract(const node& n, string& value) const; |
14994
|
|
|
|
|
|
|
|
14995
|
|
|
|
|
|
|
bool create(string_piece description, string& error); |
14996
|
|
|
|
|
|
|
|
14997
|
|
|
|
|
|
|
private: |
14998
|
|
|
|
|
|
|
enum value_t { FORM = 0, LEMMA = 1, LEMMA_ID = 2, TAG = 3, UNIVERSAL_TAG = 4, |
14999
|
|
|
|
|
|
|
FEATS = 5, UNIVERSAL_TAG_FEATS = 6, DEPREL = 7 }; |
15000
|
|
|
|
|
|
|
value_t selector; |
15001
|
|
|
|
|
|
|
}; |
15002
|
|
|
|
|
|
|
|
15003
|
|
|
|
|
|
|
} // namespace parsito |
15004
|
|
|
|
|
|
|
|
15005
|
|
|
|
|
|
|
///////// |
15006
|
|
|
|
|
|
|
// File: parsito/configuration/value_extractor.cpp |
15007
|
|
|
|
|
|
|
///////// |
15008
|
|
|
|
|
|
|
|
15009
|
|
|
|
|
|
|
// This file is part of Parsito . |
15010
|
|
|
|
|
|
|
// |
15011
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
15012
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
15013
|
|
|
|
|
|
|
// |
15014
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
15015
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
15016
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
15017
|
|
|
|
|
|
|
|
15018
|
|
|
|
|
|
|
namespace parsito { |
15019
|
|
|
|
|
|
|
|
15020
|
2016
|
|
|
|
|
|
void value_extractor::extract(const node& n, string& value) const { |
15021
|
2016
|
|
|
|
|
|
switch (selector) { |
15022
|
|
|
|
|
|
|
case FORM: |
15023
|
504
|
|
|
|
|
|
value.assign(n.form); |
15024
|
|
|
|
|
|
|
break; |
15025
|
|
|
|
|
|
|
case LEMMA: |
15026
|
0
|
|
|
|
|
|
value.assign(n.lemma); |
15027
|
|
|
|
|
|
|
break; |
15028
|
|
|
|
|
|
|
case LEMMA_ID: |
15029
|
0
|
0
|
|
|
|
|
if (!n.misc.empty()) { |
15030
|
|
|
|
|
|
|
// Try finding LId= in misc column |
15031
|
0
|
|
|
|
|
|
auto lid = n.misc.find("LId="); |
15032
|
0
|
0
|
|
|
|
|
if (lid != string::npos) { |
15033
|
0
|
|
|
|
|
|
lid += 4; |
15034
|
|
|
|
|
|
|
|
15035
|
|
|
|
|
|
|
// Find optional | ending the lemma_id |
15036
|
0
|
|
|
|
|
|
auto lid_end = n.misc.find('|', lid); |
15037
|
0
|
0
|
|
|
|
|
if (lid_end == string::npos) lid_end = n.misc.size(); |
15038
|
|
|
|
|
|
|
|
15039
|
|
|
|
|
|
|
// Store the lemma_id |
15040
|
0
|
|
|
|
|
|
value.assign(n.misc, lid, lid_end - lid); |
15041
|
0
|
|
|
|
|
|
break; |
15042
|
|
|
|
|
|
|
} |
15043
|
|
|
|
|
|
|
} |
15044
|
0
|
|
|
|
|
|
value.assign(n.lemma); |
15045
|
|
|
|
|
|
|
break; |
15046
|
|
|
|
|
|
|
case TAG: |
15047
|
0
|
|
|
|
|
|
value.assign(n.xpostag); |
15048
|
|
|
|
|
|
|
break; |
15049
|
|
|
|
|
|
|
case UNIVERSAL_TAG: |
15050
|
504
|
|
|
|
|
|
value.assign(n.upostag); |
15051
|
|
|
|
|
|
|
break; |
15052
|
|
|
|
|
|
|
case FEATS: |
15053
|
504
|
|
|
|
|
|
value.assign(n.feats); |
15054
|
|
|
|
|
|
|
break; |
15055
|
|
|
|
|
|
|
case UNIVERSAL_TAG_FEATS: |
15056
|
0
|
|
|
|
|
|
value.assign(n.upostag).append(n.feats); |
15057
|
|
|
|
|
|
|
break; |
15058
|
|
|
|
|
|
|
case DEPREL: |
15059
|
504
|
|
|
|
|
|
value.assign(n.deprel); |
15060
|
|
|
|
|
|
|
break; |
15061
|
|
|
|
|
|
|
} |
15062
|
2016
|
|
|
|
|
|
} |
15063
|
|
|
|
|
|
|
|
15064
|
4
|
|
|
|
|
|
bool value_extractor::create(string_piece description, string& error) { |
15065
|
|
|
|
|
|
|
error.clear(); |
15066
|
|
|
|
|
|
|
|
15067
|
4
|
100
|
|
|
|
|
if (description == "form") |
15068
|
1
|
|
|
|
|
|
selector = FORM; |
15069
|
3
|
50
|
|
|
|
|
else if (description == "lemma") |
15070
|
0
|
|
|
|
|
|
selector = LEMMA; |
15071
|
3
|
50
|
|
|
|
|
else if (description == "lemma_id") |
15072
|
0
|
|
|
|
|
|
selector = LEMMA_ID; |
15073
|
3
|
50
|
|
|
|
|
else if (description == "tag") |
15074
|
0
|
|
|
|
|
|
selector = TAG; |
15075
|
3
|
100
|
|
|
|
|
else if (description == "universal_tag") |
15076
|
1
|
|
|
|
|
|
selector = UNIVERSAL_TAG; |
15077
|
2
|
100
|
|
|
|
|
else if (description == "feats") |
15078
|
1
|
|
|
|
|
|
selector = FEATS; |
15079
|
1
|
50
|
|
|
|
|
else if (description == "universal_tag_feats") |
15080
|
0
|
|
|
|
|
|
selector = UNIVERSAL_TAG_FEATS; |
15081
|
1
|
50
|
|
|
|
|
else if (description == "deprel") |
15082
|
1
|
|
|
|
|
|
selector = DEPREL; |
15083
|
|
|
|
|
|
|
else |
15084
|
0
|
|
|
|
|
|
return error.assign("Cannot parse value selector '").append(description.str, description.len).append("'!"), false; |
15085
|
|
|
|
|
|
|
|
15086
|
|
|
|
|
|
|
return true; |
15087
|
|
|
|
|
|
|
} |
15088
|
|
|
|
|
|
|
|
15089
|
|
|
|
|
|
|
} // namespace parsito |
15090
|
|
|
|
|
|
|
|
15091
|
|
|
|
|
|
|
///////// |
15092
|
|
|
|
|
|
|
// File: parsito/embedding/embedding.h |
15093
|
|
|
|
|
|
|
///////// |
15094
|
|
|
|
|
|
|
|
15095
|
|
|
|
|
|
|
// This file is part of Parsito . |
15096
|
|
|
|
|
|
|
// |
15097
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
15098
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
15099
|
|
|
|
|
|
|
// |
15100
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
15101
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
15102
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
15103
|
|
|
|
|
|
|
|
15104
|
|
|
|
|
|
|
namespace parsito { |
15105
|
|
|
|
|
|
|
|
15106
|
4
|
|
|
|
|
|
class embedding { |
15107
|
|
|
|
|
|
|
public: |
15108
|
|
|
|
|
|
|
unsigned dimension; |
15109
|
|
|
|
|
|
|
|
15110
|
|
|
|
|
|
|
int lookup_word(const string& word, string& buffer) const; |
15111
|
|
|
|
|
|
|
int unknown_word() const; |
15112
|
|
|
|
|
|
|
float* weight(int id); // nullptr for wrong id |
15113
|
|
|
|
|
|
|
const float* weight(int id) const; // nullpt for wrong id |
15114
|
|
|
|
|
|
|
|
15115
|
|
|
|
|
|
|
bool can_update_weights(int id) const; |
15116
|
|
|
|
|
|
|
|
15117
|
|
|
|
|
|
|
void load(binary_decoder& data); |
15118
|
|
|
|
|
|
|
void save(binary_encoder& enc) const; |
15119
|
|
|
|
|
|
|
|
15120
|
|
|
|
|
|
|
void create(unsigned dimension, int updatable_index, const vector>>& words, const vector& unknown_weights); |
15121
|
|
|
|
|
|
|
void export_embeddings(vector>>& words, vector& unknown_weights) const; |
15122
|
|
|
|
|
|
|
private: |
15123
|
|
|
|
|
|
|
int updatable_index, unknown_index; |
15124
|
|
|
|
|
|
|
|
15125
|
|
|
|
|
|
|
unordered_map dictionary; |
15126
|
|
|
|
|
|
|
vector weights; |
15127
|
|
|
|
|
|
|
}; |
15128
|
|
|
|
|
|
|
|
15129
|
|
|
|
|
|
|
} // namespace parsito |
15130
|
|
|
|
|
|
|
|
15131
|
|
|
|
|
|
|
///////// |
15132
|
|
|
|
|
|
|
// File: parsito/embedding/embedding.cpp |
15133
|
|
|
|
|
|
|
///////// |
15134
|
|
|
|
|
|
|
|
15135
|
|
|
|
|
|
|
// This file is part of Parsito . |
15136
|
|
|
|
|
|
|
// |
15137
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
15138
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
15139
|
|
|
|
|
|
|
// |
15140
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
15141
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
15142
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
15143
|
|
|
|
|
|
|
|
15144
|
|
|
|
|
|
|
namespace parsito { |
15145
|
|
|
|
|
|
|
|
15146
|
128
|
|
|
|
|
|
int embedding::lookup_word(const string& word, string& buffer) const { |
15147
|
|
|
|
|
|
|
using namespace unilib; |
15148
|
|
|
|
|
|
|
|
15149
|
|
|
|
|
|
|
auto it = dictionary.find(word); |
15150
|
128
|
100
|
|
|
|
|
if (it != dictionary.end()) return it->second; |
15151
|
|
|
|
|
|
|
|
15152
|
|
|
|
|
|
|
// We now apply several heuristics to find a match |
15153
|
|
|
|
|
|
|
|
15154
|
|
|
|
|
|
|
// Try locating uppercase/titlecase characters which we could lowercase |
15155
|
|
|
|
|
|
|
bool first = true; |
15156
|
|
|
|
|
|
|
unicode::category_t first_category = 0, other_categories = 0; |
15157
|
54
|
100
|
|
|
|
|
for (auto&& chr : utf8::decoder(word)) { |
15158
|
18
|
100
|
|
|
|
|
(first ? first_category : other_categories) |= unicode::category(chr); |
15159
|
|
|
|
|
|
|
first = false; |
15160
|
|
|
|
|
|
|
} |
15161
|
|
|
|
|
|
|
|
15162
|
36
|
50
|
|
|
|
|
if ((first_category & unicode::Lut) && (other_categories & unicode::Lut)) { |
|
|
0
|
|
|
|
|
|
15163
|
|
|
|
|
|
|
// Lowercase all characters but the first |
15164
|
|
|
|
|
|
|
buffer.clear(); |
15165
|
|
|
|
|
|
|
first = true; |
15166
|
0
|
0
|
|
|
|
|
for (auto&& chr : utf8::decoder(word)) { |
15167
|
0
|
0
|
|
|
|
|
utf8::append(buffer, first ? chr : unicode::lowercase(chr)); |
15168
|
|
|
|
|
|
|
first = false; |
15169
|
|
|
|
|
|
|
} |
15170
|
|
|
|
|
|
|
|
15171
|
|
|
|
|
|
|
it = dictionary.find(buffer); |
15172
|
0
|
0
|
|
|
|
|
if (it != dictionary.end()) return it->second; |
15173
|
|
|
|
|
|
|
} |
15174
|
|
|
|
|
|
|
|
15175
|
36
|
50
|
|
|
|
|
if ((first_category & unicode::Lut) || (other_categories & unicode::Lut)) { |
|
|
50
|
|
|
|
|
|
15176
|
|
|
|
|
|
|
utf8::map(unicode::lowercase, word, buffer); |
15177
|
|
|
|
|
|
|
|
15178
|
|
|
|
|
|
|
it = dictionary.find(buffer); |
15179
|
0
|
0
|
|
|
|
|
if (it != dictionary.end()) return it->second; |
15180
|
|
|
|
|
|
|
} |
15181
|
|
|
|
|
|
|
|
15182
|
|
|
|
|
|
|
// If the word starts with digit and contain only digits and non-letter characters |
15183
|
|
|
|
|
|
|
// i.e. large number, date, time, try replacing it with first digit only. |
15184
|
36
|
50
|
|
|
|
|
if ((first_category & unicode::N) && !(other_categories & unicode::L)) { |
|
|
0
|
|
|
|
|
|
15185
|
|
|
|
|
|
|
buffer.clear(); |
15186
|
0
|
|
|
|
|
|
utf8::append(buffer, utf8::first(word)); |
15187
|
|
|
|
|
|
|
|
15188
|
|
|
|
|
|
|
it = dictionary.find(buffer); |
15189
|
0
|
0
|
|
|
|
|
if (it != dictionary.end()) return it->second; |
15190
|
|
|
|
|
|
|
} |
15191
|
|
|
|
|
|
|
|
15192
|
36
|
|
|
|
|
|
return unknown_index; |
15193
|
|
|
|
|
|
|
} |
15194
|
|
|
|
|
|
|
|
15195
|
0
|
|
|
|
|
|
int embedding::unknown_word() const { |
15196
|
0
|
|
|
|
|
|
return unknown_index; |
15197
|
|
|
|
|
|
|
} |
15198
|
|
|
|
|
|
|
|
15199
|
0
|
|
|
|
|
|
float* embedding::weight(int id) { |
15200
|
0
|
0
|
|
|
|
|
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15201
|
0
|
|
|
|
|
|
return weights.data() + id * dimension; |
15202
|
|
|
|
|
|
|
} |
15203
|
|
|
|
|
|
|
|
15204
|
0
|
|
|
|
|
|
const float* embedding::weight(int id) const { |
15205
|
58
|
0
|
|
|
|
|
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15206
|
54
|
|
|
|
|
|
return weights.data() + id * dimension; |
15207
|
|
|
|
|
|
|
} |
15208
|
|
|
|
|
|
|
|
15209
|
4
|
|
|
|
|
|
void embedding::load(binary_decoder& data) { |
15210
|
|
|
|
|
|
|
// Load dimemsion |
15211
|
4
|
|
|
|
|
|
dimension = data.next_4B(); |
15212
|
|
|
|
|
|
|
|
15213
|
4
|
|
|
|
|
|
updatable_index = numeric_limits::max(); |
15214
|
|
|
|
|
|
|
|
15215
|
|
|
|
|
|
|
// Load dictionary |
15216
|
|
|
|
|
|
|
dictionary.clear(); |
15217
|
|
|
|
|
|
|
string word; |
15218
|
27
|
50
|
|
|
|
|
for (unsigned size = data.next_4B(); size; size--) { |
|
|
100
|
|
|
|
|
|
15219
|
23
|
50
|
|
|
|
|
data.next_str(word); |
15220
|
46
|
|
|
|
|
|
dictionary.emplace(word, (int)dictionary.size()); |
15221
|
|
|
|
|
|
|
} |
15222
|
|
|
|
|
|
|
|
15223
|
4
|
50
|
|
|
|
|
unknown_index = data.next_1B() ? dictionary.size() : -1; |
|
|
50
|
|
|
|
|
|
15224
|
|
|
|
|
|
|
|
15225
|
|
|
|
|
|
|
// Load weights |
15226
|
4
|
50
|
|
|
|
|
weights.resize(dimension * (dictionary.size() + (unknown_index >= 0))); |
15227
|
4
|
50
|
|
|
|
|
memcpy(weights.data(), data.next(weights.size()), sizeof(float) * weights.size()); |
15228
|
4
|
|
|
|
|
|
} |
15229
|
|
|
|
|
|
|
|
15230
|
|
|
|
|
|
|
} // namespace parsito |
15231
|
|
|
|
|
|
|
|
15232
|
|
|
|
|
|
|
///////// |
15233
|
|
|
|
|
|
|
// File: parsito/embedding/embedding_encode.cpp |
15234
|
|
|
|
|
|
|
///////// |
15235
|
|
|
|
|
|
|
|
15236
|
|
|
|
|
|
|
// This file is part of Parsito . |
15237
|
|
|
|
|
|
|
// |
15238
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
15239
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
15240
|
|
|
|
|
|
|
// |
15241
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
15242
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
15243
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
15244
|
|
|
|
|
|
|
|
15245
|
|
|
|
|
|
|
namespace parsito { |
15246
|
|
|
|
|
|
|
|
15247
|
0
|
|
|
|
|
|
void embedding::save(binary_encoder& enc) const { |
15248
|
|
|
|
|
|
|
// Save dimension and update_weight |
15249
|
0
|
|
|
|
|
|
enc.add_4B(dimension); |
15250
|
|
|
|
|
|
|
|
15251
|
|
|
|
|
|
|
// Save the dictionary |
15252
|
0
|
|
|
|
|
|
vector words(dictionary.size()); |
15253
|
0
|
0
|
|
|
|
|
for (auto&& entry : dictionary) { |
15254
|
0
|
0
|
|
|
|
|
assert(entry.second >= 0 && entry.second < int(dictionary.size())); |
|
|
0
|
|
|
|
|
|
15255
|
0
|
|
|
|
|
|
words[entry.second] = entry.first; |
15256
|
|
|
|
|
|
|
} |
15257
|
0
|
|
|
|
|
|
enc.add_4B(dictionary.size()); |
15258
|
0
|
0
|
|
|
|
|
for (auto&& word : words) |
15259
|
0
|
0
|
|
|
|
|
enc.add_str(word); |
15260
|
|
|
|
|
|
|
|
15261
|
0
|
0
|
|
|
|
|
enc.add_1B(unknown_index >= 0); |
15262
|
|
|
|
|
|
|
|
15263
|
|
|
|
|
|
|
// Save the weights |
15264
|
|
|
|
|
|
|
enc.add_data(weights); |
15265
|
0
|
|
|
|
|
|
} |
15266
|
|
|
|
|
|
|
|
15267
|
0
|
|
|
|
|
|
bool embedding::can_update_weights(int id) const { |
15268
|
0
|
|
|
|
|
|
return id >= int(updatable_index); |
15269
|
|
|
|
|
|
|
} |
15270
|
|
|
|
|
|
|
|
15271
|
0
|
|
|
|
|
|
void embedding::create(unsigned dimension, int updatable_index, const vector>>& words, const vector& unknown_weights) { |
15272
|
0
|
|
|
|
|
|
this->dimension = dimension; |
15273
|
0
|
|
|
|
|
|
this->updatable_index = updatable_index; |
15274
|
|
|
|
|
|
|
|
15275
|
|
|
|
|
|
|
dictionary.clear(); |
15276
|
|
|
|
|
|
|
weights.clear(); |
15277
|
0
|
0
|
|
|
|
|
for (auto&& word : words) { |
15278
|
0
|
0
|
|
|
|
|
assert(word.second.size() == dimension); |
15279
|
0
|
|
|
|
|
|
dictionary.emplace(word.first, (int)dictionary.size()); |
15280
|
0
|
|
|
|
|
|
weights.insert(weights.end(), word.second.begin(), word.second.end()); |
15281
|
|
|
|
|
|
|
} |
15282
|
|
|
|
|
|
|
|
15283
|
0
|
0
|
|
|
|
|
if (unknown_weights.empty()) { |
15284
|
0
|
|
|
|
|
|
this->unknown_index = -1; |
15285
|
|
|
|
|
|
|
} else { |
15286
|
0
|
|
|
|
|
|
this->unknown_index = dictionary.size(); |
15287
|
0
|
|
|
|
|
|
weights.insert(weights.end(), unknown_weights.begin(), unknown_weights.end()); |
15288
|
|
|
|
|
|
|
} |
15289
|
0
|
|
|
|
|
|
} |
15290
|
|
|
|
|
|
|
|
15291
|
0
|
|
|
|
|
|
void embedding::export_embeddings(vector>>& words, vector& unknown_weights) const { |
15292
|
|
|
|
|
|
|
words.clear(); |
15293
|
|
|
|
|
|
|
unknown_weights.clear(); |
15294
|
|
|
|
|
|
|
|
15295
|
0
|
0
|
|
|
|
|
if (dictionary.empty()) return; |
15296
|
|
|
|
|
|
|
|
15297
|
0
|
0
|
|
|
|
|
assert(unknown_index < 0 || unknown_index == int(dictionary.size())); |
|
|
0
|
|
|
|
|
|
15298
|
|
|
|
|
|
|
|
15299
|
0
|
|
|
|
|
|
words.resize(dictionary.size()); |
15300
|
0
|
0
|
|
|
|
|
for (auto&& entry : dictionary) { |
15301
|
0
|
|
|
|
|
|
words[entry.second].first = entry.first; |
15302
|
0
|
|
|
|
|
|
words[entry.second].second.assign(weights.data() + entry.second * dimension, weights.data() + entry.second * dimension + dimension); |
15303
|
|
|
|
|
|
|
} |
15304
|
0
|
0
|
|
|
|
|
if (unknown_index >= 0) |
15305
|
0
|
|
|
|
|
|
unknown_weights.assign(weights.data() + unknown_index * dimension, weights.data() + unknown_index * dimension + dimension); |
15306
|
|
|
|
|
|
|
} |
15307
|
|
|
|
|
|
|
|
15308
|
|
|
|
|
|
|
} // namespace parsito |
15309
|
|
|
|
|
|
|
|
15310
|
|
|
|
|
|
|
///////// |
15311
|
|
|
|
|
|
|
// File: parsito/network/activation_function.h |
15312
|
|
|
|
|
|
|
///////// |
15313
|
|
|
|
|
|
|
|
15314
|
|
|
|
|
|
|
// This file is part of Parsito . |
15315
|
|
|
|
|
|
|
// |
15316
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
15317
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
15318
|
|
|
|
|
|
|
// |
15319
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
15320
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
15321
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
15322
|
|
|
|
|
|
|
|
15323
|
|
|
|
|
|
|
namespace parsito { |
15324
|
|
|
|
|
|
|
|
15325
|
|
|
|
|
|
|
struct activation_function { |
15326
|
|
|
|
|
|
|
enum type { TANH = 0, CUBIC = 1, RELU = 2 }; |
15327
|
|
|
|
|
|
|
|
15328
|
|
|
|
|
|
|
static bool create(string_piece name, type& activation) { |
15329
|
|
|
|
|
|
|
if (name == "tanh") return activation = TANH, true; |
15330
|
|
|
|
|
|
|
if (name == "cubic") return activation = CUBIC, true; |
15331
|
|
|
|
|
|
|
if (name == "relu") return activation = RELU, true; |
15332
|
|
|
|
|
|
|
return false; |
15333
|
|
|
|
|
|
|
} |
15334
|
|
|
|
|
|
|
}; |
15335
|
|
|
|
|
|
|
|
15336
|
|
|
|
|
|
|
} // namespace parsito |
15337
|
|
|
|
|
|
|
|
15338
|
|
|
|
|
|
|
///////// |
15339
|
|
|
|
|
|
|
// File: parsito/network/neural_network.h |
15340
|
|
|
|
|
|
|
///////// |
15341
|
|
|
|
|
|
|
|
15342
|
|
|
|
|
|
|
// This file is part of Parsito . |
15343
|
|
|
|
|
|
|
// |
15344
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
15345
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
15346
|
|
|
|
|
|
|
// |
15347
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
15348
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
15349
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
15350
|
|
|
|
|
|
|
|
15351
|
|
|
|
|
|
|
namespace parsito { |
15352
|
|
|
|
|
|
|
|
15353
|
7
|
0
|
|
|
|
|
class neural_network { |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
15354
|
|
|
|
|
|
|
public: |
15355
|
|
|
|
|
|
|
typedef vector>> embeddings_cache; |
15356
|
|
|
|
|
|
|
|
15357
|
|
|
|
|
|
|
void propagate(const vector& embeddings, const vector*>& embedding_ids_sequences, |
15358
|
|
|
|
|
|
|
vector& hidden_layer, vector& outcomes, const embeddings_cache* cache = nullptr, bool softmax = true) const; |
15359
|
|
|
|
|
|
|
|
15360
|
|
|
|
|
|
|
void load(binary_decoder& data); |
15361
|
|
|
|
|
|
|
void generate_tanh_cache(); |
15362
|
|
|
|
|
|
|
void generate_embeddings_cache(const vector& embeddings, embeddings_cache& cache, unsigned max_words) const; |
15363
|
|
|
|
|
|
|
|
15364
|
|
|
|
|
|
|
private: |
15365
|
|
|
|
|
|
|
friend class neural_network_trainer; |
15366
|
|
|
|
|
|
|
|
15367
|
|
|
|
|
|
|
void load_matrix(binary_decoder& data, vector>& m); |
15368
|
|
|
|
|
|
|
|
15369
|
|
|
|
|
|
|
activation_function::type hidden_layer_activation; |
15370
|
|
|
|
|
|
|
vector> weights[2]; |
15371
|
|
|
|
|
|
|
|
15372
|
|
|
|
|
|
|
vector tanh_cache; |
15373
|
|
|
|
|
|
|
}; |
15374
|
|
|
|
|
|
|
|
15375
|
|
|
|
|
|
|
} // namespace parsito |
15376
|
|
|
|
|
|
|
|
15377
|
|
|
|
|
|
|
///////// |
15378
|
|
|
|
|
|
|
// File: parsito/network/neural_network.cpp |
15379
|
|
|
|
|
|
|
///////// |
15380
|
|
|
|
|
|
|
|
15381
|
|
|
|
|
|
|
// This file is part of Parsito . |
15382
|
|
|
|
|
|
|
// |
15383
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
15384
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
15385
|
|
|
|
|
|
|
// |
15386
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
15387
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
15388
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
15389
|
|
|
|
|
|
|
|
15390
|
|
|
|
|
|
|
namespace parsito { |
15391
|
|
|
|
|
|
|
|
15392
|
2
|
|
|
|
|
|
void neural_network::load_matrix(binary_decoder& data, vector>& m) { |
15393
|
2
|
|
|
|
|
|
unsigned rows = data.next_4B(); |
15394
|
2
|
|
|
|
|
|
unsigned columns = data.next_4B(); |
15395
|
|
|
|
|
|
|
|
15396
|
2
|
|
|
|
|
|
m.resize(rows); |
15397
|
369
|
100
|
|
|
|
|
for (auto&& row : m) { |
15398
|
367
|
|
|
|
|
|
row.resize(columns); |
15399
|
367
|
|
|
|
|
|
memcpy(row.data(), data.next(columns), sizeof(float) * columns); |
15400
|
|
|
|
|
|
|
} |
15401
|
2
|
|
|
|
|
|
} |
15402
|
|
|
|
|
|
|
|
15403
|
1
|
|
|
|
|
|
void neural_network::load(binary_decoder& data) { |
15404
|
1
|
|
|
|
|
|
hidden_layer_activation = activation_function::type(data.next_1B()); |
15405
|
1
|
|
|
|
|
|
load_matrix(data, weights[0]); |
15406
|
1
|
|
|
|
|
|
load_matrix(data, weights[1]); |
15407
|
1
|
|
|
|
|
|
} |
15408
|
|
|
|
|
|
|
|
15409
|
62
|
|
|
|
|
|
void neural_network::propagate(const vector& embeddings, const vector*>& embedding_ids_sequences, |
15410
|
|
|
|
|
|
|
vector& hidden_layer, vector& outcomes, const embeddings_cache* cache, bool softmax) const { |
15411
|
62
|
50
|
|
|
|
|
assert(!weights[0].empty()); |
15412
|
62
|
50
|
|
|
|
|
assert(!weights[1].empty()); |
15413
|
1178
|
100
|
|
|
|
|
for (auto&& embedding_ids : embedding_ids_sequences) if (embedding_ids) assert(embeddings.size() == embedding_ids->size()); |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
15414
|
|
|
|
|
|
|
|
15415
|
62
|
|
|
|
|
|
unsigned hidden_layer_size = weights[0].front().size(); |
15416
|
62
|
|
|
|
|
|
unsigned outcomes_size = weights[1].front().size(); |
15417
|
|
|
|
|
|
|
|
15418
|
124
|
|
|
|
|
|
outcomes.assign(outcomes_size, 0); |
15419
|
|
|
|
|
|
|
|
15420
|
|
|
|
|
|
|
// Hidden layer |
15421
|
62
|
|
|
|
|
|
hidden_layer.assign(hidden_layer_size, 0); |
15422
|
|
|
|
|
|
|
|
15423
|
|
|
|
|
|
|
unsigned index = 0; |
15424
|
1178
|
100
|
|
|
|
|
for (unsigned sequence = 0; sequence < embedding_ids_sequences.size(); sequence++) |
15425
|
5580
|
100
|
|
|
|
|
for (unsigned i = 0; i < embeddings.size(); index += embeddings[i].dimension, i++) |
15426
|
6104
|
100
|
|
|
|
|
if (embedding_ids_sequences[sequence] && embedding_ids_sequences[sequence]->at(i) >= 0) { |
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
15427
|
1640
|
|
|
|
|
|
unsigned word = embedding_ids_sequences[sequence]->at(i); |
15428
|
3280
|
50
|
|
|
|
|
if (cache && i < cache->size() && word < cache->at(i).size()) { |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
15429
|
|
|
|
|
|
|
// Use cache |
15430
|
1640
|
|
|
|
|
|
const float* precomputed = cache->at(i)[word].data() + sequence * hidden_layer_size; |
15431
|
9840
|
100
|
|
|
|
|
for (unsigned j = 0; j < hidden_layer_size; j++) |
15432
|
16400
|
|
|
|
|
|
hidden_layer[j] += precomputed[j]; |
15433
|
|
|
|
|
|
|
} else { |
15434
|
|
|
|
|
|
|
// Compute directly |
15435
|
|
|
|
|
|
|
const float* embedding = embeddings[i].weight(word); |
15436
|
0
|
0
|
|
|
|
|
for (unsigned j = 0; j < embeddings[i].dimension; j++) |
15437
|
0
|
0
|
|
|
|
|
for (unsigned k = 0; k < hidden_layer_size; k++) |
15438
|
0
|
|
|
|
|
|
hidden_layer[k] += embedding[j] * weights[0][index + j][k]; |
15439
|
|
|
|
|
|
|
} |
15440
|
|
|
|
|
|
|
} |
15441
|
372
|
100
|
|
|
|
|
for (unsigned i = 0; i < hidden_layer_size; i++) // Bias |
15442
|
930
|
|
|
|
|
|
hidden_layer[i] += weights[0][index][i]; |
15443
|
|
|
|
|
|
|
|
15444
|
|
|
|
|
|
|
// Activation function |
15445
|
62
|
|
|
|
|
|
switch (hidden_layer_activation) { |
15446
|
|
|
|
|
|
|
case activation_function::TANH: |
15447
|
62
|
50
|
|
|
|
|
if (!tanh_cache.empty()) |
15448
|
372
|
100
|
|
|
|
|
for (auto&& weight : hidden_layer) |
15449
|
310
|
50
|
|
|
|
|
weight = weight <= -10 ? -1 : weight >= 10 ? 1 : tanh_cache[int(weight * 32768 + 10 * 32768)]; |
|
|
50
|
|
|
|
|
|
15450
|
|
|
|
|
|
|
else |
15451
|
62
|
0
|
|
|
|
|
for (auto&& weight : hidden_layer) |
15452
|
0
|
|
|
|
|
|
weight = tanh(weight); |
15453
|
|
|
|
|
|
|
break; |
15454
|
|
|
|
|
|
|
case activation_function::CUBIC: |
15455
|
0
|
0
|
|
|
|
|
for (auto&& weight : hidden_layer) |
15456
|
0
|
|
|
|
|
|
weight = weight * weight * weight; |
15457
|
|
|
|
|
|
|
break; |
15458
|
|
|
|
|
|
|
case activation_function::RELU: |
15459
|
0
|
0
|
|
|
|
|
for (auto&& weight : hidden_layer) |
15460
|
0
|
0
|
|
|
|
|
if (weight < 0) weight = 0; |
15461
|
|
|
|
|
|
|
break; |
15462
|
|
|
|
|
|
|
} |
15463
|
|
|
|
|
|
|
|
15464
|
372
|
100
|
|
|
|
|
for (unsigned i = 0; i < hidden_layer_size; i++) |
15465
|
4340
|
100
|
|
|
|
|
for (unsigned j = 0; j < outcomes_size; j++) |
15466
|
16120
|
|
|
|
|
|
outcomes[j] += hidden_layer[i] * weights[1][i][j]; |
15467
|
868
|
100
|
|
|
|
|
for (unsigned i = 0; i < outcomes_size; i++) // Bias |
15468
|
2418
|
|
|
|
|
|
outcomes[i] += weights[1][hidden_layer_size][i]; |
15469
|
|
|
|
|
|
|
|
15470
|
|
|
|
|
|
|
// Softmax if requested |
15471
|
62
|
50
|
|
|
|
|
if (softmax) { |
15472
|
62
|
|
|
|
|
|
float max = outcomes[0]; |
15473
|
806
|
100
|
|
|
|
|
for (unsigned i = 1; i < outcomes_size; i++) if (outcomes[i] > max) max = outcomes[i]; |
|
|
100
|
|
|
|
|
|
15474
|
|
|
|
|
|
|
|
15475
|
|
|
|
|
|
|
float sum = 0; |
15476
|
868
|
100
|
|
|
|
|
for (unsigned i = 0; i < outcomes_size; i++) sum += (outcomes[i] = exp(outcomes[i] - max)); |
15477
|
62
|
|
|
|
|
|
sum = 1 / sum; |
15478
|
|
|
|
|
|
|
|
15479
|
868
|
100
|
|
|
|
|
for (unsigned i = 0; i < outcomes_size; i++) outcomes[i] *= sum; |
15480
|
|
|
|
|
|
|
} |
15481
|
62
|
|
|
|
|
|
} |
15482
|
|
|
|
|
|
|
|
15483
|
1
|
|
|
|
|
|
void neural_network::generate_tanh_cache() { |
15484
|
1
|
|
|
|
|
|
tanh_cache.resize(2 * 10 * 32768); |
15485
|
655361
|
100
|
|
|
|
|
for (unsigned i = 0; i < tanh_cache.size(); i++) |
15486
|
655360
|
|
|
|
|
|
tanh_cache[i] = tanh(i / 32768.0 - 10); |
15487
|
1
|
|
|
|
|
|
} |
15488
|
|
|
|
|
|
|
|
15489
|
2
|
|
|
|
|
|
void neural_network::generate_embeddings_cache(const vector& embeddings, embeddings_cache& cache, unsigned max_words) const { |
15490
|
|
|
|
|
|
|
unsigned embeddings_dim = 0; |
15491
|
5
|
100
|
|
|
|
|
for (auto&& embedding : embeddings) embeddings_dim += embedding.dimension; |
15492
|
|
|
|
|
|
|
|
15493
|
1
|
|
|
|
|
|
unsigned sequences = weights[0].size() / embeddings_dim; |
15494
|
1
|
50
|
|
|
|
|
assert(sequences * embeddings_dim + 1 == weights[0].size()); |
15495
|
|
|
|
|
|
|
|
15496
|
1
|
|
|
|
|
|
unsigned hidden_layer_size = weights[0].front().size(); |
15497
|
|
|
|
|
|
|
|
15498
|
1
|
|
|
|
|
|
cache.resize(embeddings.size()); |
15499
|
5
|
100
|
|
|
|
|
for (unsigned i = 0, weight_index = 0; i < embeddings.size(); weight_index += embeddings[i].dimension, i++) { |
15500
|
|
|
|
|
|
|
unsigned words = 0; |
15501
|
35
|
50
|
|
|
|
|
while (words < max_words && embeddings[i].weight(words)) words++; |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
15502
|
|
|
|
|
|
|
|
15503
|
4
|
|
|
|
|
|
cache[i].resize(words); |
15504
|
31
|
100
|
|
|
|
|
for (unsigned word = 0; word < words; word++) { |
15505
|
27
|
|
|
|
|
|
const float* embedding = embeddings[i].weight(word); |
15506
|
|
|
|
|
|
|
|
15507
|
27
|
|
|
|
|
|
cache[i][word].assign(sequences * hidden_layer_size, 0); |
15508
|
513
|
100
|
|
|
|
|
for (unsigned sequence = 0, index = weight_index; sequence < sequences; index += embeddings_dim, sequence++) |
15509
|
2916
|
100
|
|
|
|
|
for (unsigned j = 0; j < embeddings[i].dimension; j++) |
15510
|
14580
|
100
|
|
|
|
|
for (unsigned k = 0; k < hidden_layer_size; k++) |
15511
|
36450
|
|
|
|
|
|
cache[i][word][sequence * hidden_layer_size + k] += embedding[j] * weights[0][index + j][k]; |
15512
|
|
|
|
|
|
|
} |
15513
|
|
|
|
|
|
|
} |
15514
|
1
|
|
|
|
|
|
} |
15515
|
|
|
|
|
|
|
|
15516
|
|
|
|
|
|
|
} // namespace parsito |
15517
|
|
|
|
|
|
|
|
15518
|
|
|
|
|
|
|
///////// |
15519
|
|
|
|
|
|
|
// File: parsito/network/network_parameters.h |
15520
|
|
|
|
|
|
|
///////// |
15521
|
|
|
|
|
|
|
|
15522
|
|
|
|
|
|
|
// This file is part of Parsito . |
15523
|
|
|
|
|
|
|
// |
15524
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
15525
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
15526
|
|
|
|
|
|
|
// |
15527
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
15528
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
15529
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
15530
|
|
|
|
|
|
|
|
15531
|
|
|
|
|
|
|
namespace parsito { |
15532
|
|
|
|
|
|
|
|
15533
|
|
|
|
|
|
|
struct network_trainer { |
15534
|
|
|
|
|
|
|
enum network_trainer_algorithm { |
15535
|
|
|
|
|
|
|
SGD, |
15536
|
|
|
|
|
|
|
SGD_MOMENTUM, |
15537
|
|
|
|
|
|
|
ADAGRAD, |
15538
|
|
|
|
|
|
|
ADADELTA, |
15539
|
|
|
|
|
|
|
ADAM, |
15540
|
|
|
|
|
|
|
}; |
15541
|
|
|
|
|
|
|
|
15542
|
|
|
|
|
|
|
network_trainer_algorithm algorithm; |
15543
|
|
|
|
|
|
|
float learning_rate, learning_rate_final; |
15544
|
|
|
|
|
|
|
float momentum, momentum2; |
15545
|
|
|
|
|
|
|
float epsilon; |
15546
|
|
|
|
|
|
|
}; |
15547
|
|
|
|
|
|
|
|
15548
|
|
|
|
|
|
|
struct network_parameters { |
15549
|
|
|
|
|
|
|
unsigned iterations; |
15550
|
|
|
|
|
|
|
int structured_interval; |
15551
|
|
|
|
|
|
|
unsigned hidden_layer; |
15552
|
|
|
|
|
|
|
activation_function::type hidden_layer_type; |
15553
|
|
|
|
|
|
|
network_trainer trainer; |
15554
|
|
|
|
|
|
|
unsigned batch_size; |
15555
|
|
|
|
|
|
|
float initialization_range; |
15556
|
|
|
|
|
|
|
float l1_regularization; |
15557
|
|
|
|
|
|
|
float l2_regularization; |
15558
|
|
|
|
|
|
|
float maxnorm_regularization; |
15559
|
|
|
|
|
|
|
float dropout_hidden, dropout_input; |
15560
|
|
|
|
|
|
|
bool early_stopping; |
15561
|
|
|
|
|
|
|
}; |
15562
|
|
|
|
|
|
|
|
15563
|
|
|
|
|
|
|
} // namespace parsito |
15564
|
|
|
|
|
|
|
|
15565
|
|
|
|
|
|
|
///////// |
15566
|
|
|
|
|
|
|
// File: parsito/network/neural_network_trainer.h |
15567
|
|
|
|
|
|
|
///////// |
15568
|
|
|
|
|
|
|
|
15569
|
|
|
|
|
|
|
// This file is part of Parsito . |
15570
|
|
|
|
|
|
|
// |
15571
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
15572
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
15573
|
|
|
|
|
|
|
// |
15574
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
15575
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
15576
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
15577
|
|
|
|
|
|
|
|
15578
|
|
|
|
|
|
|
namespace parsito { |
15579
|
|
|
|
|
|
|
|
15580
|
|
|
|
|
|
|
class neural_network_trainer { |
15581
|
|
|
|
|
|
|
public: |
15582
|
|
|
|
|
|
|
neural_network_trainer(neural_network& network, unsigned input_size, unsigned output_size, |
15583
|
|
|
|
|
|
|
const network_parameters& parameters, mt19937& generator); |
15584
|
|
|
|
|
|
|
|
15585
|
|
|
|
|
|
|
bool next_iteration(); |
15586
|
|
|
|
|
|
|
|
15587
|
0
|
0
|
|
|
|
|
struct workspace { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15588
|
|
|
|
|
|
|
unsigned batch = 0; |
15589
|
|
|
|
|
|
|
vector outcomes; |
15590
|
|
|
|
|
|
|
vector hidden_layer; |
15591
|
|
|
|
|
|
|
vector error_outcomes; |
15592
|
|
|
|
|
|
|
vector error_hidden; |
15593
|
|
|
|
|
|
|
|
15594
|
|
|
|
|
|
|
// Delta accumulators |
15595
|
|
|
|
|
|
|
vector> weights_batch[2]; |
15596
|
|
|
|
|
|
|
vector>> error_embedding; |
15597
|
|
|
|
|
|
|
vector> error_embedding_nonempty; |
15598
|
|
|
|
|
|
|
|
15599
|
|
|
|
|
|
|
// Trainer data |
15600
|
|
|
|
|
|
|
struct trainer_data { |
15601
|
|
|
|
|
|
|
float delta = 0; |
15602
|
|
|
|
|
|
|
float gradient = 0; |
15603
|
|
|
|
|
|
|
}; |
15604
|
|
|
|
|
|
|
vector> weights_trainer[2]; |
15605
|
|
|
|
|
|
|
vector>> embedding_trainer; |
15606
|
|
|
|
|
|
|
|
15607
|
|
|
|
|
|
|
// Dropout vectors |
15608
|
|
|
|
|
|
|
vector input_dropout; |
15609
|
|
|
|
|
|
|
vector hidden_dropout; |
15610
|
|
|
|
|
|
|
vector hidden_kept; |
15611
|
|
|
|
|
|
|
}; |
15612
|
|
|
|
|
|
|
void propagate(const vector& embeddings, const vector*>& embedding_ids_sequences, workspace& w) const; |
15613
|
|
|
|
|
|
|
void backpropagate(vector& embeddings, const vector*>& embedding_ids_sequences, unsigned required_outcome, workspace& w); |
15614
|
|
|
|
|
|
|
|
15615
|
|
|
|
|
|
|
void finalize_sentence(); |
15616
|
|
|
|
|
|
|
|
15617
|
|
|
|
|
|
|
void save_network(binary_encoder& enc) const; |
15618
|
|
|
|
|
|
|
|
15619
|
|
|
|
|
|
|
private: |
15620
|
|
|
|
|
|
|
struct trainer_sgd { |
15621
|
|
|
|
|
|
|
static bool need_trainer_data; |
15622
|
|
|
|
|
|
|
static inline float delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data); |
15623
|
|
|
|
|
|
|
}; |
15624
|
|
|
|
|
|
|
struct trainer_sgd_momentum { |
15625
|
|
|
|
|
|
|
static bool need_trainer_data; |
15626
|
|
|
|
|
|
|
static inline float delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data); |
15627
|
|
|
|
|
|
|
}; |
15628
|
|
|
|
|
|
|
struct trainer_adagrad { |
15629
|
|
|
|
|
|
|
static bool need_trainer_data; |
15630
|
|
|
|
|
|
|
static inline float delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data); |
15631
|
|
|
|
|
|
|
}; |
15632
|
|
|
|
|
|
|
struct trainer_adadelta { |
15633
|
|
|
|
|
|
|
static bool need_trainer_data; |
15634
|
|
|
|
|
|
|
static inline float delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data); |
15635
|
|
|
|
|
|
|
}; |
15636
|
|
|
|
|
|
|
struct trainer_adam { |
15637
|
|
|
|
|
|
|
static bool need_trainer_data; |
15638
|
|
|
|
|
|
|
static inline float delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data); |
15639
|
|
|
|
|
|
|
}; |
15640
|
|
|
|
|
|
|
template void backpropagate_template(vector& embeddings, const vector*>& embedding_ids_sequences, unsigned required_outcome, workspace& w); |
15641
|
|
|
|
|
|
|
|
15642
|
|
|
|
|
|
|
void l1_regularize(); |
15643
|
|
|
|
|
|
|
void maxnorm_regularize(); |
15644
|
|
|
|
|
|
|
|
15645
|
|
|
|
|
|
|
void save_matrix(const vector>& m, binary_encoder& enc) const; |
15646
|
|
|
|
|
|
|
|
15647
|
|
|
|
|
|
|
neural_network& network; |
15648
|
|
|
|
|
|
|
mt19937& generator; |
15649
|
|
|
|
|
|
|
unsigned iteration, iterations, steps; |
15650
|
|
|
|
|
|
|
network_trainer trainer; |
15651
|
|
|
|
|
|
|
unsigned batch_size; |
15652
|
|
|
|
|
|
|
float l1_regularization, l2_regularization, maxnorm_regularization; |
15653
|
|
|
|
|
|
|
float dropout_hidden, dropout_input; |
15654
|
|
|
|
|
|
|
}; |
15655
|
|
|
|
|
|
|
|
15656
|
|
|
|
|
|
|
} // namespace parsito |
15657
|
|
|
|
|
|
|
|
15658
|
|
|
|
|
|
|
///////// |
15659
|
|
|
|
|
|
|
// File: parsito/network/neural_network_trainer.cpp |
15660
|
|
|
|
|
|
|
///////// |
15661
|
|
|
|
|
|
|
|
15662
|
|
|
|
|
|
|
// This file is part of Parsito . |
15663
|
|
|
|
|
|
|
// |
15664
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
15665
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
15666
|
|
|
|
|
|
|
// |
15667
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
15668
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
15669
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
15670
|
|
|
|
|
|
|
|
15671
|
|
|
|
|
|
|
namespace parsito { |
15672
|
|
|
|
|
|
|
|
15673
|
0
|
|
|
|
|
|
neural_network_trainer::neural_network_trainer(neural_network& network, unsigned input_size, unsigned output_size, |
15674
|
0
|
|
|
|
|
|
const network_parameters& parameters, mt19937& generator) : network(network), generator(generator) { |
15675
|
|
|
|
|
|
|
// Initialize hidden layer |
15676
|
0
|
|
|
|
|
|
network.hidden_layer_activation = parameters.hidden_layer_type; |
15677
|
0
|
0
|
|
|
|
|
if (parameters.hidden_layer) { |
15678
|
0
|
|
|
|
|
|
float uniform_pre_hidden_range = parameters.initialization_range > 0 ? parameters.initialization_range : |
15679
|
0
|
0
|
|
|
|
|
-parameters.initialization_range * sqrt(6.0 / float(input_size + parameters.hidden_layer)); |
15680
|
0
|
|
|
|
|
|
uniform_real_distribution uniform_pre_hidden(-uniform_pre_hidden_range, uniform_pre_hidden_range); |
15681
|
|
|
|
|
|
|
|
15682
|
0
|
|
|
|
|
|
network.weights[0].resize(input_size + 1/*bias*/); |
15683
|
0
|
0
|
|
|
|
|
for (auto&& row : network.weights[0]) { |
15684
|
0
|
|
|
|
|
|
row.resize(parameters.hidden_layer); |
15685
|
0
|
0
|
|
|
|
|
for (auto&& weight : row) |
15686
|
0
|
|
|
|
|
|
weight = uniform_pre_hidden(generator); |
15687
|
|
|
|
|
|
|
} |
15688
|
|
|
|
|
|
|
|
15689
|
0
|
|
|
|
|
|
float uniform_post_hidden_range = parameters.initialization_range > 0 ? parameters.initialization_range : |
15690
|
0
|
0
|
|
|
|
|
-parameters.initialization_range * sqrt(6.0 / float(output_size + parameters.hidden_layer)); |
15691
|
0
|
|
|
|
|
|
uniform_real_distribution uniform_post_hidden(-uniform_post_hidden_range, uniform_post_hidden_range); |
15692
|
|
|
|
|
|
|
|
15693
|
0
|
|
|
|
|
|
network.weights[1].resize(parameters.hidden_layer + 1/*bias*/); |
15694
|
0
|
0
|
|
|
|
|
for (auto&& row : network.weights[1]) { |
15695
|
0
|
|
|
|
|
|
row.resize(output_size); |
15696
|
0
|
0
|
|
|
|
|
for (auto&& weight : row) |
15697
|
0
|
|
|
|
|
|
weight = uniform_post_hidden(generator); |
15698
|
|
|
|
|
|
|
} |
15699
|
|
|
|
|
|
|
} |
15700
|
|
|
|
|
|
|
|
15701
|
|
|
|
|
|
|
// Store the network_parameters |
15702
|
0
|
|
|
|
|
|
iteration = steps = 0; |
15703
|
0
|
|
|
|
|
|
iterations = parameters.iterations; |
15704
|
0
|
|
|
|
|
|
trainer = parameters.trainer; |
15705
|
0
|
|
|
|
|
|
batch_size = parameters.batch_size; |
15706
|
0
|
|
|
|
|
|
l1_regularization = parameters.l1_regularization; |
15707
|
0
|
|
|
|
|
|
l2_regularization = parameters.l2_regularization; |
15708
|
0
|
|
|
|
|
|
maxnorm_regularization = parameters.maxnorm_regularization; |
15709
|
0
|
|
|
|
|
|
dropout_hidden = parameters.dropout_hidden; |
15710
|
0
|
|
|
|
|
|
dropout_input = parameters.dropout_input; |
15711
|
|
|
|
|
|
|
|
15712
|
|
|
|
|
|
|
// Maxnorm regularize the created weights |
15713
|
0
|
0
|
|
|
|
|
if (maxnorm_regularization) maxnorm_regularize(); |
15714
|
0
|
|
|
|
|
|
} |
15715
|
|
|
|
|
|
|
|
15716
|
0
|
|
|
|
|
|
bool neural_network_trainer::next_iteration() { |
15717
|
0
|
0
|
|
|
|
|
if (iteration++ >= iterations) return false; |
15718
|
|
|
|
|
|
|
|
15719
|
0
|
0
|
|
|
|
|
if (trainer.algorithm != network_trainer::ADADELTA) |
15720
|
0
|
0
|
|
|
|
|
if (trainer.learning_rate != trainer.learning_rate_final && iteration > 1) |
|
|
0
|
|
|
|
|
|
15721
|
|
|
|
|
|
|
trainer.learning_rate = |
15722
|
0
|
|
|
|
|
|
exp(((iterations - iteration) * log(trainer.learning_rate) + log(trainer.learning_rate_final)) / (iterations - iteration + 1)); |
15723
|
|
|
|
|
|
|
|
15724
|
|
|
|
|
|
|
return true; |
15725
|
|
|
|
|
|
|
} |
15726
|
|
|
|
|
|
|
|
15727
|
0
|
|
|
|
|
|
void neural_network_trainer::propagate(const vector& embeddings, const vector*>& embedding_ids_sequences, workspace& w) const { |
15728
|
|
|
|
|
|
|
// Initialize dropout if requested |
15729
|
0
|
0
|
|
|
|
|
if (dropout_input) { |
15730
|
0
|
|
|
|
|
|
w.input_dropout.resize(network.weights[0].size()); |
15731
|
0
|
|
|
|
|
|
bernoulli_distribution dropout(dropout_input); |
15732
|
0
|
0
|
|
|
|
|
for (auto&& flag : w.input_dropout) |
15733
|
0
|
|
|
|
|
|
flag = dropout(generator); |
15734
|
|
|
|
|
|
|
} |
15735
|
|
|
|
|
|
|
|
15736
|
0
|
0
|
|
|
|
|
if (dropout_hidden) { |
15737
|
0
|
|
|
|
|
|
w.hidden_dropout.resize(network.weights[1].size()); |
15738
|
0
|
|
|
|
|
|
bernoulli_distribution dropout(dropout_hidden); |
15739
|
0
|
0
|
|
|
|
|
for (auto&& flag : w.hidden_dropout) |
15740
|
0
|
|
|
|
|
|
flag = dropout(generator); |
15741
|
|
|
|
|
|
|
} |
15742
|
|
|
|
|
|
|
w.hidden_kept.clear(); |
15743
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < network.weights[0].front().size(); i++) |
15744
|
0
|
0
|
|
|
|
|
if (w.hidden_dropout.empty() || !w.hidden_dropout[i]) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15745
|
0
|
|
|
|
|
|
w.hidden_kept.push_back(i); |
15746
|
|
|
|
|
|
|
|
15747
|
|
|
|
|
|
|
// Propagate |
15748
|
|
|
|
|
|
|
unsigned hidden_layer_size = network.weights[0].front().size(); |
15749
|
0
|
|
|
|
|
|
unsigned outcomes_size = network.weights[1].front().size(); |
15750
|
|
|
|
|
|
|
|
15751
|
0
|
|
|
|
|
|
w.outcomes.assign(outcomes_size, 0); |
15752
|
|
|
|
|
|
|
|
15753
|
|
|
|
|
|
|
// Hidden layer |
15754
|
0
|
|
|
|
|
|
w.hidden_layer.assign(hidden_layer_size, 0); |
15755
|
|
|
|
|
|
|
|
15756
|
|
|
|
|
|
|
unsigned index = 0; |
15757
|
0
|
0
|
|
|
|
|
for (auto&& embedding_ids : embedding_ids_sequences) |
15758
|
|
|
|
|
|
|
// Note: The unnecessary brackets on the following for cycle are needed |
15759
|
|
|
|
|
|
|
// to compile on VS 2015 Update 3, which otherwise fail to compile it. |
15760
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < embeddings.size(); i++) { |
15761
|
0
|
0
|
|
|
|
|
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15762
|
0
|
|
|
|
|
|
const float* embedding = embeddings[i].weight((*embedding_ids)[i]); |
15763
|
0
|
0
|
|
|
|
|
for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, embedding++, index++) |
15764
|
0
|
0
|
|
|
|
|
if (w.input_dropout.empty() || !w.input_dropout[index]) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15765
|
0
|
0
|
|
|
|
|
for (auto&& j : w.hidden_kept) |
15766
|
0
|
|
|
|
|
|
w.hidden_layer[j] += *embedding * network.weights[0][index][j]; |
15767
|
|
|
|
|
|
|
} else { |
15768
|
0
|
|
|
|
|
|
index += embeddings[i].dimension; |
15769
|
|
|
|
|
|
|
} |
15770
|
|
|
|
|
|
|
} |
15771
|
0
|
0
|
|
|
|
|
if (dropout_input) { // Dropout normalization |
15772
|
0
|
|
|
|
|
|
float dropout_factor = 1. / (1. - dropout_input); |
15773
|
0
|
0
|
|
|
|
|
for (auto&& i : w.hidden_kept) |
15774
|
0
|
|
|
|
|
|
w.hidden_layer[i] *= dropout_factor; |
15775
|
|
|
|
|
|
|
} |
15776
|
0
|
0
|
|
|
|
|
for (auto&& i : w.hidden_kept) // Bias |
15777
|
0
|
|
|
|
|
|
w.hidden_layer[i] += network.weights[0][index][i]; |
15778
|
|
|
|
|
|
|
|
15779
|
|
|
|
|
|
|
// Activation function |
15780
|
0
|
|
|
|
|
|
switch (network.hidden_layer_activation) { |
15781
|
|
|
|
|
|
|
case activation_function::TANH: |
15782
|
0
|
0
|
|
|
|
|
for (auto&& weight : w.hidden_layer) |
15783
|
0
|
|
|
|
|
|
weight = tanh(weight); |
15784
|
|
|
|
|
|
|
break; |
15785
|
|
|
|
|
|
|
case activation_function::CUBIC: |
15786
|
0
|
0
|
|
|
|
|
for (auto&& weight : w.hidden_layer) |
15787
|
0
|
|
|
|
|
|
weight = weight * weight * weight; |
15788
|
|
|
|
|
|
|
break; |
15789
|
|
|
|
|
|
|
case activation_function::RELU: |
15790
|
0
|
0
|
|
|
|
|
for (auto&& weight : w.hidden_layer) |
15791
|
0
|
0
|
|
|
|
|
if (weight < 0) weight = 0; |
15792
|
|
|
|
|
|
|
break; |
15793
|
|
|
|
|
|
|
} |
15794
|
0
|
0
|
|
|
|
|
if (dropout_hidden) { // Dropout normalization |
15795
|
0
|
|
|
|
|
|
float dropout_factor = 1. / (1. - dropout_hidden); |
15796
|
0
|
0
|
|
|
|
|
for (auto&& i : w.hidden_kept) |
15797
|
0
|
|
|
|
|
|
w.hidden_layer[i] *= dropout_factor; |
15798
|
|
|
|
|
|
|
} |
15799
|
|
|
|
|
|
|
|
15800
|
0
|
0
|
|
|
|
|
for (auto&& i : w.hidden_kept) |
15801
|
0
|
0
|
|
|
|
|
for (unsigned j = 0; j < outcomes_size; j++) |
15802
|
0
|
|
|
|
|
|
w.outcomes[j] += w.hidden_layer[i] * network.weights[1][i][j]; |
15803
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < outcomes_size; i++) // Bias |
15804
|
0
|
|
|
|
|
|
w.outcomes[i] += network.weights[1][hidden_layer_size][i]; |
15805
|
|
|
|
|
|
|
|
15806
|
|
|
|
|
|
|
// Softmax |
15807
|
0
|
|
|
|
|
|
float max = w.outcomes[0]; |
15808
|
0
|
0
|
|
|
|
|
for (unsigned i = 1; i < outcomes_size; i++) if (w.outcomes[i] > max) max = w.outcomes[i]; |
|
|
0
|
|
|
|
|
|
15809
|
|
|
|
|
|
|
|
15810
|
|
|
|
|
|
|
float sum = 0; |
15811
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < outcomes_size; i++) sum += (w.outcomes[i] = exp(w.outcomes[i] - max)); |
15812
|
0
|
|
|
|
|
|
sum = 1 / sum; |
15813
|
|
|
|
|
|
|
|
15814
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < outcomes_size; i++) w.outcomes[i] *= sum; |
15815
|
0
|
|
|
|
|
|
} |
15816
|
|
|
|
|
|
|
|
15817
|
|
|
|
|
|
|
// SGD |
15818
|
|
|
|
|
|
|
bool neural_network_trainer::trainer_sgd::need_trainer_data = false; |
15819
|
|
|
|
|
|
|
float neural_network_trainer::trainer_sgd::delta(float gradient, const network_trainer& trainer, workspace::trainer_data& /*data*/) { |
15820
|
0
|
|
|
|
|
|
return trainer.learning_rate * gradient; |
15821
|
|
|
|
|
|
|
} |
15822
|
|
|
|
|
|
|
|
15823
|
|
|
|
|
|
|
// SGD with momentum |
15824
|
|
|
|
|
|
|
bool neural_network_trainer::trainer_sgd_momentum::need_trainer_data = true; |
15825
|
|
|
|
|
|
|
float neural_network_trainer::trainer_sgd_momentum::delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data) { |
15826
|
0
|
|
|
|
|
|
data.delta = trainer.momentum * data.delta + trainer.learning_rate * gradient; |
15827
|
|
|
|
|
|
|
return data.delta; |
15828
|
|
|
|
|
|
|
} |
15829
|
|
|
|
|
|
|
|
15830
|
|
|
|
|
|
|
// AdaGrad |
15831
|
|
|
|
|
|
|
bool neural_network_trainer::trainer_adagrad::need_trainer_data = true; |
15832
|
|
|
|
|
|
|
float neural_network_trainer::trainer_adagrad::delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data) { |
15833
|
0
|
|
|
|
|
|
data.gradient += gradient * gradient; |
15834
|
0
|
|
|
|
|
|
return trainer.learning_rate / sqrt(data.gradient + trainer.epsilon) * gradient; |
15835
|
|
|
|
|
|
|
} |
15836
|
|
|
|
|
|
|
|
15837
|
|
|
|
|
|
|
// AdaDelta |
15838
|
|
|
|
|
|
|
bool neural_network_trainer::trainer_adadelta::need_trainer_data = true; |
15839
|
0
|
|
|
|
|
|
float neural_network_trainer::trainer_adadelta::delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data) { |
15840
|
0
|
|
|
|
|
|
data.gradient = trainer.momentum * data.gradient + (1 - trainer.momentum) * gradient * gradient; |
15841
|
0
|
|
|
|
|
|
float delta = sqrt(data.delta + trainer.epsilon) / sqrt(data.gradient + trainer.epsilon) * gradient; |
15842
|
0
|
|
|
|
|
|
data.delta = trainer.momentum * data.delta + (1 - trainer.momentum) * delta * delta; |
15843
|
0
|
|
|
|
|
|
return delta; |
15844
|
|
|
|
|
|
|
} |
15845
|
|
|
|
|
|
|
|
15846
|
|
|
|
|
|
|
// Adam |
15847
|
|
|
|
|
|
|
bool neural_network_trainer::trainer_adam::need_trainer_data = true; |
15848
|
0
|
|
|
|
|
|
float neural_network_trainer::trainer_adam::delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data) { |
15849
|
0
|
|
|
|
|
|
data.gradient = trainer.momentum * data.gradient + (1 - trainer.momentum) * gradient; |
15850
|
0
|
|
|
|
|
|
data.delta = trainer.momentum2 * data.delta + (1 - trainer.momentum2) * gradient * gradient; |
15851
|
0
|
|
|
|
|
|
return trainer.learning_rate * data.gradient / sqrt(data.delta + trainer.epsilon); |
15852
|
|
|
|
|
|
|
} |
15853
|
|
|
|
|
|
|
|
15854
|
|
|
|
|
|
|
// Backpropagation |
15855
|
|
|
|
|
|
|
template |
15856
|
0
|
|
|
|
|
|
void neural_network_trainer::backpropagate_template(vector& embeddings, const vector*>& embedding_ids_sequences, unsigned required_outcome, workspace& w) { |
15857
|
0
|
|
|
|
|
|
size_t hidden_layer_size = network.weights[0].front().size(); |
15858
|
0
|
|
|
|
|
|
size_t outcomes_size = network.weights[1].front().size(); |
15859
|
|
|
|
|
|
|
|
15860
|
|
|
|
|
|
|
// Allocate space for delta accumulators |
15861
|
0
|
0
|
|
|
|
|
if (network.weights[0].size() > w.weights_batch[0].size()) w.weights_batch[0].resize(network.weights[0].size()); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15862
|
0
|
0
|
|
|
|
|
if (network.weights[1].size() > w.weights_batch[1].size()) w.weights_batch[1].resize(network.weights[1].size()); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15863
|
0
|
0
|
|
|
|
|
if (embeddings.size() > w.error_embedding.size()) w.error_embedding.resize(embeddings.size()); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15864
|
0
|
0
|
|
|
|
|
if (embeddings.size() > w.error_embedding_nonempty.size()) w.error_embedding_nonempty.resize(embeddings.size()); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15865
|
|
|
|
|
|
|
|
15866
|
|
|
|
|
|
|
// Allocate space for trainer_data if required) |
15867
|
0
|
|
|
|
|
|
workspace::trainer_data none_trainer_data; |
15868
|
0
|
0
|
|
|
|
|
if (TRAINER::need_trainer_data) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15869
|
0
|
0
|
|
|
|
|
while (network.weights[0].size() > w.weights_trainer[0].size()) w.weights_trainer[0].emplace_back(network.weights[0].front().size()); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15870
|
0
|
0
|
|
|
|
|
while (network.weights[1].size() > w.weights_trainer[1].size()) w.weights_trainer[1].emplace_back(outcomes_size); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15871
|
|
|
|
|
|
|
} |
15872
|
|
|
|
|
|
|
|
15873
|
|
|
|
|
|
|
// Compute error vector |
15874
|
0
|
|
|
|
|
|
w.error_outcomes.resize(outcomes_size); |
15875
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < outcomes_size; i++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15876
|
0
|
0
|
|
|
|
|
w.error_outcomes[i] = (i == required_outcome) - w.outcomes[i]; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15877
|
|
|
|
|
|
|
|
15878
|
|
|
|
|
|
|
// Backpropagate error_outcomes to error_hidden |
15879
|
0
|
|
|
|
|
|
w.error_hidden.assign(hidden_layer_size, 0); |
15880
|
0
|
0
|
|
|
|
|
for (auto&& i : w.hidden_kept) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15881
|
0
|
0
|
|
|
|
|
for (unsigned j = 0; j < outcomes_size; j++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15882
|
0
|
|
|
|
|
|
w.error_hidden[i] += network.weights[1][i][j] * w.error_outcomes[j]; |
15883
|
|
|
|
|
|
|
// Dropout normalization |
15884
|
0
|
0
|
|
|
|
|
if (dropout_hidden) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15885
|
0
|
|
|
|
|
|
float dropout_factor = 1. / (1. - dropout_hidden); |
15886
|
0
|
0
|
|
|
|
|
for (auto&& i : w.hidden_kept) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15887
|
0
|
|
|
|
|
|
w.error_hidden[i] *= dropout_factor; |
15888
|
|
|
|
|
|
|
} |
15889
|
|
|
|
|
|
|
|
15890
|
|
|
|
|
|
|
// Perform activation function derivation |
15891
|
0
|
|
|
|
|
|
switch (network.hidden_layer_activation) { |
15892
|
|
|
|
|
|
|
case activation_function::TANH: |
15893
|
0
|
0
|
|
|
|
|
for (auto&& i : w.hidden_kept) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15894
|
0
|
|
|
|
|
|
w.error_hidden[i] *= 1 - w.hidden_layer[i] * w.hidden_layer[i]; |
15895
|
|
|
|
|
|
|
break; |
15896
|
|
|
|
|
|
|
case activation_function::CUBIC: |
15897
|
0
|
0
|
|
|
|
|
for (auto&& i : w.hidden_kept) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15898
|
0
|
|
|
|
|
|
float hidden_layer = cbrt(w.hidden_layer[i]); |
15899
|
0
|
|
|
|
|
|
w.error_hidden[i] *= 3 * hidden_layer * hidden_layer; |
15900
|
|
|
|
|
|
|
} |
15901
|
|
|
|
|
|
|
break; |
15902
|
|
|
|
|
|
|
case activation_function::RELU: |
15903
|
0
|
0
|
|
|
|
|
for (auto&& i : w.hidden_kept) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15904
|
0
|
0
|
|
|
|
|
if (w.hidden_layer[i] <= 0) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15905
|
0
|
|
|
|
|
|
w.error_hidden[i] = 0; |
15906
|
|
|
|
|
|
|
break; |
15907
|
|
|
|
|
|
|
} |
15908
|
|
|
|
|
|
|
|
15909
|
|
|
|
|
|
|
// Update weights[1] |
15910
|
0
|
0
|
|
|
|
|
for (auto&& i : w.hidden_kept) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15911
|
0
|
0
|
|
|
|
|
if (w.weights_batch[1][i].empty()) w.weights_batch[1][i].resize(outcomes_size); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15912
|
0
|
0
|
|
|
|
|
for (unsigned j = 0; j < outcomes_size; j++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15913
|
0
|
|
|
|
|
|
w.weights_batch[1][i][j] += w.hidden_layer[i] * w.error_outcomes[j]; |
15914
|
|
|
|
|
|
|
} |
15915
|
|
|
|
|
|
|
// Bias |
15916
|
0
|
0
|
|
|
|
|
if (w.weights_batch[1][hidden_layer_size].empty()) w.weights_batch[1][hidden_layer_size].resize(outcomes_size); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15917
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < outcomes_size; i++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15918
|
0
|
|
|
|
|
|
w.weights_batch[1][hidden_layer_size][i] += w.error_outcomes[i]; |
15919
|
|
|
|
|
|
|
|
15920
|
|
|
|
|
|
|
// Dropout normalization |
15921
|
0
|
0
|
|
|
|
|
if (dropout_input) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15922
|
0
|
|
|
|
|
|
float dropout_factor = 1. / (1. - dropout_input); |
15923
|
0
|
0
|
|
|
|
|
for (auto&& i : w.hidden_kept) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15924
|
0
|
|
|
|
|
|
w.error_hidden[i] *= dropout_factor; |
15925
|
|
|
|
|
|
|
} |
15926
|
|
|
|
|
|
|
// Update weights[0] and backpropagate to error_embedding |
15927
|
|
|
|
|
|
|
unsigned index = 0; |
15928
|
0
|
0
|
|
|
|
|
for (auto&& embedding_ids : embedding_ids_sequences) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15929
|
|
|
|
|
|
|
// Note: The unnecessary brackets on the following for cycle are needed |
15930
|
|
|
|
|
|
|
// to compile on VS 2015 Update 3, which otherwise fail to compile it. |
15931
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < embeddings.size(); i++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15932
|
0
|
0
|
|
|
|
|
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15933
|
0
|
|
|
|
|
|
int embedding_id = (*embedding_ids)[i]; |
15934
|
|
|
|
|
|
|
|
15935
|
|
|
|
|
|
|
float* error_embedding = nullptr; // Accumulate embedding error if required |
15936
|
0
|
0
|
|
|
|
|
if (embeddings[i].can_update_weights(embedding_id)) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15937
|
0
|
0
|
|
|
|
|
if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15938
|
0
|
0
|
|
|
|
|
if (w.error_embedding[i][embedding_id].empty()) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15939
|
0
|
|
|
|
|
|
w.error_embedding[i][embedding_id].assign(embeddings[i].dimension, 0); |
15940
|
0
|
0
|
|
|
|
|
w.error_embedding_nonempty[i].emplace_back(embedding_id); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15941
|
|
|
|
|
|
|
} |
15942
|
0
|
|
|
|
|
|
error_embedding = w.error_embedding[i][embedding_id].data(); |
15943
|
|
|
|
|
|
|
} |
15944
|
|
|
|
|
|
|
|
15945
|
0
|
|
|
|
|
|
const float* embedding = embeddings[i].weight(embedding_id); |
15946
|
0
|
0
|
|
|
|
|
for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15947
|
0
|
0
|
|
|
|
|
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15948
|
0
|
0
|
|
|
|
|
if (error_embedding) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15949
|
0
|
0
|
|
|
|
|
for (auto&& j : w.hidden_kept) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15950
|
0
|
|
|
|
|
|
*error_embedding += network.weights[0][index][j] * w.error_hidden[j]; |
15951
|
0
|
0
|
|
|
|
|
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15952
|
0
|
0
|
|
|
|
|
for (auto&& j : w.hidden_kept) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15953
|
0
|
|
|
|
|
|
w.weights_batch[0][index][j] += *embedding * w.error_hidden[j]; |
15954
|
|
|
|
|
|
|
} |
15955
|
|
|
|
|
|
|
} else { |
15956
|
0
|
|
|
|
|
|
index += embeddings[i].dimension; |
15957
|
|
|
|
|
|
|
} |
15958
|
|
|
|
|
|
|
} |
15959
|
|
|
|
|
|
|
// Bias |
15960
|
|
|
|
|
|
|
{ |
15961
|
0
|
|
|
|
|
|
float negate_input_dropout = 1. - dropout_hidden; |
15962
|
0
|
0
|
|
|
|
|
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15963
|
0
|
0
|
|
|
|
|
for (auto&& i : w.hidden_kept) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15964
|
0
|
|
|
|
|
|
w.weights_batch[0][index][i] += w.error_hidden[i] * negate_input_dropout; |
15965
|
|
|
|
|
|
|
} |
15966
|
|
|
|
|
|
|
|
15967
|
|
|
|
|
|
|
// End if not at the end of the batch |
15968
|
0
|
0
|
|
|
|
|
if (++w.batch < batch_size) return; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15969
|
0
|
|
|
|
|
|
w.batch = 0; |
15970
|
|
|
|
|
|
|
|
15971
|
|
|
|
|
|
|
// Update hidden weights |
15972
|
0
|
0
|
|
|
|
|
if (!network.weights[0].empty()) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15973
|
0
|
0
|
|
|
|
|
for (int i = 0; i < 2; i++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15974
|
0
|
0
|
|
|
|
|
for (unsigned j = 0; j < w.weights_batch[i].size(); j++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15975
|
0
|
0
|
|
|
|
|
if (!w.weights_batch[i][j].empty()) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15976
|
0
|
0
|
|
|
|
|
for (unsigned k = 0; k < w.weights_batch[i][j].size(); k++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15977
|
0
|
0
|
|
|
|
|
network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k]; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15978
|
|
|
|
|
|
|
w.weights_batch[i][j].clear(); |
15979
|
|
|
|
|
|
|
} |
15980
|
|
|
|
|
|
|
} |
15981
|
|
|
|
|
|
|
|
15982
|
|
|
|
|
|
|
// Update embedding weights using error_embedding |
15983
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < embeddings.size(); i++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15984
|
0
|
0
|
|
|
|
|
for (auto&& id : w.error_embedding_nonempty[i]) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15985
|
0
|
0
|
|
|
|
|
if (TRAINER::need_trainer_data) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15986
|
0
|
0
|
|
|
|
|
if (w.embedding_trainer.size() <= i) w.embedding_trainer.resize(i + 1); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15987
|
0
|
0
|
|
|
|
|
if (w.embedding_trainer[i].size() <= id) w.embedding_trainer[i].resize(id + 1); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15988
|
0
|
0
|
|
|
|
|
if (w.embedding_trainer[i][id].size() < embeddings[i].dimension) w.embedding_trainer[i][id].resize(embeddings[i].dimension); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15989
|
|
|
|
|
|
|
} |
15990
|
0
|
|
|
|
|
|
float* embedding = embeddings[i].weight(id); |
15991
|
0
|
0
|
|
|
|
|
for (unsigned j = 0; j < embeddings[i].dimension; j++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15992
|
0
|
0
|
|
|
|
|
embedding[j] += TRAINER::delta(w.error_embedding[i][id][j], trainer, TRAINER::need_trainer_data ? w.embedding_trainer[i][id][j] : none_trainer_data) - l2_regularization * embedding[j]; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
15993
|
0
|
|
|
|
|
|
w.error_embedding[i][id].clear(); |
15994
|
|
|
|
|
|
|
} |
15995
|
|
|
|
|
|
|
w.error_embedding_nonempty[i].clear(); |
15996
|
|
|
|
|
|
|
} |
15997
|
|
|
|
|
|
|
|
15998
|
|
|
|
|
|
|
// Maxnorm regularize the updated weights |
15999
|
0
|
0
|
|
|
|
|
if (maxnorm_regularization) maxnorm_regularize(); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
16000
|
|
|
|
|
|
|
} |
16001
|
|
|
|
|
|
|
|
16002
|
0
|
|
|
|
|
|
void neural_network_trainer::backpropagate(vector& embeddings, const vector*>& embedding_ids_sequences, unsigned required_outcome, workspace& w) { |
16003
|
0
|
|
|
|
|
|
steps++; |
16004
|
|
|
|
|
|
|
|
16005
|
0
|
|
|
|
|
|
switch (trainer.algorithm) { |
16006
|
|
|
|
|
|
|
case network_trainer::SGD: |
16007
|
0
|
|
|
|
|
|
backpropagate_template(embeddings, embedding_ids_sequences, required_outcome, w); |
16008
|
0
|
|
|
|
|
|
return; |
16009
|
|
|
|
|
|
|
case network_trainer::SGD_MOMENTUM: |
16010
|
0
|
|
|
|
|
|
backpropagate_template(embeddings, embedding_ids_sequences, required_outcome, w); |
16011
|
0
|
|
|
|
|
|
return; |
16012
|
|
|
|
|
|
|
case network_trainer::ADAGRAD: |
16013
|
0
|
|
|
|
|
|
backpropagate_template(embeddings, embedding_ids_sequences, required_outcome, w); |
16014
|
0
|
|
|
|
|
|
return; |
16015
|
|
|
|
|
|
|
case network_trainer::ADADELTA: |
16016
|
0
|
|
|
|
|
|
backpropagate_template(embeddings, embedding_ids_sequences, required_outcome, w); |
16017
|
0
|
|
|
|
|
|
return; |
16018
|
|
|
|
|
|
|
case network_trainer::ADAM: |
16019
|
0
|
|
|
|
|
|
float original_learning_rate = trainer.learning_rate; |
16020
|
0
|
|
|
|
|
|
trainer.learning_rate *= sqrt(1-pow(trainer.momentum2, steps)) / (1-pow(trainer.momentum, steps)); |
16021
|
0
|
|
|
|
|
|
backpropagate_template(embeddings, embedding_ids_sequences, required_outcome, w); |
16022
|
0
|
|
|
|
|
|
trainer.learning_rate = original_learning_rate; |
16023
|
0
|
|
|
|
|
|
return; |
16024
|
|
|
|
|
|
|
} |
16025
|
|
|
|
|
|
|
|
16026
|
0
|
0
|
|
|
|
|
training_failure("Internal error, unsupported trainer!"); |
|
|
0
|
|
|
|
|
|
16027
|
|
|
|
|
|
|
} |
16028
|
|
|
|
|
|
|
|
16029
|
0
|
|
|
|
|
|
void neural_network_trainer::l1_regularize() { |
16030
|
0
|
0
|
|
|
|
|
if (!l1_regularization) return; |
16031
|
|
|
|
|
|
|
|
16032
|
0
|
0
|
|
|
|
|
for (auto&& weights : network.weights) |
16033
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i + 1 /*ignore biases*/ < weights.size(); i++) { |
16034
|
0
|
|
|
|
|
|
auto& row = weights[i]; |
16035
|
0
|
0
|
|
|
|
|
for (auto&& weight : row) |
16036
|
0
|
0
|
|
|
|
|
if (weight < l1_regularization) weight += l1_regularization; |
16037
|
0
|
0
|
|
|
|
|
else if (weight > l1_regularization) weight -= l1_regularization; |
16038
|
0
|
|
|
|
|
|
else weight = 0; |
16039
|
|
|
|
|
|
|
} |
16040
|
|
|
|
|
|
|
} |
16041
|
|
|
|
|
|
|
|
16042
|
0
|
|
|
|
|
|
void neural_network_trainer::maxnorm_regularize() { |
16043
|
0
|
0
|
|
|
|
|
if (!maxnorm_regularization) return; |
16044
|
|
|
|
|
|
|
|
16045
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) |
16046
|
0
|
0
|
|
|
|
|
for (unsigned j = 0; j < network.weights[i].front().size(); j++) { |
16047
|
|
|
|
|
|
|
float length = 0; |
16048
|
0
|
0
|
|
|
|
|
for (auto&& row : network.weights[i]) |
16049
|
0
|
|
|
|
|
|
length += row[j] * row[j]; |
16050
|
|
|
|
|
|
|
|
16051
|
0
|
0
|
|
|
|
|
if (length > 0 && length > maxnorm_regularization * maxnorm_regularization) { |
|
|
0
|
|
|
|
|
|
16052
|
0
|
|
|
|
|
|
float factor = 1 / sqrt(length / (maxnorm_regularization * maxnorm_regularization)); |
16053
|
0
|
0
|
|
|
|
|
for (auto&& row : network.weights[i]) |
16054
|
0
|
|
|
|
|
|
row[j] *= factor; |
16055
|
|
|
|
|
|
|
} |
16056
|
|
|
|
|
|
|
} |
16057
|
|
|
|
|
|
|
} |
16058
|
|
|
|
|
|
|
|
16059
|
0
|
|
|
|
|
|
void neural_network_trainer::finalize_sentence() { |
16060
|
0
|
0
|
|
|
|
|
if (l1_regularization) l1_regularize(); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
16061
|
0
|
|
|
|
|
|
} |
16062
|
|
|
|
|
|
|
|
16063
|
0
|
|
|
|
|
|
void neural_network_trainer::save_matrix(const vector>& m, binary_encoder& enc) const { |
16064
|
0
|
|
|
|
|
|
enc.add_4B(m.size()); |
16065
|
0
|
0
|
|
|
|
|
enc.add_4B(m.empty() ? 0 : m.front().size()); |
16066
|
|
|
|
|
|
|
|
16067
|
0
|
0
|
|
|
|
|
for (auto&& row : m) { |
16068
|
0
|
0
|
|
|
|
|
assert(row.size() == m.front().size()); |
16069
|
|
|
|
|
|
|
enc.add_data(row); |
16070
|
|
|
|
|
|
|
} |
16071
|
0
|
|
|
|
|
|
} |
16072
|
|
|
|
|
|
|
|
16073
|
0
|
|
|
|
|
|
void neural_network_trainer::save_network(binary_encoder& enc) const { |
16074
|
0
|
|
|
|
|
|
enc.add_1B(network.hidden_layer_activation); |
16075
|
0
|
|
|
|
|
|
save_matrix(network.weights[0], enc); |
16076
|
0
|
|
|
|
|
|
save_matrix(network.weights[1], enc); |
16077
|
0
|
|
|
|
|
|
} |
16078
|
|
|
|
|
|
|
|
16079
|
|
|
|
|
|
|
} // namespace parsito |
16080
|
|
|
|
|
|
|
|
16081
|
|
|
|
|
|
|
///////// |
16082
|
|
|
|
|
|
|
// File: parsito/transition/transition.h |
16083
|
|
|
|
|
|
|
///////// |
16084
|
|
|
|
|
|
|
|
16085
|
|
|
|
|
|
|
// This file is part of Parsito . |
16086
|
|
|
|
|
|
|
// |
16087
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
16088
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
16089
|
|
|
|
|
|
|
// |
16090
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
16091
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
16092
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
16093
|
|
|
|
|
|
|
|
16094
|
|
|
|
|
|
|
namespace parsito { |
16095
|
|
|
|
|
|
|
|
16096
|
|
|
|
|
|
|
// Abstract transition class |
16097
|
13
|
|
|
|
|
|
class transition { |
16098
|
|
|
|
|
|
|
public: |
16099
|
13
|
|
|
|
|
|
virtual ~transition() {} |
16100
|
|
|
|
|
|
|
|
16101
|
|
|
|
|
|
|
virtual bool applicable(const configuration& conf) const = 0; |
16102
|
|
|
|
|
|
|
virtual int perform(configuration& conf) const = 0; |
16103
|
|
|
|
|
|
|
}; |
16104
|
|
|
|
|
|
|
|
16105
|
|
|
|
|
|
|
// Specific transition classes |
16106
|
12
|
|
|
|
|
|
class transition_left_arc : public transition { |
16107
|
|
|
|
|
|
|
public: |
16108
|
6
|
|
|
|
|
|
transition_left_arc(const string& label) : label(label), label_is_root(label == "root") {} |
16109
|
|
|
|
|
|
|
|
16110
|
|
|
|
|
|
|
virtual bool applicable(const configuration& conf) const override; |
16111
|
|
|
|
|
|
|
virtual int perform(configuration& conf) const override; |
16112
|
|
|
|
|
|
|
private: |
16113
|
|
|
|
|
|
|
string label; |
16114
|
|
|
|
|
|
|
bool label_is_root; |
16115
|
|
|
|
|
|
|
}; |
16116
|
|
|
|
|
|
|
|
16117
|
12
|
|
|
|
|
|
class transition_right_arc : public transition { |
16118
|
|
|
|
|
|
|
public: |
16119
|
6
|
|
|
|
|
|
transition_right_arc(const string& label) : label(label), label_is_root(label == "root") {} |
16120
|
|
|
|
|
|
|
|
16121
|
|
|
|
|
|
|
virtual bool applicable(const configuration& conf) const override; |
16122
|
|
|
|
|
|
|
virtual int perform(configuration& conf) const override; |
16123
|
|
|
|
|
|
|
private: |
16124
|
|
|
|
|
|
|
string label; |
16125
|
|
|
|
|
|
|
bool label_is_root; |
16126
|
|
|
|
|
|
|
}; |
16127
|
|
|
|
|
|
|
|
16128
|
2
|
|
|
|
|
|
class transition_shift : public transition { |
16129
|
|
|
|
|
|
|
public: |
16130
|
|
|
|
|
|
|
virtual bool applicable(const configuration& conf) const override; |
16131
|
|
|
|
|
|
|
virtual int perform(configuration& conf) const override; |
16132
|
|
|
|
|
|
|
}; |
16133
|
|
|
|
|
|
|
|
16134
|
0
|
|
|
|
|
|
class transition_swap : public transition { |
16135
|
|
|
|
|
|
|
public: |
16136
|
|
|
|
|
|
|
virtual bool applicable(const configuration& conf) const override; |
16137
|
|
|
|
|
|
|
virtual int perform(configuration& conf) const override; |
16138
|
|
|
|
|
|
|
}; |
16139
|
|
|
|
|
|
|
|
16140
|
0
|
|
|
|
|
|
class transition_left_arc_2 : public transition { |
16141
|
|
|
|
|
|
|
public: |
16142
|
0
|
|
|
|
|
|
transition_left_arc_2(const string& label) : label(label), label_is_root(label == "root") {} |
16143
|
|
|
|
|
|
|
|
16144
|
|
|
|
|
|
|
virtual bool applicable(const configuration& conf) const override; |
16145
|
|
|
|
|
|
|
virtual int perform(configuration& conf) const override; |
16146
|
|
|
|
|
|
|
private: |
16147
|
|
|
|
|
|
|
string label; |
16148
|
|
|
|
|
|
|
bool label_is_root; |
16149
|
|
|
|
|
|
|
}; |
16150
|
|
|
|
|
|
|
|
16151
|
0
|
|
|
|
|
|
class transition_right_arc_2 : public transition { |
16152
|
|
|
|
|
|
|
public: |
16153
|
0
|
|
|
|
|
|
transition_right_arc_2(const string& label) : label(label), label_is_root(label == "root") {} |
16154
|
|
|
|
|
|
|
|
16155
|
|
|
|
|
|
|
virtual bool applicable(const configuration& conf) const override; |
16156
|
|
|
|
|
|
|
virtual int perform(configuration& conf) const override; |
16157
|
|
|
|
|
|
|
private: |
16158
|
|
|
|
|
|
|
string label; |
16159
|
|
|
|
|
|
|
bool label_is_root; |
16160
|
|
|
|
|
|
|
}; |
16161
|
|
|
|
|
|
|
|
16162
|
|
|
|
|
|
|
} // namespace parsito |
16163
|
|
|
|
|
|
|
|
16164
|
|
|
|
|
|
|
///////// |
16165
|
|
|
|
|
|
|
// File: parsito/transition/transition_oracle.h |
16166
|
|
|
|
|
|
|
///////// |
16167
|
|
|
|
|
|
|
|
16168
|
|
|
|
|
|
|
// This file is part of Parsito . |
16169
|
|
|
|
|
|
|
// |
16170
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
16171
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
16172
|
|
|
|
|
|
|
// |
16173
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
16174
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
16175
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
16176
|
|
|
|
|
|
|
|
16177
|
|
|
|
|
|
|
namespace parsito { |
16178
|
|
|
|
|
|
|
|
16179
|
0
|
|
|
|
|
|
class transition_oracle { |
16180
|
|
|
|
|
|
|
public: |
16181
|
0
|
|
|
|
|
|
virtual ~transition_oracle() {} |
16182
|
|
|
|
|
|
|
|
16183
|
|
|
|
|
|
|
struct predicted_transition { |
16184
|
|
|
|
|
|
|
unsigned best; |
16185
|
|
|
|
|
|
|
unsigned to_follow; |
16186
|
|
|
|
|
|
|
|
16187
|
|
|
|
|
|
|
predicted_transition(unsigned best, unsigned to_follow) : best(best), to_follow(to_follow) {} |
16188
|
|
|
|
|
|
|
}; |
16189
|
|
|
|
|
|
|
|
16190
|
0
|
|
|
|
|
|
class tree_oracle { |
16191
|
|
|
|
|
|
|
public: |
16192
|
0
|
|
|
|
|
|
virtual ~tree_oracle() {} |
16193
|
|
|
|
|
|
|
|
16194
|
|
|
|
|
|
|
virtual predicted_transition predict(const configuration& conf, unsigned network_outcome, unsigned iteration) const = 0; |
16195
|
|
|
|
|
|
|
virtual void interesting_transitions(const configuration& conf, vector& transitions) const = 0; |
16196
|
|
|
|
|
|
|
}; |
16197
|
|
|
|
|
|
|
|
16198
|
|
|
|
|
|
|
virtual unique_ptr create_tree_oracle(const tree& gold) const = 0; |
16199
|
|
|
|
|
|
|
}; |
16200
|
|
|
|
|
|
|
|
16201
|
|
|
|
|
|
|
} // namespace parsito |
16202
|
|
|
|
|
|
|
|
16203
|
|
|
|
|
|
|
///////// |
16204
|
|
|
|
|
|
|
// File: parsito/transition/transition_system.h |
16205
|
|
|
|
|
|
|
///////// |
16206
|
|
|
|
|
|
|
|
16207
|
|
|
|
|
|
|
// This file is part of Parsito . |
16208
|
|
|
|
|
|
|
// |
16209
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
16210
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
16211
|
|
|
|
|
|
|
// |
16212
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
16213
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
16214
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
16215
|
|
|
|
|
|
|
|
16216
|
|
|
|
|
|
|
namespace parsito { |
16217
|
|
|
|
|
|
|
|
16218
|
|
|
|
|
|
|
class transition_system { |
16219
|
|
|
|
|
|
|
public: |
16220
|
1
|
|
|
|
|
|
virtual ~transition_system() {} |
16221
|
|
|
|
|
|
|
|
16222
|
|
|
|
|
|
|
virtual unsigned transition_count() const; |
16223
|
|
|
|
|
|
|
virtual bool applicable(const configuration& conf, unsigned transition) const; |
16224
|
|
|
|
|
|
|
virtual int perform(configuration& conf, unsigned transition) const; |
16225
|
|
|
|
|
|
|
virtual transition_oracle* oracle(const string& name) const = 0; |
16226
|
|
|
|
|
|
|
|
16227
|
|
|
|
|
|
|
static transition_system* create(const string& name, const vector& labels); |
16228
|
|
|
|
|
|
|
|
16229
|
|
|
|
|
|
|
protected: |
16230
|
1
|
|
|
|
|
|
transition_system(const vector& labels) : labels(labels) {} |
16231
|
|
|
|
|
|
|
|
16232
|
|
|
|
|
|
|
const vector& labels; |
16233
|
|
|
|
|
|
|
vector> transitions; |
16234
|
|
|
|
|
|
|
}; |
16235
|
|
|
|
|
|
|
|
16236
|
|
|
|
|
|
|
} // namespace parsito |
16237
|
|
|
|
|
|
|
|
16238
|
|
|
|
|
|
|
///////// |
16239
|
|
|
|
|
|
|
// File: parsito/parser/parser_nn.h |
16240
|
|
|
|
|
|
|
///////// |
16241
|
|
|
|
|
|
|
|
16242
|
|
|
|
|
|
|
// This file is part of Parsito . |
16243
|
|
|
|
|
|
|
// |
16244
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
16245
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
16246
|
|
|
|
|
|
|
// |
16247
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
16248
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
16249
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
16250
|
|
|
|
|
|
|
|
16251
|
|
|
|
|
|
|
namespace parsito { |
16252
|
|
|
|
|
|
|
|
16253
|
5
|
|
|
|
|
|
class parser_nn : public parser { |
16254
|
|
|
|
|
|
|
public: |
16255
|
|
|
|
|
|
|
parser_nn(bool versioned); |
16256
|
|
|
|
|
|
|
|
16257
|
|
|
|
|
|
|
virtual void parse(tree& t, unsigned beam_size = 0, double* cost = nullptr) const override; |
16258
|
|
|
|
|
|
|
|
16259
|
|
|
|
|
|
|
protected: |
16260
|
|
|
|
|
|
|
virtual void load(binary_decoder& data, unsigned cache) override; |
16261
|
|
|
|
|
|
|
|
16262
|
|
|
|
|
|
|
private: |
16263
|
|
|
|
|
|
|
friend class parser_nn_trainer; |
16264
|
|
|
|
|
|
|
void parse_greedy(tree& t, double* cost) const; |
16265
|
|
|
|
|
|
|
void parse_beam_search(tree& t, unsigned beam_size, double* cost) const; |
16266
|
|
|
|
|
|
|
|
16267
|
|
|
|
|
|
|
bool versioned; |
16268
|
|
|
|
|
|
|
unsigned version; |
16269
|
|
|
|
|
|
|
bool single_root; |
16270
|
|
|
|
|
|
|
enum { VERSION_LATEST = 2 }; |
16271
|
|
|
|
|
|
|
|
16272
|
|
|
|
|
|
|
vector labels; |
16273
|
|
|
|
|
|
|
unique_ptr system; |
16274
|
|
|
|
|
|
|
|
16275
|
|
|
|
|
|
|
node_extractor nodes; |
16276
|
|
|
|
|
|
|
|
16277
|
|
|
|
|
|
|
vector values; |
16278
|
|
|
|
|
|
|
vector embeddings; |
16279
|
|
|
|
|
|
|
|
16280
|
|
|
|
|
|
|
neural_network network; |
16281
|
|
|
|
|
|
|
neural_network::embeddings_cache embeddings_cache; |
16282
|
|
|
|
|
|
|
|
16283
|
6
|
50
|
|
|
|
|
struct workspace { |
|
|
100
|
|
|
|
|
|
16284
|
4
|
100
|
|
|
|
|
workspace(bool single_root) : conf(single_root) {} |
16285
|
|
|
|
|
|
|
|
16286
|
|
|
|
|
|
|
configuration conf; |
16287
|
|
|
|
|
|
|
|
16288
|
|
|
|
|
|
|
string word, word_buffer; |
16289
|
|
|
|
|
|
|
vector> embeddings; |
16290
|
|
|
|
|
|
|
vector> embeddings_values; |
16291
|
|
|
|
|
|
|
|
16292
|
|
|
|
|
|
|
vector extracted_nodes; |
16293
|
|
|
|
|
|
|
vector*> extracted_embeddings; |
16294
|
|
|
|
|
|
|
|
16295
|
|
|
|
|
|
|
vector outcomes, network_buffer; |
16296
|
|
|
|
|
|
|
|
16297
|
|
|
|
|
|
|
// Beam-size structures |
16298
|
228
|
|
|
|
|
|
struct beam_size_configuration { |
16299
|
|
|
|
|
|
|
beam_size_configuration(bool single_root) : conf(single_root) {} |
16300
|
|
|
|
|
|
|
|
16301
|
|
|
|
|
|
|
configuration conf; |
16302
|
|
|
|
|
|
|
vector heads; |
16303
|
|
|
|
|
|
|
vector deprels; |
16304
|
|
|
|
|
|
|
double cost; |
16305
|
|
|
|
|
|
|
|
16306
|
|
|
|
|
|
|
void refresh_tree(); |
16307
|
|
|
|
|
|
|
void save_tree(); |
16308
|
|
|
|
|
|
|
}; |
16309
|
|
|
|
|
|
|
struct beam_size_alternative { |
16310
|
|
|
|
|
|
|
const beam_size_configuration* bs_conf; |
16311
|
|
|
|
|
|
|
int transition; |
16312
|
|
|
|
|
|
|
double cost; |
16313
|
|
|
|
|
|
|
bool operator<(const beam_size_alternative& other) const { return cost > other.cost; } |
16314
|
|
|
|
|
|
|
|
16315
|
|
|
|
|
|
|
beam_size_alternative(const beam_size_configuration* bs_conf, int transition, double cost) |
16316
|
241
|
|
|
|
|
|
: bs_conf(bs_conf), transition(transition), cost(cost) {} |
16317
|
|
|
|
|
|
|
}; |
16318
|
|
|
|
|
|
|
vector bs_confs[2]; size_t bs_confs_size[2]; |
16319
|
|
|
|
|
|
|
vector bs_alternatives; |
16320
|
|
|
|
|
|
|
}; |
16321
|
|
|
|
|
|
|
mutable threadsafe_stack workspaces; |
16322
|
|
|
|
|
|
|
}; |
16323
|
|
|
|
|
|
|
|
16324
|
|
|
|
|
|
|
} // namespace parsito |
16325
|
|
|
|
|
|
|
|
16326
|
|
|
|
|
|
|
///////// |
16327
|
|
|
|
|
|
|
// File: parsito/parser/parser.cpp |
16328
|
|
|
|
|
|
|
///////// |
16329
|
|
|
|
|
|
|
|
16330
|
|
|
|
|
|
|
// This file is part of Parsito . |
16331
|
|
|
|
|
|
|
// |
16332
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
16333
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
16334
|
|
|
|
|
|
|
// |
16335
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
16336
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
16337
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
16338
|
|
|
|
|
|
|
|
16339
|
|
|
|
|
|
|
namespace parsito { |
16340
|
|
|
|
|
|
|
|
16341
|
0
|
|
|
|
|
|
parser* parser::load(const char* file, unsigned cache) { |
16342
|
0
|
0
|
|
|
|
|
ifstream in(path_from_utf8(file).c_str(), ifstream::in | ifstream::binary); |
16343
|
0
|
0
|
|
|
|
|
if (!in.is_open()) return nullptr; |
16344
|
0
|
0
|
|
|
|
|
return load(in, cache); |
16345
|
|
|
|
|
|
|
} |
16346
|
|
|
|
|
|
|
|
16347
|
1
|
|
|
|
|
|
parser* parser::load(istream& in, unsigned cache) { |
16348
|
|
|
|
|
|
|
unique_ptr result; |
16349
|
|
|
|
|
|
|
|
16350
|
|
|
|
|
|
|
binary_decoder data; |
16351
|
1
|
50
|
|
|
|
|
if (!compressor::load(in, data)) return nullptr; |
|
|
50
|
|
|
|
|
|
16352
|
|
|
|
|
|
|
|
16353
|
|
|
|
|
|
|
try { |
16354
|
|
|
|
|
|
|
string name; |
16355
|
1
|
50
|
|
|
|
|
data.next_str(name); |
16356
|
|
|
|
|
|
|
|
16357
|
1
|
50
|
|
|
|
|
result.reset(create(name)); |
16358
|
1
|
50
|
|
|
|
|
if (!result) return nullptr; |
16359
|
|
|
|
|
|
|
|
16360
|
1
|
50
|
|
|
|
|
result->load(data, cache); |
|
|
0
|
|
|
|
|
|
16361
|
|
|
|
|
|
|
} catch (binary_decoder_error&) { |
16362
|
|
|
|
|
|
|
return nullptr; |
16363
|
|
|
|
|
|
|
} |
16364
|
|
|
|
|
|
|
|
16365
|
1
|
50
|
|
|
|
|
return result && data.is_end() ? result.release() : nullptr; |
|
|
50
|
|
|
|
|
|
16366
|
|
|
|
|
|
|
} |
16367
|
|
|
|
|
|
|
|
16368
|
1
|
|
|
|
|
|
parser* parser::create(const string& name) { |
16369
|
1
|
50
|
|
|
|
|
if (name == "nn") return new parser_nn(false); |
16370
|
0
|
0
|
|
|
|
|
if (name == "nn_versioned") return new parser_nn(true); |
16371
|
|
|
|
|
|
|
return nullptr; |
16372
|
|
|
|
|
|
|
} |
16373
|
|
|
|
|
|
|
|
16374
|
|
|
|
|
|
|
} // namespace parsito |
16375
|
|
|
|
|
|
|
|
16376
|
|
|
|
|
|
|
///////// |
16377
|
|
|
|
|
|
|
// File: parsito/parser/parser_nn.cpp |
16378
|
|
|
|
|
|
|
///////// |
16379
|
|
|
|
|
|
|
|
16380
|
|
|
|
|
|
|
// This file is part of Parsito . |
16381
|
|
|
|
|
|
|
// |
16382
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
16383
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
16384
|
|
|
|
|
|
|
// |
16385
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
16386
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
16387
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
16388
|
|
|
|
|
|
|
|
16389
|
|
|
|
|
|
|
namespace parsito { |
16390
|
|
|
|
|
|
|
|
16391
|
|
|
|
|
|
|
// Versions: |
16392
|
|
|
|
|
|
|
// 1: initial version |
16393
|
|
|
|
|
|
|
// 2: add ReLU activation function |
16394
|
|
|
|
|
|
|
|
16395
|
1
|
|
|
|
|
|
parser_nn::parser_nn(bool versioned) : versioned(versioned) {} |
16396
|
|
|
|
|
|
|
|
16397
|
1
|
|
|
|
|
|
void parser_nn::parse(tree& t, unsigned beam_size, double* cost) const { |
16398
|
1
|
50
|
|
|
|
|
if (beam_size > 1) |
16399
|
1
|
|
|
|
|
|
parse_beam_search(t, beam_size, cost); |
16400
|
|
|
|
|
|
|
else |
16401
|
0
|
0
|
|
|
|
|
parse_greedy(t, cost); |
16402
|
1
|
|
|
|
|
|
} |
16403
|
|
|
|
|
|
|
|
16404
|
0
|
|
|
|
|
|
void parser_nn::parse_greedy(tree& t, double* cost) const { |
16405
|
0
|
0
|
|
|
|
|
assert(system); |
16406
|
0
|
0
|
|
|
|
|
if (cost) *cost = 0.; |
16407
|
|
|
|
|
|
|
|
16408
|
|
|
|
|
|
|
// Retrieve or create workspace |
16409
|
0
|
|
|
|
|
|
workspace* w = workspaces.pop(); |
16410
|
0
|
0
|
|
|
|
|
if (!w) w = new workspace(single_root); |
16411
|
|
|
|
|
|
|
|
16412
|
|
|
|
|
|
|
// Create configuration |
16413
|
0
|
|
|
|
|
|
w->conf.init(&t); |
16414
|
|
|
|
|
|
|
|
16415
|
|
|
|
|
|
|
// Compute embeddings of all nodes |
16416
|
0
|
0
|
|
|
|
|
if (w->embeddings.size() < t.nodes.size()) w->embeddings.resize(t.nodes.size()); |
16417
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < t.nodes.size(); i++) { |
16418
|
0
|
0
|
|
|
|
|
if (w->embeddings[i].size() < embeddings.size()) w->embeddings[i].resize(embeddings.size()); |
16419
|
0
|
0
|
|
|
|
|
for (size_t j = 0; j < embeddings.size(); j++) { |
16420
|
0
|
|
|
|
|
|
values[j].extract(t.nodes[i], w->word); |
16421
|
0
|
|
|
|
|
|
w->embeddings[i][j] = embeddings[j].lookup_word(w->word, w->word_buffer); |
16422
|
|
|
|
|
|
|
} |
16423
|
|
|
|
|
|
|
} |
16424
|
|
|
|
|
|
|
|
16425
|
|
|
|
|
|
|
// Compute which transitions to perform and perform them |
16426
|
|
|
|
|
|
|
int transitions = 0; |
16427
|
0
|
0
|
|
|
|
|
for (; !w->conf.final(); transitions++) { |
16428
|
|
|
|
|
|
|
// Extract nodes from the configuration |
16429
|
0
|
|
|
|
|
|
nodes.extract(w->conf, w->extracted_nodes); |
16430
|
0
|
|
|
|
|
|
w->extracted_embeddings.resize(w->extracted_nodes.size()); |
16431
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < w->extracted_nodes.size(); i++) |
16432
|
0
|
0
|
|
|
|
|
w->extracted_embeddings[i] = w->extracted_nodes[i] >= 0 ? &w->embeddings[w->extracted_nodes[i]] : nullptr; |
16433
|
|
|
|
|
|
|
|
16434
|
|
|
|
|
|
|
// Classify using neural network |
16435
|
0
|
|
|
|
|
|
network.propagate(embeddings, w->extracted_embeddings, w->network_buffer, w->outcomes, &embeddings_cache, cost ? true : false); |
16436
|
|
|
|
|
|
|
|
16437
|
|
|
|
|
|
|
// Find most probable applicable transition |
16438
|
|
|
|
|
|
|
int best = -1; |
16439
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < w->outcomes.size(); i++) |
16440
|
0
|
0
|
|
|
|
|
if (system->applicable(w->conf, i) && (best < 0 || w->outcomes[i] > w->outcomes[best])) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
16441
|
0
|
|
|
|
|
|
best = i; |
16442
|
|
|
|
|
|
|
|
16443
|
|
|
|
|
|
|
// Perform the best transition |
16444
|
0
|
|
|
|
|
|
int child = system->perform(w->conf, best); |
16445
|
0
|
0
|
|
|
|
|
if (cost) *cost += log(w->outcomes[best]); |
16446
|
|
|
|
|
|
|
|
16447
|
|
|
|
|
|
|
// If a node was linked, recompute its embeddings as deprel has changed |
16448
|
0
|
0
|
|
|
|
|
if (child >= 0) |
16449
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < embeddings.size(); i++) { |
16450
|
0
|
|
|
|
|
|
values[i].extract(t.nodes[child], w->word); |
16451
|
0
|
|
|
|
|
|
w->embeddings[child][i] = embeddings[i].lookup_word(w->word, w->word_buffer); |
16452
|
|
|
|
|
|
|
} |
16453
|
|
|
|
|
|
|
} |
16454
|
|
|
|
|
|
|
|
16455
|
0
|
0
|
|
|
|
|
if (cost && transitions) |
16456
|
0
|
|
|
|
|
|
*cost = *cost / transitions * (t.nodes.size() - 1); |
16457
|
|
|
|
|
|
|
|
16458
|
|
|
|
|
|
|
// Store workspace |
16459
|
0
|
|
|
|
|
|
workspaces.push(w); |
16460
|
0
|
|
|
|
|
|
} |
16461
|
|
|
|
|
|
|
|
16462
|
1
|
|
|
|
|
|
void parser_nn::parse_beam_search(tree& t, unsigned beam_size, double* cost) const { |
16463
|
1
|
50
|
|
|
|
|
assert(system); |
16464
|
|
|
|
|
|
|
|
16465
|
|
|
|
|
|
|
// Retrieve or create workspace |
16466
|
1
|
|
|
|
|
|
workspace* w = workspaces.pop(); |
16467
|
1
|
50
|
|
|
|
|
if (!w) w = new workspace(single_root); |
16468
|
|
|
|
|
|
|
|
16469
|
|
|
|
|
|
|
// Allocate and initialize configuration |
16470
|
3
|
100
|
|
|
|
|
for (int i = 0; i < 2; i++) { |
16471
|
12
|
100
|
|
|
|
|
while (w->bs_confs[i].size() < beam_size) w->bs_confs[i].emplace_back(single_root); |
16472
|
2
|
50
|
|
|
|
|
while (w->bs_confs[i].size() > beam_size) w->bs_confs[i].pop_back(); |
16473
|
2
|
|
|
|
|
|
w->bs_confs_size[i] = 0; |
16474
|
|
|
|
|
|
|
} |
16475
|
1
|
|
|
|
|
|
w->bs_confs[0][0].cost = 0; |
16476
|
1
|
|
|
|
|
|
w->bs_confs[0][0].conf.init(&t); |
16477
|
1
|
|
|
|
|
|
w->bs_confs[0][0].save_tree(); |
16478
|
1
|
|
|
|
|
|
w->bs_confs_size[0] = 1; |
16479
|
|
|
|
|
|
|
|
16480
|
|
|
|
|
|
|
// Compute embeddings of all nodes |
16481
|
1
|
50
|
|
|
|
|
if (w->embeddings.size() < t.nodes.size()) w->embeddings.resize(t.nodes.size()); |
16482
|
1
|
50
|
|
|
|
|
if (w->embeddings_values.size() < t.nodes.size()) w->embeddings_values.resize(t.nodes.size()); |
16483
|
9
|
100
|
|
|
|
|
for (size_t i = 0; i < t.nodes.size(); i++) { |
16484
|
8
|
50
|
|
|
|
|
if (w->embeddings[i].size() < embeddings.size()) w->embeddings[i].resize(embeddings.size()); |
16485
|
8
|
50
|
|
|
|
|
if (w->embeddings_values[i].size() < embeddings.size()) w->embeddings_values[i].resize(embeddings.size()); |
16486
|
40
|
100
|
|
|
|
|
for (size_t j = 0; j < embeddings.size(); j++) { |
16487
|
32
|
|
|
|
|
|
values[j].extract(t.nodes[i], w->embeddings_values[i][j]); |
16488
|
32
|
|
|
|
|
|
w->embeddings[i][j] = embeddings[j].lookup_word(w->embeddings_values[i][j], w->word_buffer); |
16489
|
|
|
|
|
|
|
} |
16490
|
|
|
|
|
|
|
} |
16491
|
|
|
|
|
|
|
|
16492
|
|
|
|
|
|
|
// Compute which transitions to perform and perform them |
16493
|
|
|
|
|
|
|
size_t iteration = 0; |
16494
|
16
|
100
|
|
|
|
|
for (bool all_final = false; !all_final; iteration++) { |
16495
|
|
|
|
|
|
|
all_final = true; |
16496
|
|
|
|
|
|
|
w->bs_alternatives.clear(); |
16497
|
|
|
|
|
|
|
|
16498
|
82
|
100
|
|
|
|
|
for (size_t c = 0; c < w->bs_confs_size[iteration & 1]; c++) { |
16499
|
67
|
|
|
|
|
|
auto& bs_conf = w->bs_confs[iteration & 1][c]; |
16500
|
|
|
|
|
|
|
|
16501
|
67
|
100
|
|
|
|
|
if (bs_conf.conf.final()) { |
16502
|
5
|
50
|
|
|
|
|
if (w->bs_alternatives.size() == beam_size) { |
16503
|
0
|
0
|
|
|
|
|
if (bs_conf.cost <= w->bs_alternatives[0].cost) continue; |
16504
|
|
|
|
|
|
|
pop_heap(w->bs_alternatives.begin(), w->bs_alternatives.end()); |
16505
|
|
|
|
|
|
|
w->bs_alternatives.pop_back(); |
16506
|
|
|
|
|
|
|
} |
16507
|
5
|
|
|
|
|
|
w->bs_alternatives.emplace_back(&bs_conf, -1, bs_conf.cost); |
16508
|
5
|
|
|
|
|
|
push_heap(w->bs_alternatives.begin(), w->bs_alternatives.end()); |
16509
|
5
|
|
|
|
|
|
continue; |
16510
|
|
|
|
|
|
|
} |
16511
|
|
|
|
|
|
|
all_final = false; |
16512
|
|
|
|
|
|
|
|
16513
|
62
|
|
|
|
|
|
bs_conf.refresh_tree(); |
16514
|
|
|
|
|
|
|
// Update embeddings for all nodes |
16515
|
558
|
100
|
|
|
|
|
for (size_t i = 0; i < t.nodes.size(); i++) |
16516
|
2480
|
100
|
|
|
|
|
for (size_t j = 0; j < embeddings.size(); j++) { |
16517
|
1984
|
|
|
|
|
|
values[j].extract(t.nodes[i], w->word); |
16518
|
1984
|
100
|
|
|
|
|
if (w->word != w->embeddings_values[i][j]) { |
16519
|
96
|
|
|
|
|
|
w->embeddings[i][j] = embeddings[j].lookup_word(w->word, w->word_buffer); |
16520
|
|
|
|
|
|
|
w->embeddings_values[i][j].assign(w->word); |
16521
|
|
|
|
|
|
|
} |
16522
|
|
|
|
|
|
|
} |
16523
|
|
|
|
|
|
|
|
16524
|
|
|
|
|
|
|
// Extract nodes from the configuration |
16525
|
62
|
|
|
|
|
|
nodes.extract(bs_conf.conf, w->extracted_nodes); |
16526
|
62
|
|
|
|
|
|
w->extracted_embeddings.resize(w->extracted_nodes.size()); |
16527
|
1178
|
100
|
|
|
|
|
for (size_t i = 0; i < w->extracted_nodes.size(); i++) |
16528
|
1116
|
100
|
|
|
|
|
w->extracted_embeddings[i] = w->extracted_nodes[i] >= 0 ? &w->embeddings[w->extracted_nodes[i]] : nullptr; |
16529
|
|
|
|
|
|
|
|
16530
|
|
|
|
|
|
|
// Classify using neural network |
16531
|
62
|
|
|
|
|
|
network.propagate(embeddings, w->extracted_embeddings, w->network_buffer, w->outcomes, &embeddings_cache); |
16532
|
|
|
|
|
|
|
|
16533
|
|
|
|
|
|
|
// Store all alternatives |
16534
|
868
|
100
|
|
|
|
|
for (unsigned i = 0; i < w->outcomes.size(); i++) |
16535
|
806
|
100
|
|
|
|
|
if (system->applicable(bs_conf.conf, i)) { |
16536
|
1899
|
|
|
|
|
|
double cost = (bs_conf.cost * iteration + log(w->outcomes[i])) / (iteration + 1); |
16537
|
633
|
100
|
|
|
|
|
if (w->bs_alternatives.size() == beam_size) { |
16538
|
567
|
100
|
|
|
|
|
if (cost <= w->bs_alternatives[0].cost) continue; |
16539
|
|
|
|
|
|
|
pop_heap(w->bs_alternatives.begin(), w->bs_alternatives.end()); |
16540
|
|
|
|
|
|
|
w->bs_alternatives.pop_back(); |
16541
|
|
|
|
|
|
|
} |
16542
|
236
|
|
|
|
|
|
w->bs_alternatives.emplace_back(&bs_conf, i, cost); |
16543
|
236
|
|
|
|
|
|
push_heap(w->bs_alternatives.begin(), w->bs_alternatives.end()); |
16544
|
|
|
|
|
|
|
} |
16545
|
|
|
|
|
|
|
} |
16546
|
|
|
|
|
|
|
|
16547
|
15
|
|
|
|
|
|
w->bs_confs_size[(iteration + 1) & 1] = 0; |
16548
|
86
|
100
|
|
|
|
|
for (auto&& alternative : w->bs_alternatives) { |
16549
|
71
|
|
|
|
|
|
auto& bs_conf_new = w->bs_confs[(iteration + 1) & 1][w->bs_confs_size[(iteration + 1) & 1]++]; |
16550
|
71
|
|
|
|
|
|
bs_conf_new = *alternative.bs_conf; |
16551
|
71
|
|
|
|
|
|
bs_conf_new.cost = alternative.cost; |
16552
|
71
|
100
|
|
|
|
|
if (alternative.transition >= 0) { |
16553
|
66
|
|
|
|
|
|
bs_conf_new.refresh_tree(); |
16554
|
66
|
|
|
|
|
|
system->perform(bs_conf_new.conf, alternative.transition); |
16555
|
66
|
|
|
|
|
|
bs_conf_new.save_tree(); |
16556
|
|
|
|
|
|
|
} |
16557
|
|
|
|
|
|
|
} |
16558
|
|
|
|
|
|
|
} |
16559
|
|
|
|
|
|
|
|
16560
|
|
|
|
|
|
|
// Return the best tree |
16561
|
|
|
|
|
|
|
size_t best = 0; |
16562
|
5
|
100
|
|
|
|
|
for (size_t i = 1; i < w->bs_confs_size[iteration & 1]; i++) |
16563
|
4
|
100
|
|
|
|
|
if (w->bs_confs[iteration & 1][i].cost > w->bs_confs[iteration & 1][best].cost) |
16564
|
|
|
|
|
|
|
best = i; |
16565
|
1
|
|
|
|
|
|
w->bs_confs[iteration & 1][best].refresh_tree(); |
16566
|
|
|
|
|
|
|
|
16567
|
1
|
50
|
|
|
|
|
if (cost) *cost = w->bs_confs[iteration & 1][best].cost * (t.nodes.size() - 1); |
16568
|
|
|
|
|
|
|
|
16569
|
|
|
|
|
|
|
// Store workspace |
16570
|
1
|
|
|
|
|
|
workspaces.push(w); |
16571
|
1
|
|
|
|
|
|
} |
16572
|
|
|
|
|
|
|
|
16573
|
129
|
|
|
|
|
|
void parser_nn::workspace::beam_size_configuration::refresh_tree() { |
16574
|
1161
|
100
|
|
|
|
|
for (auto&& node : conf.t->nodes) node.children.clear(); |
16575
|
1161
|
100
|
|
|
|
|
for (size_t i = 0; i < conf.t->nodes.size(); i++) { |
16576
|
1032
|
|
|
|
|
|
conf.t->nodes[i].head = heads[i]; |
16577
|
2064
|
|
|
|
|
|
conf.t->nodes[i].deprel = deprels[i]; |
16578
|
1334
|
100
|
|
|
|
|
if (heads[i] >= 0) conf.t->nodes[heads[i]].children.push_back(i); |
16579
|
|
|
|
|
|
|
} |
16580
|
129
|
|
|
|
|
|
} |
16581
|
|
|
|
|
|
|
|
16582
|
67
|
|
|
|
|
|
void parser_nn::workspace::beam_size_configuration::save_tree() { |
16583
|
67
|
100
|
|
|
|
|
if (conf.t->nodes.size() > heads.size()) heads.resize(conf.t->nodes.size()); |
16584
|
67
|
100
|
|
|
|
|
if (conf.t->nodes.size() > deprels.size()) deprels.resize(conf.t->nodes.size()); |
16585
|
603
|
100
|
|
|
|
|
for (size_t i = 0; i < conf.t->nodes.size(); i++) { |
16586
|
536
|
|
|
|
|
|
heads[i] = conf.t->nodes[i].head; |
16587
|
1072
|
|
|
|
|
|
deprels[i] = conf.t->nodes[i].deprel; |
16588
|
|
|
|
|
|
|
} |
16589
|
67
|
|
|
|
|
|
} |
16590
|
|
|
|
|
|
|
|
16591
|
1
|
|
|
|
|
|
void parser_nn::load(binary_decoder& data, unsigned cache) { |
16592
|
|
|
|
|
|
|
string description, error; |
16593
|
|
|
|
|
|
|
|
16594
|
1
|
50
|
|
|
|
|
version = versioned ? data.next_1B() : 1; |
|
|
0
|
|
|
|
|
|
16595
|
1
|
50
|
|
|
|
|
if (!(version >= 1 && version <= VERSION_LATEST)) |
16596
|
0
|
|
|
|
|
|
throw binary_decoder_error("Unrecognized version of the parser_nn model"); |
16597
|
|
|
|
|
|
|
|
16598
|
1
|
50
|
|
|
|
|
single_root = version >= 2 ? data.next_1B() : false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
16599
|
|
|
|
|
|
|
|
16600
|
|
|
|
|
|
|
// Load labels |
16601
|
1
|
50
|
|
|
|
|
labels.resize(data.next_2B()); |
|
|
50
|
|
|
|
|
|
16602
|
7
|
100
|
|
|
|
|
for (auto&& label : labels) |
16603
|
6
|
50
|
|
|
|
|
data.next_str(label); |
16604
|
|
|
|
|
|
|
|
16605
|
|
|
|
|
|
|
// Load transition system |
16606
|
|
|
|
|
|
|
string system_name; |
16607
|
1
|
50
|
|
|
|
|
data.next_str(system_name); |
16608
|
1
|
50
|
|
|
|
|
system.reset(transition_system::create(system_name, labels)); |
16609
|
1
|
50
|
|
|
|
|
if (!system) throw binary_decoder_error("Cannot load transition system"); |
16610
|
|
|
|
|
|
|
|
16611
|
|
|
|
|
|
|
// Load node extractor |
16612
|
1
|
50
|
|
|
|
|
data.next_str(description); |
16613
|
1
|
50
|
|
|
|
|
if (!nodes.create(description, error)) |
|
|
50
|
|
|
|
|
|
16614
|
0
|
|
|
|
|
|
throw binary_decoder_error(error.c_str()); |
16615
|
|
|
|
|
|
|
|
16616
|
|
|
|
|
|
|
// Load value extractors and embeddings |
16617
|
1
|
50
|
|
|
|
|
values.resize(data.next_2B()); |
|
|
50
|
|
|
|
|
|
16618
|
5
|
100
|
|
|
|
|
for (auto&& value : values) { |
16619
|
4
|
50
|
|
|
|
|
data.next_str(description); |
16620
|
4
|
50
|
|
|
|
|
if (!value.create(description, error)) |
|
|
50
|
|
|
|
|
|
16621
|
0
|
|
|
|
|
|
throw binary_decoder_error(error.c_str()); |
16622
|
|
|
|
|
|
|
} |
16623
|
|
|
|
|
|
|
|
16624
|
1
|
50
|
|
|
|
|
embeddings.resize(values.size()); |
16625
|
5
|
100
|
|
|
|
|
for (auto&& embedding : embeddings) |
16626
|
4
|
50
|
|
|
|
|
embedding.load(data); |
16627
|
|
|
|
|
|
|
|
16628
|
|
|
|
|
|
|
// Load the network |
16629
|
1
|
50
|
|
|
|
|
network.load(data); |
16630
|
1
|
50
|
|
|
|
|
network.generate_tanh_cache(); |
16631
|
1
|
50
|
|
|
|
|
network.generate_embeddings_cache(embeddings, embeddings_cache, cache); |
16632
|
1
|
|
|
|
|
|
} |
16633
|
|
|
|
|
|
|
|
16634
|
|
|
|
|
|
|
} // namespace parsito |
16635
|
|
|
|
|
|
|
|
16636
|
|
|
|
|
|
|
///////// |
16637
|
|
|
|
|
|
|
// File: parsito/parser/parser_nn_trainer.h |
16638
|
|
|
|
|
|
|
///////// |
16639
|
|
|
|
|
|
|
|
16640
|
|
|
|
|
|
|
// This file is part of Parsito . |
16641
|
|
|
|
|
|
|
// |
16642
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
16643
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
16644
|
|
|
|
|
|
|
// |
16645
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
16646
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
16647
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
16648
|
|
|
|
|
|
|
|
16649
|
|
|
|
|
|
|
namespace parsito { |
16650
|
|
|
|
|
|
|
|
16651
|
|
|
|
|
|
|
class parser_nn_trainer { |
16652
|
|
|
|
|
|
|
public: |
16653
|
|
|
|
|
|
|
static void train(const string& transition_system_name, const string& transition_oracle_name, bool single_root, |
16654
|
|
|
|
|
|
|
const string& embeddings_description, const string& nodes_description, const network_parameters& parameters, |
16655
|
|
|
|
|
|
|
unsigned number_of_threads, const vector& train, const vector& heldout, binary_encoder& enc); |
16656
|
|
|
|
|
|
|
}; |
16657
|
|
|
|
|
|
|
|
16658
|
|
|
|
|
|
|
} // namespace parsito |
16659
|
|
|
|
|
|
|
|
16660
|
|
|
|
|
|
|
///////// |
16661
|
|
|
|
|
|
|
// File: parsito/parser/parser_nn_trainer.cpp |
16662
|
|
|
|
|
|
|
///////// |
16663
|
|
|
|
|
|
|
|
16664
|
|
|
|
|
|
|
// This file is part of Parsito . |
16665
|
|
|
|
|
|
|
// |
16666
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
16667
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
16668
|
|
|
|
|
|
|
// |
16669
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
16670
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
16671
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
16672
|
|
|
|
|
|
|
|
16673
|
|
|
|
|
|
|
namespace parsito { |
16674
|
|
|
|
|
|
|
|
16675
|
0
|
|
|
|
|
|
void parser_nn_trainer::train(const string& transition_system_name, const string& transition_oracle_name, bool single_root, |
16676
|
|
|
|
|
|
|
const string& embeddings_description, const string& nodes_description, const network_parameters& parameters, |
16677
|
|
|
|
|
|
|
unsigned /*number_of_threads*/, const vector& train, const vector& heldout, binary_encoder& enc) { |
16678
|
0
|
0
|
|
|
|
|
if (train.empty()) training_failure("No training data was given!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
16679
|
|
|
|
|
|
|
|
16680
|
|
|
|
|
|
|
// Random generator with fixed seed for reproducibility |
16681
|
|
|
|
|
|
|
mt19937 generator(42); |
16682
|
|
|
|
|
|
|
|
16683
|
|
|
|
|
|
|
// Check that all non-root nodes have heads and nonempty deprel |
16684
|
0
|
0
|
|
|
|
|
for (auto&& tree : train) |
16685
|
0
|
0
|
|
|
|
|
for (auto&& node : tree.nodes) |
16686
|
0
|
0
|
|
|
|
|
if (node.id) { |
16687
|
0
|
0
|
|
|
|
|
if (node.head < 0) training_failure("The node '" << node.form << "' with id " << node.id << " has no head set!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
16688
|
0
|
0
|
|
|
|
|
if (node.deprel.empty()) training_failure("The node '" << node.form << "' with id " << node.id << " has no deprel set!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
16689
|
|
|
|
|
|
|
} |
16690
|
|
|
|
|
|
|
|
16691
|
|
|
|
|
|
|
// Create parser instance to be trained |
16692
|
0
|
|
|
|
|
|
parser_nn parser(true); parser.version = parser_nn::VERSION_LATEST; |
16693
|
|
|
|
|
|
|
|
16694
|
|
|
|
|
|
|
// Generate labels for transition system |
16695
|
|
|
|
|
|
|
unordered_set labels_set; |
16696
|
0
|
0
|
|
|
|
|
for (auto&& tree : train) |
16697
|
0
|
0
|
|
|
|
|
for (auto&& node : tree.nodes) |
16698
|
0
|
0
|
|
|
|
|
if (node.id && !labels_set.count(node.deprel)) { |
16699
|
0
|
|
|
|
|
|
labels_set.insert(node.deprel); |
16700
|
0
|
0
|
|
|
|
|
parser.labels.push_back(node.deprel); |
16701
|
|
|
|
|
|
|
} |
16702
|
|
|
|
|
|
|
|
16703
|
|
|
|
|
|
|
// If single_root, check that exactly root nodes have "root" deprel |
16704
|
0
|
0
|
|
|
|
|
if (single_root) { |
16705
|
0
|
0
|
|
|
|
|
for (auto&& tree : train) { |
16706
|
|
|
|
|
|
|
unsigned roots = 0; |
16707
|
0
|
0
|
|
|
|
|
for (auto&& node : tree.nodes) |
16708
|
0
|
0
|
|
|
|
|
if (node.id) { |
16709
|
0
|
0
|
|
|
|
|
if (node.head == 0 && node.deprel != "root") |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
16710
|
0
|
0
|
|
|
|
|
training_failure("When single root is required, every root node must have 'root' deprel!"); |
|
|
0
|
|
|
|
|
|
16711
|
0
|
0
|
|
|
|
|
if (node.head != 0 && node.deprel == "root") |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
16712
|
0
|
0
|
|
|
|
|
training_failure("When single root is required, any non-root cannot have 'root' deprel!"); |
|
|
0
|
|
|
|
|
|
16713
|
0
|
|
|
|
|
|
roots += node.head == 0; |
16714
|
|
|
|
|
|
|
} |
16715
|
0
|
0
|
|
|
|
|
if (roots != 1) |
16716
|
0
|
0
|
|
|
|
|
training_failure("When single root is required, every training tree must have single root!"); |
|
|
0
|
|
|
|
|
|
16717
|
|
|
|
|
|
|
} |
16718
|
|
|
|
|
|
|
|
16719
|
|
|
|
|
|
|
// Make sure (in case input is really small) there is "root" deprel plus another one |
16720
|
0
|
0
|
|
|
|
|
if (!labels_set.count("root")) |
|
|
0
|
|
|
|
|
|
16721
|
0
|
0
|
|
|
|
|
training_failure("When single root is required, the deprel 'root' must be present!"); |
|
|
0
|
|
|
|
|
|
16722
|
0
|
0
|
|
|
|
|
if (labels_set.size() <= 1) |
16723
|
0
|
0
|
|
|
|
|
training_failure("When single root is required, deprel different from 'root' must exist!"); |
|
|
0
|
|
|
|
|
|
16724
|
|
|
|
|
|
|
} |
16725
|
|
|
|
|
|
|
|
16726
|
|
|
|
|
|
|
// Create transition system and transition oracle |
16727
|
0
|
0
|
|
|
|
|
parser.system.reset(transition_system::create(transition_system_name, parser.labels)); |
16728
|
0
|
0
|
|
|
|
|
if (!parser.system) training_failure("Cannot create transition system '" << transition_system_name << "'!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
16729
|
|
|
|
|
|
|
|
16730
|
0
|
0
|
|
|
|
|
unique_ptr oracle(parser.system->oracle(transition_oracle_name)); |
16731
|
0
|
0
|
|
|
|
|
if (!oracle) training_failure("Cannot create transition oracle '" << transition_oracle_name << "' for transition system '" << transition_system_name << "'!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
16732
|
|
|
|
|
|
|
|
16733
|
|
|
|
|
|
|
// Create node_extractor |
16734
|
|
|
|
|
|
|
string error; |
16735
|
0
|
0
|
|
|
|
|
if (!parser.nodes.create(nodes_description, error)) training_failure(error); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
16736
|
|
|
|
|
|
|
|
16737
|
|
|
|
|
|
|
// Load value_extractors and embeddings |
16738
|
0
|
|
|
|
|
|
vector value_names; |
16739
|
|
|
|
|
|
|
vector lines, tokens; |
16740
|
0
|
0
|
|
|
|
|
split(embeddings_description, '\n', lines); |
16741
|
0
|
0
|
|
|
|
|
for (auto&& line : lines) { |
16742
|
|
|
|
|
|
|
// Ignore empty lines and comments |
16743
|
0
|
0
|
|
|
|
|
if (!line.len || line.str[0] == '#') continue; |
|
|
0
|
|
|
|
|
|
16744
|
|
|
|
|
|
|
|
16745
|
0
|
0
|
|
|
|
|
split(line, ' ', tokens); |
16746
|
0
|
0
|
|
|
|
|
if (!(tokens.size() >= 3 && tokens.size() <= 6)) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
16747
|
0
|
0
|
|
|
|
|
training_failure("Expected 3 to 6 columns on embedding description line '" << line << "'!"); |
|
|
0
|
|
|
|
|
|
16748
|
|
|
|
|
|
|
|
16749
|
0
|
0
|
|
|
|
|
value_names.emplace_back(string(tokens[0].str, tokens[0].len)); |
16750
|
0
|
0
|
|
|
|
|
parser.values.emplace_back(); |
16751
|
0
|
0
|
|
|
|
|
if (!parser.values.back().create(tokens[0], error)) training_failure(error); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
16752
|
|
|
|
|
|
|
|
16753
|
0
|
0
|
|
|
|
|
int dimension = parse_int(tokens[1], "embedding dimension"); |
16754
|
0
|
0
|
|
|
|
|
int min_count = parse_int(tokens[2], "minimum frequency count"); |
16755
|
|
|
|
|
|
|
unsigned updatable_index = 0; |
16756
|
|
|
|
|
|
|
unsigned embeddings_from_file = 0; |
16757
|
|
|
|
|
|
|
string embeddings_from_file_comment; |
16758
|
0
|
|
|
|
|
|
vector>> weights; |
16759
|
|
|
|
|
|
|
unordered_set weights_set; |
16760
|
|
|
|
|
|
|
|
16761
|
|
|
|
|
|
|
// Compute words and counts present in the training data |
16762
|
|
|
|
|
|
|
string word; |
16763
|
|
|
|
|
|
|
unordered_map word_counts; |
16764
|
0
|
0
|
|
|
|
|
for (auto&& tree : train) |
16765
|
0
|
0
|
|
|
|
|
for (auto&& node : tree.nodes) |
16766
|
0
|
0
|
|
|
|
|
if (node.id) { |
16767
|
0
|
0
|
|
|
|
|
parser.values.back().extract(node, word); |
16768
|
0
|
|
|
|
|
|
word_counts[word]++; |
16769
|
|
|
|
|
|
|
} |
16770
|
|
|
|
|
|
|
|
16771
|
|
|
|
|
|
|
// Load embedding if it was given |
16772
|
0
|
0
|
|
|
|
|
if (tokens.size() >= 4) { |
16773
|
0
|
0
|
|
|
|
|
int update_weights = tokens.size() >= 5 ? parse_int(tokens[4], "update weights") : 1; |
|
|
0
|
|
|
|
|
|
16774
|
0
|
0
|
|
|
|
|
int max_embeddings = tokens.size() >= 6 ? parse_int(tokens[5], "maximum embeddings count") : numeric_limits::max(); |
|
|
0
|
|
|
|
|
|
16775
|
0
|
0
|
|
|
|
|
ifstream in(path_from_utf8(string(tokens[3].str, tokens[3].len)).c_str()); |
16776
|
0
|
0
|
|
|
|
|
if (!in.is_open()) training_failure("Cannot load '" << tokens[0] << "' embedding from file '" << tokens[3] << "'!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
16777
|
|
|
|
|
|
|
|
16778
|
|
|
|
|
|
|
// Load first line containing dictionary size and dimensions |
16779
|
|
|
|
|
|
|
string line; |
16780
|
|
|
|
|
|
|
vector parts; |
16781
|
0
|
0
|
|
|
|
|
if (!getline(in, line)) training_failure("Cannot read first line from embedding file '" << tokens[3] << "'!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
16782
|
0
|
0
|
|
|
|
|
split(line, ' ', parts); |
16783
|
0
|
0
|
|
|
|
|
if (parts.size() != 2) training_failure("Expected two numbers on the first line of embedding file '" << tokens[3] << "'!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
16784
|
0
|
0
|
|
|
|
|
int file_dimension = parse_int(parts[1], "embedding file dimension"); |
16785
|
|
|
|
|
|
|
|
16786
|
0
|
0
|
|
|
|
|
if (file_dimension < dimension) training_failure("The embedding file '" << tokens[3] << "' has lower dimension than required!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
16787
|
|
|
|
|
|
|
|
16788
|
|
|
|
|
|
|
// Generate random projection when smaller dimension is required |
16789
|
0
|
|
|
|
|
|
vector> projection; |
16790
|
0
|
0
|
|
|
|
|
if (file_dimension > dimension) { |
16791
|
0
|
0
|
|
|
|
|
embeddings_from_file_comment = "[dim" + to_string(file_dimension) + "->" + to_string(dimension) + "]"; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
16792
|
|
|
|
|
|
|
|
16793
|
|
|
|
|
|
|
uniform_real_distribution uniform(0, 1); |
16794
|
0
|
0
|
|
|
|
|
projection.resize(dimension); |
16795
|
0
|
0
|
|
|
|
|
for (auto&& row : projection) { |
16796
|
0
|
0
|
|
|
|
|
row.resize(file_dimension); |
16797
|
0
|
0
|
|
|
|
|
for (auto&& weight : row) weight = uniform(generator); |
16798
|
|
|
|
|
|
|
|
16799
|
|
|
|
|
|
|
double sum = 0; |
16800
|
0
|
0
|
|
|
|
|
for (auto&& weight : row) sum += weight; |
16801
|
0
|
0
|
|
|
|
|
for (auto&& weight : row) weight /= sum; |
16802
|
|
|
|
|
|
|
} |
16803
|
|
|
|
|
|
|
} |
16804
|
|
|
|
|
|
|
|
16805
|
|
|
|
|
|
|
// Load input embedding |
16806
|
0
|
0
|
|
|
|
|
vector input_weights(file_dimension); |
16807
|
0
|
0
|
|
|
|
|
vector projected_weights(dimension); |
16808
|
0
|
0
|
|
|
|
|
while (getline(in, line) && int(weights.size()) < max_embeddings) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
16809
|
0
|
0
|
|
|
|
|
split(line, ' ', parts); |
16810
|
0
|
0
|
|
|
|
|
if (!parts.empty() && !parts.back().len) parts.pop_back(); // Ignore space at the end of line |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
16811
|
0
|
0
|
|
|
|
|
if (int(parts.size()) != file_dimension + 1) training_failure("Wrong number of values on line '" << line << "' of embedding file '" << tokens[3]); |
|
|
0
|
|
|
|
|
|
16812
|
0
|
0
|
|
|
|
|
for (int i = 0; i < file_dimension; i++) |
16813
|
0
|
0
|
|
|
|
|
input_weights[i] = parse_double(parts[1 + i], "embedding weight"); |
16814
|
|
|
|
|
|
|
|
16815
|
0
|
|
|
|
|
|
string word(parts[0].str, parts[0].len); |
16816
|
|
|
|
|
|
|
|
16817
|
|
|
|
|
|
|
// For update_weights == 2, ignore embeddings for unknown words |
16818
|
0
|
0
|
|
|
|
|
if (update_weights == 2 && !word_counts.count(word)) |
16819
|
|
|
|
|
|
|
continue; |
16820
|
|
|
|
|
|
|
|
16821
|
0
|
0
|
|
|
|
|
for (int i = 0; i < dimension; i++) |
16822
|
0
|
0
|
|
|
|
|
if (file_dimension == dimension) { |
16823
|
0
|
|
|
|
|
|
projected_weights[i] = input_weights[i]; |
16824
|
|
|
|
|
|
|
} else { |
16825
|
0
|
|
|
|
|
|
projected_weights[i] = 0; |
16826
|
0
|
0
|
|
|
|
|
for (int j = 0; j < file_dimension; j++) |
16827
|
0
|
|
|
|
|
|
projected_weights[i] += projection[i][j] * input_weights[j]; |
16828
|
|
|
|
|
|
|
} |
16829
|
|
|
|
|
|
|
|
16830
|
0
|
0
|
|
|
|
|
if (!weights_set.count(word)) { |
16831
|
0
|
0
|
|
|
|
|
weights.emplace_back(word, projected_weights); |
16832
|
|
|
|
|
|
|
weights_set.insert(word); |
16833
|
|
|
|
|
|
|
} |
16834
|
|
|
|
|
|
|
} |
16835
|
0
|
|
|
|
|
|
embeddings_from_file = weights.size(); |
16836
|
0
|
0
|
|
|
|
|
updatable_index = update_weights ? 0 : embeddings_from_file; |
16837
|
|
|
|
|
|
|
} |
16838
|
|
|
|
|
|
|
|
16839
|
|
|
|
|
|
|
// Add embedding for non-present word with min_count, sorted by count |
16840
|
|
|
|
|
|
|
{ |
16841
|
0
|
|
|
|
|
|
vector> count_words; |
16842
|
0
|
0
|
|
|
|
|
for (auto&& word_count : word_counts) |
16843
|
0
|
0
|
|
|
|
|
if (word_count.second >= min_count && !weights_set.count(word_count.first)) |
16844
|
0
|
0
|
|
|
|
|
count_words.emplace_back(word_count.second, word_count.first); |
16845
|
|
|
|
|
|
|
|
16846
|
|
|
|
|
|
|
sort(count_words.rbegin(), count_words.rend()); |
16847
|
|
|
|
|
|
|
|
16848
|
0
|
0
|
|
|
|
|
vector word_weights(dimension); |
16849
|
|
|
|
|
|
|
uniform_real_distribution uniform(-1, 1); |
16850
|
0
|
0
|
|
|
|
|
for (auto&& count_word : count_words) { |
16851
|
0
|
0
|
|
|
|
|
for (auto&& word_weight : word_weights) |
16852
|
0
|
|
|
|
|
|
word_weight = uniform(generator); |
16853
|
|
|
|
|
|
|
|
16854
|
0
|
0
|
|
|
|
|
weights.emplace_back(count_word.second, word_weights); |
16855
|
|
|
|
|
|
|
} |
16856
|
|
|
|
|
|
|
} |
16857
|
|
|
|
|
|
|
|
16858
|
|
|
|
|
|
|
// If there are unknown words in the training data, create initial embedding |
16859
|
0
|
0
|
|
|
|
|
vector unknown_weights(dimension); |
16860
|
0
|
0
|
|
|
|
|
if (min_count > 1) { |
16861
|
|
|
|
|
|
|
uniform_real_distribution uniform(-1, 1); |
16862
|
|
|
|
|
|
|
|
16863
|
0
|
0
|
|
|
|
|
for (auto&& weight : unknown_weights) |
16864
|
0
|
|
|
|
|
|
weight = uniform(generator); |
16865
|
|
|
|
|
|
|
} |
16866
|
|
|
|
|
|
|
|
16867
|
|
|
|
|
|
|
// Add the embedding |
16868
|
0
|
0
|
|
|
|
|
parser.embeddings.emplace_back(); |
16869
|
0
|
0
|
|
|
|
|
parser.embeddings.back().create(dimension, updatable_index, weights, unknown_weights); |
16870
|
|
|
|
|
|
|
|
16871
|
|
|
|
|
|
|
// Count the cover of this embedding |
16872
|
|
|
|
|
|
|
string buffer; |
16873
|
|
|
|
|
|
|
unsigned words_total = 0, words_covered = 0, words_covered_from_file = 0; |
16874
|
0
|
0
|
|
|
|
|
for (auto&& tree : train) |
16875
|
0
|
0
|
|
|
|
|
for (auto&& node : tree.nodes) |
16876
|
0
|
0
|
|
|
|
|
if (node.id) { |
16877
|
0
|
0
|
|
|
|
|
parser.values.back().extract(node, word); |
16878
|
0
|
|
|
|
|
|
words_total++; |
16879
|
0
|
0
|
|
|
|
|
int word_id = parser.embeddings.back().lookup_word(word, buffer); |
16880
|
0
|
|
|
|
|
|
words_covered += word_id != parser.embeddings.back().unknown_word(); |
16881
|
0
|
0
|
|
|
|
|
words_covered_from_file += word_id != parser.embeddings.back().unknown_word() && unsigned(word_id) < embeddings_from_file; |
|
|
0
|
|
|
|
|
|
16882
|
|
|
|
|
|
|
} |
16883
|
|
|
|
|
|
|
|
16884
|
|
|
|
|
|
|
cerr << "Initialized '" << tokens[0] << "' embedding with " << embeddings_from_file << embeddings_from_file_comment |
16885
|
0
|
|
|
|
|
|
<< "," << weights.size() << " words and " << fixed << setprecision(1) << 100. * words_covered_from_file / words_total |
16886
|
0
|
|
|
|
|
|
<< "%," << 100. * words_covered / words_total << "% coverage." << endl; |
16887
|
|
|
|
|
|
|
} |
16888
|
|
|
|
|
|
|
|
16889
|
|
|
|
|
|
|
// Train the network |
16890
|
|
|
|
|
|
|
unsigned total_dimension = 0, total_nodes = 0; |
16891
|
0
|
0
|
|
|
|
|
for (auto&& embedding : parser.embeddings) total_dimension += embedding.dimension; |
16892
|
0
|
0
|
|
|
|
|
for (auto&& tree : train) total_nodes += tree.nodes.size() - 1; |
16893
|
0
|
|
|
|
|
|
auto scaled_parameters = parameters; |
16894
|
0
|
|
|
|
|
|
scaled_parameters.l1_regularization /= train.size(); |
16895
|
0
|
|
|
|
|
|
scaled_parameters.l2_regularization /= total_nodes; |
16896
|
0
|
0
|
|
|
|
|
neural_network_trainer network_trainer(parser.network, total_dimension * parser.nodes.node_count(), parser.system->transition_count(), scaled_parameters, generator); |
|
|
0
|
|
|
|
|
|
16897
|
|
|
|
|
|
|
|
16898
|
0
|
|
|
|
|
|
neural_network heldout_best_network; |
16899
|
|
|
|
|
|
|
unsigned heldout_best_correct_labelled = 0, heldout_best_iteration = 0; |
16900
|
|
|
|
|
|
|
|
16901
|
|
|
|
|
|
|
vector permutation; |
16902
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < train.size(); i++) |
16903
|
0
|
|
|
|
|
|
permutation.push_back(permutation.size()); |
16904
|
|
|
|
|
|
|
|
16905
|
0
|
0
|
|
|
|
|
for (int iteration = 1; network_trainer.next_iteration(); iteration++) { |
16906
|
|
|
|
|
|
|
// Train on training data |
16907
|
0
|
|
|
|
|
|
shuffle(permutation.begin(), permutation.end(), generator); |
16908
|
|
|
|
|
|
|
|
16909
|
0
|
|
|
|
|
|
atomic atomic_index(0); |
16910
|
0
|
|
|
|
|
|
atomic atomic_logprob(0); |
16911
|
0
|
|
|
|
|
|
auto training = [&]() { |
16912
|
0
|
|
|
|
|
|
tree t; |
16913
|
0
|
|
|
|
|
|
configuration conf(single_root); |
16914
|
|
|
|
|
|
|
string word, word_buffer; |
16915
|
0
|
|
|
|
|
|
vector> nodes_embeddings; |
16916
|
|
|
|
|
|
|
vector extracted_nodes; |
16917
|
|
|
|
|
|
|
vector*> extracted_embeddings; |
16918
|
0
|
|
|
|
|
|
neural_network_trainer::workspace workspace; |
16919
|
|
|
|
|
|
|
double logprob = 0; |
16920
|
|
|
|
|
|
|
|
16921
|
|
|
|
|
|
|
// Data for structured prediction |
16922
|
0
|
0
|
|
|
|
|
tree t_eval; |
16923
|
0
|
|
|
|
|
|
configuration conf_eval(single_root); |
16924
|
0
|
|
|
|
|
|
vector> nodes_embeddings_eval; |
16925
|
|
|
|
|
|
|
vector extracted_nodes_eval; |
16926
|
|
|
|
|
|
|
vector*> extracted_embeddings_eval; |
16927
|
|
|
|
|
|
|
vector transitions_eval; |
16928
|
|
|
|
|
|
|
vector hidden_layer_eval, outcomes_eval; |
16929
|
|
|
|
|
|
|
|
16930
|
0
|
0
|
|
|
|
|
for (unsigned current_index; (current_index = atomic_index++) < permutation.size();) { |
16931
|
0
|
|
|
|
|
|
const tree& gold = train[permutation[current_index]]; |
16932
|
|
|
|
|
|
|
t = gold; |
16933
|
|
|
|
|
|
|
t.unlink_all_nodes(); |
16934
|
0
|
0
|
|
|
|
|
conf.init(&t); |
16935
|
|
|
|
|
|
|
|
16936
|
|
|
|
|
|
|
// Compute embeddings |
16937
|
0
|
0
|
|
|
|
|
if (t.nodes.size() > nodes_embeddings.size()) nodes_embeddings.resize(t.nodes.size()); |
|
|
0
|
|
|
|
|
|
16938
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < t.nodes.size(); i++) { |
16939
|
0
|
0
|
|
|
|
|
nodes_embeddings[i].resize(parser.embeddings.size()); |
16940
|
0
|
0
|
|
|
|
|
for (size_t j = 0; j < parser.embeddings.size(); j++) { |
16941
|
0
|
0
|
|
|
|
|
parser.values[j].extract(t.nodes[i], word); |
16942
|
0
|
0
|
|
|
|
|
nodes_embeddings[i][j] = parser.embeddings[j].lookup_word(word, word_buffer); |
16943
|
|
|
|
|
|
|
} |
16944
|
|
|
|
|
|
|
} |
16945
|
|
|
|
|
|
|
|
16946
|
|
|
|
|
|
|
// Create tree oracle |
16947
|
0
|
0
|
|
|
|
|
auto tree_oracle = oracle->create_tree_oracle(gold); |
16948
|
|
|
|
|
|
|
|
16949
|
|
|
|
|
|
|
// Train the network |
16950
|
0
|
0
|
|
|
|
|
while (!conf.final()) { |
16951
|
|
|
|
|
|
|
// Extract nodes |
16952
|
0
|
0
|
|
|
|
|
parser.nodes.extract(conf, extracted_nodes); |
16953
|
0
|
0
|
|
|
|
|
extracted_embeddings.resize(extracted_nodes.size()); |
16954
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < extracted_nodes.size(); i++) |
16955
|
0
|
0
|
|
|
|
|
extracted_embeddings[i] = extracted_nodes[i] >= 0 ? &nodes_embeddings[extracted_nodes[i]] : nullptr; |
16956
|
|
|
|
|
|
|
|
16957
|
|
|
|
|
|
|
// Propagate |
16958
|
0
|
0
|
|
|
|
|
network_trainer.propagate(parser.embeddings, extracted_embeddings, workspace); |
16959
|
|
|
|
|
|
|
|
16960
|
|
|
|
|
|
|
// Find most probable applicable transition |
16961
|
|
|
|
|
|
|
int network_best = -1; |
16962
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < workspace.outcomes.size(); i++) |
16963
|
0
|
0
|
|
|
|
|
if (parser.system->applicable(conf, i) && (network_best < 0 || workspace.outcomes[i] > workspace.outcomes[network_best])) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
16964
|
0
|
|
|
|
|
|
network_best = i; |
16965
|
|
|
|
|
|
|
|
16966
|
|
|
|
|
|
|
// Apply the oracle |
16967
|
0
|
0
|
|
|
|
|
auto prediction = tree_oracle->predict(conf, network_best, iteration); |
16968
|
|
|
|
|
|
|
|
16969
|
|
|
|
|
|
|
// If the best transition is applicable, train on it |
16970
|
0
|
0
|
|
|
|
|
if (parser.system->applicable(conf, prediction.best)) { |
|
|
0
|
|
|
|
|
|
16971
|
|
|
|
|
|
|
// Update logprob |
16972
|
0
|
0
|
|
|
|
|
if (workspace.outcomes[prediction.best]) |
16973
|
0
|
|
|
|
|
|
logprob += log(workspace.outcomes[prediction.best]); |
16974
|
|
|
|
|
|
|
|
16975
|
|
|
|
|
|
|
// Backpropagate the chosen outcome |
16976
|
0
|
0
|
|
|
|
|
network_trainer.backpropagate(parser.embeddings, extracted_embeddings, prediction.best, workspace); |
16977
|
|
|
|
|
|
|
} |
16978
|
|
|
|
|
|
|
|
16979
|
|
|
|
|
|
|
// Emergency break if the to_follow transition is not applicable |
16980
|
0
|
0
|
|
|
|
|
if (!parser.system->applicable(conf, prediction.to_follow)) |
|
|
0
|
|
|
|
|
|
16981
|
|
|
|
|
|
|
break; |
16982
|
|
|
|
|
|
|
|
16983
|
|
|
|
|
|
|
// Follow the chosen outcome |
16984
|
0
|
0
|
|
|
|
|
int child = parser.system->perform(conf, prediction.to_follow); |
16985
|
|
|
|
|
|
|
|
16986
|
|
|
|
|
|
|
// If a node was linked, recompute its embeddings as deprel has changed |
16987
|
0
|
0
|
|
|
|
|
if (child >= 0) |
16988
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < parser.embeddings.size(); i++) { |
16989
|
0
|
0
|
|
|
|
|
parser.values[i].extract(t.nodes[child], word); |
16990
|
0
|
0
|
|
|
|
|
nodes_embeddings[child][i] = parser.embeddings[i].lookup_word(word, word_buffer); |
16991
|
|
|
|
|
|
|
} |
16992
|
|
|
|
|
|
|
} |
16993
|
|
|
|
|
|
|
network_trainer.finalize_sentence(); |
16994
|
|
|
|
|
|
|
|
16995
|
|
|
|
|
|
|
// Structured prediction |
16996
|
0
|
0
|
|
|
|
|
if (parameters.structured_interval && (current_index % parameters.structured_interval) == 0) { |
|
|
0
|
|
|
|
|
|
16997
|
0
|
|
|
|
|
|
uniform_int_distribution train_distribution(0, train.size() - 1); |
16998
|
0
|
|
|
|
|
|
const tree& gold = train[train_distribution(generator)]; |
16999
|
|
|
|
|
|
|
t = gold; |
17000
|
|
|
|
|
|
|
t.unlink_all_nodes(); |
17001
|
0
|
0
|
|
|
|
|
conf.init(&t); |
17002
|
|
|
|
|
|
|
|
17003
|
|
|
|
|
|
|
// Compute embeddings |
17004
|
0
|
0
|
|
|
|
|
if (t.nodes.size() > nodes_embeddings.size()) nodes_embeddings.resize(t.nodes.size()); |
|
|
0
|
|
|
|
|
|
17005
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < t.nodes.size(); i++) { |
17006
|
0
|
0
|
|
|
|
|
nodes_embeddings[i].resize(parser.embeddings.size()); |
17007
|
0
|
0
|
|
|
|
|
for (size_t j = 0; j < parser.embeddings.size(); j++) { |
17008
|
0
|
0
|
|
|
|
|
parser.values[j].extract(t.nodes[i], word); |
17009
|
0
|
0
|
|
|
|
|
nodes_embeddings[i][j] = parser.embeddings[j].lookup_word(word, word_buffer); |
17010
|
|
|
|
|
|
|
} |
17011
|
|
|
|
|
|
|
} |
17012
|
|
|
|
|
|
|
|
17013
|
|
|
|
|
|
|
// Create tree oracle |
17014
|
0
|
0
|
|
|
|
|
auto tree_oracle = oracle->create_tree_oracle(gold); |
17015
|
|
|
|
|
|
|
|
17016
|
|
|
|
|
|
|
// Train the network |
17017
|
0
|
0
|
|
|
|
|
while (!conf.final()) { |
17018
|
|
|
|
|
|
|
// Extract nodes |
17019
|
0
|
0
|
|
|
|
|
parser.nodes.extract(conf, extracted_nodes); |
17020
|
0
|
0
|
|
|
|
|
extracted_embeddings.resize(extracted_nodes.size()); |
17021
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < extracted_nodes.size(); i++) |
17022
|
0
|
0
|
|
|
|
|
extracted_embeddings[i] = extracted_nodes[i] >= 0 ? &nodes_embeddings[extracted_nodes[i]] : nullptr; |
17023
|
|
|
|
|
|
|
|
17024
|
|
|
|
|
|
|
// Find the best transition |
17025
|
|
|
|
|
|
|
int best = 0; |
17026
|
|
|
|
|
|
|
int best_uas = -1; |
17027
|
0
|
0
|
|
|
|
|
tree_oracle->interesting_transitions(conf, transitions_eval); |
17028
|
0
|
0
|
|
|
|
|
for (auto&& transition : transitions_eval) { |
17029
|
|
|
|
|
|
|
t_eval = t; |
17030
|
0
|
0
|
|
|
|
|
conf_eval = conf; |
17031
|
0
|
|
|
|
|
|
conf_eval.t = &t_eval; |
17032
|
0
|
0
|
|
|
|
|
nodes_embeddings_eval = nodes_embeddings; |
17033
|
|
|
|
|
|
|
|
17034
|
|
|
|
|
|
|
// Perform probed transition |
17035
|
0
|
0
|
|
|
|
|
int child = parser.system->perform(conf_eval, transition); |
17036
|
0
|
0
|
|
|
|
|
if (child >= 0) |
17037
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < parser.embeddings.size(); i++) { |
17038
|
0
|
0
|
|
|
|
|
parser.values[i].extract(t_eval.nodes[child], word); |
17039
|
0
|
0
|
|
|
|
|
nodes_embeddings_eval[child][i] = parser.embeddings[i].lookup_word(word, word_buffer); |
17040
|
|
|
|
|
|
|
} |
17041
|
|
|
|
|
|
|
|
17042
|
|
|
|
|
|
|
// Train the network |
17043
|
0
|
0
|
|
|
|
|
while (!conf_eval.final()) { |
17044
|
|
|
|
|
|
|
// Extract nodes |
17045
|
0
|
0
|
|
|
|
|
parser.nodes.extract(conf_eval, extracted_nodes_eval); |
17046
|
0
|
0
|
|
|
|
|
extracted_embeddings_eval.resize(extracted_nodes_eval.size()); |
17047
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < extracted_nodes_eval.size(); i++) |
17048
|
0
|
0
|
|
|
|
|
extracted_embeddings_eval[i] = extracted_nodes_eval[i] >= 0 ? &nodes_embeddings_eval[extracted_nodes_eval[i]] : nullptr; |
17049
|
|
|
|
|
|
|
|
17050
|
|
|
|
|
|
|
// Classify using neural network |
17051
|
0
|
0
|
|
|
|
|
parser.network.propagate(parser.embeddings, extracted_embeddings_eval, hidden_layer_eval, outcomes_eval, nullptr, false); |
17052
|
|
|
|
|
|
|
|
17053
|
|
|
|
|
|
|
// Find most probable applicable transition |
17054
|
|
|
|
|
|
|
int network_best = -1; |
17055
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < outcomes_eval.size(); i++) |
17056
|
0
|
0
|
|
|
|
|
if (parser.system->applicable(conf_eval, i) && (network_best < 0 || outcomes_eval[i] > outcomes_eval[network_best])) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
17057
|
0
|
|
|
|
|
|
network_best = i; |
17058
|
|
|
|
|
|
|
|
17059
|
|
|
|
|
|
|
// Perform the best transition |
17060
|
0
|
0
|
|
|
|
|
int child = parser.system->perform(conf_eval, network_best); |
17061
|
|
|
|
|
|
|
|
17062
|
|
|
|
|
|
|
// If a node was linked, recompute its embeddings as deprel has changed |
17063
|
0
|
0
|
|
|
|
|
if (child >= 0) |
17064
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < parser.embeddings.size(); i++) { |
17065
|
0
|
0
|
|
|
|
|
parser.values[i].extract(t_eval.nodes[child], word); |
17066
|
0
|
0
|
|
|
|
|
nodes_embeddings_eval[child][i] = parser.embeddings[i].lookup_word(word, word_buffer); |
17067
|
|
|
|
|
|
|
} |
17068
|
|
|
|
|
|
|
} |
17069
|
|
|
|
|
|
|
|
17070
|
|
|
|
|
|
|
int uas = 0; |
17071
|
0
|
0
|
|
|
|
|
for (unsigned i = 1; i < gold.nodes.size(); i++) |
17072
|
0
|
|
|
|
|
|
uas += gold.nodes[i].head == t_eval.nodes[i].head; |
17073
|
|
|
|
|
|
|
|
17074
|
0
|
0
|
|
|
|
|
if (uas > best_uas) best = transition, best_uas = uas; |
17075
|
|
|
|
|
|
|
} |
17076
|
|
|
|
|
|
|
|
17077
|
|
|
|
|
|
|
// Propagate |
17078
|
0
|
0
|
|
|
|
|
network_trainer.propagate(parser.embeddings, extracted_embeddings, workspace); |
17079
|
|
|
|
|
|
|
|
17080
|
|
|
|
|
|
|
// Backpropagate for the best transition |
17081
|
0
|
0
|
|
|
|
|
if (workspace.outcomes[best]) |
17082
|
0
|
|
|
|
|
|
logprob += log(workspace.outcomes[best]); |
17083
|
0
|
0
|
|
|
|
|
network_trainer.backpropagate(parser.embeddings, extracted_embeddings, best, workspace); |
17084
|
|
|
|
|
|
|
|
17085
|
|
|
|
|
|
|
// // Find most probable applicable transition when following network outcome |
17086
|
|
|
|
|
|
|
// int network_best = -1; |
17087
|
|
|
|
|
|
|
// for (unsigned i = 0; i < workspace.outcomes.size(); i++) |
17088
|
|
|
|
|
|
|
// if (parser.system->applicable(conf, i) && (network_best < 0 || workspace.outcomes[i] > workspace.outcomes[network_best])) |
17089
|
|
|
|
|
|
|
// network_best = i; |
17090
|
|
|
|
|
|
|
|
17091
|
|
|
|
|
|
|
// Follow the best outcome |
17092
|
0
|
0
|
|
|
|
|
int child = parser.system->perform(conf, /*network_*/best); |
17093
|
|
|
|
|
|
|
|
17094
|
|
|
|
|
|
|
// If a node was linked, recompute its embeddings as deprel has changed |
17095
|
0
|
0
|
|
|
|
|
if (child >= 0) |
17096
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < parser.embeddings.size(); i++) { |
17097
|
0
|
0
|
|
|
|
|
parser.values[i].extract(t.nodes[child], word); |
17098
|
0
|
0
|
|
|
|
|
nodes_embeddings[child][i] = parser.embeddings[i].lookup_word(word, word_buffer); |
17099
|
|
|
|
|
|
|
} |
17100
|
|
|
|
|
|
|
} |
17101
|
|
|
|
|
|
|
network_trainer.finalize_sentence(); |
17102
|
|
|
|
|
|
|
} |
17103
|
|
|
|
|
|
|
} |
17104
|
0
|
0
|
|
|
|
|
for (double old_atomic_logprob = atomic_logprob; atomic_logprob.compare_exchange_weak(old_atomic_logprob, old_atomic_logprob + logprob); ) {} |
17105
|
0
|
|
|
|
|
|
}; |
17106
|
|
|
|
|
|
|
|
17107
|
0
|
0
|
|
|
|
|
cerr << "Iteration " << iteration << ": "; |
|
|
0
|
|
|
|
|
|
17108
|
0
|
0
|
|
|
|
|
training(); |
17109
|
|
|
|
|
|
|
cerr << "training logprob " << scientific << setprecision(4) << atomic_logprob; |
17110
|
|
|
|
|
|
|
|
17111
|
|
|
|
|
|
|
// Evaluate heldout data if present |
17112
|
0
|
0
|
|
|
|
|
if (!heldout.empty()) { |
17113
|
0
|
0
|
|
|
|
|
tree t; |
17114
|
|
|
|
|
|
|
unsigned total = 0, correct_unlabelled = 0, correct_labelled = 0; |
17115
|
0
|
0
|
|
|
|
|
for (auto&& gold : heldout) { |
17116
|
|
|
|
|
|
|
t = gold; |
17117
|
|
|
|
|
|
|
t.unlink_all_nodes(); |
17118
|
|
|
|
|
|
|
parser.parse(t); |
17119
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < t.nodes.size(); i++) { |
17120
|
0
|
|
|
|
|
|
total++; |
17121
|
0
|
|
|
|
|
|
correct_unlabelled += t.nodes[i].head == gold.nodes[i].head; |
17122
|
0
|
0
|
|
|
|
|
correct_labelled += t.nodes[i].head == gold.nodes[i].head && t.nodes[i].deprel == gold.nodes[i].deprel; |
|
|
0
|
|
|
|
|
|
17123
|
|
|
|
|
|
|
} |
17124
|
|
|
|
|
|
|
} |
17125
|
|
|
|
|
|
|
|
17126
|
0
|
0
|
|
|
|
|
cerr << ", heldout UAS " << fixed << setprecision(2) << (100. * correct_unlabelled / total) << "%, LAS " << (100. * correct_labelled / total) << "%"; |
17127
|
|
|
|
|
|
|
|
17128
|
0
|
0
|
|
|
|
|
if (parameters.early_stopping && correct_labelled > heldout_best_correct_labelled) { |
|
|
0
|
|
|
|
|
|
17129
|
0
|
0
|
|
|
|
|
heldout_best_network = parser.network; |
17130
|
|
|
|
|
|
|
heldout_best_correct_labelled = correct_labelled; |
17131
|
0
|
|
|
|
|
|
heldout_best_iteration = iteration; |
17132
|
|
|
|
|
|
|
} |
17133
|
|
|
|
|
|
|
} |
17134
|
|
|
|
|
|
|
|
17135
|
|
|
|
|
|
|
cerr << endl; |
17136
|
|
|
|
|
|
|
} |
17137
|
|
|
|
|
|
|
|
17138
|
0
|
0
|
|
|
|
|
if (parameters.early_stopping && heldout_best_iteration > 0) { |
|
|
0
|
|
|
|
|
|
17139
|
|
|
|
|
|
|
cerr << "Using early stopping -- choosing network from iteration " << heldout_best_iteration << endl; |
17140
|
0
|
0
|
|
|
|
|
parser.network = heldout_best_network; |
17141
|
|
|
|
|
|
|
} |
17142
|
|
|
|
|
|
|
|
17143
|
|
|
|
|
|
|
// Encode version |
17144
|
0
|
0
|
|
|
|
|
enc.add_1B(parser.version); |
17145
|
|
|
|
|
|
|
|
17146
|
|
|
|
|
|
|
// Encode single_root |
17147
|
0
|
0
|
|
|
|
|
enc.add_1B(single_root); |
17148
|
|
|
|
|
|
|
|
17149
|
|
|
|
|
|
|
// Encode transition system |
17150
|
0
|
0
|
|
|
|
|
enc.add_2B(parser.labels.size()); |
17151
|
0
|
0
|
|
|
|
|
for (auto&& label : parser.labels) |
17152
|
0
|
0
|
|
|
|
|
enc.add_str(label); |
17153
|
0
|
0
|
|
|
|
|
enc.add_str(transition_system_name); |
17154
|
|
|
|
|
|
|
|
17155
|
|
|
|
|
|
|
// Encode nodes selector |
17156
|
0
|
0
|
|
|
|
|
enc.add_str(nodes_description); |
17157
|
|
|
|
|
|
|
|
17158
|
|
|
|
|
|
|
// Encode value extractors and embeddings |
17159
|
0
|
0
|
|
|
|
|
enc.add_2B(value_names.size()); |
17160
|
0
|
0
|
|
|
|
|
for (auto&& value_name : value_names) |
17161
|
0
|
0
|
|
|
|
|
enc.add_str(value_name); |
17162
|
0
|
0
|
|
|
|
|
for (auto&& embedding : parser.embeddings) |
17163
|
0
|
0
|
|
|
|
|
embedding.save(enc); |
17164
|
|
|
|
|
|
|
|
17165
|
|
|
|
|
|
|
// Encode the network |
17166
|
0
|
0
|
|
|
|
|
network_trainer.save_network(enc); |
17167
|
0
|
|
|
|
|
|
} |
17168
|
|
|
|
|
|
|
|
17169
|
|
|
|
|
|
|
} // namespace parsito |
17170
|
|
|
|
|
|
|
|
17171
|
|
|
|
|
|
|
///////// |
17172
|
|
|
|
|
|
|
// File: parsito/transition/transition.cpp |
17173
|
|
|
|
|
|
|
///////// |
17174
|
|
|
|
|
|
|
|
17175
|
|
|
|
|
|
|
// This file is part of Parsito . |
17176
|
|
|
|
|
|
|
// |
17177
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
17178
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
17179
|
|
|
|
|
|
|
// |
17180
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
17181
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
17182
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
17183
|
|
|
|
|
|
|
|
17184
|
|
|
|
|
|
|
namespace parsito { |
17185
|
|
|
|
|
|
|
|
17186
|
|
|
|
|
|
|
// Left arc |
17187
|
387
|
|
|
|
|
|
bool transition_left_arc::applicable(const configuration& conf) const { |
17188
|
387
|
50
|
|
|
|
|
if (conf.single_root && label_is_root) |
|
|
0
|
|
|
|
|
|
17189
|
|
|
|
|
|
|
return false; |
17190
|
|
|
|
|
|
|
else |
17191
|
387
|
100
|
|
|
|
|
return conf.stack.size() >= 2 && conf.stack[conf.stack.size() - 2]; |
|
|
100
|
|
|
|
|
|
17192
|
|
|
|
|
|
|
} |
17193
|
|
|
|
|
|
|
|
17194
|
15
|
|
|
|
|
|
int transition_left_arc::perform(configuration& conf) const { |
17195
|
15
|
50
|
|
|
|
|
assert(applicable(conf)); |
17196
|
|
|
|
|
|
|
|
17197
|
15
|
|
|
|
|
|
int parent = conf.stack.back(); conf.stack.pop_back(); |
17198
|
15
|
|
|
|
|
|
int child = conf.stack.back(); conf.stack.pop_back(); |
17199
|
15
|
|
|
|
|
|
conf.stack.push_back(parent); |
17200
|
15
|
|
|
|
|
|
conf.t->set_head(child, parent, label); |
17201
|
15
|
|
|
|
|
|
return child; |
17202
|
|
|
|
|
|
|
} |
17203
|
|
|
|
|
|
|
|
17204
|
|
|
|
|
|
|
// Right arc |
17205
|
395
|
|
|
|
|
|
bool transition_right_arc::applicable(const configuration& conf) const { |
17206
|
395
|
50
|
|
|
|
|
if (conf.single_root && label_is_root) |
|
|
0
|
|
|
|
|
|
17207
|
0
|
0
|
|
|
|
|
return conf.stack.size() == 2 && conf.buffer.empty(); |
|
|
0
|
|
|
|
|
|
17208
|
395
|
50
|
|
|
|
|
else if (conf.single_root) // && !label_is_root |
17209
|
0
|
|
|
|
|
|
return conf.stack.size() > 2; |
17210
|
|
|
|
|
|
|
else |
17211
|
395
|
|
|
|
|
|
return conf.stack.size() >= 2; |
17212
|
|
|
|
|
|
|
} |
17213
|
|
|
|
|
|
|
|
17214
|
23
|
|
|
|
|
|
int transition_right_arc::perform(configuration& conf) const { |
17215
|
23
|
50
|
|
|
|
|
assert(applicable(conf)); |
17216
|
|
|
|
|
|
|
|
17217
|
23
|
|
|
|
|
|
int child = conf.stack.back(); conf.stack.pop_back(); |
17218
|
23
|
|
|
|
|
|
int parent = conf.stack.back(); |
17219
|
23
|
|
|
|
|
|
conf.t->set_head(child, parent, label); |
17220
|
23
|
|
|
|
|
|
return child; |
17221
|
|
|
|
|
|
|
} |
17222
|
|
|
|
|
|
|
|
17223
|
|
|
|
|
|
|
// Shift |
17224
|
90
|
|
|
|
|
|
bool transition_shift::applicable(const configuration& conf) const { |
17225
|
90
|
|
|
|
|
|
return !conf.buffer.empty(); |
17226
|
|
|
|
|
|
|
} |
17227
|
|
|
|
|
|
|
|
17228
|
28
|
|
|
|
|
|
int transition_shift::perform(configuration& conf) const { |
17229
|
28
|
50
|
|
|
|
|
assert(applicable(conf)); |
17230
|
|
|
|
|
|
|
|
17231
|
28
|
|
|
|
|
|
conf.stack.push_back(conf.buffer.back()); |
17232
|
|
|
|
|
|
|
conf.buffer.pop_back(); |
17233
|
28
|
|
|
|
|
|
return -1; |
17234
|
|
|
|
|
|
|
} |
17235
|
|
|
|
|
|
|
|
17236
|
|
|
|
|
|
|
// Swap |
17237
|
0
|
|
|
|
|
|
bool transition_swap::applicable(const configuration& conf) const { |
17238
|
0
|
0
|
|
|
|
|
return conf.stack.size() >= 2 && conf.stack[conf.stack.size() - 2] && conf.stack[conf.stack.size() - 2] < conf.stack[conf.stack.size() - 1]; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
17239
|
|
|
|
|
|
|
} |
17240
|
|
|
|
|
|
|
|
17241
|
0
|
|
|
|
|
|
int transition_swap::perform(configuration& conf) const { |
17242
|
0
|
0
|
|
|
|
|
assert(applicable(conf)); |
17243
|
|
|
|
|
|
|
|
17244
|
0
|
|
|
|
|
|
int top = conf.stack.back(); conf.stack.pop_back(); |
17245
|
0
|
|
|
|
|
|
int to_buffer = conf.stack.back(); conf.stack.pop_back(); |
17246
|
0
|
|
|
|
|
|
conf.stack.push_back(top); |
17247
|
0
|
|
|
|
|
|
conf.buffer.push_back(to_buffer); |
17248
|
0
|
|
|
|
|
|
return -1; |
17249
|
|
|
|
|
|
|
} |
17250
|
|
|
|
|
|
|
|
17251
|
|
|
|
|
|
|
// Left arc 2 |
17252
|
0
|
|
|
|
|
|
bool transition_left_arc_2::applicable(const configuration& conf) const { |
17253
|
0
|
0
|
|
|
|
|
if (conf.single_root && label_is_root) |
|
|
0
|
|
|
|
|
|
17254
|
|
|
|
|
|
|
return false; |
17255
|
|
|
|
|
|
|
else |
17256
|
0
|
0
|
|
|
|
|
return conf.stack.size() >= 3 && conf.stack[conf.stack.size() - 3]; |
|
|
0
|
|
|
|
|
|
17257
|
|
|
|
|
|
|
} |
17258
|
|
|
|
|
|
|
|
17259
|
0
|
|
|
|
|
|
int transition_left_arc_2::perform(configuration& conf) const { |
17260
|
0
|
0
|
|
|
|
|
assert(applicable(conf)); |
17261
|
|
|
|
|
|
|
|
17262
|
0
|
|
|
|
|
|
int parent = conf.stack.back(); conf.stack.pop_back(); |
17263
|
0
|
|
|
|
|
|
int ignore = conf.stack.back(); conf.stack.pop_back(); |
17264
|
0
|
|
|
|
|
|
int child = conf.stack.back(); conf.stack.pop_back(); |
17265
|
0
|
|
|
|
|
|
conf.stack.push_back(ignore); |
17266
|
0
|
|
|
|
|
|
conf.stack.push_back(parent); |
17267
|
0
|
|
|
|
|
|
conf.t->set_head(child, parent, label); |
17268
|
0
|
|
|
|
|
|
return child; |
17269
|
|
|
|
|
|
|
} |
17270
|
|
|
|
|
|
|
|
17271
|
|
|
|
|
|
|
// Right arc 2 |
17272
|
0
|
|
|
|
|
|
bool transition_right_arc_2::applicable(const configuration& conf) const { |
17273
|
0
|
0
|
|
|
|
|
if (conf.single_root && label_is_root) |
|
|
0
|
|
|
|
|
|
17274
|
|
|
|
|
|
|
return false; |
17275
|
0
|
0
|
|
|
|
|
else if (conf.single_root) // && !label_is_root |
17276
|
0
|
|
|
|
|
|
return conf.stack.size() >= 4; |
17277
|
|
|
|
|
|
|
else |
17278
|
0
|
|
|
|
|
|
return conf.stack.size() >= 3; |
17279
|
|
|
|
|
|
|
} |
17280
|
|
|
|
|
|
|
|
17281
|
0
|
|
|
|
|
|
int transition_right_arc_2::perform(configuration& conf) const { |
17282
|
0
|
0
|
|
|
|
|
assert(applicable(conf)); |
17283
|
|
|
|
|
|
|
|
17284
|
0
|
|
|
|
|
|
int child = conf.stack.back(); conf.stack.pop_back(); |
17285
|
0
|
|
|
|
|
|
int to_buffer = conf.stack.back(); conf.stack.pop_back(); |
17286
|
0
|
|
|
|
|
|
int parent = conf.stack.back(); |
17287
|
0
|
|
|
|
|
|
conf.buffer.push_back(to_buffer); |
17288
|
0
|
|
|
|
|
|
conf.t->set_head(child, parent, label); |
17289
|
0
|
|
|
|
|
|
return child; |
17290
|
|
|
|
|
|
|
} |
17291
|
|
|
|
|
|
|
|
17292
|
|
|
|
|
|
|
} // namespace parsito |
17293
|
|
|
|
|
|
|
|
17294
|
|
|
|
|
|
|
///////// |
17295
|
|
|
|
|
|
|
// File: parsito/transition/transition_system_link2.h |
17296
|
|
|
|
|
|
|
///////// |
17297
|
|
|
|
|
|
|
|
17298
|
|
|
|
|
|
|
// This file is part of Parsito . |
17299
|
|
|
|
|
|
|
// |
17300
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
17301
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
17302
|
|
|
|
|
|
|
// |
17303
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
17304
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
17305
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
17306
|
|
|
|
|
|
|
|
17307
|
|
|
|
|
|
|
namespace parsito { |
17308
|
|
|
|
|
|
|
|
17309
|
0
|
|
|
|
|
|
class transition_system_link2 : public transition_system { |
17310
|
|
|
|
|
|
|
public: |
17311
|
|
|
|
|
|
|
transition_system_link2(const vector& labels); |
17312
|
|
|
|
|
|
|
|
17313
|
|
|
|
|
|
|
virtual transition_oracle* oracle(const string& name) const override; |
17314
|
|
|
|
|
|
|
}; |
17315
|
|
|
|
|
|
|
|
17316
|
|
|
|
|
|
|
} // namespace parsito |
17317
|
|
|
|
|
|
|
|
17318
|
|
|
|
|
|
|
///////// |
17319
|
|
|
|
|
|
|
// File: parsito/transition/transition_system_projective.h |
17320
|
|
|
|
|
|
|
///////// |
17321
|
|
|
|
|
|
|
|
17322
|
|
|
|
|
|
|
// This file is part of Parsito . |
17323
|
|
|
|
|
|
|
// |
17324
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
17325
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
17326
|
|
|
|
|
|
|
// |
17327
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
17328
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
17329
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
17330
|
|
|
|
|
|
|
|
17331
|
|
|
|
|
|
|
namespace parsito { |
17332
|
|
|
|
|
|
|
|
17333
|
2
|
|
|
|
|
|
class transition_system_projective : public transition_system { |
17334
|
|
|
|
|
|
|
public: |
17335
|
|
|
|
|
|
|
transition_system_projective(const vector& labels); |
17336
|
|
|
|
|
|
|
|
17337
|
|
|
|
|
|
|
virtual transition_oracle* oracle(const string& name) const override; |
17338
|
|
|
|
|
|
|
}; |
17339
|
|
|
|
|
|
|
|
17340
|
|
|
|
|
|
|
} // namespace parsito |
17341
|
|
|
|
|
|
|
|
17342
|
|
|
|
|
|
|
///////// |
17343
|
|
|
|
|
|
|
// File: parsito/transition/transition_system_swap.h |
17344
|
|
|
|
|
|
|
///////// |
17345
|
|
|
|
|
|
|
|
17346
|
|
|
|
|
|
|
// This file is part of Parsito . |
17347
|
|
|
|
|
|
|
// |
17348
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
17349
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
17350
|
|
|
|
|
|
|
// |
17351
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
17352
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
17353
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
17354
|
|
|
|
|
|
|
|
17355
|
|
|
|
|
|
|
namespace parsito { |
17356
|
|
|
|
|
|
|
|
17357
|
0
|
|
|
|
|
|
class transition_system_swap : public transition_system { |
17358
|
|
|
|
|
|
|
public: |
17359
|
|
|
|
|
|
|
transition_system_swap(const vector& labels); |
17360
|
|
|
|
|
|
|
|
17361
|
|
|
|
|
|
|
virtual transition_oracle* oracle(const string& name) const override; |
17362
|
|
|
|
|
|
|
}; |
17363
|
|
|
|
|
|
|
|
17364
|
|
|
|
|
|
|
} // namespace parsito |
17365
|
|
|
|
|
|
|
|
17366
|
|
|
|
|
|
|
///////// |
17367
|
|
|
|
|
|
|
// File: parsito/transition/transition_system.cpp |
17368
|
|
|
|
|
|
|
///////// |
17369
|
|
|
|
|
|
|
|
17370
|
|
|
|
|
|
|
// This file is part of Parsito . |
17371
|
|
|
|
|
|
|
// |
17372
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
17373
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
17374
|
|
|
|
|
|
|
// |
17375
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
17376
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
17377
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
17378
|
|
|
|
|
|
|
|
17379
|
|
|
|
|
|
|
namespace parsito { |
17380
|
|
|
|
|
|
|
|
17381
|
0
|
|
|
|
|
|
unsigned transition_system::transition_count() const { |
17382
|
0
|
|
|
|
|
|
return transitions.size(); |
17383
|
|
|
|
|
|
|
} |
17384
|
|
|
|
|
|
|
|
17385
|
806
|
|
|
|
|
|
bool transition_system::applicable(const configuration& conf, unsigned transition) const { |
17386
|
806
|
50
|
|
|
|
|
assert(transition < transitions.size()); |
17387
|
|
|
|
|
|
|
|
17388
|
806
|
|
|
|
|
|
return transitions[transition]->applicable(conf); |
17389
|
|
|
|
|
|
|
} |
17390
|
|
|
|
|
|
|
|
17391
|
66
|
|
|
|
|
|
int transition_system::perform(configuration& conf, unsigned transition) const { |
17392
|
66
|
50
|
|
|
|
|
assert(transition < transitions.size()); |
17393
|
|
|
|
|
|
|
|
17394
|
66
|
|
|
|
|
|
return transitions[transition]->perform(conf); |
17395
|
|
|
|
|
|
|
} |
17396
|
|
|
|
|
|
|
|
17397
|
1
|
|
|
|
|
|
transition_system* transition_system::create(const string& name, const vector& labels) { |
17398
|
1
|
50
|
|
|
|
|
if (name == "projective") return new transition_system_projective(labels); |
|
|
50
|
|
|
|
|
|
17399
|
0
|
0
|
|
|
|
|
if (name == "swap") return new transition_system_swap(labels); |
|
|
0
|
|
|
|
|
|
17400
|
1
|
0
|
|
|
|
|
if (name == "link2") return new transition_system_link2(labels); |
|
|
0
|
|
|
|
|
|
17401
|
|
|
|
|
|
|
return nullptr; |
17402
|
|
|
|
|
|
|
} |
17403
|
|
|
|
|
|
|
|
17404
|
|
|
|
|
|
|
} // namespace parsito |
17405
|
|
|
|
|
|
|
|
17406
|
|
|
|
|
|
|
///////// |
17407
|
|
|
|
|
|
|
// File: parsito/transition/transition_system_link2.cpp |
17408
|
|
|
|
|
|
|
///////// |
17409
|
|
|
|
|
|
|
|
17410
|
|
|
|
|
|
|
// This file is part of Parsito . |
17411
|
|
|
|
|
|
|
// |
17412
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
17413
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
17414
|
|
|
|
|
|
|
// |
17415
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
17416
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
17417
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
17418
|
|
|
|
|
|
|
|
17419
|
|
|
|
|
|
|
namespace parsito { |
17420
|
|
|
|
|
|
|
|
17421
|
0
|
|
|
|
|
|
transition_system_link2::transition_system_link2(const vector& labels) : transition_system(labels) { |
17422
|
0
|
0
|
|
|
|
|
transitions.emplace_back(new transition_shift()); |
|
|
0
|
|
|
|
|
|
17423
|
0
|
0
|
|
|
|
|
for (auto&& label : labels) { |
17424
|
0
|
0
|
|
|
|
|
transitions.emplace_back(new transition_left_arc(label)); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
17425
|
0
|
0
|
|
|
|
|
transitions.emplace_back(new transition_right_arc(label)); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
17426
|
0
|
0
|
|
|
|
|
transitions.emplace_back(new transition_left_arc_2(label)); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
17427
|
0
|
0
|
|
|
|
|
transitions.emplace_back(new transition_right_arc_2(label)); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
17428
|
|
|
|
|
|
|
} |
17429
|
0
|
|
|
|
|
|
} |
17430
|
|
|
|
|
|
|
|
17431
|
|
|
|
|
|
|
// Static oracle |
17432
|
0
|
|
|
|
|
|
class transition_system_link2_oracle_static : public transition_oracle { |
17433
|
|
|
|
|
|
|
public: |
17434
|
0
|
|
|
|
|
|
transition_system_link2_oracle_static(const vector& labels) : labels(labels) { |
17435
|
0
|
0
|
|
|
|
|
for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break; |
|
|
0
|
|
|
|
|
|
17436
|
0
|
|
|
|
|
|
} |
17437
|
|
|
|
|
|
|
|
17438
|
0
|
|
|
|
|
|
class tree_oracle_static : public transition_oracle::tree_oracle { |
17439
|
|
|
|
|
|
|
public: |
17440
|
0
|
|
|
|
|
|
tree_oracle_static(const vector& labels, unsigned root_label, const tree& gold) : labels(labels), root_label(root_label), gold(gold) {} |
17441
|
|
|
|
|
|
|
virtual predicted_transition predict(const configuration& conf, unsigned network_outcome, unsigned iteration) const override; |
17442
|
|
|
|
|
|
|
virtual void interesting_transitions(const configuration& conf, vector& transitions) const override; |
17443
|
|
|
|
|
|
|
private: |
17444
|
|
|
|
|
|
|
const vector& labels; |
17445
|
|
|
|
|
|
|
unsigned root_label; |
17446
|
|
|
|
|
|
|
const tree& gold; |
17447
|
|
|
|
|
|
|
}; |
17448
|
|
|
|
|
|
|
|
17449
|
|
|
|
|
|
|
virtual unique_ptr create_tree_oracle(const tree& gold) const override; |
17450
|
|
|
|
|
|
|
private: |
17451
|
|
|
|
|
|
|
const vector& labels; |
17452
|
|
|
|
|
|
|
unsigned root_label; |
17453
|
|
|
|
|
|
|
}; |
17454
|
|
|
|
|
|
|
|
17455
|
0
|
|
|
|
|
|
unique_ptr transition_system_link2_oracle_static::create_tree_oracle(const tree& gold) const { |
17456
|
0
|
|
|
|
|
|
return unique_ptr(new tree_oracle_static(labels, root_label, gold)); |
17457
|
|
|
|
|
|
|
} |
17458
|
|
|
|
|
|
|
|
17459
|
0
|
|
|
|
|
|
void transition_system_link2_oracle_static::tree_oracle_static::interesting_transitions(const configuration& conf, vector& transitions) const { |
17460
|
|
|
|
|
|
|
transitions.clear(); |
17461
|
|
|
|
|
|
|
|
17462
|
|
|
|
|
|
|
// Shift |
17463
|
0
|
0
|
|
|
|
|
if (!conf.buffer.empty()) transitions.push_back(0); |
17464
|
|
|
|
|
|
|
|
17465
|
|
|
|
|
|
|
// Arcs |
17466
|
0
|
|
|
|
|
|
unsigned parents[4] = {1, 2, 1, 3}; |
17467
|
0
|
|
|
|
|
|
unsigned children[4] = {2, 1, 3, 1}; |
17468
|
0
|
0
|
|
|
|
|
for (int direction = 0; direction < 4; direction++) |
17469
|
0
|
0
|
|
|
|
|
if (conf.stack.size() >= parents[direction] && conf.stack.size() >= children[direction]) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
17470
|
0
|
|
|
|
|
|
int parent = conf.stack[conf.stack.size() - parents[direction]]; |
17471
|
0
|
|
|
|
|
|
int child = conf.stack[conf.stack.size() - children[direction]]; |
17472
|
|
|
|
|
|
|
|
17473
|
|
|
|
|
|
|
// Allow arc_2 only when seeing golden edge. |
17474
|
0
|
0
|
|
|
|
|
if (direction >= 2 && gold.nodes[child].head != parent) continue; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
17475
|
|
|
|
|
|
|
|
17476
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < labels.size(); i++) |
17477
|
0
|
0
|
|
|
|
|
if (gold.nodes[child].deprel == labels[i]) |
17478
|
0
|
0
|
|
|
|
|
if (!conf.single_root || |
|
|
0
|
|
|
|
|
|
17479
|
0
|
0
|
|
|
|
|
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
17480
|
0
|
0
|
|
|
|
|
(i != root_label && conf.stack.size() > 2 && direction < 2) || |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
17481
|
0
|
0
|
|
|
|
|
(i != root_label && conf.stack.size() > 3 && direction >= 2)) |
|
|
0
|
|
|
|
|
|
17482
|
0
|
|
|
|
|
|
transitions.push_back(1 + 4*i + direction); |
17483
|
|
|
|
|
|
|
} |
17484
|
0
|
|
|
|
|
|
} |
17485
|
|
|
|
|
|
|
|
17486
|
0
|
|
|
|
|
|
transition_oracle::predicted_transition transition_system_link2_oracle_static::tree_oracle_static::predict(const configuration& conf, unsigned /*network_outcome*/, unsigned /*iteration*/) const { |
17487
|
|
|
|
|
|
|
// Arcs |
17488
|
0
|
|
|
|
|
|
unsigned parents[4] = {1, 2, 1, 3}; |
17489
|
0
|
|
|
|
|
|
unsigned children[4] = {2, 1, 3, 1}; |
17490
|
0
|
0
|
|
|
|
|
for (int direction = 0; direction < 4; direction++) |
17491
|
0
|
0
|
|
|
|
|
if (conf.stack.size() >= parents[direction] && conf.stack.size() >= children[direction]) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
17492
|
0
|
|
|
|
|
|
int parent = conf.stack[conf.stack.size() - parents[direction]]; |
17493
|
0
|
|
|
|
|
|
int child = conf.stack[conf.stack.size() - children[direction]]; |
17494
|
|
|
|
|
|
|
|
17495
|
0
|
0
|
|
|
|
|
if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
17496
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < labels.size(); i++) |
17497
|
0
|
0
|
|
|
|
|
if (gold.nodes[child].deprel == labels[i]) |
17498
|
0
|
|
|
|
|
|
return predicted_transition(1 + 4*i + direction, 1 + 4*i + direction); |
17499
|
|
|
|
|
|
|
|
17500
|
0
|
|
|
|
|
|
assert(!"label was not found"); |
17501
|
|
|
|
|
|
|
} |
17502
|
|
|
|
|
|
|
} |
17503
|
|
|
|
|
|
|
|
17504
|
|
|
|
|
|
|
// Otherwise, just shift |
17505
|
0
|
|
|
|
|
|
return predicted_transition(0, 0); |
17506
|
|
|
|
|
|
|
} |
17507
|
|
|
|
|
|
|
|
17508
|
|
|
|
|
|
|
// Oracle factory method |
17509
|
0
|
|
|
|
|
|
transition_oracle* transition_system_link2::oracle(const string& name) const { |
17510
|
0
|
0
|
|
|
|
|
if (name == "static") return new transition_system_link2_oracle_static(labels); |
|
|
0
|
|
|
|
|
|
17511
|
|
|
|
|
|
|
return nullptr; |
17512
|
|
|
|
|
|
|
} |
17513
|
|
|
|
|
|
|
|
17514
|
|
|
|
|
|
|
} // namespace parsito |
17515
|
|
|
|
|
|
|
|
17516
|
|
|
|
|
|
|
///////// |
17517
|
|
|
|
|
|
|
// File: parsito/transition/transition_system_projective.cpp |
17518
|
|
|
|
|
|
|
///////// |
17519
|
|
|
|
|
|
|
|
17520
|
|
|
|
|
|
|
// This file is part of Parsito . |
17521
|
|
|
|
|
|
|
// |
17522
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
17523
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
17524
|
|
|
|
|
|
|
// |
17525
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
17526
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
17527
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
17528
|
|
|
|
|
|
|
|
17529
|
|
|
|
|
|
|
namespace parsito { |
17530
|
|
|
|
|
|
|
|
17531
|
1
|
|
|
|
|
|
transition_system_projective::transition_system_projective(const vector& labels) : transition_system(labels) { |
17532
|
1
|
50
|
|
|
|
|
transitions.emplace_back(new transition_shift()); |
|
|
50
|
|
|
|
|
|
17533
|
7
|
100
|
|
|
|
|
for (auto&& label : labels) { |
17534
|
6
|
50
|
|
|
|
|
transitions.emplace_back(new transition_left_arc(label)); |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
17535
|
6
|
50
|
|
|
|
|
transitions.emplace_back(new transition_right_arc(label)); |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
17536
|
|
|
|
|
|
|
} |
17537
|
1
|
|
|
|
|
|
} |
17538
|
|
|
|
|
|
|
|
17539
|
|
|
|
|
|
|
// Static oracle |
17540
|
0
|
|
|
|
|
|
class transition_system_projective_oracle_static : public transition_oracle { |
17541
|
|
|
|
|
|
|
public: |
17542
|
0
|
|
|
|
|
|
transition_system_projective_oracle_static(const vector& labels) : labels(labels) { |
17543
|
0
|
0
|
|
|
|
|
for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break; |
|
|
0
|
|
|
|
|
|
17544
|
0
|
|
|
|
|
|
} |
17545
|
|
|
|
|
|
|
|
17546
|
0
|
|
|
|
|
|
class tree_oracle_static : public transition_oracle::tree_oracle { |
17547
|
|
|
|
|
|
|
public: |
17548
|
0
|
|
|
|
|
|
tree_oracle_static(const vector& labels, unsigned root_label, const tree& gold) : labels(labels), root_label(root_label), gold(gold) {} |
17549
|
|
|
|
|
|
|
virtual predicted_transition predict(const configuration& conf, unsigned network_outcome, unsigned iteration) const override; |
17550
|
|
|
|
|
|
|
virtual void interesting_transitions(const configuration& conf, vector& transitions) const override; |
17551
|
|
|
|
|
|
|
private: |
17552
|
|
|
|
|
|
|
const vector& labels; |
17553
|
|
|
|
|
|
|
unsigned root_label; |
17554
|
|
|
|
|
|
|
const tree& gold; |
17555
|
|
|
|
|
|
|
}; |
17556
|
|
|
|
|
|
|
|
17557
|
|
|
|
|
|
|
virtual unique_ptr create_tree_oracle(const tree& gold) const override; |
17558
|
|
|
|
|
|
|
private: |
17559
|
|
|
|
|
|
|
const vector& labels; |
17560
|
|
|
|
|
|
|
unsigned root_label; |
17561
|
|
|
|
|
|
|
}; |
17562
|
|
|
|
|
|
|
|
17563
|
0
|
|
|
|
|
|
unique_ptr transition_system_projective_oracle_static::create_tree_oracle(const tree& gold) const { |
17564
|
0
|
|
|
|
|
|
return unique_ptr(new tree_oracle_static(labels, root_label, gold)); |
17565
|
|
|
|
|
|
|
} |
17566
|
|
|
|
|
|
|
|
17567
|
0
|
|
|
|
|
|
void transition_system_projective_oracle_static::tree_oracle_static::interesting_transitions(const configuration& conf, vector& transitions) const { |
17568
|
|
|
|
|
|
|
transitions.clear(); |
17569
|
0
|
0
|
|
|
|
|
if (!conf.buffer.empty()) transitions.push_back(0); |
17570
|
0
|
0
|
|
|
|
|
if (conf.stack.size() >= 2) |
17571
|
0
|
0
|
|
|
|
|
for (int direction = 0; direction < 2; direction++) { |
17572
|
0
|
|
|
|
|
|
int child = conf.stack[conf.stack.size() - 2 + direction]; |
17573
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < labels.size(); i++) |
17574
|
0
|
0
|
|
|
|
|
if (gold.nodes[child].deprel == labels[i]) |
17575
|
0
|
0
|
|
|
|
|
if (!conf.single_root || |
|
|
0
|
|
|
|
|
|
17576
|
0
|
0
|
|
|
|
|
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
17577
|
0
|
0
|
|
|
|
|
(i != root_label && conf.stack.size() > 2)) |
17578
|
0
|
|
|
|
|
|
transitions.push_back(1 + 2*i + direction); |
17579
|
|
|
|
|
|
|
} |
17580
|
0
|
|
|
|
|
|
} |
17581
|
|
|
|
|
|
|
|
17582
|
0
|
|
|
|
|
|
transition_oracle::predicted_transition transition_system_projective_oracle_static::tree_oracle_static::predict(const configuration& conf, unsigned /*network_outcome*/, unsigned /*iteration*/) const { |
17583
|
|
|
|
|
|
|
// Use left if appropriate |
17584
|
0
|
0
|
|
|
|
|
if (conf.stack.size() >= 2) { |
17585
|
0
|
|
|
|
|
|
int parent = conf.stack[conf.stack.size() - 1]; |
17586
|
0
|
|
|
|
|
|
int child = conf.stack[conf.stack.size() - 2]; |
17587
|
0
|
0
|
|
|
|
|
if (gold.nodes[child].head == parent) { |
17588
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < labels.size(); i++) |
17589
|
0
|
0
|
|
|
|
|
if (gold.nodes[child].deprel == labels[i]) |
17590
|
0
|
|
|
|
|
|
return predicted_transition(1 + 2*i, 1 + 2*i); |
17591
|
|
|
|
|
|
|
|
17592
|
0
|
|
|
|
|
|
assert(!"label was not found"); |
17593
|
|
|
|
|
|
|
} |
17594
|
|
|
|
|
|
|
} |
17595
|
|
|
|
|
|
|
|
17596
|
|
|
|
|
|
|
// Use right if appropriate |
17597
|
0
|
0
|
|
|
|
|
if (conf.stack.size() >= 2) { |
17598
|
0
|
|
|
|
|
|
int child = conf.stack[conf.stack.size() - 1]; |
17599
|
0
|
|
|
|
|
|
int parent = conf.stack[conf.stack.size() - 2]; |
17600
|
0
|
0
|
|
|
|
|
if (gold.nodes[child].head == parent && |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
17601
|
0
|
0
|
|
|
|
|
(conf.buffer.empty() || gold.nodes[child].children.empty() || gold.nodes[child].children.back() < conf.buffer.back())) { |
|
|
0
|
|
|
|
|
|
17602
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < labels.size(); i++) |
17603
|
0
|
0
|
|
|
|
|
if (gold.nodes[child].deprel == labels[i]) |
17604
|
0
|
|
|
|
|
|
return predicted_transition(1 + 2*i + 1, 1 + 2*i + 1); |
17605
|
|
|
|
|
|
|
|
17606
|
0
|
|
|
|
|
|
assert(!"label was not found"); |
17607
|
|
|
|
|
|
|
} |
17608
|
|
|
|
|
|
|
} |
17609
|
|
|
|
|
|
|
|
17610
|
|
|
|
|
|
|
// Otherwise, just shift |
17611
|
0
|
|
|
|
|
|
return predicted_transition(0, 0); |
17612
|
|
|
|
|
|
|
} |
17613
|
|
|
|
|
|
|
|
17614
|
|
|
|
|
|
|
// Dynamic oracle |
17615
|
0
|
|
|
|
|
|
class transition_system_projective_oracle_dynamic : public transition_oracle { |
17616
|
|
|
|
|
|
|
public: |
17617
|
0
|
|
|
|
|
|
transition_system_projective_oracle_dynamic(const vector& labels) : labels(labels) { |
17618
|
0
|
0
|
|
|
|
|
for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break; |
|
|
0
|
|
|
|
|
|
17619
|
0
|
|
|
|
|
|
} |
17620
|
|
|
|
|
|
|
|
17621
|
0
|
|
|
|
|
|
class tree_oracle_dynamic : public transition_oracle::tree_oracle { |
17622
|
|
|
|
|
|
|
public: |
17623
|
0
|
|
|
|
|
|
tree_oracle_dynamic(const vector& labels, unsigned root_label, const tree& gold) : labels(labels), gold(gold), oracle_static(labels, root_label, gold) {} |
17624
|
|
|
|
|
|
|
virtual predicted_transition predict(const configuration& conf, unsigned network_outcome, unsigned iteration) const override; |
17625
|
|
|
|
|
|
|
virtual void interesting_transitions(const configuration& conf, vector& transitions) const override; |
17626
|
|
|
|
|
|
|
private: |
17627
|
|
|
|
|
|
|
const vector& labels; |
17628
|
|
|
|
|
|
|
const tree& gold; |
17629
|
|
|
|
|
|
|
transition_system_projective_oracle_static::tree_oracle_static oracle_static; |
17630
|
|
|
|
|
|
|
}; |
17631
|
|
|
|
|
|
|
|
17632
|
|
|
|
|
|
|
virtual unique_ptr create_tree_oracle(const tree& gold) const override; |
17633
|
|
|
|
|
|
|
private: |
17634
|
|
|
|
|
|
|
const vector& labels; |
17635
|
|
|
|
|
|
|
unsigned root_label; |
17636
|
|
|
|
|
|
|
}; |
17637
|
|
|
|
|
|
|
|
17638
|
0
|
|
|
|
|
|
unique_ptr transition_system_projective_oracle_dynamic::create_tree_oracle(const tree& gold) const { |
17639
|
0
|
|
|
|
|
|
return unique_ptr(new tree_oracle_dynamic(labels, root_label, gold)); |
17640
|
|
|
|
|
|
|
} |
17641
|
|
|
|
|
|
|
|
17642
|
0
|
|
|
|
|
|
void transition_system_projective_oracle_dynamic::tree_oracle_dynamic::interesting_transitions(const configuration& conf, vector& transitions) const { |
17643
|
0
|
|
|
|
|
|
oracle_static.interesting_transitions(conf, transitions); |
17644
|
0
|
|
|
|
|
|
} |
17645
|
|
|
|
|
|
|
|
17646
|
0
|
|
|
|
|
|
transition_oracle::predicted_transition transition_system_projective_oracle_dynamic::tree_oracle_dynamic::predict(const configuration& conf, unsigned network_outcome, unsigned iteration) const { |
17647
|
|
|
|
|
|
|
// Use static oracle in the first iteration |
17648
|
0
|
0
|
|
|
|
|
if (iteration <= 1) |
17649
|
0
|
|
|
|
|
|
return oracle_static.predict(conf, network_outcome, iteration); |
17650
|
|
|
|
|
|
|
|
17651
|
|
|
|
|
|
|
// Use dynamic programming to compute transition leading to best parse tree |
17652
|
|
|
|
|
|
|
|
17653
|
|
|
|
|
|
|
// Start by computing the right stack |
17654
|
|
|
|
|
|
|
vector right_stack; |
17655
|
|
|
|
|
|
|
|
17656
|
|
|
|
|
|
|
unordered_set right_stack_inserted; |
17657
|
0
|
0
|
|
|
|
|
if (!conf.buffer.empty()) { |
17658
|
0
|
|
|
|
|
|
int buffer_start = conf.buffer.back(); |
17659
|
0
|
0
|
|
|
|
|
for (size_t i = conf.buffer.size(); i--; ) { |
17660
|
|
|
|
|
|
|
const auto& node = conf.buffer[i]; |
17661
|
0
|
|
|
|
|
|
bool to_right_stack = gold.nodes[node].head < buffer_start; |
17662
|
0
|
0
|
|
|
|
|
for (auto&& child : gold.nodes[node].children) |
17663
|
0
|
|
|
|
|
|
to_right_stack |= child < buffer_start || right_stack_inserted.count(child); |
17664
|
0
|
0
|
|
|
|
|
if (to_right_stack) { |
17665
|
0
|
0
|
|
|
|
|
right_stack.push_back(node); |
17666
|
|
|
|
|
|
|
right_stack_inserted.insert(node); |
17667
|
|
|
|
|
|
|
} |
17668
|
|
|
|
|
|
|
} |
17669
|
|
|
|
|
|
|
} |
17670
|
|
|
|
|
|
|
|
17671
|
|
|
|
|
|
|
// Fill the array T from the 2014 Goldberg paper |
17672
|
0
|
0
|
|
|
|
|
class t_representation { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
17673
|
|
|
|
|
|
|
public: |
17674
|
0
|
|
|
|
|
|
t_representation(const vector& stack, const vector& right_stack, const tree& gold, const vector& labels) |
17675
|
0
|
0
|
|
|
|
|
: stack(stack), right_stack(right_stack), gold(gold), labels(labels) { |
|
|
0
|
|
|
|
|
|
17676
|
0
|
0
|
|
|
|
|
for (int i = 0; i < 2; i++) { |
17677
|
0
|
0
|
|
|
|
|
costs[i].reserve((stack.size() + right_stack.size()) * (stack.size() + right_stack.size())); |
17678
|
0
|
0
|
|
|
|
|
transitions[i].reserve((stack.size() + right_stack.size()) * (stack.size() + right_stack.size())); |
17679
|
|
|
|
|
|
|
} |
17680
|
0
|
0
|
|
|
|
|
} |
|
|
0
|
|
|
|
|
|
17681
|
|
|
|
|
|
|
|
17682
|
0
|
|
|
|
|
|
void prepare(unsigned diagonal) { |
17683
|
0
|
|
|
|
|
|
costs[diagonal & 1].assign((diagonal + 1) * (diagonal + 1), gold.nodes.size() + 1); |
17684
|
0
|
|
|
|
|
|
transitions[diagonal & 1].assign((diagonal + 1) * (diagonal + 1), -1); |
17685
|
0
|
|
|
|
|
|
} |
17686
|
|
|
|
|
|
|
|
17687
|
0
|
|
|
|
|
|
int& cost(unsigned i, unsigned j, unsigned h) { return costs[(i+j) & 1][i * (i+j+1) + h]; } |
17688
|
0
|
|
|
|
|
|
int& transition(unsigned i, unsigned j, unsigned h) { return transitions[(i+j) & 1][i * (i+j+1) + h]; } |
17689
|
|
|
|
|
|
|
|
17690
|
0
|
0
|
|
|
|
|
int node(unsigned i, unsigned /*j*/, unsigned h) const { return h <= i ? stack[stack.size() - 1 - i + h] : right_stack[h - i - 1]; } |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
17691
|
0
|
|
|
|
|
|
int edge_cost(int parent, int child) const { return gold.nodes[child].head != parent; } |
17692
|
0
|
|
|
|
|
|
int which_arc_transition(int parent, int child) const { |
17693
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < labels.size(); i++) |
17694
|
0
|
0
|
|
|
|
|
if (gold.nodes[child].deprel == labels[i]) |
17695
|
0
|
|
|
|
|
|
return 1 + 2*i + (child > parent); |
17696
|
0
|
|
|
|
|
|
assert(!"label was not found"); |
17697
|
|
|
|
|
|
|
return 0; // To keep VS 2015 happy and warning-free |
17698
|
|
|
|
|
|
|
} |
17699
|
|
|
|
|
|
|
|
17700
|
|
|
|
|
|
|
private: |
17701
|
|
|
|
|
|
|
const vector& stack; |
17702
|
|
|
|
|
|
|
const vector& right_stack; |
17703
|
|
|
|
|
|
|
const tree& gold; |
17704
|
|
|
|
|
|
|
const vector& labels; |
17705
|
|
|
|
|
|
|
vector costs[2], transitions[2]; |
17706
|
0
|
0
|
|
|
|
|
} t(conf.stack, right_stack, gold, labels); |
17707
|
|
|
|
|
|
|
|
17708
|
0
|
0
|
|
|
|
|
t.prepare(0); |
17709
|
0
|
|
|
|
|
|
t.cost(0, 0, 0) = 0; |
17710
|
0
|
0
|
|
|
|
|
for (unsigned diagonal = 0; diagonal < conf.stack.size() + right_stack.size(); diagonal++) { |
17711
|
0
|
0
|
|
|
|
|
t.prepare(diagonal + 1); |
17712
|
0
|
0
|
|
|
|
|
for (unsigned i = diagonal > right_stack.size() ? diagonal - right_stack.size() : 0; i <= diagonal && i < conf.stack.size(); i++) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
17713
|
0
|
|
|
|
|
|
unsigned j = diagonal - i; |
17714
|
|
|
|
|
|
|
|
17715
|
|
|
|
|
|
|
// Try extending stack |
17716
|
0
|
0
|
|
|
|
|
if (i+1 < conf.stack.size()) |
17717
|
0
|
0
|
|
|
|
|
for (unsigned h = 0; h <= diagonal; h++) { |
17718
|
|
|
|
|
|
|
int h_node = t.node(i, j, h), new_node = t.node(i+1, j, 0); |
17719
|
0
|
0
|
|
|
|
|
if (new_node && t.cost(i, j, h) + t.edge_cost(h_node, new_node) < t.cost(i+1, j, h+1) + (t.transition(i, j, h) != 0)) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
17720
|
0
|
|
|
|
|
|
t.cost(i+1, j, h+1) = t.cost(i, j, h) + t.edge_cost(h_node, new_node); |
17721
|
0
|
0
|
|
|
|
|
t.transition(i+1, j, h+1) = t.transition(i, j, h) >= 0 ? t.transition(i, j, h) : t.which_arc_transition(h_node, new_node); |
17722
|
|
|
|
|
|
|
} |
17723
|
0
|
0
|
|
|
|
|
if (t.cost(i, j, h) + t.edge_cost(new_node, h_node) < t.cost(i+1, j, 0) + (t.transition(i, j, h) != 0)) { |
17724
|
0
|
|
|
|
|
|
t.cost(i+1, j, 0) = t.cost(i, j, h) + t.edge_cost(new_node, h_node); |
17725
|
0
|
0
|
|
|
|
|
t.transition(i+1, j, 0) = t.transition(i, j, h) >= 0 ? t.transition(i, j, h) : t.which_arc_transition(new_node, h_node); |
17726
|
|
|
|
|
|
|
} |
17727
|
|
|
|
|
|
|
} |
17728
|
|
|
|
|
|
|
|
17729
|
|
|
|
|
|
|
// Try extending right_stack |
17730
|
0
|
0
|
|
|
|
|
if (j+1 < right_stack.size() + 1) |
17731
|
0
|
0
|
|
|
|
|
for (unsigned h = 0; h <= diagonal; h++) { |
17732
|
|
|
|
|
|
|
int h_node = t.node(i, j, h), new_node = t.node(i, j+1, diagonal+1); |
17733
|
0
|
0
|
|
|
|
|
if (t.cost(i, j, h) + t.edge_cost(h_node, new_node) < t.cost(i, j+1, h) + (t.transition(i, j, h) > 0)) { |
17734
|
0
|
|
|
|
|
|
t.cost(i, j+1, h) = t.cost(i, j, h) + t.edge_cost(h_node, new_node); |
17735
|
0
|
0
|
|
|
|
|
t.transition(i, j+1, h) = t.transition(i, j, h) >= 0 ? t.transition(i, j, h) : 0; |
17736
|
|
|
|
|
|
|
} |
17737
|
0
|
0
|
|
|
|
|
if (h_node && t.cost(i, j, h) + t.edge_cost(new_node, h_node) < t.cost(i, j+1, diagonal+1) + (t.transition(i, j, h) > 0)) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
17738
|
0
|
|
|
|
|
|
t.cost(i, j+1, diagonal+1) = t.cost(i, j, h) + t.edge_cost(new_node, h_node); |
17739
|
0
|
0
|
|
|
|
|
t.transition(i, j+1, diagonal+1) = t.transition(i, j, h) >= 0 ? t.transition(i, j, h) : 0; |
17740
|
|
|
|
|
|
|
} |
17741
|
|
|
|
|
|
|
} |
17742
|
|
|
|
|
|
|
} |
17743
|
|
|
|
|
|
|
} |
17744
|
|
|
|
|
|
|
|
17745
|
0
|
|
|
|
|
|
return predicted_transition(t.transition(conf.stack.size() - 1, right_stack.size(), 0), network_outcome); |
17746
|
|
|
|
|
|
|
} |
17747
|
|
|
|
|
|
|
|
17748
|
|
|
|
|
|
|
// Oracle factory method |
17749
|
0
|
|
|
|
|
|
transition_oracle* transition_system_projective::oracle(const string& name) const { |
17750
|
0
|
0
|
|
|
|
|
if (name == "static") return new transition_system_projective_oracle_static(labels); |
|
|
0
|
|
|
|
|
|
17751
|
0
|
0
|
|
|
|
|
if (name == "dynamic") return new transition_system_projective_oracle_dynamic(labels); |
|
|
0
|
|
|
|
|
|
17752
|
|
|
|
|
|
|
return nullptr; |
17753
|
|
|
|
|
|
|
} |
17754
|
|
|
|
|
|
|
|
17755
|
|
|
|
|
|
|
} // namespace parsito |
17756
|
|
|
|
|
|
|
|
17757
|
|
|
|
|
|
|
///////// |
17758
|
|
|
|
|
|
|
// File: parsito/transition/transition_system_swap.cpp |
17759
|
|
|
|
|
|
|
///////// |
17760
|
|
|
|
|
|
|
|
17761
|
|
|
|
|
|
|
// This file is part of Parsito . |
17762
|
|
|
|
|
|
|
// |
17763
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
17764
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
17765
|
|
|
|
|
|
|
// |
17766
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
17767
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
17768
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
17769
|
|
|
|
|
|
|
|
17770
|
|
|
|
|
|
|
namespace parsito { |
17771
|
|
|
|
|
|
|
|
17772
|
0
|
|
|
|
|
|
transition_system_swap::transition_system_swap(const vector& labels) : transition_system(labels) { |
17773
|
0
|
0
|
|
|
|
|
transitions.emplace_back(new transition_shift()); |
|
|
0
|
|
|
|
|
|
17774
|
0
|
0
|
|
|
|
|
transitions.emplace_back(new transition_swap()); |
|
|
0
|
|
|
|
|
|
17775
|
0
|
0
|
|
|
|
|
for (auto&& label : labels) { |
17776
|
0
|
0
|
|
|
|
|
transitions.emplace_back(new transition_left_arc(label)); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
17777
|
0
|
0
|
|
|
|
|
transitions.emplace_back(new transition_right_arc(label)); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
17778
|
|
|
|
|
|
|
} |
17779
|
0
|
|
|
|
|
|
} |
17780
|
|
|
|
|
|
|
|
17781
|
|
|
|
|
|
|
// Static oracle |
17782
|
0
|
|
|
|
|
|
class transition_system_swap_oracle_static : public transition_oracle { |
17783
|
|
|
|
|
|
|
public: |
17784
|
0
|
|
|
|
|
|
transition_system_swap_oracle_static(const vector& labels, bool lazy) : labels(labels), lazy(lazy) { |
17785
|
0
|
0
|
|
|
|
|
for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break; |
|
|
0
|
|
|
|
|
|
17786
|
0
|
|
|
|
|
|
} |
17787
|
|
|
|
|
|
|
|
17788
|
0
|
|
|
|
|
|
class tree_oracle_static : public transition_oracle::tree_oracle { |
17789
|
|
|
|
|
|
|
public: |
17790
|
0
|
|
|
|
|
|
tree_oracle_static(const vector& labels, unsigned root_label, const tree& gold, vector&& projective_order, vector&& projective_components) |
17791
|
0
|
0
|
|
|
|
|
: labels(labels), root_label(root_label), gold(gold), projective_order(projective_order), projective_components(projective_components) {} |
|
|
0
|
|
|
|
|
|
17792
|
|
|
|
|
|
|
virtual predicted_transition predict(const configuration& conf, unsigned network_outcome, unsigned iteration) const override; |
17793
|
|
|
|
|
|
|
virtual void interesting_transitions(const configuration& conf, vector& transitions) const override; |
17794
|
|
|
|
|
|
|
private: |
17795
|
|
|
|
|
|
|
const vector& labels; |
17796
|
|
|
|
|
|
|
unsigned root_label; |
17797
|
|
|
|
|
|
|
const tree& gold; |
17798
|
|
|
|
|
|
|
const vector projective_order; |
17799
|
|
|
|
|
|
|
const vector projective_components; |
17800
|
|
|
|
|
|
|
}; |
17801
|
|
|
|
|
|
|
|
17802
|
|
|
|
|
|
|
virtual unique_ptr create_tree_oracle(const tree& gold) const override; |
17803
|
|
|
|
|
|
|
private: |
17804
|
|
|
|
|
|
|
void create_projective_order(const tree& gold, int node, vector& projective_order, int& projective_index) const; |
17805
|
|
|
|
|
|
|
void create_projective_component(const tree& gold, int node, vector& projective_components, int component_index) const; |
17806
|
|
|
|
|
|
|
|
17807
|
|
|
|
|
|
|
const vector& labels; |
17808
|
|
|
|
|
|
|
bool lazy; |
17809
|
|
|
|
|
|
|
unsigned root_label; |
17810
|
|
|
|
|
|
|
}; |
17811
|
|
|
|
|
|
|
|
17812
|
0
|
|
|
|
|
|
unique_ptr transition_system_swap_oracle_static::create_tree_oracle(const tree& gold) const { |
17813
|
0
|
|
|
|
|
|
vector projective_order(gold.nodes.size()); |
17814
|
|
|
|
|
|
|
int projective_index; |
17815
|
0
|
|
|
|
|
|
create_projective_order(gold, 0, projective_order, projective_index); |
17816
|
|
|
|
|
|
|
|
17817
|
|
|
|
|
|
|
vector projective_components; |
17818
|
0
|
0
|
|
|
|
|
if (lazy) { |
17819
|
0
|
0
|
|
|
|
|
tree_oracle_static projective_oracle(labels, root_label, gold, vector(), vector()); |
17820
|
0
|
|
|
|
|
|
configuration conf(false); |
17821
|
|
|
|
|
|
|
tree t = gold; |
17822
|
0
|
0
|
|
|
|
|
transition_system_swap system(labels); |
17823
|
|
|
|
|
|
|
|
17824
|
0
|
0
|
|
|
|
|
conf.init(&t); |
17825
|
0
|
0
|
|
|
|
|
while (!conf.final()) { |
17826
|
0
|
|
|
|
|
|
auto prediction = projective_oracle.predict(conf, 0, 0); |
17827
|
0
|
0
|
|
|
|
|
if (!system.applicable(conf, prediction.to_follow)) break; |
|
|
0
|
|
|
|
|
|
17828
|
0
|
0
|
|
|
|
|
system.perform(conf, prediction.to_follow); |
17829
|
|
|
|
|
|
|
} |
17830
|
|
|
|
|
|
|
|
17831
|
0
|
|
|
|
|
|
projective_components.assign(gold.nodes.size(), 0); |
17832
|
0
|
0
|
|
|
|
|
for (auto&& node : conf.stack) |
17833
|
0
|
0
|
|
|
|
|
if (node) |
17834
|
0
|
|
|
|
|
|
create_projective_component(t, node, projective_components, node); |
17835
|
|
|
|
|
|
|
} |
17836
|
|
|
|
|
|
|
|
17837
|
0
|
0
|
|
|
|
|
return unique_ptr(new tree_oracle_static(labels, root_label, gold, move(projective_order), move(projective_components))); |
|
|
0
|
|
|
|
|
|
17838
|
|
|
|
|
|
|
} |
17839
|
|
|
|
|
|
|
|
17840
|
0
|
|
|
|
|
|
void transition_system_swap_oracle_static::create_projective_order(const tree& gold, int node, vector& projective_order, int& projective_index) const { |
17841
|
|
|
|
|
|
|
unsigned child_index = 0; |
17842
|
0
|
0
|
|
|
|
|
while (child_index < gold.nodes[node].children.size() && gold.nodes[node].children[child_index] < node) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
17843
|
0
|
|
|
|
|
|
create_projective_order(gold, gold.nodes[node].children[child_index++], projective_order, projective_index); |
17844
|
0
|
|
|
|
|
|
projective_order[node] = projective_index++; |
17845
|
0
|
0
|
|
|
|
|
while (child_index < gold.nodes[node].children.size()) |
17846
|
0
|
|
|
|
|
|
create_projective_order(gold, gold.nodes[node].children[child_index++], projective_order, projective_index); |
17847
|
0
|
|
|
|
|
|
} |
17848
|
|
|
|
|
|
|
|
17849
|
0
|
|
|
|
|
|
void transition_system_swap_oracle_static::create_projective_component(const tree& gold, int node, vector& projective_components, int component_index) const { |
17850
|
0
|
|
|
|
|
|
projective_components[node] = component_index; |
17851
|
0
|
0
|
|
|
|
|
for (auto&& child : gold.nodes[node].children) |
17852
|
0
|
|
|
|
|
|
create_projective_component(gold, child, projective_components, component_index); |
17853
|
0
|
|
|
|
|
|
} |
17854
|
|
|
|
|
|
|
|
17855
|
0
|
|
|
|
|
|
void transition_system_swap_oracle_static::tree_oracle_static::interesting_transitions(const configuration& conf, vector& transitions) const { |
17856
|
|
|
|
|
|
|
transitions.clear(); |
17857
|
0
|
0
|
|
|
|
|
if (!conf.buffer.empty()) transitions.push_back(0); |
17858
|
0
|
0
|
|
|
|
|
if (conf.stack.size() >= 2) { |
17859
|
|
|
|
|
|
|
// Swap |
17860
|
0
|
0
|
|
|
|
|
if (!projective_order.empty()) { |
17861
|
0
|
|
|
|
|
|
int last = conf.stack[conf.stack.size() - 1]; |
17862
|
0
|
|
|
|
|
|
int prev = conf.stack[conf.stack.size() - 2]; |
17863
|
0
|
0
|
|
|
|
|
if (projective_order[last] < projective_order[prev] && |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
17864
|
0
|
0
|
|
|
|
|
(projective_components.empty() || |
17865
|
0
|
0
|
|
|
|
|
(conf.buffer.empty() || projective_components[last] != projective_components[conf.buffer.back()]))) |
17866
|
0
|
|
|
|
|
|
transitions.push_back(1); |
17867
|
|
|
|
|
|
|
} |
17868
|
|
|
|
|
|
|
|
17869
|
|
|
|
|
|
|
// Arcs |
17870
|
0
|
0
|
|
|
|
|
for (int direction = 0; direction < 2; direction++) { |
17871
|
0
|
|
|
|
|
|
int child = conf.stack[conf.stack.size() - 2 + direction]; |
17872
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < labels.size(); i++) |
17873
|
0
|
0
|
|
|
|
|
if (gold.nodes[child].deprel == labels[i]) |
17874
|
0
|
0
|
|
|
|
|
if (!conf.single_root || |
|
|
0
|
|
|
|
|
|
17875
|
0
|
0
|
|
|
|
|
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
17876
|
0
|
0
|
|
|
|
|
(i != root_label && conf.stack.size() > 2)) |
17877
|
0
|
|
|
|
|
|
transitions.push_back(2 + 2*i + direction); |
17878
|
|
|
|
|
|
|
} |
17879
|
|
|
|
|
|
|
} |
17880
|
0
|
|
|
|
|
|
} |
17881
|
|
|
|
|
|
|
|
17882
|
0
|
|
|
|
|
|
transition_oracle::predicted_transition transition_system_swap_oracle_static::tree_oracle_static::predict(const configuration& conf, unsigned /*network_outcome*/, unsigned /*iteration*/) const { |
17883
|
|
|
|
|
|
|
// Use left if appropriate |
17884
|
0
|
0
|
|
|
|
|
if (conf.stack.size() >= 2) { |
17885
|
0
|
|
|
|
|
|
int parent = conf.stack[conf.stack.size() - 1]; |
17886
|
0
|
|
|
|
|
|
int child = conf.stack[conf.stack.size() - 2]; |
17887
|
0
|
0
|
|
|
|
|
if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
17888
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < labels.size(); i++) |
17889
|
0
|
0
|
|
|
|
|
if (gold.nodes[child].deprel == labels[i]) |
17890
|
0
|
|
|
|
|
|
return predicted_transition(2 + 2*i, 2 + 2*i); |
17891
|
|
|
|
|
|
|
|
17892
|
0
|
|
|
|
|
|
assert(!"label was not found"); |
17893
|
|
|
|
|
|
|
} |
17894
|
|
|
|
|
|
|
} |
17895
|
|
|
|
|
|
|
|
17896
|
|
|
|
|
|
|
// Use right if appropriate |
17897
|
0
|
0
|
|
|
|
|
if (conf.stack.size() >= 2) { |
17898
|
0
|
|
|
|
|
|
int child = conf.stack[conf.stack.size() - 1]; |
17899
|
0
|
|
|
|
|
|
int parent = conf.stack[conf.stack.size() - 2]; |
17900
|
0
|
0
|
|
|
|
|
if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
17901
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < labels.size(); i++) |
17902
|
0
|
0
|
|
|
|
|
if (gold.nodes[child].deprel == labels[i]) |
17903
|
0
|
|
|
|
|
|
return predicted_transition(2 + 2*i + 1, 2 + 2*i + 1); |
17904
|
|
|
|
|
|
|
|
17905
|
0
|
|
|
|
|
|
assert(!"label was not found"); |
17906
|
|
|
|
|
|
|
} |
17907
|
|
|
|
|
|
|
} |
17908
|
|
|
|
|
|
|
|
17909
|
|
|
|
|
|
|
// Use swap if appropriate |
17910
|
0
|
0
|
|
|
|
|
if (conf.stack.size() >= 2 && !projective_order.empty()) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
17911
|
0
|
|
|
|
|
|
int last = conf.stack[conf.stack.size() - 1]; |
17912
|
0
|
|
|
|
|
|
int prev = conf.stack[conf.stack.size() - 2]; |
17913
|
0
|
0
|
|
|
|
|
if (projective_order[last] < projective_order[prev] && |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
17914
|
0
|
0
|
|
|
|
|
(projective_components.empty() || |
17915
|
0
|
0
|
|
|
|
|
(conf.buffer.empty() || projective_components[last] != projective_components[conf.buffer.back()]))) |
17916
|
0
|
|
|
|
|
|
return predicted_transition(1, 1); |
17917
|
|
|
|
|
|
|
} |
17918
|
|
|
|
|
|
|
|
17919
|
|
|
|
|
|
|
// Otherwise, just shift |
17920
|
0
|
|
|
|
|
|
return predicted_transition(0, 0); |
17921
|
|
|
|
|
|
|
} |
17922
|
|
|
|
|
|
|
|
17923
|
|
|
|
|
|
|
// Oracle factory method |
17924
|
0
|
|
|
|
|
|
transition_oracle* transition_system_swap::oracle(const string& name) const { |
17925
|
0
|
0
|
|
|
|
|
if (name == "static_eager") return new transition_system_swap_oracle_static(labels, false); |
|
|
0
|
|
|
|
|
|
17926
|
0
|
0
|
|
|
|
|
if (name == "static_lazy") return new transition_system_swap_oracle_static(labels, true); |
|
|
0
|
|
|
|
|
|
17927
|
|
|
|
|
|
|
return nullptr; |
17928
|
|
|
|
|
|
|
} |
17929
|
|
|
|
|
|
|
|
17930
|
|
|
|
|
|
|
} // namespace parsito |
17931
|
|
|
|
|
|
|
|
17932
|
|
|
|
|
|
|
///////// |
17933
|
|
|
|
|
|
|
// File: parsito/tree/tree.cpp |
17934
|
|
|
|
|
|
|
///////// |
17935
|
|
|
|
|
|
|
|
17936
|
|
|
|
|
|
|
// This file is part of Parsito . |
17937
|
|
|
|
|
|
|
// |
17938
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
17939
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
17940
|
|
|
|
|
|
|
// |
17941
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
17942
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
17943
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
17944
|
|
|
|
|
|
|
|
17945
|
|
|
|
|
|
|
namespace parsito { |
17946
|
|
|
|
|
|
|
|
17947
|
2
|
|
|
|
|
|
const string tree::root_form = ""; |
17948
|
|
|
|
|
|
|
|
17949
|
1
|
|
|
|
|
|
tree::tree() { |
17950
|
1
|
50
|
|
|
|
|
clear(); |
17951
|
1
|
|
|
|
|
|
} |
17952
|
|
|
|
|
|
|
|
17953
|
0
|
|
|
|
|
|
bool tree::empty() { |
17954
|
0
|
|
|
|
|
|
return nodes.size() == 1; |
17955
|
|
|
|
|
|
|
} |
17956
|
|
|
|
|
|
|
|
17957
|
2
|
|
|
|
|
|
void tree::clear() { |
17958
|
|
|
|
|
|
|
nodes.clear(); |
17959
|
|
|
|
|
|
|
node& root = add_node(root_form); |
17960
|
8
|
|
|
|
|
|
root.lemma = root.upostag = root.xpostag = root.feats = root_form; |
17961
|
2
|
|
|
|
|
|
} |
17962
|
|
|
|
|
|
|
|
17963
|
0
|
|
|
|
|
|
node& tree::add_node(const string& form) { |
17964
|
9
|
0
|
|
|
|
|
nodes.emplace_back((int)nodes.size(), form); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
17965
|
0
|
|
|
|
|
|
return nodes.back(); |
17966
|
|
|
|
|
|
|
} |
17967
|
|
|
|
|
|
|
|
17968
|
38
|
|
|
|
|
|
void tree::set_head(int id, int head, const string& deprel) { |
17969
|
38
|
50
|
|
|
|
|
assert(id >= 0 && id < int(nodes.size())); |
|
|
50
|
|
|
|
|
|
17970
|
38
|
50
|
|
|
|
|
assert(head < int(nodes.size())); |
17971
|
|
|
|
|
|
|
|
17972
|
|
|
|
|
|
|
// Remove existing head |
17973
|
38
|
50
|
|
|
|
|
if (nodes[id].head >= 0) { |
17974
|
0
|
|
|
|
|
|
auto& children = nodes[nodes[id].head].children; |
17975
|
0
|
0
|
|
|
|
|
for (size_t i = children.size(); i && children[i-1] >= id; i--) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
17976
|
0
|
0
|
|
|
|
|
if (children[i-1] == id) { |
17977
|
|
|
|
|
|
|
children.erase(children.begin() + i - 1); |
17978
|
0
|
|
|
|
|
|
break; |
17979
|
|
|
|
|
|
|
} |
17980
|
|
|
|
|
|
|
} |
17981
|
|
|
|
|
|
|
|
17982
|
|
|
|
|
|
|
// Set new head |
17983
|
76
|
|
|
|
|
|
nodes[id].head = head; |
17984
|
38
|
|
|
|
|
|
nodes[id].deprel = deprel; |
17985
|
38
|
50
|
|
|
|
|
if (head >= 0) { |
17986
|
76
|
|
|
|
|
|
auto& children = nodes[head].children; |
17987
|
|
|
|
|
|
|
size_t i = children.size(); |
17988
|
56
|
100
|
|
|
|
|
while (i && children[i-1] > id) i--; |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
17989
|
38
|
100
|
|
|
|
|
if (!i || children[i-1] < id) children.insert(children.begin() + i, id); |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
17990
|
|
|
|
|
|
|
} |
17991
|
38
|
|
|
|
|
|
} |
17992
|
|
|
|
|
|
|
|
17993
|
0
|
|
|
|
|
|
void tree::unlink_all_nodes() { |
17994
|
9
|
0
|
|
|
|
|
for (auto&& node : nodes) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
17995
|
8
|
|
|
|
|
|
node.head = -1; |
17996
|
|
|
|
|
|
|
node.deprel.clear(); |
17997
|
|
|
|
|
|
|
node.children.clear(); |
17998
|
|
|
|
|
|
|
} |
17999
|
0
|
|
|
|
|
|
} |
18000
|
|
|
|
|
|
|
|
18001
|
|
|
|
|
|
|
} // namespace parsito |
18002
|
|
|
|
|
|
|
|
18003
|
|
|
|
|
|
|
///////// |
18004
|
|
|
|
|
|
|
// File: parsito/tree/tree_format.h |
18005
|
|
|
|
|
|
|
///////// |
18006
|
|
|
|
|
|
|
|
18007
|
|
|
|
|
|
|
// This file is part of Parsito . |
18008
|
|
|
|
|
|
|
// |
18009
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
18010
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
18011
|
|
|
|
|
|
|
// |
18012
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
18013
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
18014
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
18015
|
|
|
|
|
|
|
|
18016
|
|
|
|
|
|
|
namespace parsito { |
18017
|
|
|
|
|
|
|
|
18018
|
|
|
|
|
|
|
// Input format |
18019
|
0
|
|
|
|
|
|
class tree_input_format { |
18020
|
|
|
|
|
|
|
public: |
18021
|
0
|
|
|
|
|
|
virtual ~tree_input_format() {} |
18022
|
|
|
|
|
|
|
|
18023
|
|
|
|
|
|
|
virtual bool read_block(istream& in, string& block) const = 0; |
18024
|
|
|
|
|
|
|
virtual void set_text(string_piece text, bool make_copy = false) = 0; |
18025
|
|
|
|
|
|
|
virtual bool next_tree(tree& t) = 0; |
18026
|
|
|
|
|
|
|
const string& last_error() const; |
18027
|
|
|
|
|
|
|
|
18028
|
|
|
|
|
|
|
// Static factory methods |
18029
|
|
|
|
|
|
|
static tree_input_format* new_input_format(const string& name); |
18030
|
|
|
|
|
|
|
static tree_input_format* new_conllu_input_format(); |
18031
|
|
|
|
|
|
|
|
18032
|
|
|
|
|
|
|
protected: |
18033
|
|
|
|
|
|
|
string error; |
18034
|
|
|
|
|
|
|
}; |
18035
|
|
|
|
|
|
|
|
18036
|
|
|
|
|
|
|
// Output format |
18037
|
0
|
|
|
|
|
|
class tree_output_format { |
18038
|
|
|
|
|
|
|
public: |
18039
|
0
|
|
|
|
|
|
virtual ~tree_output_format() {} |
18040
|
|
|
|
|
|
|
|
18041
|
|
|
|
|
|
|
virtual void write_tree(const tree& t, string& output, const tree_input_format* additional_info = nullptr) const = 0; |
18042
|
|
|
|
|
|
|
|
18043
|
|
|
|
|
|
|
// Static factory methods |
18044
|
|
|
|
|
|
|
static tree_output_format* new_output_format(const string& name); |
18045
|
|
|
|
|
|
|
static tree_output_format* new_conllu_output_format(); |
18046
|
|
|
|
|
|
|
}; |
18047
|
|
|
|
|
|
|
|
18048
|
|
|
|
|
|
|
} // namespace parsito |
18049
|
|
|
|
|
|
|
|
18050
|
|
|
|
|
|
|
///////// |
18051
|
|
|
|
|
|
|
// File: parsito/tree/tree_format_conllu.h |
18052
|
|
|
|
|
|
|
///////// |
18053
|
|
|
|
|
|
|
|
18054
|
|
|
|
|
|
|
// This file is part of Parsito . |
18055
|
|
|
|
|
|
|
// |
18056
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
18057
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
18058
|
|
|
|
|
|
|
// |
18059
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
18060
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
18061
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
18062
|
|
|
|
|
|
|
|
18063
|
|
|
|
|
|
|
namespace parsito { |
18064
|
|
|
|
|
|
|
|
18065
|
|
|
|
|
|
|
// Input CoNLL-U format |
18066
|
0
|
|
|
|
|
|
class tree_input_format_conllu : public tree_input_format { |
18067
|
|
|
|
|
|
|
public: |
18068
|
|
|
|
|
|
|
virtual bool read_block(istream& in, string& block) const override; |
18069
|
|
|
|
|
|
|
virtual void set_text(string_piece text, bool make_copy = false) override; |
18070
|
|
|
|
|
|
|
virtual bool next_tree(tree& t) override; |
18071
|
|
|
|
|
|
|
|
18072
|
|
|
|
|
|
|
private: |
18073
|
|
|
|
|
|
|
friend class tree_output_format_conllu; |
18074
|
|
|
|
|
|
|
vector comments; |
18075
|
|
|
|
|
|
|
vector> multiword_tokens; |
18076
|
|
|
|
|
|
|
|
18077
|
|
|
|
|
|
|
string_piece text; |
18078
|
|
|
|
|
|
|
string text_copy; |
18079
|
|
|
|
|
|
|
}; |
18080
|
|
|
|
|
|
|
|
18081
|
|
|
|
|
|
|
// Output CoNLL-U format |
18082
|
0
|
|
|
|
|
|
class tree_output_format_conllu : public tree_output_format { |
18083
|
|
|
|
|
|
|
public: |
18084
|
|
|
|
|
|
|
virtual void write_tree(const tree& t, string& output, const tree_input_format* additional_info = nullptr) const override; |
18085
|
|
|
|
|
|
|
|
18086
|
|
|
|
|
|
|
private: |
18087
|
|
|
|
|
|
|
static const string underscore; |
18088
|
0
|
0
|
|
|
|
|
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18089
|
|
|
|
|
|
|
}; |
18090
|
|
|
|
|
|
|
|
18091
|
|
|
|
|
|
|
} // namespace parsito |
18092
|
|
|
|
|
|
|
|
18093
|
|
|
|
|
|
|
///////// |
18094
|
|
|
|
|
|
|
// File: parsito/tree/tree_format.cpp |
18095
|
|
|
|
|
|
|
///////// |
18096
|
|
|
|
|
|
|
|
18097
|
|
|
|
|
|
|
// This file is part of Parsito . |
18098
|
|
|
|
|
|
|
// |
18099
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
18100
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
18101
|
|
|
|
|
|
|
// |
18102
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
18103
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
18104
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
18105
|
|
|
|
|
|
|
|
18106
|
|
|
|
|
|
|
namespace parsito { |
18107
|
|
|
|
|
|
|
|
18108
|
0
|
|
|
|
|
|
const string& tree_input_format::last_error() const { |
18109
|
0
|
|
|
|
|
|
return error; |
18110
|
|
|
|
|
|
|
} |
18111
|
|
|
|
|
|
|
|
18112
|
|
|
|
|
|
|
// Input Static factory methods |
18113
|
0
|
|
|
|
|
|
tree_input_format* tree_input_format::new_conllu_input_format() { |
18114
|
0
|
|
|
|
|
|
return new tree_input_format_conllu(); |
18115
|
|
|
|
|
|
|
} |
18116
|
|
|
|
|
|
|
|
18117
|
0
|
|
|
|
|
|
tree_input_format* tree_input_format::new_input_format(const string& name) { |
18118
|
0
|
0
|
|
|
|
|
if (name == "conllu") return new_conllu_input_format(); |
18119
|
|
|
|
|
|
|
return nullptr; |
18120
|
|
|
|
|
|
|
} |
18121
|
|
|
|
|
|
|
|
18122
|
|
|
|
|
|
|
// Output static factory methods |
18123
|
0
|
|
|
|
|
|
tree_output_format* tree_output_format::new_conllu_output_format() { |
18124
|
0
|
|
|
|
|
|
return new tree_output_format_conllu(); |
18125
|
|
|
|
|
|
|
} |
18126
|
|
|
|
|
|
|
|
18127
|
0
|
|
|
|
|
|
tree_output_format* tree_output_format::new_output_format(const string& name) { |
18128
|
0
|
0
|
|
|
|
|
if (name == "conllu") return new_conllu_output_format(); |
18129
|
|
|
|
|
|
|
return nullptr; |
18130
|
|
|
|
|
|
|
} |
18131
|
|
|
|
|
|
|
|
18132
|
|
|
|
|
|
|
} // namespace parsito |
18133
|
|
|
|
|
|
|
|
18134
|
|
|
|
|
|
|
///////// |
18135
|
|
|
|
|
|
|
// File: parsito/tree/tree_format_conllu.cpp |
18136
|
|
|
|
|
|
|
///////// |
18137
|
|
|
|
|
|
|
|
18138
|
|
|
|
|
|
|
// This file is part of Parsito . |
18139
|
|
|
|
|
|
|
// |
18140
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
18141
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
18142
|
|
|
|
|
|
|
// |
18143
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
18144
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
18145
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
18146
|
|
|
|
|
|
|
|
18147
|
|
|
|
|
|
|
namespace parsito { |
18148
|
|
|
|
|
|
|
|
18149
|
|
|
|
|
|
|
// Input CoNLL-U format |
18150
|
|
|
|
|
|
|
|
18151
|
0
|
|
|
|
|
|
bool tree_input_format_conllu::read_block(istream& in, string& block) const { |
18152
|
0
|
|
|
|
|
|
return bool(getpara(in, block)); |
18153
|
|
|
|
|
|
|
} |
18154
|
|
|
|
|
|
|
|
18155
|
0
|
|
|
|
|
|
void tree_input_format_conllu::set_text(string_piece text, bool make_copy) { |
18156
|
0
|
0
|
|
|
|
|
if (make_copy) { |
18157
|
0
|
|
|
|
|
|
text_copy.assign(text.str, text.len); |
18158
|
|
|
|
|
|
|
text = string_piece(text_copy.c_str(), text_copy.size()); |
18159
|
|
|
|
|
|
|
} |
18160
|
0
|
|
|
|
|
|
this->text = text; |
18161
|
0
|
|
|
|
|
|
} |
18162
|
|
|
|
|
|
|
|
18163
|
0
|
|
|
|
|
|
bool tree_input_format_conllu::next_tree(tree& t) { |
18164
|
|
|
|
|
|
|
error.clear(); |
18165
|
0
|
|
|
|
|
|
t.clear(); |
18166
|
|
|
|
|
|
|
comments.clear(); |
18167
|
|
|
|
|
|
|
multiword_tokens.clear(); |
18168
|
|
|
|
|
|
|
int last_multiword_token = 0; |
18169
|
|
|
|
|
|
|
|
18170
|
|
|
|
|
|
|
vector tokens, parts; |
18171
|
0
|
0
|
|
|
|
|
while (text.len) { |
18172
|
|
|
|
|
|
|
// Read line |
18173
|
0
|
|
|
|
|
|
string_piece line(text.str, 0); |
18174
|
0
|
0
|
|
|
|
|
while (line.len < text.len && line.str[line.len] != '\n') line.len++; |
|
|
0
|
|
|
|
|
|
18175
|
0
|
|
|
|
|
|
text.str += line.len + (line.len < text.len); |
18176
|
0
|
|
|
|
|
|
text.len -= line.len + (line.len < text.len); |
18177
|
|
|
|
|
|
|
|
18178
|
|
|
|
|
|
|
// Empty lines denote end of tree, unless at the beginning |
18179
|
0
|
0
|
|
|
|
|
if (!line.len) { |
18180
|
0
|
0
|
|
|
|
|
if (t.empty()) continue; |
18181
|
0
|
|
|
|
|
|
break; |
18182
|
|
|
|
|
|
|
} |
18183
|
|
|
|
|
|
|
|
18184
|
0
|
0
|
|
|
|
|
if (*line.str == '#') { |
18185
|
|
|
|
|
|
|
// Store comments at the beginning and ignore the rest |
18186
|
0
|
0
|
|
|
|
|
if (t.empty()) comments.push_back(line); |
|
|
0
|
|
|
|
|
|
18187
|
|
|
|
|
|
|
continue; |
18188
|
|
|
|
|
|
|
} |
18189
|
|
|
|
|
|
|
|
18190
|
|
|
|
|
|
|
// Parse another tree node |
18191
|
0
|
0
|
|
|
|
|
split(line, '\t', tokens); |
18192
|
0
|
0
|
|
|
|
|
if (tokens.size() != 10) |
18193
|
0
|
0
|
|
|
|
|
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' does not contain 10 columns!") , false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18194
|
|
|
|
|
|
|
|
18195
|
|
|
|
|
|
|
// Store and skip multiword tokens |
18196
|
0
|
0
|
|
|
|
|
if (memchr(tokens[0].str, '-', tokens[0].len)) { |
18197
|
0
|
0
|
|
|
|
|
split(tokens[0], '-', parts); |
18198
|
0
|
0
|
|
|
|
|
if (parts.size() != 2) |
18199
|
0
|
0
|
|
|
|
|
return error.assign("Cannot parse ID of multiword token '").append(line.str, line.len).append("'!") , false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18200
|
|
|
|
|
|
|
int from, to; |
18201
|
0
|
0
|
|
|
|
|
if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error)) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18202
|
|
|
|
|
|
|
return false; |
18203
|
0
|
0
|
|
|
|
|
if (from != int(t.nodes.size())) |
18204
|
0
|
0
|
|
|
|
|
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18205
|
0
|
0
|
|
|
|
|
if (to < from) |
18206
|
0
|
0
|
|
|
|
|
return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18207
|
0
|
0
|
|
|
|
|
if (from <= last_multiword_token) |
18208
|
0
|
0
|
|
|
|
|
return error.assign("Multiword token '").append(line.str, line.len).append("' overlaps with the previous one!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18209
|
|
|
|
|
|
|
last_multiword_token = to; |
18210
|
0
|
0
|
|
|
|
|
multiword_tokens.emplace_back(from, line); |
18211
|
0
|
|
|
|
|
|
continue; |
18212
|
|
|
|
|
|
|
} |
18213
|
|
|
|
|
|
|
|
18214
|
|
|
|
|
|
|
// Parse node ID and head |
18215
|
|
|
|
|
|
|
int id; |
18216
|
0
|
0
|
|
|
|
|
if (!parse_int(tokens[0], "CoNLL-U id", id, error)) |
|
|
0
|
|
|
|
|
|
18217
|
|
|
|
|
|
|
return false; |
18218
|
0
|
0
|
|
|
|
|
if (id != int(t.nodes.size())) |
18219
|
0
|
0
|
|
|
|
|
return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18220
|
|
|
|
|
|
|
|
18221
|
|
|
|
|
|
|
int head; |
18222
|
0
|
0
|
|
|
|
|
if (tokens[6].len == 1 && tokens[6].str[0] == '_') { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18223
|
0
|
|
|
|
|
|
head = -1; |
18224
|
|
|
|
|
|
|
} else { |
18225
|
0
|
0
|
|
|
|
|
if (!parse_int(tokens[6], "CoNLL-U head", head, error)) |
|
|
0
|
|
|
|
|
|
18226
|
|
|
|
|
|
|
return false; |
18227
|
0
|
0
|
|
|
|
|
if (head < 0) |
18228
|
0
|
0
|
|
|
|
|
return error.assign("Numeric head value '").append(tokens[0].str, tokens[0].len).append("' cannot be negative!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18229
|
|
|
|
|
|
|
} |
18230
|
|
|
|
|
|
|
|
18231
|
|
|
|
|
|
|
// Add new node |
18232
|
0
|
|
|
|
|
|
auto& node = t.add_node(string(tokens[1].str, tokens[1].len)); |
18233
|
0
|
0
|
|
|
|
|
if (!(tokens[2].len == 1 && tokens[2].str[0] == '_')) node.lemma.assign(tokens[2].str, tokens[2].len); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18234
|
0
|
0
|
|
|
|
|
if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) node.upostag.assign(tokens[3].str, tokens[3].len); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18235
|
0
|
0
|
|
|
|
|
if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) node.xpostag.assign(tokens[4].str, tokens[4].len); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18236
|
0
|
0
|
|
|
|
|
if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) node.feats.assign(tokens[5].str, tokens[5].len); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18237
|
0
|
|
|
|
|
|
node.head = head; |
18238
|
0
|
0
|
|
|
|
|
if (!(tokens[7].len == 1 && tokens[7].str[0] == '_')) node.deprel.assign(tokens[7].str, tokens[7].len); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18239
|
0
|
0
|
|
|
|
|
if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) node.deps.assign(tokens[8].str, tokens[8].len); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18240
|
0
|
0
|
|
|
|
|
if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) node.misc.assign(tokens[9].str, tokens[9].len); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18241
|
|
|
|
|
|
|
} |
18242
|
|
|
|
|
|
|
|
18243
|
|
|
|
|
|
|
// Check that we got word for the last multiword token |
18244
|
0
|
0
|
|
|
|
|
if (last_multiword_token >= int(t.nodes.size())) |
18245
|
0
|
0
|
|
|
|
|
return error.assign("There are words missing for multiword token '").append(multiword_tokens.back().second.str, multiword_tokens.back().second.len).append("'!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18246
|
|
|
|
|
|
|
|
18247
|
|
|
|
|
|
|
// Set heads correctly |
18248
|
0
|
0
|
|
|
|
|
for (auto&& node : t.nodes) |
18249
|
0
|
0
|
|
|
|
|
if (node.id && node.head >= 0) { |
|
|
0
|
|
|
|
|
|
18250
|
0
|
0
|
|
|
|
|
if (node.head >= int(t.nodes.size())) |
18251
|
0
|
0
|
|
|
|
|
return error.assign("Node ID '").append(to_string(node.id)).append("' form '").append(node.form).append("' has too large head: '").append(to_string(node.head)).append("'!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18252
|
0
|
0
|
|
|
|
|
t.set_head(node.id, node.head, node.deprel); |
18253
|
|
|
|
|
|
|
} |
18254
|
|
|
|
|
|
|
|
18255
|
0
|
|
|
|
|
|
return !t.empty(); |
18256
|
|
|
|
|
|
|
} |
18257
|
|
|
|
|
|
|
|
18258
|
|
|
|
|
|
|
// Output CoNLL-U format |
18259
|
|
|
|
|
|
|
|
18260
|
2
|
|
|
|
|
|
const string tree_output_format_conllu::underscore = "_"; |
18261
|
|
|
|
|
|
|
|
18262
|
0
|
|
|
|
|
|
void tree_output_format_conllu::write_tree(const tree& t, string& output, const tree_input_format* additional_info) const { |
18263
|
|
|
|
|
|
|
output.clear(); |
18264
|
|
|
|
|
|
|
|
18265
|
|
|
|
|
|
|
// Try casting input format to CoNLL-U |
18266
|
0
|
0
|
|
|
|
|
auto input_conllu = dynamic_cast(additional_info); |
18267
|
|
|
|
|
|
|
size_t input_conllu_multiword_tokens = 0; |
18268
|
|
|
|
|
|
|
|
18269
|
|
|
|
|
|
|
// Comments if present |
18270
|
0
|
0
|
|
|
|
|
if (input_conllu) |
18271
|
0
|
0
|
|
|
|
|
for (auto&& comment : input_conllu->comments) |
18272
|
0
|
|
|
|
|
|
output.append(comment.str, comment.len).push_back('\n'); |
18273
|
|
|
|
|
|
|
|
18274
|
|
|
|
|
|
|
// Print out the tokens |
18275
|
0
|
0
|
|
|
|
|
for (int i = 1 /*skip the root node*/; i < int(t.nodes.size()); i++) { |
18276
|
|
|
|
|
|
|
// Write multiword token if present |
18277
|
0
|
0
|
|
|
|
|
if (input_conllu && input_conllu_multiword_tokens < input_conllu->multiword_tokens.size() && |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18278
|
0
|
|
|
|
|
|
i == input_conllu->multiword_tokens[input_conllu_multiword_tokens].first) { |
18279
|
0
|
|
|
|
|
|
output.append(input_conllu->multiword_tokens[input_conllu_multiword_tokens].second.str, |
18280
|
0
|
|
|
|
|
|
input_conllu->multiword_tokens[input_conllu_multiword_tokens].second.len).push_back('\n'); |
18281
|
0
|
|
|
|
|
|
input_conllu_multiword_tokens++; |
18282
|
|
|
|
|
|
|
} |
18283
|
|
|
|
|
|
|
|
18284
|
|
|
|
|
|
|
// Write the token |
18285
|
0
|
0
|
|
|
|
|
output.append(to_string(i)).push_back('\t'); |
18286
|
0
|
|
|
|
|
|
output.append(t.nodes[i].form).push_back('\t'); |
18287
|
0
|
|
|
|
|
|
output.append(underscore_on_empty(t.nodes[i].lemma)).push_back('\t'); |
18288
|
0
|
|
|
|
|
|
output.append(underscore_on_empty(t.nodes[i].upostag)).push_back('\t'); |
18289
|
0
|
|
|
|
|
|
output.append(underscore_on_empty(t.nodes[i].xpostag)).push_back('\t'); |
18290
|
0
|
|
|
|
|
|
output.append(underscore_on_empty(t.nodes[i].feats)).push_back('\t'); |
18291
|
0
|
0
|
|
|
|
|
output.append(t.nodes[i].head < 0 ? "_" : to_string(t.nodes[i].head)).push_back('\t'); |
|
|
0
|
|
|
|
|
|
18292
|
0
|
|
|
|
|
|
output.append(underscore_on_empty(t.nodes[i].deprel)).push_back('\t'); |
18293
|
0
|
|
|
|
|
|
output.append(underscore_on_empty(t.nodes[i].deps)).push_back('\t'); |
18294
|
0
|
|
|
|
|
|
output.append(underscore_on_empty(t.nodes[i].misc)).push_back('\n'); |
18295
|
|
|
|
|
|
|
} |
18296
|
0
|
|
|
|
|
|
output.push_back('\n'); |
18297
|
0
|
|
|
|
|
|
} |
18298
|
|
|
|
|
|
|
|
18299
|
|
|
|
|
|
|
} // namespace parsito |
18300
|
|
|
|
|
|
|
|
18301
|
|
|
|
|
|
|
///////// |
18302
|
|
|
|
|
|
|
// File: parsito/version/version.h |
18303
|
|
|
|
|
|
|
///////// |
18304
|
|
|
|
|
|
|
|
18305
|
|
|
|
|
|
|
// This file is part of Parsito . |
18306
|
|
|
|
|
|
|
// |
18307
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
18308
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
18309
|
|
|
|
|
|
|
// |
18310
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
18311
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
18312
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
18313
|
|
|
|
|
|
|
|
18314
|
|
|
|
|
|
|
namespace parsito { |
18315
|
|
|
|
|
|
|
|
18316
|
0
|
|
|
|
|
|
struct version { |
18317
|
|
|
|
|
|
|
unsigned major; |
18318
|
|
|
|
|
|
|
unsigned minor; |
18319
|
|
|
|
|
|
|
unsigned patch; |
18320
|
|
|
|
|
|
|
std::string prerelease; |
18321
|
|
|
|
|
|
|
|
18322
|
|
|
|
|
|
|
// Returns current version. |
18323
|
|
|
|
|
|
|
static version current(); |
18324
|
|
|
|
|
|
|
|
18325
|
|
|
|
|
|
|
// Returns multi-line formated version and copyright string. |
18326
|
|
|
|
|
|
|
static string version_and_copyright(const string& other_libraries = string()); |
18327
|
|
|
|
|
|
|
}; |
18328
|
|
|
|
|
|
|
|
18329
|
|
|
|
|
|
|
} // namespace parsito |
18330
|
|
|
|
|
|
|
|
18331
|
|
|
|
|
|
|
///////// |
18332
|
|
|
|
|
|
|
// File: parsito/version/version.cpp |
18333
|
|
|
|
|
|
|
///////// |
18334
|
|
|
|
|
|
|
|
18335
|
|
|
|
|
|
|
// This file is part of Parsito . |
18336
|
|
|
|
|
|
|
// |
18337
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
18338
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
18339
|
|
|
|
|
|
|
// |
18340
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
18341
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
18342
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
18343
|
|
|
|
|
|
|
|
18344
|
|
|
|
|
|
|
namespace parsito { |
18345
|
|
|
|
|
|
|
|
18346
|
|
|
|
|
|
|
// Returns current version. |
18347
|
0
|
|
|
|
|
|
version version::current() { |
18348
|
0
|
0
|
|
|
|
|
return {1, 1, 1, "devel"}; |
|
|
0
|
|
|
|
|
|
18349
|
|
|
|
|
|
|
} |
18350
|
|
|
|
|
|
|
|
18351
|
|
|
|
|
|
|
// Returns multi-line formated version and copyright string. |
18352
|
0
|
|
|
|
|
|
string version::version_and_copyright(const string& other_libraries) { |
18353
|
0
|
|
|
|
|
|
ostringstream info; |
18354
|
|
|
|
|
|
|
|
18355
|
|
|
|
|
|
|
auto parsito = version::current(); |
18356
|
|
|
|
|
|
|
auto unilib = unilib::version::current(); |
18357
|
|
|
|
|
|
|
|
18358
|
0
|
|
|
|
|
|
info << "Parsito version " << parsito.major << '.' << parsito.minor << '.' << parsito.patch |
18359
|
0
|
0
|
|
|
|
|
<< (parsito.prerelease.empty() ? "" : "-") << parsito.prerelease |
|
|
0
|
|
|
|
|
|
18360
|
0
|
|
|
|
|
|
<< " (using UniLib " << unilib.major << '.' << unilib.minor << '.' << unilib.patch |
18361
|
0
|
0
|
|
|
|
|
<< (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n" |
|
|
0
|
|
|
|
|
|
18362
|
|
|
|
|
|
|
"Copyright 2015 by Institute of Formal and Applied Linguistics, Faculty of\n" |
18363
|
0
|
0
|
|
|
|
|
"Mathematics and Physics, Charles University in Prague, Czech Republic."; |
18364
|
|
|
|
|
|
|
|
18365
|
0
|
|
|
|
|
|
return info.str(); |
18366
|
|
|
|
|
|
|
} |
18367
|
|
|
|
|
|
|
|
18368
|
|
|
|
|
|
|
} // namespace parsito |
18369
|
|
|
|
|
|
|
|
18370
|
|
|
|
|
|
|
///////// |
18371
|
|
|
|
|
|
|
// File: sentence/input_format.cpp |
18372
|
|
|
|
|
|
|
///////// |
18373
|
|
|
|
|
|
|
|
18374
|
|
|
|
|
|
|
// This file is part of UDPipe . |
18375
|
|
|
|
|
|
|
// |
18376
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
18377
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
18378
|
|
|
|
|
|
|
// |
18379
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
18380
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
18381
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
18382
|
|
|
|
|
|
|
|
18383
|
2
|
|
|
|
|
|
const string input_format::CONLLU_V1 = "v1"; |
18384
|
2
|
|
|
|
|
|
const string input_format::CONLLU_V2 = "v2"; |
18385
|
2
|
|
|
|
|
|
const string input_format::GENERIC_TOKENIZER_NORMALIZED_SPACES = "normalized_spaces"; |
18386
|
2
|
|
|
|
|
|
const string input_format::GENERIC_TOKENIZER_PRESEGMENTED = "presegmented"; |
18387
|
2
|
|
|
|
|
|
const string input_format::GENERIC_TOKENIZER_RANGES = "ranges"; |
18388
|
|
|
|
|
|
|
|
18389
|
|
|
|
|
|
|
// CoNLL-U input format |
18390
|
0
|
|
|
|
|
|
class input_format_conllu : public input_format { |
18391
|
|
|
|
|
|
|
public: |
18392
|
0
|
|
|
|
|
|
input_format_conllu(unsigned version) : version(version) {} |
18393
|
|
|
|
|
|
|
|
18394
|
|
|
|
|
|
|
virtual bool read_block(istream& is, string& block) const override; |
18395
|
|
|
|
|
|
|
virtual void reset_document(string_piece id = string_piece()) override; |
18396
|
|
|
|
|
|
|
virtual void set_text(string_piece text, bool make_copy = false) override; |
18397
|
|
|
|
|
|
|
virtual bool next_sentence(sentence& s, string& error) override; |
18398
|
|
|
|
|
|
|
|
18399
|
|
|
|
|
|
|
private: |
18400
|
|
|
|
|
|
|
unsigned version; |
18401
|
|
|
|
|
|
|
string_piece text; |
18402
|
|
|
|
|
|
|
string text_copy; |
18403
|
|
|
|
|
|
|
|
18404
|
|
|
|
|
|
|
static const string columns[10]; |
18405
|
|
|
|
|
|
|
}; |
18406
|
|
|
|
|
|
|
|
18407
|
26
|
100
|
|
|
|
|
const string input_format_conllu::columns[10] = {"ID", "FORM", "LEMMA", |
18408
|
2
|
50
|
|
|
|
|
"UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"}; |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18409
|
|
|
|
|
|
|
|
18410
|
0
|
|
|
|
|
|
bool input_format_conllu::read_block(istream& is, string& block) const { |
18411
|
0
|
|
|
|
|
|
return bool(getpara(is, block)); |
18412
|
|
|
|
|
|
|
} |
18413
|
|
|
|
|
|
|
|
18414
|
0
|
|
|
|
|
|
void input_format_conllu::reset_document(string_piece /*id*/) { |
18415
|
0
|
|
|
|
|
|
set_text(""); |
18416
|
0
|
|
|
|
|
|
} |
18417
|
|
|
|
|
|
|
|
18418
|
0
|
|
|
|
|
|
void input_format_conllu::set_text(string_piece text, bool make_copy) { |
18419
|
0
|
0
|
|
|
|
|
if (make_copy) { |
18420
|
0
|
|
|
|
|
|
text_copy.assign(text.str, text.len); |
18421
|
|
|
|
|
|
|
text = string_piece(text_copy.c_str(), text_copy.size()); |
18422
|
|
|
|
|
|
|
} |
18423
|
0
|
|
|
|
|
|
this->text = text; |
18424
|
0
|
|
|
|
|
|
} |
18425
|
|
|
|
|
|
|
|
18426
|
0
|
|
|
|
|
|
bool input_format_conllu::next_sentence(sentence& s, string& error) { |
18427
|
|
|
|
|
|
|
error.clear(); |
18428
|
0
|
|
|
|
|
|
s.clear(); |
18429
|
|
|
|
|
|
|
int last_multiword_token = 0; |
18430
|
|
|
|
|
|
|
|
18431
|
|
|
|
|
|
|
vector tokens, parts; |
18432
|
0
|
0
|
|
|
|
|
while (text.len) { |
18433
|
|
|
|
|
|
|
// Read line |
18434
|
0
|
|
|
|
|
|
string_piece line(text.str, 0); |
18435
|
0
|
0
|
|
|
|
|
while (line.len < text.len && (line.str[line.len] != '\r' && line.str[line.len] != '\n')) line.len++; |
|
|
0
|
|
|
|
|
|
18436
|
|
|
|
|
|
|
|
18437
|
0
|
|
|
|
|
|
text.str += line.len, text.len -= line.len; |
18438
|
0
|
0
|
|
|
|
|
if (text.len >= 2 && text.str[0] == '\r' && text.str[1] == '\n') |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18439
|
0
|
|
|
|
|
|
text.str += 2, text.len -= 2; |
18440
|
0
|
0
|
|
|
|
|
else if (text.len && *text.str == '\n') |
|
|
0
|
|
|
|
|
|
18441
|
0
|
|
|
|
|
|
text.str++, text.len--; |
18442
|
|
|
|
|
|
|
|
18443
|
|
|
|
|
|
|
// Empty lines denote end of tree, unless at the beginning |
18444
|
0
|
0
|
|
|
|
|
if (!line.len) { |
18445
|
0
|
0
|
|
|
|
|
if (s.empty()) continue; |
18446
|
0
|
|
|
|
|
|
break; |
18447
|
|
|
|
|
|
|
} |
18448
|
|
|
|
|
|
|
|
18449
|
0
|
0
|
|
|
|
|
if (*line.str == '#') { |
18450
|
|
|
|
|
|
|
// Store comments at the beginning and ignore the rest |
18451
|
0
|
0
|
|
|
|
|
if (s.empty()) s.comments.emplace_back(line.str, line.len); |
|
|
0
|
|
|
|
|
|
18452
|
|
|
|
|
|
|
continue; |
18453
|
|
|
|
|
|
|
} |
18454
|
|
|
|
|
|
|
|
18455
|
|
|
|
|
|
|
// Parse the line |
18456
|
0
|
0
|
|
|
|
|
split(line, '\t', tokens); |
18457
|
0
|
0
|
|
|
|
|
if (tokens.size() != 10) |
18458
|
0
|
0
|
|
|
|
|
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' does not contain 10 columns!") , false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18459
|
|
|
|
|
|
|
|
18460
|
|
|
|
|
|
|
// Check that no column is empty and contains no spaces (except FORM, LEMMA and MISC in version >= 2) |
18461
|
0
|
0
|
|
|
|
|
for (int i = 0; i < 10; i++) { |
18462
|
0
|
0
|
|
|
|
|
if (!tokens[i].len) |
18463
|
0
|
0
|
|
|
|
|
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains empty column ").append(columns[i]).append("!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18464
|
0
|
0
|
|
|
|
|
if ((version < 2 || (i != 1 && i != 2 && i != 9)) && memchr(tokens[i].str, ' ', tokens[i].len) != NULL) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18465
|
0
|
0
|
|
|
|
|
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains spaces in column ").append(columns[i]).append("!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18466
|
|
|
|
|
|
|
} |
18467
|
|
|
|
|
|
|
|
18468
|
|
|
|
|
|
|
// Handle multiword tokens |
18469
|
0
|
0
|
|
|
|
|
if (memchr(tokens[0].str, '-', tokens[0].len)) { |
18470
|
0
|
0
|
|
|
|
|
split(tokens[0], '-', parts); |
18471
|
0
|
0
|
|
|
|
|
if (parts.size() != 2) |
18472
|
0
|
0
|
|
|
|
|
return error.assign("Cannot parse ID of multiword token '").append(line.str, line.len).append("'!") , false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18473
|
|
|
|
|
|
|
int from, to; |
18474
|
0
|
0
|
|
|
|
|
if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error)) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18475
|
|
|
|
|
|
|
return false; |
18476
|
0
|
0
|
|
|
|
|
if (from != int(s.words.size())) |
18477
|
0
|
0
|
|
|
|
|
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18478
|
0
|
0
|
|
|
|
|
if (to < from) |
18479
|
0
|
0
|
|
|
|
|
return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18480
|
0
|
0
|
|
|
|
|
if (from <= last_multiword_token) |
18481
|
0
|
0
|
|
|
|
|
return error.assign("Multiword token '").append(line.str, line.len).append("' overlaps with the previous one!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18482
|
|
|
|
|
|
|
last_multiword_token = to; |
18483
|
0
|
0
|
|
|
|
|
for (int i = 2; i < 9; i++) |
18484
|
0
|
0
|
|
|
|
|
if (tokens[i].len != 1 || tokens[i].str[0] != '_') |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18485
|
0
|
0
|
|
|
|
|
return error.assign("Column ").append(columns[i]).append(" of an multi-word token '").append(line.str, line.len).append("' is not an empty!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18486
|
0
|
0
|
|
|
|
|
s.multiword_tokens.emplace_back(from, to, tokens[1], tokens[9].len == 1 && tokens[9].str[0] == '_' ? string_piece() : tokens[9]); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18487
|
0
|
|
|
|
|
|
continue; |
18488
|
|
|
|
|
|
|
} |
18489
|
|
|
|
|
|
|
|
18490
|
|
|
|
|
|
|
// Handle empty nodes |
18491
|
0
|
0
|
|
|
|
|
if (version >= 2) |
18492
|
0
|
0
|
|
|
|
|
if (memchr(tokens[0].str, '.', tokens[0].len)) { |
18493
|
0
|
0
|
|
|
|
|
split(tokens[0], '.', parts); |
18494
|
0
|
0
|
|
|
|
|
if (parts.size() != 2) |
18495
|
0
|
0
|
|
|
|
|
return error.assign("Cannot parse ID of empty node '").append(line.str, line.len).append("'!") , false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18496
|
|
|
|
|
|
|
int id, index; |
18497
|
0
|
0
|
|
|
|
|
if (!parse_int(parts[0], "CoNLL-U empty node id", id, error) || !parse_int(parts[1], "CoNLL-U empty node index", index, error)) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18498
|
|
|
|
|
|
|
return false; |
18499
|
0
|
0
|
|
|
|
|
if (id != int(s.words.size()) - 1) |
18500
|
0
|
0
|
|
|
|
|
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18501
|
0
|
0
|
|
|
|
|
if (!((s.empty_nodes.empty() && index == 1) || (!s.empty_nodes.empty() && s.empty_nodes.back().id < id && index == 1) || |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18502
|
0
|
0
|
|
|
|
|
(!s.empty_nodes.empty() && s.empty_nodes.back().id == id && index == s.empty_nodes.back().index + 1))) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18503
|
0
|
0
|
|
|
|
|
return error.assign("Incorrect ID index '").append(parts[1].str, parts[1].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18504
|
0
|
0
|
|
|
|
|
for (int i = 6; i < 8; i++) |
18505
|
0
|
0
|
|
|
|
|
if (tokens[i].len != 1 || tokens[i].str[0] != '_') |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18506
|
0
|
0
|
|
|
|
|
return error.assign("Column ").append(columns[i]).append(" of an empty node token '").append(line.str, line.len).append("' is not an empty!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18507
|
|
|
|
|
|
|
|
18508
|
0
|
0
|
|
|
|
|
s.empty_nodes.emplace_back(id, index); |
18509
|
0
|
|
|
|
|
|
s.empty_nodes.back().form.assign(tokens[1].str, tokens[1].len); |
18510
|
0
|
|
|
|
|
|
s.empty_nodes.back().lemma.assign(tokens[2].str, tokens[2].len); |
18511
|
0
|
0
|
|
|
|
|
if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) s.empty_nodes.back().upostag.assign(tokens[3].str, tokens[3].len); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18512
|
0
|
0
|
|
|
|
|
if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) s.empty_nodes.back().xpostag.assign(tokens[4].str, tokens[4].len); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18513
|
0
|
0
|
|
|
|
|
if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) s.empty_nodes.back().feats.assign(tokens[5].str, tokens[5].len); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18514
|
0
|
0
|
|
|
|
|
if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) s.empty_nodes.back().deps.assign(tokens[8].str, tokens[8].len); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18515
|
0
|
0
|
|
|
|
|
if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) s.empty_nodes.back().misc.assign(tokens[9].str, tokens[9].len); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18516
|
0
|
|
|
|
|
|
continue; |
18517
|
|
|
|
|
|
|
} |
18518
|
|
|
|
|
|
|
|
18519
|
|
|
|
|
|
|
// Parse word ID and head |
18520
|
|
|
|
|
|
|
int id; |
18521
|
0
|
0
|
|
|
|
|
if (!parse_int(tokens[0], "CoNLL-U id", id, error)) |
|
|
0
|
|
|
|
|
|
18522
|
|
|
|
|
|
|
return false; |
18523
|
0
|
0
|
|
|
|
|
if (id != int(s.words.size())) |
18524
|
0
|
0
|
|
|
|
|
return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18525
|
|
|
|
|
|
|
|
18526
|
|
|
|
|
|
|
int head; |
18527
|
0
|
0
|
|
|
|
|
if (tokens[6].len == 1 && tokens[6].str[0] == '_') { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18528
|
0
|
|
|
|
|
|
head = -1; |
18529
|
|
|
|
|
|
|
} else { |
18530
|
0
|
0
|
|
|
|
|
if (!parse_int(tokens[6], "CoNLL-U head", head, error)) |
|
|
0
|
|
|
|
|
|
18531
|
|
|
|
|
|
|
return false; |
18532
|
0
|
0
|
|
|
|
|
if (head < 0) |
18533
|
0
|
0
|
|
|
|
|
return error.assign("Numeric head value '").append(tokens[0].str, tokens[0].len).append("' cannot be negative!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18534
|
|
|
|
|
|
|
} |
18535
|
|
|
|
|
|
|
|
18536
|
|
|
|
|
|
|
// Add new word |
18537
|
|
|
|
|
|
|
auto& word = s.add_word(tokens[1]); |
18538
|
0
|
|
|
|
|
|
word.lemma.assign(tokens[2].str, tokens[2].len); |
18539
|
0
|
0
|
|
|
|
|
if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) word.upostag.assign(tokens[3].str, tokens[3].len); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18540
|
0
|
0
|
|
|
|
|
if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) word.xpostag.assign(tokens[4].str, tokens[4].len); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18541
|
0
|
0
|
|
|
|
|
if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) word.feats.assign(tokens[5].str, tokens[5].len); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18542
|
0
|
|
|
|
|
|
word.head = head; |
18543
|
0
|
0
|
|
|
|
|
if (!(tokens[7].len == 1 && tokens[7].str[0] == '_')) word.deprel.assign(tokens[7].str, tokens[7].len); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18544
|
0
|
0
|
|
|
|
|
if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) word.deps.assign(tokens[8].str, tokens[8].len); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18545
|
0
|
0
|
|
|
|
|
if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) word.misc.assign(tokens[9].str, tokens[9].len); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18546
|
|
|
|
|
|
|
} |
18547
|
|
|
|
|
|
|
|
18548
|
|
|
|
|
|
|
// Check that we got word for the last multiword token |
18549
|
0
|
0
|
|
|
|
|
if (last_multiword_token >= int(s.words.size())) |
18550
|
0
|
0
|
|
|
|
|
return error.assign("There are words missing for multiword token '").append(s.multiword_tokens.back().form).append("'!"), false; |
|
|
0
|
|
|
|
|
|
18551
|
|
|
|
|
|
|
|
18552
|
|
|
|
|
|
|
// Set heads correctly |
18553
|
0
|
0
|
|
|
|
|
for (auto&& word : s.words) |
18554
|
0
|
0
|
|
|
|
|
if (word.id && word.head >= 0) { |
|
|
0
|
|
|
|
|
|
18555
|
0
|
0
|
|
|
|
|
if (word.head >= int(s.words.size())) |
18556
|
0
|
0
|
|
|
|
|
return error.assign("Node ID '").append(to_string(word.id)).append("' form '").append(word.form).append("' has too large head: '").append(to_string(word.head)).append("'!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18557
|
0
|
0
|
|
|
|
|
s.set_head(word.id, word.head, word.deprel); |
18558
|
|
|
|
|
|
|
} |
18559
|
|
|
|
|
|
|
|
18560
|
0
|
|
|
|
|
|
return !s.empty(); |
18561
|
|
|
|
|
|
|
} |
18562
|
|
|
|
|
|
|
|
18563
|
|
|
|
|
|
|
// Horizontal input format |
18564
|
0
|
|
|
|
|
|
class input_format_horizontal : public input_format { |
18565
|
|
|
|
|
|
|
public: |
18566
|
|
|
|
|
|
|
virtual bool read_block(istream& is, string& block) const override; |
18567
|
|
|
|
|
|
|
virtual void reset_document(string_piece id = string_piece()) override; |
18568
|
|
|
|
|
|
|
virtual void set_text(string_piece text, bool make_copy = false) override; |
18569
|
|
|
|
|
|
|
virtual bool next_sentence(sentence& s, string& error) override; |
18570
|
|
|
|
|
|
|
|
18571
|
|
|
|
|
|
|
private: |
18572
|
|
|
|
|
|
|
string_piece text; |
18573
|
|
|
|
|
|
|
string text_copy; |
18574
|
|
|
|
|
|
|
bool new_document = true; |
18575
|
|
|
|
|
|
|
string document_id; |
18576
|
|
|
|
|
|
|
unsigned preceeding_newlines = 2; |
18577
|
|
|
|
|
|
|
unsigned sentence_id = 1; |
18578
|
|
|
|
|
|
|
}; |
18579
|
|
|
|
|
|
|
|
18580
|
0
|
|
|
|
|
|
bool input_format_horizontal::read_block(istream& is, string& block) const { |
18581
|
0
|
0
|
|
|
|
|
if (getline(is, block)) |
18582
|
0
|
|
|
|
|
|
return block.push_back('\n'), true; |
18583
|
|
|
|
|
|
|
return false; |
18584
|
|
|
|
|
|
|
} |
18585
|
|
|
|
|
|
|
|
18586
|
0
|
|
|
|
|
|
void input_format_horizontal::reset_document(string_piece id) { |
18587
|
0
|
|
|
|
|
|
new_document = true; |
18588
|
0
|
|
|
|
|
|
document_id.assign(id.str, id.len); |
18589
|
0
|
|
|
|
|
|
preceeding_newlines = 2; |
18590
|
0
|
|
|
|
|
|
sentence_id = 1; |
18591
|
0
|
|
|
|
|
|
set_text(""); |
18592
|
0
|
|
|
|
|
|
} |
18593
|
|
|
|
|
|
|
|
18594
|
0
|
|
|
|
|
|
void input_format_horizontal::set_text(string_piece text, bool make_copy) { |
18595
|
0
|
0
|
|
|
|
|
if (make_copy) { |
18596
|
0
|
|
|
|
|
|
text_copy.assign(text.str, text.len); |
18597
|
|
|
|
|
|
|
text = string_piece(text_copy.c_str(), text_copy.size()); |
18598
|
|
|
|
|
|
|
} |
18599
|
0
|
|
|
|
|
|
this->text = text; |
18600
|
0
|
|
|
|
|
|
} |
18601
|
|
|
|
|
|
|
|
18602
|
0
|
|
|
|
|
|
bool input_format_horizontal::next_sentence(sentence& s, string& error) { |
18603
|
|
|
|
|
|
|
error.clear(); |
18604
|
0
|
|
|
|
|
|
s.clear(); |
18605
|
|
|
|
|
|
|
|
18606
|
|
|
|
|
|
|
// Skip spaces and newlines |
18607
|
0
|
0
|
|
|
|
|
while (text.len && (*text.str == ' ' || *text.str == '\t' || *text.str == '\r' || *text.str == '\n')) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18608
|
0
|
|
|
|
|
|
preceeding_newlines += *text.str == '\n'; |
18609
|
0
|
|
|
|
|
|
text.str++, text.len--; |
18610
|
|
|
|
|
|
|
} |
18611
|
|
|
|
|
|
|
|
18612
|
|
|
|
|
|
|
// Read space (and tab) separated words |
18613
|
0
|
0
|
|
|
|
|
while (text.len && *text.str != '\r' && *text.str != '\n') { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18614
|
|
|
|
|
|
|
string_piece word = text; |
18615
|
|
|
|
|
|
|
|
18616
|
|
|
|
|
|
|
// Slurp the word |
18617
|
0
|
0
|
|
|
|
|
while (text.len && *text.str != ' ' && *text.str != '\t' && *text.str != '\r' && *text.str != '\n') |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18618
|
0
|
|
|
|
|
|
text.str++, text.len--; |
18619
|
0
|
|
|
|
|
|
word.len = text.str - word.str; |
18620
|
|
|
|
|
|
|
s.add_word(word); |
18621
|
|
|
|
|
|
|
|
18622
|
|
|
|
|
|
|
// Replace s by regular spaces |
18623
|
0
|
0
|
|
|
|
|
if (s.words.back().form.find("\302\240") != string::npos) { |
18624
|
0
|
|
|
|
|
|
string& form = s.words.back().form; |
18625
|
|
|
|
|
|
|
size_t form_len = 0; |
18626
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < form.size(); i++) { |
18627
|
0
|
0
|
|
|
|
|
if (form_len && form[form_len-1] == '\302' && form[i] == '\240') |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18628
|
0
|
|
|
|
|
|
form[form_len - 1] = ' '; |
18629
|
|
|
|
|
|
|
else |
18630
|
0
|
|
|
|
|
|
form[form_len++] = form[i]; |
18631
|
|
|
|
|
|
|
} |
18632
|
|
|
|
|
|
|
form.resize(form_len); |
18633
|
|
|
|
|
|
|
} |
18634
|
|
|
|
|
|
|
|
18635
|
|
|
|
|
|
|
// Skip spaces |
18636
|
0
|
0
|
|
|
|
|
while (text.len && (*text.str == ' ' || *text.str == '\t')) |
|
|
0
|
|
|
|
|
|
18637
|
0
|
|
|
|
|
|
text.str++, text.len--; |
18638
|
|
|
|
|
|
|
} |
18639
|
|
|
|
|
|
|
|
18640
|
0
|
0
|
|
|
|
|
if (!s.empty()) { |
18641
|
|
|
|
|
|
|
// Mark new document if needed |
18642
|
0
|
0
|
|
|
|
|
if (new_document) |
18643
|
0
|
|
|
|
|
|
s.set_new_doc(true, document_id); |
18644
|
0
|
|
|
|
|
|
new_document = false; |
18645
|
|
|
|
|
|
|
|
18646
|
|
|
|
|
|
|
// Mark new paragraph if needed |
18647
|
0
|
0
|
|
|
|
|
if (preceeding_newlines >= 2) |
18648
|
0
|
|
|
|
|
|
s.set_new_par(true); |
18649
|
0
|
|
|
|
|
|
preceeding_newlines = 0; |
18650
|
|
|
|
|
|
|
|
18651
|
|
|
|
|
|
|
// Sentence id |
18652
|
0
|
0
|
|
|
|
|
s.set_sent_id(to_string(sentence_id++)); |
18653
|
|
|
|
|
|
|
} |
18654
|
|
|
|
|
|
|
|
18655
|
0
|
|
|
|
|
|
return !s.empty(); |
18656
|
|
|
|
|
|
|
} |
18657
|
|
|
|
|
|
|
|
18658
|
|
|
|
|
|
|
// Vertical input format |
18659
|
0
|
|
|
|
|
|
class input_format_vertical : public input_format { |
18660
|
|
|
|
|
|
|
public: |
18661
|
|
|
|
|
|
|
virtual bool read_block(istream& is, string& block) const override; |
18662
|
|
|
|
|
|
|
virtual void reset_document(string_piece id = string_piece()) override; |
18663
|
|
|
|
|
|
|
virtual void set_text(string_piece text, bool make_copy = false) override; |
18664
|
|
|
|
|
|
|
virtual bool next_sentence(sentence& s, string& error) override; |
18665
|
|
|
|
|
|
|
|
18666
|
|
|
|
|
|
|
private: |
18667
|
|
|
|
|
|
|
string_piece text; |
18668
|
|
|
|
|
|
|
string text_copy; |
18669
|
|
|
|
|
|
|
bool new_document = true; |
18670
|
|
|
|
|
|
|
string document_id; |
18671
|
|
|
|
|
|
|
unsigned preceeding_newlines = 2; |
18672
|
|
|
|
|
|
|
unsigned sentence_id = 1; |
18673
|
|
|
|
|
|
|
}; |
18674
|
|
|
|
|
|
|
|
18675
|
0
|
|
|
|
|
|
bool input_format_vertical::read_block(istream& is, string& block) const { |
18676
|
0
|
|
|
|
|
|
return bool(getpara(is, block)); |
18677
|
|
|
|
|
|
|
} |
18678
|
|
|
|
|
|
|
|
18679
|
0
|
|
|
|
|
|
void input_format_vertical::reset_document(string_piece id) { |
18680
|
0
|
|
|
|
|
|
new_document = true; |
18681
|
0
|
|
|
|
|
|
document_id.assign(id.str, id.len); |
18682
|
0
|
|
|
|
|
|
preceeding_newlines = 2; |
18683
|
0
|
|
|
|
|
|
sentence_id = 1; |
18684
|
0
|
|
|
|
|
|
set_text(""); |
18685
|
0
|
|
|
|
|
|
} |
18686
|
|
|
|
|
|
|
|
18687
|
0
|
|
|
|
|
|
void input_format_vertical::set_text(string_piece text, bool make_copy) { |
18688
|
0
|
0
|
|
|
|
|
if (make_copy) { |
18689
|
0
|
|
|
|
|
|
text_copy.assign(text.str, text.len); |
18690
|
|
|
|
|
|
|
text = string_piece(text_copy.c_str(), text_copy.size()); |
18691
|
|
|
|
|
|
|
} |
18692
|
0
|
|
|
|
|
|
this->text = text; |
18693
|
0
|
|
|
|
|
|
} |
18694
|
|
|
|
|
|
|
|
18695
|
0
|
|
|
|
|
|
bool input_format_vertical::next_sentence(sentence& s, string& error) { |
18696
|
|
|
|
|
|
|
error.clear(); |
18697
|
0
|
|
|
|
|
|
s.clear(); |
18698
|
|
|
|
|
|
|
|
18699
|
|
|
|
|
|
|
// Skip tabs and newlines |
18700
|
0
|
0
|
|
|
|
|
while (text.len && (*text.str == '\t' || *text.str == '\r' || *text.str == '\n')) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18701
|
0
|
|
|
|
|
|
preceeding_newlines += *text.str == '\n'; |
18702
|
0
|
|
|
|
|
|
text.str++, text.len--; |
18703
|
|
|
|
|
|
|
} |
18704
|
|
|
|
|
|
|
|
18705
|
|
|
|
|
|
|
// Read first word without tabs on every line |
18706
|
0
|
0
|
|
|
|
|
while (text.len && *text.str != '\r' && *text.str != '\n') { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18707
|
|
|
|
|
|
|
string_piece word = text; |
18708
|
|
|
|
|
|
|
|
18709
|
|
|
|
|
|
|
// Slurp the word |
18710
|
0
|
0
|
|
|
|
|
while (text.len && *text.str != '\t' && *text.str != '\r' && *text.str != '\n') |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18711
|
0
|
|
|
|
|
|
text.str++, text.len--; |
18712
|
0
|
|
|
|
|
|
word.len = text.str - word.str; |
18713
|
|
|
|
|
|
|
s.add_word(word); |
18714
|
|
|
|
|
|
|
|
18715
|
|
|
|
|
|
|
// Skip spaces till end of line |
18716
|
0
|
0
|
|
|
|
|
while (text.len && *text.str != '\r' && *text.str != '\n') |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18717
|
0
|
|
|
|
|
|
text.str++, text.len--; |
18718
|
|
|
|
|
|
|
|
18719
|
|
|
|
|
|
|
// Skip one new line |
18720
|
0
|
0
|
|
|
|
|
if (text.len >= 2 && text.str[0] == '\r' && text.str[1] == '\n') |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18721
|
0
|
|
|
|
|
|
text.str += 2, text.len -= 2; |
18722
|
0
|
0
|
|
|
|
|
else if (text.len && *text.str == '\n') |
|
|
0
|
|
|
|
|
|
18723
|
0
|
|
|
|
|
|
text.str++, text.len--; |
18724
|
|
|
|
|
|
|
|
18725
|
|
|
|
|
|
|
// Skip tabs on the beginning of the line |
18726
|
0
|
0
|
|
|
|
|
while (text.len && *text.str == '\t') |
|
|
0
|
|
|
|
|
|
18727
|
0
|
|
|
|
|
|
text.str++, text.len--; |
18728
|
|
|
|
|
|
|
} |
18729
|
|
|
|
|
|
|
|
18730
|
0
|
0
|
|
|
|
|
if (!s.empty()) { |
18731
|
|
|
|
|
|
|
// Mark new document if needed |
18732
|
0
|
0
|
|
|
|
|
if (new_document) |
18733
|
0
|
|
|
|
|
|
s.set_new_doc(true, document_id); |
18734
|
0
|
|
|
|
|
|
new_document = false; |
18735
|
|
|
|
|
|
|
|
18736
|
|
|
|
|
|
|
// Mark new paragraph if needed |
18737
|
0
|
0
|
|
|
|
|
if (preceeding_newlines >= 2) |
18738
|
0
|
|
|
|
|
|
s.set_new_par(true); |
18739
|
0
|
|
|
|
|
|
preceeding_newlines = 0; |
18740
|
|
|
|
|
|
|
|
18741
|
|
|
|
|
|
|
// Sentence id |
18742
|
0
|
0
|
|
|
|
|
s.set_sent_id(to_string(sentence_id++)); |
18743
|
|
|
|
|
|
|
} |
18744
|
|
|
|
|
|
|
|
18745
|
0
|
|
|
|
|
|
return !s.empty(); |
18746
|
|
|
|
|
|
|
} |
18747
|
|
|
|
|
|
|
|
18748
|
|
|
|
|
|
|
// Presegmented tokenizer |
18749
|
0
|
|
|
|
|
|
class input_format_presegmented_tokenizer : public input_format { |
18750
|
|
|
|
|
|
|
public: |
18751
|
0
|
|
|
|
|
|
input_format_presegmented_tokenizer(input_format* tokenizer) : tokenizer(tokenizer) {} |
18752
|
|
|
|
|
|
|
|
18753
|
|
|
|
|
|
|
virtual bool read_block(istream& is, string& block) const override; |
18754
|
|
|
|
|
|
|
virtual void reset_document(string_piece id) override; |
18755
|
|
|
|
|
|
|
virtual void set_text(string_piece text, bool make_copy = false) override; |
18756
|
|
|
|
|
|
|
virtual bool next_sentence(sentence& s, string& error) override; |
18757
|
|
|
|
|
|
|
|
18758
|
|
|
|
|
|
|
private: |
18759
|
|
|
|
|
|
|
unique_ptr tokenizer; |
18760
|
|
|
|
|
|
|
string_piece text; |
18761
|
|
|
|
|
|
|
string text_copy; |
18762
|
|
|
|
|
|
|
bool new_document = true; |
18763
|
|
|
|
|
|
|
string document_id; |
18764
|
|
|
|
|
|
|
unsigned preceeding_newlines = 2; |
18765
|
|
|
|
|
|
|
unsigned sentence_id = 1; |
18766
|
|
|
|
|
|
|
}; |
18767
|
|
|
|
|
|
|
|
18768
|
0
|
|
|
|
|
|
bool input_format_presegmented_tokenizer::read_block(istream& is, string& block) const { |
18769
|
0
|
0
|
|
|
|
|
if (getline(is, block)) |
18770
|
0
|
|
|
|
|
|
return block.push_back('\n'), true; |
18771
|
|
|
|
|
|
|
return false; |
18772
|
|
|
|
|
|
|
} |
18773
|
|
|
|
|
|
|
|
18774
|
0
|
|
|
|
|
|
void input_format_presegmented_tokenizer::reset_document(string_piece id) { |
18775
|
0
|
|
|
|
|
|
new_document = true; |
18776
|
0
|
|
|
|
|
|
document_id.assign(id.str, id.len); |
18777
|
0
|
|
|
|
|
|
preceeding_newlines = 2; |
18778
|
0
|
|
|
|
|
|
sentence_id = 1; |
18779
|
0
|
|
|
|
|
|
tokenizer->reset_document(); |
18780
|
0
|
|
|
|
|
|
set_text(""); |
18781
|
0
|
|
|
|
|
|
} |
18782
|
|
|
|
|
|
|
|
18783
|
0
|
|
|
|
|
|
void input_format_presegmented_tokenizer::set_text(string_piece text, bool make_copy) { |
18784
|
0
|
0
|
|
|
|
|
if (make_copy) { |
18785
|
0
|
|
|
|
|
|
text_copy.assign(text.str, text.len); |
18786
|
|
|
|
|
|
|
text = string_piece(text_copy.c_str(), text_copy.size()); |
18787
|
|
|
|
|
|
|
} |
18788
|
0
|
|
|
|
|
|
this->text = text; |
18789
|
0
|
|
|
|
|
|
} |
18790
|
|
|
|
|
|
|
|
18791
|
0
|
|
|
|
|
|
bool input_format_presegmented_tokenizer::next_sentence(sentence& s, string& error) { |
18792
|
|
|
|
|
|
|
error.clear(); |
18793
|
0
|
|
|
|
|
|
s.clear(); |
18794
|
|
|
|
|
|
|
|
18795
|
0
|
|
|
|
|
|
sentence partial; |
18796
|
|
|
|
|
|
|
unsigned following_newlines = 0; |
18797
|
0
|
0
|
|
|
|
|
while (text.len && s.empty()) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18798
|
|
|
|
|
|
|
// Move next line from `text' to `line', including leading and following newlines |
18799
|
0
|
|
|
|
|
|
string_piece line(text.str, 0); |
18800
|
0
|
0
|
|
|
|
|
while (line.len < text.len && (line.str[line.len] == '\n' || line.str[line.len] == '\r')) { |
|
|
0
|
|
|
|
|
|
18801
|
0
|
|
|
|
|
|
preceeding_newlines += line.str[line.len] == '\n'; |
18802
|
0
|
|
|
|
|
|
line.len++; |
18803
|
|
|
|
|
|
|
} |
18804
|
0
|
0
|
|
|
|
|
while (line.len < text.len && (line.str[line.len] != '\n' && line.str[line.len] != '\r')) |
|
|
0
|
|
|
|
|
|
18805
|
0
|
|
|
|
|
|
line.len++; |
18806
|
0
|
0
|
|
|
|
|
while (line.len < text.len && (line.str[line.len] == '\n' || line.str[line.len] == '\r')) { |
|
|
0
|
|
|
|
|
|
18807
|
0
|
|
|
|
|
|
following_newlines += line.str[line.len] == '\n'; |
18808
|
0
|
|
|
|
|
|
line.len++; |
18809
|
|
|
|
|
|
|
} |
18810
|
0
|
|
|
|
|
|
text.str += line.len, text.len -= line.len; |
18811
|
|
|
|
|
|
|
|
18812
|
|
|
|
|
|
|
// Add all tokens from the line to `s' |
18813
|
0
|
0
|
|
|
|
|
tokenizer->set_text(line, false); |
18814
|
0
|
0
|
|
|
|
|
while (tokenizer->next_sentence(partial, error)) { |
|
|
0
|
|
|
|
|
|
18815
|
|
|
|
|
|
|
// Append words |
18816
|
0
|
|
|
|
|
|
size_t words = s.words.size() - 1; |
18817
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < partial.words.size(); i++) { |
18818
|
0
|
|
|
|
|
|
s.words.push_back(move(partial.words[i])); |
18819
|
0
|
|
|
|
|
|
s.words.back().id += words; |
18820
|
0
|
0
|
|
|
|
|
if (s.words.back().head > 0) s.words.back().head += words; |
18821
|
|
|
|
|
|
|
} |
18822
|
|
|
|
|
|
|
|
18823
|
|
|
|
|
|
|
// Append multiword_tokens |
18824
|
0
|
0
|
|
|
|
|
for (auto&& multiword_token : partial.multiword_tokens) { |
18825
|
0
|
|
|
|
|
|
s.multiword_tokens.push_back(move(multiword_token)); |
18826
|
0
|
|
|
|
|
|
s.multiword_tokens.back().id_first += words; |
18827
|
0
|
|
|
|
|
|
s.multiword_tokens.back().id_last += words; |
18828
|
|
|
|
|
|
|
} |
18829
|
|
|
|
|
|
|
|
18830
|
|
|
|
|
|
|
// Append empty nodes |
18831
|
0
|
0
|
|
|
|
|
for (auto&& empty_node : partial.empty_nodes) { |
18832
|
0
|
|
|
|
|
|
s.empty_nodes.push_back(move(empty_node)); |
18833
|
0
|
|
|
|
|
|
s.empty_nodes.back().id += words; |
18834
|
|
|
|
|
|
|
} |
18835
|
|
|
|
|
|
|
} |
18836
|
0
|
0
|
|
|
|
|
if (!error.empty()) return false; |
18837
|
|
|
|
|
|
|
|
18838
|
0
|
0
|
|
|
|
|
if (s.empty()) { |
18839
|
0
|
|
|
|
|
|
preceeding_newlines += following_newlines; |
18840
|
|
|
|
|
|
|
following_newlines = 0; |
18841
|
|
|
|
|
|
|
} |
18842
|
|
|
|
|
|
|
} |
18843
|
|
|
|
|
|
|
|
18844
|
0
|
0
|
|
|
|
|
if (!s.empty()) { |
18845
|
|
|
|
|
|
|
// Mark new document if needed |
18846
|
0
|
0
|
|
|
|
|
if (new_document) |
18847
|
0
|
0
|
|
|
|
|
s.set_new_doc(true, document_id); |
18848
|
0
|
|
|
|
|
|
new_document = false; |
18849
|
|
|
|
|
|
|
|
18850
|
|
|
|
|
|
|
// Mark new paragraph if needed |
18851
|
0
|
0
|
|
|
|
|
if (preceeding_newlines >= 2) |
18852
|
0
|
0
|
|
|
|
|
s.set_new_par(true); |
18853
|
0
|
|
|
|
|
|
preceeding_newlines = following_newlines; |
18854
|
|
|
|
|
|
|
|
18855
|
|
|
|
|
|
|
// Sentence id |
18856
|
0
|
0
|
|
|
|
|
s.set_sent_id(to_string(sentence_id++)); |
18857
|
|
|
|
|
|
|
|
18858
|
|
|
|
|
|
|
// Fill "# text" comment |
18859
|
0
|
0
|
|
|
|
|
s.comments.emplace_back("# text = "); |
18860
|
0
|
0
|
|
|
|
|
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
18861
|
0
|
0
|
|
|
|
|
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j].form : (const token&)s.words[i].form; |
|
|
0
|
|
|
|
|
|
18862
|
0
|
0
|
|
|
|
|
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18863
|
0
|
|
|
|
|
|
i = s.multiword_tokens[j++].id_last; |
18864
|
|
|
|
|
|
|
|
18865
|
|
|
|
|
|
|
s.comments.back().append(tok.form); |
18866
|
0
|
0
|
|
|
|
|
if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' '); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18867
|
|
|
|
|
|
|
} |
18868
|
|
|
|
|
|
|
} |
18869
|
|
|
|
|
|
|
|
18870
|
0
|
|
|
|
|
|
return !s.empty(); |
18871
|
|
|
|
|
|
|
} |
18872
|
|
|
|
|
|
|
|
18873
|
|
|
|
|
|
|
// Static factory methods |
18874
|
0
|
|
|
|
|
|
input_format* input_format::new_conllu_input_format(const string& options) { |
18875
|
|
|
|
|
|
|
named_values::map parsed_options; |
18876
|
|
|
|
|
|
|
string parse_error; |
18877
|
0
|
0
|
|
|
|
|
if (!named_values::parse(options, parsed_options, parse_error)) |
|
|
0
|
|
|
|
|
|
18878
|
|
|
|
|
|
|
return nullptr; |
18879
|
|
|
|
|
|
|
|
18880
|
|
|
|
|
|
|
unsigned version = 2; |
18881
|
0
|
0
|
|
|
|
|
if (parsed_options.count(CONLLU_V1)) |
18882
|
|
|
|
|
|
|
version = 1; |
18883
|
0
|
0
|
|
|
|
|
if (parsed_options.count(CONLLU_V2)) |
18884
|
|
|
|
|
|
|
version = 2; |
18885
|
|
|
|
|
|
|
|
18886
|
0
|
0
|
|
|
|
|
return new input_format_conllu(version); |
18887
|
|
|
|
|
|
|
} |
18888
|
|
|
|
|
|
|
|
18889
|
0
|
|
|
|
|
|
input_format* input_format::new_generic_tokenizer_input_format(const string& options) { |
18890
|
|
|
|
|
|
|
named_values::map parsed_options; |
18891
|
|
|
|
|
|
|
string parse_error; |
18892
|
0
|
0
|
|
|
|
|
if (!named_values::parse(options, parsed_options, parse_error)) |
|
|
0
|
|
|
|
|
|
18893
|
|
|
|
|
|
|
return nullptr; |
18894
|
|
|
|
|
|
|
|
18895
|
0
|
|
|
|
|
|
bool normalized_spaces = parsed_options.count(GENERIC_TOKENIZER_NORMALIZED_SPACES); |
18896
|
0
|
|
|
|
|
|
bool token_ranges = parsed_options.count(GENERIC_TOKENIZER_RANGES); |
18897
|
|
|
|
|
|
|
|
18898
|
0
|
0
|
|
|
|
|
input_format* result = new morphodita_tokenizer_wrapper(morphodita::tokenizer::new_generic_tokenizer(), nullptr, normalized_spaces, token_ranges); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18899
|
0
|
0
|
|
|
|
|
return (parsed_options.count(GENERIC_TOKENIZER_PRESEGMENTED) && result) ? input_format::new_presegmented_tokenizer(result) : result; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18900
|
|
|
|
|
|
|
} |
18901
|
|
|
|
|
|
|
|
18902
|
0
|
|
|
|
|
|
input_format* input_format::new_horizontal_input_format(const string& /*options*/) { |
18903
|
0
|
|
|
|
|
|
return new input_format_horizontal(); |
18904
|
|
|
|
|
|
|
} |
18905
|
|
|
|
|
|
|
|
18906
|
0
|
|
|
|
|
|
input_format* input_format::new_vertical_input_format(const string& /*options*/) { |
18907
|
0
|
|
|
|
|
|
return new input_format_vertical(); |
18908
|
|
|
|
|
|
|
} |
18909
|
|
|
|
|
|
|
|
18910
|
0
|
|
|
|
|
|
input_format* input_format::new_input_format(const string& name) { |
18911
|
0
|
|
|
|
|
|
size_t equal = name.find('='); |
18912
|
0
|
0
|
|
|
|
|
size_t name_len = equal != string::npos ? equal : name.size(); |
18913
|
0
|
0
|
|
|
|
|
size_t option_offset = equal != string::npos ? equal + 1 : name.size(); |
18914
|
|
|
|
|
|
|
|
18915
|
0
|
0
|
|
|
|
|
if (name.compare(0, name_len, "conllu") == 0) return new_conllu_input_format(name.substr(option_offset)); |
|
|
0
|
|
|
|
|
|
18916
|
0
|
0
|
|
|
|
|
if (name.compare(0, name_len, "generic_tokenizer") == 0) return new_generic_tokenizer_input_format(name.substr(option_offset)); |
|
|
0
|
|
|
|
|
|
18917
|
0
|
0
|
|
|
|
|
if (name.compare(0, name_len, "horizontal") == 0) return new_horizontal_input_format(name.substr(option_offset)); |
|
|
0
|
|
|
|
|
|
18918
|
0
|
0
|
|
|
|
|
if (name.compare(0, name_len, "vertical") == 0) return new_vertical_input_format(name.substr(option_offset)); |
|
|
0
|
|
|
|
|
|
18919
|
|
|
|
|
|
|
return nullptr; |
18920
|
|
|
|
|
|
|
} |
18921
|
|
|
|
|
|
|
|
18922
|
0
|
|
|
|
|
|
input_format* input_format::new_presegmented_tokenizer(input_format* tokenizer) { |
18923
|
0
|
|
|
|
|
|
return new input_format_presegmented_tokenizer(tokenizer); |
18924
|
|
|
|
|
|
|
} |
18925
|
|
|
|
|
|
|
|
18926
|
|
|
|
|
|
|
///////// |
18927
|
|
|
|
|
|
|
// File: utils/xml_encoded.h |
18928
|
|
|
|
|
|
|
///////// |
18929
|
|
|
|
|
|
|
|
18930
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
18931
|
|
|
|
|
|
|
// |
18932
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
18933
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
18934
|
|
|
|
|
|
|
// |
18935
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
18936
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
18937
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
18938
|
|
|
|
|
|
|
|
18939
|
|
|
|
|
|
|
namespace utils { |
18940
|
|
|
|
|
|
|
|
18941
|
|
|
|
|
|
|
// |
18942
|
|
|
|
|
|
|
// Declarations |
18943
|
|
|
|
|
|
|
// |
18944
|
|
|
|
|
|
|
|
18945
|
|
|
|
|
|
|
// Print xml content while encoding <>& and optionally " using XML entities. |
18946
|
|
|
|
|
|
|
class xml_encoded { |
18947
|
|
|
|
|
|
|
public: |
18948
|
0
|
|
|
|
|
|
xml_encoded(string_piece str, bool encode_quot = false) : str(str), encode_quot(encode_quot) {} |
18949
|
|
|
|
|
|
|
|
18950
|
|
|
|
|
|
|
friend ostream& operator<<(ostream& os, xml_encoded data); |
18951
|
|
|
|
|
|
|
private: |
18952
|
|
|
|
|
|
|
string_piece str; |
18953
|
|
|
|
|
|
|
bool encode_quot; |
18954
|
|
|
|
|
|
|
}; |
18955
|
|
|
|
|
|
|
|
18956
|
|
|
|
|
|
|
inline ostream& operator<<(ostream& os, xml_encoded data); |
18957
|
|
|
|
|
|
|
|
18958
|
|
|
|
|
|
|
// |
18959
|
|
|
|
|
|
|
// Definitions |
18960
|
|
|
|
|
|
|
// |
18961
|
|
|
|
|
|
|
|
18962
|
0
|
|
|
|
|
|
ostream& operator<<(ostream& os, xml_encoded data) { |
18963
|
|
|
|
|
|
|
string_piece& str = data.str; |
18964
|
|
|
|
|
|
|
const char* to_print = str.str; |
18965
|
|
|
|
|
|
|
|
18966
|
0
|
0
|
|
|
|
|
while (str.len) { |
18967
|
0
|
0
|
|
|
|
|
while (str.len && *str.str != '<' && *str.str != '>' && *str.str != '&' && (!data.encode_quot || *str.str != '"')) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18968
|
0
|
|
|
|
|
|
str.str++, str.len--; |
18969
|
|
|
|
|
|
|
|
18970
|
0
|
0
|
|
|
|
|
if (str.len) { |
18971
|
0
|
0
|
|
|
|
|
if (to_print < str.str) os.write(to_print, str.str - to_print); |
18972
|
0
|
0
|
|
|
|
|
os << (*str.str == '<' ? "<" : *str.str == '>' ? ">" : *str.str == '&' ? "&" : """); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
18973
|
0
|
|
|
|
|
|
str.str++, str.len--; |
18974
|
|
|
|
|
|
|
to_print = str.str; |
18975
|
|
|
|
|
|
|
} |
18976
|
|
|
|
|
|
|
} |
18977
|
|
|
|
|
|
|
|
18978
|
0
|
0
|
|
|
|
|
if (to_print < str.str) os.write(to_print, str.str - to_print); |
18979
|
|
|
|
|
|
|
|
18980
|
0
|
|
|
|
|
|
return os; |
18981
|
|
|
|
|
|
|
} |
18982
|
|
|
|
|
|
|
|
18983
|
|
|
|
|
|
|
} // namespace utils |
18984
|
|
|
|
|
|
|
|
18985
|
|
|
|
|
|
|
///////// |
18986
|
|
|
|
|
|
|
// File: sentence/output_format.cpp |
18987
|
|
|
|
|
|
|
///////// |
18988
|
|
|
|
|
|
|
|
18989
|
|
|
|
|
|
|
// This file is part of UDPipe . |
18990
|
|
|
|
|
|
|
// |
18991
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
18992
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
18993
|
|
|
|
|
|
|
// |
18994
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
18995
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
18996
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
18997
|
|
|
|
|
|
|
|
18998
|
2
|
|
|
|
|
|
const string output_format::CONLLU_V1 = "v1"; |
18999
|
2
|
|
|
|
|
|
const string output_format::CONLLU_V2 = "v2"; |
19000
|
2
|
|
|
|
|
|
const string output_format::HORIZONTAL_PARAGRAPHS = "paragraphs"; |
19001
|
2
|
|
|
|
|
|
const string output_format::PLAINTEXT_NORMALIZED_SPACES = "normalized_spaces"; |
19002
|
2
|
|
|
|
|
|
const string output_format::VERTICAL_PARAGRAPHS = "paragraphs"; |
19003
|
|
|
|
|
|
|
|
19004
|
|
|
|
|
|
|
// CoNLL-U output format |
19005
|
2
|
|
|
|
|
|
class output_format_conllu : public output_format { |
19006
|
|
|
|
|
|
|
public: |
19007
|
1
|
|
|
|
|
|
output_format_conllu(unsigned version) : version(version) {} |
19008
|
|
|
|
|
|
|
|
19009
|
|
|
|
|
|
|
virtual void write_sentence(const sentence& s, ostream& os) override; |
19010
|
|
|
|
|
|
|
|
19011
|
|
|
|
|
|
|
private: |
19012
|
|
|
|
|
|
|
unsigned version; |
19013
|
|
|
|
|
|
|
static const string underscore; |
19014
|
14
|
0
|
|
|
|
|
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
19015
|
|
|
|
|
|
|
ostream& write_with_spaces(ostream& os, const string& str); |
19016
|
|
|
|
|
|
|
}; |
19017
|
|
|
|
|
|
|
|
19018
|
2
|
|
|
|
|
|
const string output_format_conllu::underscore = "_"; |
19019
|
|
|
|
|
|
|
|
19020
|
2
|
|
|
|
|
|
void output_format_conllu::write_sentence(const sentence& s, ostream& os) { |
19021
|
|
|
|
|
|
|
// Comments |
19022
|
5
|
100
|
|
|
|
|
for (auto&& comment : s.comments) |
19023
|
|
|
|
|
|
|
os << comment << '\n'; |
19024
|
|
|
|
|
|
|
|
19025
|
|
|
|
|
|
|
// Words and multiword tokens |
19026
|
|
|
|
|
|
|
size_t multiword_token = 0, empty_node = 0; |
19027
|
9
|
100
|
|
|
|
|
for (int i = 0; i < int(s.words.size()); i++) { |
19028
|
|
|
|
|
|
|
// Write non-root nodes |
19029
|
8
|
100
|
|
|
|
|
if (i > 0) { |
19030
|
|
|
|
|
|
|
// Multiword token if present |
19031
|
7
|
50
|
|
|
|
|
if (multiword_token < s.multiword_tokens.size() && |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
19032
|
0
|
|
|
|
|
|
i == s.multiword_tokens[multiword_token].id_first) { |
19033
|
0
|
|
|
|
|
|
os << s.multiword_tokens[multiword_token].id_first << '-' |
19034
|
0
|
|
|
|
|
|
<< s.multiword_tokens[multiword_token].id_last << '\t'; |
19035
|
0
|
|
|
|
|
|
write_with_spaces(os, s.multiword_tokens[multiword_token].form) << "\t_\t_\t_\t_\t_\t_\t_\t" |
19036
|
0
|
|
|
|
|
|
<< underscore_on_empty(s.multiword_tokens[multiword_token].misc) << '\n'; |
19037
|
0
|
|
|
|
|
|
multiword_token++; |
19038
|
|
|
|
|
|
|
} |
19039
|
|
|
|
|
|
|
|
19040
|
|
|
|
|
|
|
// Write the word |
19041
|
7
|
|
|
|
|
|
os << i << '\t'; |
19042
|
7
|
|
|
|
|
|
write_with_spaces(os, s.words[i].form) << '\t'; |
19043
|
7
|
|
|
|
|
|
write_with_spaces(os, underscore_on_empty(s.words[i].lemma)) << '\t' |
19044
|
7
|
|
|
|
|
|
<< underscore_on_empty(s.words[i].upostag) << '\t' |
19045
|
7
|
|
|
|
|
|
<< underscore_on_empty(s.words[i].xpostag) << '\t' |
19046
|
7
|
|
|
|
|
|
<< underscore_on_empty(s.words[i].feats) << '\t'; |
19047
|
7
|
50
|
|
|
|
|
if (s.words[i].head < 0) os << '_'; else os << s.words[i].head; os << '\t' |
19048
|
7
|
|
|
|
|
|
<< underscore_on_empty(s.words[i].deprel) << '\t' |
19049
|
7
|
|
|
|
|
|
<< underscore_on_empty(s.words[i].deps) << '\t' |
19050
|
7
|
|
|
|
|
|
<< underscore_on_empty(s.words[i].misc) << '\n'; |
19051
|
|
|
|
|
|
|
} |
19052
|
|
|
|
|
|
|
|
19053
|
|
|
|
|
|
|
// Empty nodes |
19054
|
8
|
50
|
|
|
|
|
if (version >= 2) |
19055
|
8
|
50
|
|
|
|
|
for (; empty_node < s.empty_nodes.size() && i == s.empty_nodes[empty_node].id; empty_node++) { |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
19056
|
0
|
|
|
|
|
|
os << i << '.' << s.empty_nodes[empty_node].index << '\t' |
19057
|
|
|
|
|
|
|
<< s.empty_nodes[empty_node].form << '\t' |
19058
|
0
|
|
|
|
|
|
<< underscore_on_empty(s.empty_nodes[empty_node].lemma) << '\t' |
19059
|
0
|
|
|
|
|
|
<< underscore_on_empty(s.empty_nodes[empty_node].upostag) << '\t' |
19060
|
0
|
|
|
|
|
|
<< underscore_on_empty(s.empty_nodes[empty_node].xpostag) << '\t' |
19061
|
0
|
|
|
|
|
|
<< underscore_on_empty(s.empty_nodes[empty_node].feats) << '\t' |
19062
|
|
|
|
|
|
|
<< "_\t" |
19063
|
|
|
|
|
|
|
<< "_\t" |
19064
|
0
|
|
|
|
|
|
<< underscore_on_empty(s.empty_nodes[empty_node].deps) << '\t' |
19065
|
0
|
|
|
|
|
|
<< underscore_on_empty(s.empty_nodes[empty_node].misc) << '\n'; |
19066
|
|
|
|
|
|
|
} |
19067
|
|
|
|
|
|
|
} |
19068
|
|
|
|
|
|
|
os << endl; |
19069
|
1
|
|
|
|
|
|
} |
19070
|
|
|
|
|
|
|
|
19071
|
14
|
|
|
|
|
|
ostream& output_format_conllu::write_with_spaces(ostream& os, const string& str) { |
19072
|
14
|
50
|
|
|
|
|
if (version >= 2 || str.find(' ') == string::npos) |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
19073
|
|
|
|
|
|
|
os << str; |
19074
|
|
|
|
|
|
|
else |
19075
|
0
|
0
|
|
|
|
|
for (auto&& chr : str) |
19076
|
0
|
0
|
|
|
|
|
os << (chr == ' ' ? '_' : chr); |
19077
|
|
|
|
|
|
|
|
19078
|
14
|
|
|
|
|
|
return os; |
19079
|
|
|
|
|
|
|
} |
19080
|
|
|
|
|
|
|
|
19081
|
|
|
|
|
|
|
// EPE output format |
19082
|
0
|
|
|
|
|
|
class output_format_epe : public output_format { |
19083
|
|
|
|
|
|
|
public: |
19084
|
|
|
|
|
|
|
virtual void write_sentence(const sentence& s, ostream& os) override; |
19085
|
|
|
|
|
|
|
virtual void finish_document(ostream& os) override; |
19086
|
|
|
|
|
|
|
|
19087
|
|
|
|
|
|
|
private: |
19088
|
0
|
|
|
|
|
|
class json_builder { |
19089
|
|
|
|
|
|
|
public: |
19090
|
0
|
|
|
|
|
|
json_builder& object() { comma(); json.push_back('{'); stack.push_back('}'); return *this; } |
19091
|
0
|
|
|
|
|
|
json_builder& array() { comma(); json.push_back('['); stack.push_back(']'); return *this; } |
19092
|
0
|
0
|
|
|
|
|
json_builder& close() { if (!stack.empty()) { json.push_back(stack.back()); stack.pop_back(); } comma_needed = true; return *this; } |
|
|
0
|
|
|
|
|
|
19093
|
0
|
|
|
|
|
|
json_builder& key(string_piece name) { comma(); string(name); json.push_back(':'); return *this; } |
19094
|
0
|
0
|
|
|
|
|
json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; } |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
19095
|
0
|
0
|
|
|
|
|
json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; } |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
19096
|
0
|
|
|
|
|
|
json_builder& value_true() { comma(); json.push_back('t'); json.push_back('r'); json.push_back('u'); json.push_back('e'); comma_needed=true; return *this; } |
19097
|
|
|
|
|
|
|
|
19098
|
|
|
|
|
|
|
string_piece current() const { return string_piece(json.data(), json.size()); } |
19099
|
0
|
|
|
|
|
|
void clear() { json.clear(); stack.clear(); comma_needed=false; } |
19100
|
|
|
|
|
|
|
|
19101
|
|
|
|
|
|
|
private: |
19102
|
0
|
|
|
|
|
|
void comma() { |
19103
|
0
|
0
|
|
|
|
|
if (comma_needed) { |
19104
|
0
|
|
|
|
|
|
json.push_back(','); |
19105
|
0
|
|
|
|
|
|
json.push_back(' '); |
19106
|
|
|
|
|
|
|
} |
19107
|
0
|
|
|
|
|
|
comma_needed = false; |
19108
|
0
|
|
|
|
|
|
} |
19109
|
0
|
|
|
|
|
|
void string(string_piece str) { |
19110
|
0
|
|
|
|
|
|
json.push_back('"'); |
19111
|
0
|
0
|
|
|
|
|
for (; str.len; str.str++, str.len--) |
19112
|
0
|
|
|
|
|
|
switch (*str.str) { |
19113
|
0
|
|
|
|
|
|
case '"': json.push_back('\\'); json.push_back('\"'); break; |
19114
|
0
|
|
|
|
|
|
case '\\': json.push_back('\\'); json.push_back('\\'); break; |
19115
|
0
|
|
|
|
|
|
case '\b': json.push_back('\\'); json.push_back('b'); break; |
19116
|
0
|
|
|
|
|
|
case '\f': json.push_back('\\'); json.push_back('f'); break; |
19117
|
0
|
|
|
|
|
|
case '\n': json.push_back('\\'); json.push_back('n'); break; |
19118
|
0
|
|
|
|
|
|
case '\r': json.push_back('\\'); json.push_back('r'); break; |
19119
|
0
|
|
|
|
|
|
case '\t': json.push_back('\\'); json.push_back('t'); break; |
19120
|
|
|
|
|
|
|
default: |
19121
|
0
|
0
|
|
|
|
|
if (((unsigned char)*str.str) < 32) { |
19122
|
0
|
|
|
|
|
|
json.push_back('u'); json.push_back('0'); json.push_back('0'); json.push_back('0' + (*str.str >> 4)); json.push_back("0123456789ABCDEF"[*str.str & 0xF]); |
19123
|
|
|
|
|
|
|
} else { |
19124
|
0
|
|
|
|
|
|
json.push_back(*str.str); |
19125
|
|
|
|
|
|
|
} |
19126
|
|
|
|
|
|
|
} |
19127
|
0
|
|
|
|
|
|
json.push_back('"'); |
19128
|
0
|
|
|
|
|
|
} |
19129
|
0
|
|
|
|
|
|
void number(size_t value) { |
19130
|
|
|
|
|
|
|
size_t start_size = json.size(); |
19131
|
0
|
0
|
|
|
|
|
for (; value || start_size == json.size(); value /= 10) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
19132
|
0
|
|
|
|
|
|
json.push_back('0' + (value % 10)); |
19133
|
|
|
|
|
|
|
reverse(json.begin() + start_size, json.end()); |
19134
|
0
|
|
|
|
|
|
} |
19135
|
|
|
|
|
|
|
|
19136
|
|
|
|
|
|
|
std::vector json; |
19137
|
|
|
|
|
|
|
std::vector stack; |
19138
|
|
|
|
|
|
|
bool comma_needed = false; |
19139
|
|
|
|
|
|
|
} json; |
19140
|
|
|
|
|
|
|
|
19141
|
|
|
|
|
|
|
vector feats; |
19142
|
|
|
|
|
|
|
size_t sentences = 0; |
19143
|
|
|
|
|
|
|
}; |
19144
|
|
|
|
|
|
|
|
19145
|
0
|
|
|
|
|
|
void output_format_epe::write_sentence(const sentence& s, ostream& os) { |
19146
|
0
|
0
|
|
|
|
|
json.object().key("id").value(++sentences).key("nodes").array(); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
19147
|
|
|
|
|
|
|
|
19148
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < s.words.size(); i++) { |
19149
|
0
|
0
|
|
|
|
|
json.object().key("id").value(i).key("form").value(s.words[i].form); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
19150
|
|
|
|
|
|
|
|
19151
|
|
|
|
|
|
|
size_t start, end; |
19152
|
0
|
0
|
|
|
|
|
if (s.words[i].get_token_range(start, end)) |
19153
|
0
|
0
|
|
|
|
|
json.key("start").value(start).key("end").value(end); |
|
|
0
|
|
|
|
|
|
19154
|
0
|
0
|
|
|
|
|
if (s.words[i].head == 0) |
19155
|
0
|
|
|
|
|
|
json.key("top").value_true(); |
19156
|
|
|
|
|
|
|
|
19157
|
0
|
0
|
|
|
|
|
json.key("properties").object() |
|
|
0
|
|
|
|
|
|
19158
|
0
|
0
|
|
|
|
|
.key("lemma").value(s.words[i].lemma) |
19159
|
0
|
0
|
|
|
|
|
.key("upos").value(s.words[i].upostag) |
19160
|
0
|
0
|
|
|
|
|
.key("xpos").value(s.words[i].xpostag); |
19161
|
0
|
|
|
|
|
|
split(s.words[i].feats, '|', feats); |
19162
|
0
|
0
|
|
|
|
|
for (auto&& feat : feats) { |
19163
|
0
|
|
|
|
|
|
string_piece key(feat.str, 0); |
19164
|
0
|
0
|
|
|
|
|
while (key.len < feat.len && key.str[key.len] != '=') |
|
|
0
|
|
|
|
|
|
19165
|
0
|
|
|
|
|
|
key.len++; |
19166
|
0
|
0
|
|
|
|
|
if (key.len + 1 < feat.len) |
19167
|
0
|
0
|
|
|
|
|
json.key(key).value(string_piece(key.str + key.len + 1, feat.len - key.len - 1)); |
19168
|
|
|
|
|
|
|
} |
19169
|
0
|
|
|
|
|
|
json.close(); |
19170
|
|
|
|
|
|
|
|
19171
|
0
|
0
|
|
|
|
|
if (!s.words[i].children.empty()) { |
19172
|
0
|
|
|
|
|
|
json.key("edges").array(); |
19173
|
0
|
0
|
|
|
|
|
for (auto&& child : s.words[i].children) |
19174
|
0
|
0
|
|
|
|
|
json.object().key("label").value(s.words[child].deprel).key("target").value(child).close(); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
19175
|
0
|
|
|
|
|
|
json.close(); |
19176
|
|
|
|
|
|
|
} |
19177
|
|
|
|
|
|
|
|
19178
|
0
|
|
|
|
|
|
json.close(); |
19179
|
|
|
|
|
|
|
} |
19180
|
0
|
|
|
|
|
|
json.close().close(); |
19181
|
|
|
|
|
|
|
|
19182
|
|
|
|
|
|
|
string_piece current = json.current(); |
19183
|
0
|
|
|
|
|
|
os.write(current.str, current.len).put('\n'); |
19184
|
|
|
|
|
|
|
json.clear(); |
19185
|
0
|
|
|
|
|
|
} |
19186
|
|
|
|
|
|
|
|
19187
|
0
|
|
|
|
|
|
void output_format_epe::finish_document(ostream& /*os*/) { |
19188
|
0
|
|
|
|
|
|
sentences = 0; |
19189
|
0
|
|
|
|
|
|
} |
19190
|
|
|
|
|
|
|
|
19191
|
|
|
|
|
|
|
// Matxin output format |
19192
|
0
|
|
|
|
|
|
class output_format_matxin : public output_format { |
19193
|
|
|
|
|
|
|
public: |
19194
|
|
|
|
|
|
|
virtual void write_sentence(const sentence& s, ostream& os) override; |
19195
|
|
|
|
|
|
|
virtual void finish_document(ostream& os) override; |
19196
|
|
|
|
|
|
|
|
19197
|
|
|
|
|
|
|
private: |
19198
|
|
|
|
|
|
|
void write_node(const sentence& s, int node, string& pad, ostream& os); |
19199
|
|
|
|
|
|
|
|
19200
|
|
|
|
|
|
|
int sentences = 0; |
19201
|
|
|
|
|
|
|
}; |
19202
|
|
|
|
|
|
|
|
19203
|
0
|
|
|
|
|
|
void output_format_matxin::write_sentence(const sentence& s, ostream& os) { |
19204
|
0
|
0
|
|
|
|
|
if (!sentences) { |
19205
|
0
|
|
|
|
|
|
os << ""; |
19206
|
|
|
|
|
|
|
} |
19207
|
0
|
|
|
|
|
|
os << "\n\n"; |
19208
|
|
|
|
|
|
|
|
19209
|
|
|
|
|
|
|
string pad; |
19210
|
0
|
0
|
|
|
|
|
for (auto&& node : s.words[0].children) |
19211
|
0
|
0
|
|
|
|
|
write_node(s, node, pad, os); |
19212
|
|
|
|
|
|
|
|
19213
|
|
|
|
|
|
|
os << "" << endl; |
19214
|
0
|
|
|
|
|
|
} |
19215
|
|
|
|
|
|
|
|
19216
|
0
|
|
|
|
|
|
void output_format_matxin::finish_document(ostream& os) { |
19217
|
0
|
|
|
|
|
|
os << "\n"; |
19218
|
|
|
|
|
|
|
|
19219
|
0
|
|
|
|
|
|
sentences = 0; |
19220
|
0
|
|
|
|
|
|
} |
19221
|
|
|
|
|
|
|
|
19222
|
0
|
|
|
|
|
|
void output_format_matxin::write_node(const sentence& s, int node, string& pad, ostream& os) { |
19223
|
|
|
|
|
|
|
// |
19224
|
0
|
|
|
|
|
|
pad.push_back(' '); |
19225
|
|
|
|
|
|
|
|
19226
|
0
|
0
|
|
|
|
|
os << pad << "
|
|
|
0
|
|
|
|
|
|
19227
|
0
|
0
|
|
|
|
|
<< "\" form=\"" << xml_encoded(s.words[node].form, true) |
19228
|
0
|
0
|
|
|
|
|
<< "\" lem=\"" << xml_encoded(s.words[node].lemma, true) |
19229
|
0
|
0
|
|
|
|
|
<< "\" mi=\"" << xml_encoded(s.words[node].feats, true) |
19230
|
0
|
0
|
|
|
|
|
<< "\" si=\"" << xml_encoded(s.words[node].deprel, true) << '"'; |
19231
|
|
|
|
|
|
|
|
19232
|
0
|
0
|
|
|
|
|
if (s.words[node].children.empty()) { |
19233
|
0
|
|
|
|
|
|
os << "/>\n"; |
19234
|
|
|
|
|
|
|
} else { |
19235
|
0
|
|
|
|
|
|
os << ">\n"; |
19236
|
0
|
0
|
|
|
|
|
for (auto&& child : s.words[node].children) |
19237
|
0
|
|
|
|
|
|
write_node(s, child, pad, os); |
19238
|
0
|
|
|
|
|
|
os << pad << "\n"; |
19239
|
|
|
|
|
|
|
} |
19240
|
|
|
|
|
|
|
|
19241
|
|
|
|
|
|
|
pad.pop_back(); |
19242
|
0
|
|
|
|
|
|
} |
19243
|
|
|
|
|
|
|
|
19244
|
|
|
|
|
|
|
// Horizontal output format |
19245
|
0
|
|
|
|
|
|
class output_format_horizontal : public output_format { |
19246
|
|
|
|
|
|
|
public: |
19247
|
0
|
|
|
|
|
|
output_format_horizontal(bool paragraphs) : paragraphs(paragraphs), empty(true) {} |
19248
|
|
|
|
|
|
|
|
19249
|
|
|
|
|
|
|
virtual void write_sentence(const sentence& s, ostream& os) override; |
19250
|
0
|
|
|
|
|
|
virtual void finish_document(ostream& /*os*/) override { empty = true; } |
19251
|
|
|
|
|
|
|
|
19252
|
|
|
|
|
|
|
private: |
19253
|
|
|
|
|
|
|
bool paragraphs; |
19254
|
|
|
|
|
|
|
bool empty; |
19255
|
|
|
|
|
|
|
}; |
19256
|
|
|
|
|
|
|
|
19257
|
0
|
|
|
|
|
|
void output_format_horizontal::write_sentence(const sentence& s, ostream& os) { |
19258
|
0
|
0
|
|
|
|
|
if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par())) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
19259
|
|
|
|
|
|
|
os << '\n'; |
19260
|
0
|
|
|
|
|
|
empty = false; |
19261
|
|
|
|
|
|
|
|
19262
|
|
|
|
|
|
|
string line; |
19263
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < s.words.size(); i++) { |
19264
|
|
|
|
|
|
|
// Append word, but replace spaces by s |
19265
|
0
|
0
|
|
|
|
|
for (auto&& chr : s.words[i].form) |
19266
|
0
|
0
|
|
|
|
|
if (chr == ' ') |
19267
|
0
|
0
|
|
|
|
|
line.append("\302\240"); |
19268
|
|
|
|
|
|
|
else |
19269
|
0
|
0
|
|
|
|
|
line.push_back(chr); |
19270
|
|
|
|
|
|
|
|
19271
|
0
|
0
|
|
|
|
|
if (i+1 < s.words.size()) |
19272
|
0
|
0
|
|
|
|
|
line.push_back(' '); |
19273
|
|
|
|
|
|
|
} |
19274
|
|
|
|
|
|
|
os << line << endl; |
19275
|
0
|
|
|
|
|
|
} |
19276
|
|
|
|
|
|
|
|
19277
|
|
|
|
|
|
|
// Plaintext output format |
19278
|
0
|
|
|
|
|
|
class output_format_plaintext : public output_format { |
19279
|
|
|
|
|
|
|
public: |
19280
|
0
|
|
|
|
|
|
output_format_plaintext(bool normalized): normalized(normalized), empty(true) {} |
19281
|
|
|
|
|
|
|
|
19282
|
|
|
|
|
|
|
virtual void write_sentence(const sentence& s, ostream& os) override; |
19283
|
0
|
|
|
|
|
|
virtual void finish_document(ostream& /*os*/) override { empty = true; } |
19284
|
|
|
|
|
|
|
private: |
19285
|
|
|
|
|
|
|
bool normalized; |
19286
|
|
|
|
|
|
|
bool empty; |
19287
|
|
|
|
|
|
|
}; |
19288
|
|
|
|
|
|
|
|
19289
|
0
|
|
|
|
|
|
void output_format_plaintext::write_sentence(const sentence& s, ostream& os) { |
19290
|
0
|
0
|
|
|
|
|
if (normalized) { |
19291
|
0
|
0
|
|
|
|
|
if (!empty && (s.get_new_doc() || s.get_new_par())) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
19292
|
|
|
|
|
|
|
os << '\n'; |
19293
|
0
|
0
|
|
|
|
|
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
19294
|
0
|
0
|
|
|
|
|
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j] : (const token&)s.words[i]; |
|
|
0
|
|
|
|
|
|
19295
|
|
|
|
|
|
|
os << tok.form; |
19296
|
0
|
0
|
|
|
|
|
if (i+1 < s.words.size() && tok.get_space_after()) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
19297
|
|
|
|
|
|
|
os << ' '; |
19298
|
0
|
0
|
|
|
|
|
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
19299
|
0
|
|
|
|
|
|
i = s.multiword_tokens[j++].id_last; |
19300
|
|
|
|
|
|
|
} |
19301
|
|
|
|
|
|
|
os << endl; |
19302
|
|
|
|
|
|
|
} else { |
19303
|
|
|
|
|
|
|
string spaces; |
19304
|
0
|
0
|
|
|
|
|
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
19305
|
0
|
0
|
|
|
|
|
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j] : (const token&)s.words[i]; |
|
|
0
|
|
|
|
|
|
19306
|
0
|
0
|
|
|
|
|
tok.get_spaces_before(spaces); os << spaces; |
19307
|
0
|
0
|
|
|
|
|
tok.get_spaces_in_token(spaces); os << (!spaces.empty() ? spaces : tok.form); |
|
|
0
|
|
|
|
|
|
19308
|
0
|
0
|
|
|
|
|
tok.get_spaces_after(spaces); os << spaces; |
19309
|
0
|
0
|
|
|
|
|
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
19310
|
0
|
|
|
|
|
|
i = s.multiword_tokens[j++].id_last; |
19311
|
|
|
|
|
|
|
} |
19312
|
|
|
|
|
|
|
os << flush; |
19313
|
|
|
|
|
|
|
} |
19314
|
0
|
|
|
|
|
|
empty = false; |
19315
|
0
|
|
|
|
|
|
} |
19316
|
|
|
|
|
|
|
|
19317
|
|
|
|
|
|
|
// Vertical output format |
19318
|
0
|
|
|
|
|
|
class output_format_vertical : public output_format { |
19319
|
|
|
|
|
|
|
public: |
19320
|
0
|
|
|
|
|
|
output_format_vertical(bool paragraphs) : paragraphs(paragraphs), empty(true) {} |
19321
|
|
|
|
|
|
|
|
19322
|
|
|
|
|
|
|
virtual void write_sentence(const sentence& s, ostream& os) override; |
19323
|
0
|
|
|
|
|
|
virtual void finish_document(ostream& /*os*/) override { empty = true; } |
19324
|
|
|
|
|
|
|
|
19325
|
|
|
|
|
|
|
private: |
19326
|
|
|
|
|
|
|
bool paragraphs; |
19327
|
|
|
|
|
|
|
bool empty; |
19328
|
|
|
|
|
|
|
}; |
19329
|
|
|
|
|
|
|
|
19330
|
0
|
|
|
|
|
|
void output_format_vertical::write_sentence(const sentence& s, ostream& os) { |
19331
|
0
|
0
|
|
|
|
|
if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par())) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
19332
|
|
|
|
|
|
|
os << '\n'; |
19333
|
0
|
|
|
|
|
|
empty = false; |
19334
|
|
|
|
|
|
|
|
19335
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < s.words.size(); i++) |
19336
|
|
|
|
|
|
|
os << s.words[i].form << '\n'; |
19337
|
|
|
|
|
|
|
os << endl; |
19338
|
0
|
|
|
|
|
|
} |
19339
|
|
|
|
|
|
|
|
19340
|
|
|
|
|
|
|
// Static factory methods |
19341
|
1
|
|
|
|
|
|
output_format* output_format::new_conllu_output_format(const string& options) { |
19342
|
|
|
|
|
|
|
named_values::map parsed_options; |
19343
|
|
|
|
|
|
|
string parse_error; |
19344
|
1
|
50
|
|
|
|
|
if (!named_values::parse(options, parsed_options, parse_error)) |
|
|
50
|
|
|
|
|
|
19345
|
|
|
|
|
|
|
return nullptr; |
19346
|
|
|
|
|
|
|
|
19347
|
|
|
|
|
|
|
unsigned version = 2; |
19348
|
1
|
50
|
|
|
|
|
if (parsed_options.count(CONLLU_V1)) |
19349
|
|
|
|
|
|
|
version = 1; |
19350
|
1
|
50
|
|
|
|
|
if (parsed_options.count(CONLLU_V2)) |
19351
|
|
|
|
|
|
|
version = 2; |
19352
|
|
|
|
|
|
|
|
19353
|
1
|
50
|
|
|
|
|
return new output_format_conllu(version); |
19354
|
|
|
|
|
|
|
} |
19355
|
|
|
|
|
|
|
|
19356
|
0
|
|
|
|
|
|
output_format* output_format::new_epe_output_format(const string& /*options*/) { |
19357
|
0
|
|
|
|
|
|
return new output_format_epe(); |
19358
|
|
|
|
|
|
|
} |
19359
|
|
|
|
|
|
|
|
19360
|
0
|
|
|
|
|
|
output_format* output_format::new_matxin_output_format(const string& /*options*/) { |
19361
|
0
|
0
|
|
|
|
|
return new output_format_matxin(); |
19362
|
|
|
|
|
|
|
} |
19363
|
|
|
|
|
|
|
|
19364
|
0
|
|
|
|
|
|
output_format* output_format::new_horizontal_output_format(const string& options) { |
19365
|
|
|
|
|
|
|
named_values::map parsed_options; |
19366
|
|
|
|
|
|
|
string parse_error; |
19367
|
0
|
0
|
|
|
|
|
if (!named_values::parse(options, parsed_options, parse_error)) |
|
|
0
|
|
|
|
|
|
19368
|
|
|
|
|
|
|
return nullptr; |
19369
|
|
|
|
|
|
|
|
19370
|
0
|
0
|
|
|
|
|
return new output_format_horizontal(parsed_options.count(HORIZONTAL_PARAGRAPHS)); |
19371
|
|
|
|
|
|
|
} |
19372
|
|
|
|
|
|
|
|
19373
|
0
|
|
|
|
|
|
output_format* output_format::new_plaintext_output_format(const string& options) { |
19374
|
|
|
|
|
|
|
named_values::map parsed_options; |
19375
|
|
|
|
|
|
|
string parse_error; |
19376
|
0
|
0
|
|
|
|
|
if (!named_values::parse(options, parsed_options, parse_error)) |
|
|
0
|
|
|
|
|
|
19377
|
|
|
|
|
|
|
return nullptr; |
19378
|
|
|
|
|
|
|
|
19379
|
0
|
0
|
|
|
|
|
return new output_format_plaintext(parsed_options.count(PLAINTEXT_NORMALIZED_SPACES)); |
19380
|
|
|
|
|
|
|
} |
19381
|
|
|
|
|
|
|
|
19382
|
0
|
|
|
|
|
|
output_format* output_format::new_vertical_output_format(const string& options) { |
19383
|
|
|
|
|
|
|
named_values::map parsed_options; |
19384
|
|
|
|
|
|
|
string parse_error; |
19385
|
0
|
0
|
|
|
|
|
if (!named_values::parse(options, parsed_options, parse_error)) |
|
|
0
|
|
|
|
|
|
19386
|
|
|
|
|
|
|
return nullptr; |
19387
|
|
|
|
|
|
|
|
19388
|
0
|
0
|
|
|
|
|
return new output_format_vertical(parsed_options.count(VERTICAL_PARAGRAPHS)); |
19389
|
|
|
|
|
|
|
} |
19390
|
|
|
|
|
|
|
|
19391
|
1
|
|
|
|
|
|
output_format* output_format::new_output_format(const string& name) { |
19392
|
1
|
|
|
|
|
|
size_t equal = name.find('='); |
19393
|
1
|
50
|
|
|
|
|
size_t name_len = equal != string::npos ? equal : name.size(); |
19394
|
1
|
50
|
|
|
|
|
size_t option_offset = equal != string::npos ? equal + 1 : name.size(); |
19395
|
|
|
|
|
|
|
|
19396
|
2
|
50
|
|
|
|
|
if (name.compare(0, name_len, "conllu") == 0) return new_conllu_output_format(name.substr(option_offset)); |
|
|
50
|
|
|
|
|
|
19397
|
0
|
0
|
|
|
|
|
if (name.compare(0, name_len, "epe") == 0) return new_epe_output_format(name.substr(option_offset)); |
|
|
0
|
|
|
|
|
|
19398
|
0
|
0
|
|
|
|
|
if (name.compare(0, name_len, "matxin") == 0) return new_matxin_output_format(name.substr(option_offset)); |
19399
|
0
|
0
|
|
|
|
|
if (name.compare(0, name_len, "horizontal") == 0) return new_horizontal_output_format(name.substr(option_offset)); |
|
|
0
|
|
|
|
|
|
19400
|
0
|
0
|
|
|
|
|
if (name.compare(0, name_len, "plaintext") == 0) return new_plaintext_output_format(name.substr(option_offset)); |
|
|
0
|
|
|
|
|
|
19401
|
1
|
0
|
|
|
|
|
if (name.compare(0, name_len, "vertical") == 0) return new_vertical_output_format(name.substr(option_offset)); |
|
|
0
|
|
|
|
|
|
19402
|
|
|
|
|
|
|
return nullptr; |
19403
|
|
|
|
|
|
|
} |
19404
|
|
|
|
|
|
|
|
19405
|
|
|
|
|
|
|
///////// |
19406
|
|
|
|
|
|
|
// File: sentence/sentence.cpp |
19407
|
|
|
|
|
|
|
///////// |
19408
|
|
|
|
|
|
|
|
19409
|
|
|
|
|
|
|
// This file is part of UDPipe . |
19410
|
|
|
|
|
|
|
// |
19411
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
19412
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
19413
|
|
|
|
|
|
|
// |
19414
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
19415
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
19416
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
19417
|
|
|
|
|
|
|
|
19418
|
2
|
|
|
|
|
|
const string sentence::root_form = ""; |
19419
|
|
|
|
|
|
|
|
19420
|
1
|
|
|
|
|
|
sentence::sentence() { |
19421
|
1
|
50
|
|
|
|
|
clear(); |
19422
|
1
|
|
|
|
|
|
} |
19423
|
|
|
|
|
|
|
|
19424
|
0
|
|
|
|
|
|
bool sentence::empty() { |
19425
|
0
|
|
|
|
|
|
return words.size() == 1; |
19426
|
|
|
|
|
|
|
} |
19427
|
|
|
|
|
|
|
|
19428
|
3
|
|
|
|
|
|
void sentence::clear() { |
19429
|
|
|
|
|
|
|
words.clear(); |
19430
|
|
|
|
|
|
|
multiword_tokens.clear(); |
19431
|
|
|
|
|
|
|
empty_nodes.clear(); |
19432
|
3
|
|
|
|
|
|
comments.clear(); |
19433
|
|
|
|
|
|
|
|
19434
|
|
|
|
|
|
|
word& root = add_word(root_form); |
19435
|
12
|
|
|
|
|
|
root.lemma = root.upostag = root.xpostag = root.feats = root_form; |
19436
|
3
|
|
|
|
|
|
} |
19437
|
|
|
|
|
|
|
|
19438
|
0
|
|
|
|
|
|
word& sentence::add_word(string_piece form) { |
19439
|
10
|
0
|
|
|
|
|
words.emplace_back((int)words.size(), form); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
19440
|
0
|
|
|
|
|
|
return words.back(); |
19441
|
|
|
|
|
|
|
} |
19442
|
|
|
|
|
|
|
|
19443
|
7
|
|
|
|
|
|
void sentence::set_head(int id, int head, const string& deprel) { |
19444
|
7
|
50
|
|
|
|
|
assert(id >= 0 && id < int(words.size())); |
|
|
50
|
|
|
|
|
|
19445
|
7
|
50
|
|
|
|
|
assert(head < int(words.size())); |
19446
|
|
|
|
|
|
|
|
19447
|
|
|
|
|
|
|
// Remove existing head |
19448
|
7
|
50
|
|
|
|
|
if (words[id].head >= 0) { |
19449
|
0
|
|
|
|
|
|
auto& children = words[words[id].head].children; |
19450
|
0
|
0
|
|
|
|
|
for (size_t i = children.size(); i && children[i-1] >= id; i--) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
19451
|
0
|
0
|
|
|
|
|
if (children[i-1] == id) { |
19452
|
|
|
|
|
|
|
children.erase(children.begin() + i - 1); |
19453
|
0
|
|
|
|
|
|
break; |
19454
|
|
|
|
|
|
|
} |
19455
|
|
|
|
|
|
|
} |
19456
|
|
|
|
|
|
|
|
19457
|
|
|
|
|
|
|
// Set new head |
19458
|
14
|
|
|
|
|
|
words[id].head = head; |
19459
|
7
|
|
|
|
|
|
words[id].deprel = deprel; |
19460
|
7
|
50
|
|
|
|
|
if (head >= 0) { |
19461
|
14
|
|
|
|
|
|
auto& children = words[head].children; |
19462
|
|
|
|
|
|
|
size_t i = children.size(); |
19463
|
7
|
100
|
|
|
|
|
while (i && children[i-1] > id) i--; |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
19464
|
7
|
100
|
|
|
|
|
if (!i || children[i-1] < id) children.insert(children.begin() + i, id); |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
19465
|
|
|
|
|
|
|
} |
19466
|
7
|
|
|
|
|
|
} |
19467
|
|
|
|
|
|
|
|
19468
|
0
|
|
|
|
|
|
void sentence::unlink_all_words() { |
19469
|
0
|
0
|
|
|
|
|
for (auto&& word : words) { |
19470
|
0
|
|
|
|
|
|
word.head = -1; |
19471
|
|
|
|
|
|
|
word.deprel.clear(); |
19472
|
|
|
|
|
|
|
word.children.clear(); |
19473
|
|
|
|
|
|
|
} |
19474
|
0
|
|
|
|
|
|
} |
19475
|
|
|
|
|
|
|
|
19476
|
0
|
|
|
|
|
|
bool sentence::get_new_doc(string* id) const { |
19477
|
0
|
0
|
|
|
|
|
if (get_comment("newdoc id", id)) |
19478
|
|
|
|
|
|
|
return true; |
19479
|
0
|
|
|
|
|
|
return get_comment("newdoc", id); |
19480
|
|
|
|
|
|
|
} |
19481
|
|
|
|
|
|
|
|
19482
|
1
|
|
|
|
|
|
void sentence::set_new_doc(bool new_doc, string_piece id) { |
19483
|
1
|
|
|
|
|
|
remove_comment("newdoc"); |
19484
|
1
|
|
|
|
|
|
remove_comment("newdoc id"); |
19485
|
|
|
|
|
|
|
|
19486
|
1
|
50
|
|
|
|
|
if (new_doc && id.len) |
|
|
50
|
|
|
|
|
|
19487
|
0
|
|
|
|
|
|
set_comment("newdoc id", id); |
19488
|
1
|
50
|
|
|
|
|
else if (new_doc) |
19489
|
1
|
|
|
|
|
|
set_comment("newdoc"); |
19490
|
1
|
|
|
|
|
|
} |
19491
|
|
|
|
|
|
|
|
19492
|
0
|
|
|
|
|
|
bool sentence::get_new_par(string* id) const { |
19493
|
0
|
0
|
|
|
|
|
if (get_comment("newpar id", id)) |
19494
|
|
|
|
|
|
|
return true; |
19495
|
0
|
|
|
|
|
|
return get_comment("newpar", id); |
19496
|
|
|
|
|
|
|
} |
19497
|
|
|
|
|
|
|
|
19498
|
1
|
|
|
|
|
|
void sentence::set_new_par(bool new_par, string_piece id) { |
19499
|
1
|
|
|
|
|
|
remove_comment("newpar"); |
19500
|
1
|
|
|
|
|
|
remove_comment("newpar id"); |
19501
|
|
|
|
|
|
|
|
19502
|
1
|
50
|
|
|
|
|
if (new_par && id.len) |
|
|
50
|
|
|
|
|
|
19503
|
0
|
|
|
|
|
|
set_comment("newpar id", id); |
19504
|
1
|
50
|
|
|
|
|
else if (new_par) |
19505
|
1
|
|
|
|
|
|
set_comment("newpar"); |
19506
|
1
|
|
|
|
|
|
} |
19507
|
|
|
|
|
|
|
|
19508
|
0
|
|
|
|
|
|
bool sentence::get_sent_id(string& id) const { |
19509
|
|
|
|
|
|
|
id.clear(); |
19510
|
|
|
|
|
|
|
|
19511
|
0
|
|
|
|
|
|
return get_comment("sent_id", &id); |
19512
|
|
|
|
|
|
|
} |
19513
|
|
|
|
|
|
|
|
19514
|
1
|
|
|
|
|
|
void sentence::set_sent_id(string_piece id) { |
19515
|
1
|
|
|
|
|
|
remove_comment("sent_id"); |
19516
|
|
|
|
|
|
|
|
19517
|
1
|
50
|
|
|
|
|
if (id.len) |
19518
|
1
|
|
|
|
|
|
set_comment("sent_id", id); |
19519
|
1
|
|
|
|
|
|
} |
19520
|
|
|
|
|
|
|
|
19521
|
0
|
|
|
|
|
|
bool sentence::get_text(string& text) const { |
19522
|
|
|
|
|
|
|
text.clear(); |
19523
|
|
|
|
|
|
|
|
19524
|
0
|
|
|
|
|
|
return get_comment("text", &text); |
19525
|
|
|
|
|
|
|
} |
19526
|
|
|
|
|
|
|
|
19527
|
0
|
|
|
|
|
|
void sentence::set_text(string_piece text) { |
19528
|
0
|
|
|
|
|
|
remove_comment("text"); |
19529
|
|
|
|
|
|
|
|
19530
|
0
|
0
|
|
|
|
|
if (text.len) |
19531
|
0
|
|
|
|
|
|
set_comment("text", text); |
19532
|
0
|
|
|
|
|
|
} |
19533
|
|
|
|
|
|
|
|
19534
|
0
|
|
|
|
|
|
bool sentence::get_comment(string_piece name, string* value) const { |
19535
|
0
|
0
|
|
|
|
|
for (auto&& comment : comments) |
19536
|
0
|
0
|
|
|
|
|
if (comment[0] == '#') { |
19537
|
|
|
|
|
|
|
// Skip spaces |
19538
|
|
|
|
|
|
|
unsigned j = 1; |
19539
|
0
|
0
|
|
|
|
|
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
19540
|
|
|
|
|
|
|
|
19541
|
|
|
|
|
|
|
// Try matching the name |
19542
|
0
|
0
|
|
|
|
|
if (j + name.len <= comment.size() && comment.compare(j, name.len, name.str, name.len) == 0) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
19543
|
0
|
|
|
|
|
|
j += name.len; |
19544
|
0
|
0
|
|
|
|
|
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
19545
|
0
|
0
|
|
|
|
|
if (j < comment.size() && comment[j] == '=') { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
19546
|
|
|
|
|
|
|
//We have a value |
19547
|
0
|
|
|
|
|
|
j++; |
19548
|
0
|
0
|
|
|
|
|
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
19549
|
0
|
0
|
|
|
|
|
if (value) value->assign(comment, j, comment.size() - j); |
19550
|
|
|
|
|
|
|
} else { |
19551
|
|
|
|
|
|
|
// No value |
19552
|
0
|
0
|
|
|
|
|
if (value) value->clear(); |
19553
|
|
|
|
|
|
|
} |
19554
|
|
|
|
|
|
|
|
19555
|
|
|
|
|
|
|
return true; |
19556
|
|
|
|
|
|
|
} |
19557
|
|
|
|
|
|
|
} |
19558
|
|
|
|
|
|
|
|
19559
|
|
|
|
|
|
|
return false; |
19560
|
|
|
|
|
|
|
} |
19561
|
|
|
|
|
|
|
|
19562
|
8
|
|
|
|
|
|
void sentence::remove_comment(string_piece name) { |
19563
|
15
|
100
|
|
|
|
|
for (unsigned i = comments.size(); i--; ) |
19564
|
7
|
50
|
|
|
|
|
if (comments[i][0] == '#') { |
19565
|
|
|
|
|
|
|
// Skip spaces |
19566
|
|
|
|
|
|
|
unsigned j = 1; |
19567
|
14
|
50
|
|
|
|
|
while (j < comments[i].size() && (comments[i][j] == ' ' || comments[i][j] == '\t')) j++; |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
19568
|
|
|
|
|
|
|
|
19569
|
|
|
|
|
|
|
// Remove matching comments |
19570
|
7
|
100
|
|
|
|
|
if (j + name.len <= comments[i].size() && comments[i].compare(j, name.len, name.str, name.len) == 0) |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
19571
|
0
|
|
|
|
|
|
comments.erase(comments.begin() + i); |
19572
|
|
|
|
|
|
|
} |
19573
|
8
|
|
|
|
|
|
} |
19574
|
|
|
|
|
|
|
|
19575
|
3
|
|
|
|
|
|
void sentence::set_comment(string_piece name, string_piece value) { |
19576
|
3
|
|
|
|
|
|
remove_comment(name); |
19577
|
|
|
|
|
|
|
|
19578
|
|
|
|
|
|
|
string comment; |
19579
|
3
|
50
|
|
|
|
|
comment.append("# ").append(name.str, name.len); |
|
|
50
|
|
|
|
|
|
19580
|
3
|
100
|
|
|
|
|
if (value.len) { |
19581
|
1
|
50
|
|
|
|
|
comment.append(" = "); |
19582
|
2
|
100
|
|
|
|
|
for (size_t i = 0; i < value.len; i++) |
19583
|
1
|
50
|
|
|
|
|
comment.push_back(value.str[i] == '\r' || value.str[i] == '\n' ? ' ' : value.str[i]); |
|
|
50
|
|
|
|
|
|
19584
|
|
|
|
|
|
|
} |
19585
|
3
|
|
|
|
|
|
comments.push_back(move(comment)); |
19586
|
3
|
|
|
|
|
|
} |
19587
|
|
|
|
|
|
|
|
19588
|
|
|
|
|
|
|
///////// |
19589
|
|
|
|
|
|
|
// File: sentence/token.cpp |
19590
|
|
|
|
|
|
|
///////// |
19591
|
|
|
|
|
|
|
|
19592
|
|
|
|
|
|
|
// This file is part of UDPipe . |
19593
|
|
|
|
|
|
|
// |
19594
|
|
|
|
|
|
|
// Copyright 2017 Institute of Formal and Applied Linguistics, Faculty of |
19595
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
19596
|
|
|
|
|
|
|
// |
19597
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
19598
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
19599
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
19600
|
|
|
|
|
|
|
|
19601
|
11
|
|
|
|
|
|
token::token(string_piece form, string_piece misc) { |
19602
|
11
|
100
|
|
|
|
|
if (form.len) this->form.assign(form.str, form.len); |
19603
|
11
|
50
|
|
|
|
|
if (misc.len) this->misc.assign(misc.str, misc.len); |
19604
|
11
|
|
|
|
|
|
} |
19605
|
|
|
|
|
|
|
|
19606
|
|
|
|
|
|
|
// CoNLL-U defined SpaceAfter=No feature |
19607
|
6
|
|
|
|
|
|
bool token::get_space_after() const { |
19608
|
|
|
|
|
|
|
string_piece value; |
19609
|
|
|
|
|
|
|
|
19610
|
6
|
100
|
|
|
|
|
return !(get_misc_field("SpaceAfter", value) && value.len == 2 && memcmp(value.str, "No", 2) == 0); |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
19611
|
|
|
|
|
|
|
} |
19612
|
|
|
|
|
|
|
|
19613
|
7
|
|
|
|
|
|
void token::set_space_after(bool space_after) { |
19614
|
7
|
100
|
|
|
|
|
if (space_after) |
19615
|
5
|
|
|
|
|
|
remove_misc_field("SpaceAfter"); |
19616
|
|
|
|
|
|
|
else |
19617
|
2
|
|
|
|
|
|
start_misc_field("SpaceAfter").append("No"); |
19618
|
7
|
|
|
|
|
|
} |
19619
|
|
|
|
|
|
|
|
19620
|
|
|
|
|
|
|
// UDPipe-specific all-spaces-preserving SpacesBefore and SpacesAfter features |
19621
|
0
|
|
|
|
|
|
void token::get_spaces_before(string& spaces_before) const { |
19622
|
|
|
|
|
|
|
string_piece value; |
19623
|
|
|
|
|
|
|
|
19624
|
0
|
0
|
|
|
|
|
if (get_misc_field("SpacesBefore", value)) |
19625
|
0
|
|
|
|
|
|
unescape_spaces(value, spaces_before); |
19626
|
|
|
|
|
|
|
else |
19627
|
|
|
|
|
|
|
spaces_before.clear(); |
19628
|
0
|
|
|
|
|
|
} |
19629
|
|
|
|
|
|
|
|
19630
|
7
|
|
|
|
|
|
void token::set_spaces_before(string_piece spaces_before) { |
19631
|
7
|
50
|
|
|
|
|
if (spaces_before.len == 0) |
19632
|
7
|
|
|
|
|
|
remove_misc_field("SpacesBefore"); |
19633
|
|
|
|
|
|
|
else |
19634
|
0
|
|
|
|
|
|
append_escaped_spaces(spaces_before, start_misc_field("SpacesBefore")); |
19635
|
7
|
|
|
|
|
|
} |
19636
|
|
|
|
|
|
|
|
19637
|
0
|
|
|
|
|
|
void token::get_spaces_after(string& spaces_after) const { |
19638
|
|
|
|
|
|
|
string_piece value; |
19639
|
|
|
|
|
|
|
|
19640
|
0
|
0
|
|
|
|
|
if (get_misc_field("SpacesAfter", value)) |
19641
|
0
|
|
|
|
|
|
unescape_spaces(value, spaces_after); |
19642
|
|
|
|
|
|
|
else |
19643
|
0
|
0
|
|
|
|
|
spaces_after.assign(get_space_after() ? " " : ""); |
19644
|
0
|
|
|
|
|
|
} |
19645
|
|
|
|
|
|
|
|
19646
|
7
|
|
|
|
|
|
void token::set_spaces_after(string_piece spaces_after) { |
19647
|
7
|
100
|
|
|
|
|
if (spaces_after.len == 0) { |
19648
|
2
|
|
|
|
|
|
set_space_after(false); |
19649
|
2
|
|
|
|
|
|
remove_misc_field("SpacesAfter"); |
19650
|
5
|
50
|
|
|
|
|
} else if (spaces_after.len == 1 && spaces_after.str[0] == ' ') { |
|
|
50
|
|
|
|
|
|
19651
|
5
|
|
|
|
|
|
set_space_after(true); |
19652
|
5
|
|
|
|
|
|
remove_misc_field("SpacesAfter"); |
19653
|
|
|
|
|
|
|
} else { |
19654
|
0
|
|
|
|
|
|
set_space_after(true); |
19655
|
0
|
|
|
|
|
|
append_escaped_spaces(spaces_after, start_misc_field("SpacesAfter")); |
19656
|
|
|
|
|
|
|
} |
19657
|
7
|
|
|
|
|
|
} |
19658
|
|
|
|
|
|
|
|
19659
|
0
|
|
|
|
|
|
void token::get_spaces_in_token(string& spaces_in_token) const { |
19660
|
|
|
|
|
|
|
string_piece value; |
19661
|
|
|
|
|
|
|
|
19662
|
0
|
0
|
|
|
|
|
if (get_misc_field("SpacesInToken", value)) |
19663
|
0
|
|
|
|
|
|
unescape_spaces(value, spaces_in_token); |
19664
|
|
|
|
|
|
|
else |
19665
|
|
|
|
|
|
|
spaces_in_token.clear(); |
19666
|
0
|
|
|
|
|
|
} |
19667
|
|
|
|
|
|
|
|
19668
|
7
|
|
|
|
|
|
void token::set_spaces_in_token(string_piece spaces_in_token) { |
19669
|
7
|
50
|
|
|
|
|
if (spaces_in_token.len == 0) |
19670
|
7
|
|
|
|
|
|
remove_misc_field("SpacesInToken"); |
19671
|
|
|
|
|
|
|
else |
19672
|
0
|
|
|
|
|
|
append_escaped_spaces(spaces_in_token, start_misc_field("SpacesInToken")); |
19673
|
7
|
|
|
|
|
|
} |
19674
|
|
|
|
|
|
|
|
19675
|
|
|
|
|
|
|
// UDPipe-specific TokenRange feature |
19676
|
0
|
|
|
|
|
|
bool token::get_token_range(size_t& start, size_t& end) const { |
19677
|
|
|
|
|
|
|
string_piece value; |
19678
|
|
|
|
|
|
|
|
19679
|
0
|
0
|
|
|
|
|
if (!get_misc_field("TokenRange", value)) return false; |
19680
|
|
|
|
|
|
|
|
19681
|
0
|
|
|
|
|
|
start = 0; |
19682
|
0
|
0
|
|
|
|
|
while (value.len && value.str[0] >= '0' && value.str[0] <= '9') { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
19683
|
0
|
0
|
|
|
|
|
if (start > (numeric_limits::max() - (value.str[0] - '0')) / 10) |
19684
|
|
|
|
|
|
|
return false; |
19685
|
0
|
|
|
|
|
|
start = 10 * start + (value.str[0] - '0'); |
19686
|
0
|
|
|
|
|
|
value.str++, value.len--; |
19687
|
|
|
|
|
|
|
} |
19688
|
|
|
|
|
|
|
|
19689
|
0
|
0
|
|
|
|
|
if (value.len == 0 || value.str[0] != ':') return false; |
|
|
0
|
|
|
|
|
|
19690
|
0
|
|
|
|
|
|
value.str++, value.len--; |
19691
|
|
|
|
|
|
|
|
19692
|
0
|
|
|
|
|
|
end = 0; |
19693
|
0
|
0
|
|
|
|
|
while (value.len && value.str[0] >= '0' && value.str[0] <= '9') { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
19694
|
0
|
0
|
|
|
|
|
if (end > (numeric_limits::max() - (value.str[0] - '0')) / 10) |
19695
|
|
|
|
|
|
|
return false; |
19696
|
0
|
|
|
|
|
|
end = 10 * end + (value.str[0] - '0'); |
19697
|
0
|
|
|
|
|
|
value.str++, value.len--; |
19698
|
|
|
|
|
|
|
} |
19699
|
|
|
|
|
|
|
|
19700
|
|
|
|
|
|
|
return true; |
19701
|
|
|
|
|
|
|
} |
19702
|
|
|
|
|
|
|
|
19703
|
0
|
|
|
|
|
|
void token::set_token_range(size_t start, size_t end) { |
19704
|
0
|
0
|
|
|
|
|
if (start == size_t(string::npos)) |
19705
|
0
|
|
|
|
|
|
remove_misc_field("TokenRange"); |
19706
|
|
|
|
|
|
|
else |
19707
|
0
|
0
|
|
|
|
|
start_misc_field("TokenRange").append(to_string(start)).append(1, ':').append(to_string(end)); |
19708
|
0
|
|
|
|
|
|
} |
19709
|
|
|
|
|
|
|
|
19710
|
|
|
|
|
|
|
// Private MISC field helpers |
19711
|
12
|
|
|
|
|
|
bool token::get_misc_field(string_piece name, string_piece& value) const { |
19712
|
6
|
100
|
|
|
|
|
for (size_t index = 0; index < misc.size(); ) { |
19713
|
2
|
50
|
|
|
|
|
if (misc.compare(index, name.len, name.str, name.len) == 0 && misc[index + name.len] == '=') { |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
19714
|
2
|
|
|
|
|
|
index += name.len + 1; |
19715
|
2
|
|
|
|
|
|
value.str = misc.c_str() + index; |
19716
|
2
|
|
|
|
|
|
value.len = misc.find('|', index); |
19717
|
2
|
50
|
|
|
|
|
value.len = (value.len == size_t(string::npos) ? misc.size() : value.len) - index; |
19718
|
2
|
|
|
|
|
|
return true; |
19719
|
|
|
|
|
|
|
} |
19720
|
0
|
|
|
|
|
|
index = misc.find('|', index); |
19721
|
0
|
0
|
|
|
|
|
if (index != size_t(string::npos)) index++; |
19722
|
|
|
|
|
|
|
} |
19723
|
|
|
|
|
|
|
return false; |
19724
|
|
|
|
|
|
|
} |
19725
|
|
|
|
|
|
|
|
19726
|
64
|
|
|
|
|
|
void token::remove_misc_field(string_piece name) { |
19727
|
36
|
100
|
|
|
|
|
for (size_t index = 0; index < misc.size(); ) |
19728
|
8
|
100
|
|
|
|
|
if (misc.compare(index, name.len, name.str, name.len) == 0 && misc[index + name.len] == '=') { |
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
19729
|
2
|
|
|
|
|
|
size_t end_index = misc.find('|', index + name.len + 1); |
19730
|
2
|
50
|
|
|
|
|
if (end_index == size_t(string::npos)) end_index = misc.size(); |
19731
|
|
|
|
|
|
|
|
19732
|
|
|
|
|
|
|
// Be careful to delete at most one neighboring '|' |
19733
|
2
|
50
|
|
|
|
|
if (index) |
19734
|
0
|
|
|
|
|
|
misc.erase(index - 1, end_index - (index - 1)); |
19735
|
|
|
|
|
|
|
else |
19736
|
2
|
50
|
|
|
|
|
misc.erase(index, end_index + (end_index < misc.size() ? 1 : 0) - index); |
19737
|
|
|
|
|
|
|
} else { |
19738
|
6
|
|
|
|
|
|
index = misc.find('|', index); |
19739
|
6
|
50
|
|
|
|
|
if (index != size_t(string::npos)) index++; |
19740
|
|
|
|
|
|
|
} |
19741
|
28
|
|
|
|
|
|
} |
19742
|
|
|
|
|
|
|
|
19743
|
2
|
|
|
|
|
|
string& token::start_misc_field(string_piece name) { |
19744
|
2
|
|
|
|
|
|
remove_misc_field(name); |
19745
|
2
|
50
|
|
|
|
|
if (!misc.empty()) misc.push_back('|'); |
19746
|
2
|
|
|
|
|
|
misc.append(name.str, name.len).push_back('='); |
19747
|
2
|
|
|
|
|
|
return misc; |
19748
|
|
|
|
|
|
|
} |
19749
|
|
|
|
|
|
|
|
19750
|
0
|
|
|
|
|
|
void token::append_escaped_spaces(string_piece spaces, string& escaped_spaces) const { |
19751
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < spaces.len; i++) |
19752
|
0
|
|
|
|
|
|
switch (spaces.str[i]) { |
19753
|
|
|
|
|
|
|
case ' ': |
19754
|
0
|
|
|
|
|
|
escaped_spaces.push_back('\\'); escaped_spaces.push_back('s'); break; |
19755
|
|
|
|
|
|
|
case '|': |
19756
|
0
|
|
|
|
|
|
escaped_spaces.push_back('\\'); escaped_spaces.push_back('p'); break; |
19757
|
|
|
|
|
|
|
case '\t': |
19758
|
0
|
|
|
|
|
|
escaped_spaces.push_back('\\'); escaped_spaces.push_back('t'); break; |
19759
|
|
|
|
|
|
|
case '\r': |
19760
|
0
|
|
|
|
|
|
escaped_spaces.push_back('\\'); escaped_spaces.push_back('r'); break; |
19761
|
|
|
|
|
|
|
case '\n': |
19762
|
0
|
|
|
|
|
|
escaped_spaces.push_back('\\'); escaped_spaces.push_back('n'); break; |
19763
|
|
|
|
|
|
|
case '\\': |
19764
|
0
|
|
|
|
|
|
escaped_spaces.push_back('\\'); escaped_spaces.push_back('\\'); break; |
19765
|
|
|
|
|
|
|
default: |
19766
|
0
|
|
|
|
|
|
escaped_spaces.push_back(spaces.str[i]); |
19767
|
|
|
|
|
|
|
} |
19768
|
0
|
|
|
|
|
|
} |
19769
|
|
|
|
|
|
|
|
19770
|
0
|
|
|
|
|
|
void token::unescape_spaces(string_piece escaped_spaces, string& spaces) const { |
19771
|
|
|
|
|
|
|
spaces.clear(); |
19772
|
|
|
|
|
|
|
|
19773
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < escaped_spaces.len; i++) |
19774
|
0
|
0
|
|
|
|
|
if (escaped_spaces.str[i] != '\\' || i+1 >= escaped_spaces.len) |
|
|
0
|
|
|
|
|
|
19775
|
0
|
|
|
|
|
|
spaces.push_back(escaped_spaces.str[i]); |
19776
|
0
|
|
|
|
|
|
else switch (escaped_spaces.str[++i]) { |
19777
|
|
|
|
|
|
|
case 's': |
19778
|
0
|
|
|
|
|
|
spaces.push_back(' '); break; |
19779
|
|
|
|
|
|
|
case 'p': |
19780
|
0
|
|
|
|
|
|
spaces.push_back('|'); break; |
19781
|
|
|
|
|
|
|
case 't': |
19782
|
0
|
|
|
|
|
|
spaces.push_back('\t'); break; |
19783
|
|
|
|
|
|
|
case 'r': |
19784
|
0
|
|
|
|
|
|
spaces.push_back('\r'); break; |
19785
|
|
|
|
|
|
|
case 'n': |
19786
|
0
|
|
|
|
|
|
spaces.push_back('\n'); break; |
19787
|
|
|
|
|
|
|
case '\\': |
19788
|
0
|
|
|
|
|
|
spaces.push_back('\\'); break; |
19789
|
|
|
|
|
|
|
default: |
19790
|
0
|
|
|
|
|
|
spaces.push_back(escaped_spaces.str[i - 1]); |
19791
|
0
|
|
|
|
|
|
spaces.push_back(escaped_spaces.str[i]); |
19792
|
|
|
|
|
|
|
} |
19793
|
0
|
|
|
|
|
|
} |
19794
|
|
|
|
|
|
|
|
19795
|
|
|
|
|
|
|
///////// |
19796
|
|
|
|
|
|
|
// File: tokenizer/detokenizer.h |
19797
|
|
|
|
|
|
|
///////// |
19798
|
|
|
|
|
|
|
|
19799
|
|
|
|
|
|
|
// This file is part of UDPipe . |
19800
|
|
|
|
|
|
|
// |
19801
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
19802
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
19803
|
|
|
|
|
|
|
// |
19804
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
19805
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
19806
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
19807
|
|
|
|
|
|
|
|
19808
|
0
|
|
|
|
|
|
class detokenizer { |
19809
|
|
|
|
|
|
|
public: |
19810
|
|
|
|
|
|
|
detokenizer(const string& plain_text); |
19811
|
|
|
|
|
|
|
|
19812
|
|
|
|
|
|
|
void detokenize(sentence& s) const; |
19813
|
|
|
|
|
|
|
private: |
19814
|
|
|
|
|
|
|
enum { LOWERCASE, CATEGORIZE, TOTAL }; |
19815
|
|
|
|
|
|
|
|
19816
|
|
|
|
|
|
|
int difference(const string& left, const string& right, bool separate, int mode) const; |
19817
|
|
|
|
|
|
|
|
19818
|
|
|
|
|
|
|
static string perform_lowercase(const string& input); |
19819
|
|
|
|
|
|
|
static string perform_categorize(const string& input); |
19820
|
|
|
|
|
|
|
bool has_letters(const string& word) const; |
19821
|
|
|
|
|
|
|
bool only_digits(const string& word) const; |
19822
|
|
|
|
|
|
|
|
19823
|
0
|
|
|
|
|
|
class suffix_array { |
19824
|
|
|
|
|
|
|
public: |
19825
|
|
|
|
|
|
|
suffix_array(const string& str); |
19826
|
|
|
|
|
|
|
suffix_array(suffix_array&& other) = default; |
19827
|
|
|
|
|
|
|
|
19828
|
|
|
|
|
|
|
unsigned count(const string& data) const; |
19829
|
|
|
|
|
|
|
|
19830
|
|
|
|
|
|
|
private: |
19831
|
|
|
|
|
|
|
vector sa; |
19832
|
|
|
|
|
|
|
|
19833
|
|
|
|
|
|
|
struct suffix_compare { |
19834
|
0
|
|
|
|
|
|
suffix_compare(const string& str) : str(str) {} |
19835
|
0
|
|
|
|
|
|
bool operator()(unsigned a, unsigned b) const { return str.compare(a, string::npos, str, b, string::npos) < 0; } |
19836
|
|
|
|
|
|
|
private: |
19837
|
|
|
|
|
|
|
const string& str; |
19838
|
|
|
|
|
|
|
} suffix_comparator; |
19839
|
|
|
|
|
|
|
|
19840
|
|
|
|
|
|
|
struct suffix_lower_find { |
19841
|
0
|
|
|
|
|
|
suffix_lower_find(const string& str) : str(str) {} |
19842
|
0
|
|
|
|
|
|
bool operator()(unsigned a, const string& data) const { return str.compare(a, data.size(), data) < 0; } |
19843
|
|
|
|
|
|
|
|
19844
|
|
|
|
|
|
|
private: |
19845
|
|
|
|
|
|
|
const string& str; |
19846
|
|
|
|
|
|
|
} suffix_lower_finder; |
19847
|
|
|
|
|
|
|
|
19848
|
|
|
|
|
|
|
struct suffix_upper_find { |
19849
|
0
|
|
|
|
|
|
suffix_upper_find(const string& str) : str(str) {} |
19850
|
0
|
|
|
|
|
|
bool operator()(const string& data, unsigned a) const { return str.compare(a, data.size(), data) > 0; } |
19851
|
|
|
|
|
|
|
|
19852
|
|
|
|
|
|
|
private: |
19853
|
|
|
|
|
|
|
const string& str; |
19854
|
|
|
|
|
|
|
} suffix_upper_finder; |
19855
|
|
|
|
|
|
|
}; |
19856
|
|
|
|
|
|
|
|
19857
|
|
|
|
|
|
|
string data_lowercased, data_categorized; |
19858
|
|
|
|
|
|
|
suffix_array sa_lowercased, sa_categorized; |
19859
|
|
|
|
|
|
|
}; |
19860
|
|
|
|
|
|
|
|
19861
|
|
|
|
|
|
|
///////// |
19862
|
|
|
|
|
|
|
// File: tokenizer/detokenizer.cpp |
19863
|
|
|
|
|
|
|
///////// |
19864
|
|
|
|
|
|
|
|
19865
|
|
|
|
|
|
|
// This file is part of UDPipe . |
19866
|
|
|
|
|
|
|
// |
19867
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
19868
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
19869
|
|
|
|
|
|
|
// |
19870
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
19871
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
19872
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
19873
|
|
|
|
|
|
|
|
19874
|
0
|
|
|
|
|
|
detokenizer::detokenizer(const string& plain_text) |
19875
|
|
|
|
|
|
|
: data_lowercased(perform_lowercase(plain_text)), data_categorized(perform_categorize(plain_text)), |
19876
|
0
|
0
|
|
|
|
|
sa_lowercased(data_lowercased), sa_categorized(data_categorized) {} |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
19877
|
|
|
|
|
|
|
|
19878
|
0
|
|
|
|
|
|
void detokenizer::detokenize(sentence& s) const { |
19879
|
|
|
|
|
|
|
token* previous_tok = nullptr; |
19880
|
0
|
0
|
|
|
|
|
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
19881
|
0
|
0
|
|
|
|
|
token* tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (token*)&s.multiword_tokens[j] : (token*)&s.words[i]; |
|
|
0
|
|
|
|
|
|
19882
|
|
|
|
|
|
|
|
19883
|
0
|
0
|
|
|
|
|
if (previous_tok) { |
19884
|
|
|
|
|
|
|
// Should we add SpaceAfter=No to the previous form? |
19885
|
0
|
|
|
|
|
|
int score = difference(previous_tok->form, tok->form, true, LOWERCASE); |
19886
|
0
|
0
|
|
|
|
|
if (!score) score = has_letters(previous_tok->form) && has_letters(tok->form) ? -1 : 0; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
19887
|
0
|
0
|
|
|
|
|
if (!score) score = only_digits(previous_tok->form) && only_digits(tok->form) ? -1 : 0; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
19888
|
0
|
0
|
|
|
|
|
if (!score) score = difference(previous_tok->form, tok->form, false, LOWERCASE); |
19889
|
0
|
0
|
|
|
|
|
if (!score) score = difference(previous_tok->form, tok->form, false, CATEGORIZE); |
19890
|
0
|
0
|
|
|
|
|
if (!score) score = difference(previous_tok->form, tok->form, true, CATEGORIZE); |
19891
|
|
|
|
|
|
|
|
19892
|
0
|
0
|
|
|
|
|
if (score > 0) |
19893
|
0
|
|
|
|
|
|
previous_tok->set_space_after(false); |
19894
|
|
|
|
|
|
|
} |
19895
|
|
|
|
|
|
|
|
19896
|
|
|
|
|
|
|
// Remove the SpaceAfter attribute on current token |
19897
|
0
|
|
|
|
|
|
tok->set_space_after(true); |
19898
|
|
|
|
|
|
|
previous_tok = tok; |
19899
|
|
|
|
|
|
|
|
19900
|
0
|
0
|
|
|
|
|
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
19901
|
0
|
|
|
|
|
|
i = s.multiword_tokens[j++].id_last; |
19902
|
|
|
|
|
|
|
} |
19903
|
0
|
|
|
|
|
|
} |
19904
|
|
|
|
|
|
|
|
19905
|
0
|
|
|
|
|
|
int detokenizer::difference(const string& left, const string& right, bool separate, int mode) const { |
19906
|
0
|
0
|
|
|
|
|
auto& func = mode == LOWERCASE ? perform_lowercase : perform_categorize; |
19907
|
0
|
0
|
|
|
|
|
auto& sa = mode == LOWERCASE ? sa_lowercased : sa_categorized; |
19908
|
|
|
|
|
|
|
|
19909
|
0
|
|
|
|
|
|
string left_mapped = func(left); |
19910
|
0
|
0
|
|
|
|
|
string right_mapped = func(right); |
19911
|
|
|
|
|
|
|
string pattern; |
19912
|
|
|
|
|
|
|
|
19913
|
0
|
0
|
|
|
|
|
pattern.assign(separate?" ":"").append(left_mapped).append(right_mapped).append(separate?" ":""); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
19914
|
0
|
0
|
|
|
|
|
int together = sa.count(pattern); |
19915
|
|
|
|
|
|
|
|
19916
|
0
|
0
|
|
|
|
|
pattern.assign(separate?" ":"").append(left_mapped).append(" ").append(right_mapped).append(separate?" ":""); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
19917
|
0
|
0
|
|
|
|
|
int apart = sa.count(pattern); |
19918
|
|
|
|
|
|
|
|
19919
|
0
|
|
|
|
|
|
return together - apart; |
19920
|
|
|
|
|
|
|
} |
19921
|
|
|
|
|
|
|
|
19922
|
0
|
|
|
|
|
|
string detokenizer::perform_lowercase(const string& input) { |
19923
|
|
|
|
|
|
|
using namespace unilib; |
19924
|
|
|
|
|
|
|
|
19925
|
|
|
|
|
|
|
string output; |
19926
|
0
|
0
|
|
|
|
|
for (auto&& chr : utf8::decoder(input)) |
19927
|
0
|
0
|
|
|
|
|
utf8::append(output, unicode::lowercase(chr)); |
19928
|
0
|
|
|
|
|
|
return output; |
19929
|
|
|
|
|
|
|
} |
19930
|
|
|
|
|
|
|
|
19931
|
0
|
|
|
|
|
|
string detokenizer::perform_categorize(const string& input) { |
19932
|
|
|
|
|
|
|
using namespace unilib; |
19933
|
|
|
|
|
|
|
|
19934
|
|
|
|
|
|
|
string output; |
19935
|
0
|
0
|
|
|
|
|
for (auto&& chr : utf8::decoder(input)) { |
19936
|
0
|
|
|
|
|
|
auto category = unicode::category(chr); |
19937
|
0
|
0
|
|
|
|
|
if (category & unicode::C) output.push_back('C'); |
|
|
0
|
|
|
|
|
|
19938
|
0
|
0
|
|
|
|
|
if (category & unicode::L) output.push_back('L'); |
|
|
0
|
|
|
|
|
|
19939
|
0
|
0
|
|
|
|
|
if (category & unicode::M) output.push_back('M'); |
|
|
0
|
|
|
|
|
|
19940
|
0
|
0
|
|
|
|
|
if (category & unicode::N) output.push_back('N'); |
|
|
0
|
|
|
|
|
|
19941
|
0
|
0
|
|
|
|
|
if (category & unicode::Pc) output.push_back('c'); |
|
|
0
|
|
|
|
|
|
19942
|
0
|
0
|
|
|
|
|
if (category & unicode::Pd) output.push_back('d'); |
|
|
0
|
|
|
|
|
|
19943
|
0
|
0
|
|
|
|
|
if (category & unicode::Pe) output.push_back('e'); |
|
|
0
|
|
|
|
|
|
19944
|
0
|
0
|
|
|
|
|
if (category & unicode::Pf) output.push_back('f'); |
|
|
0
|
|
|
|
|
|
19945
|
0
|
0
|
|
|
|
|
if (category & unicode::Pi) output.push_back('i'); |
|
|
0
|
|
|
|
|
|
19946
|
0
|
0
|
|
|
|
|
if (category & unicode::Po) output.push_back('o'); |
|
|
0
|
|
|
|
|
|
19947
|
0
|
0
|
|
|
|
|
if (category & unicode::Ps) output.push_back('s'); |
|
|
0
|
|
|
|
|
|
19948
|
0
|
0
|
|
|
|
|
if (category & unicode::S) output.push_back('S'); |
|
|
0
|
|
|
|
|
|
19949
|
0
|
0
|
|
|
|
|
if (category & unicode::Zl) output.push_back('Z'); |
|
|
0
|
|
|
|
|
|
19950
|
0
|
0
|
|
|
|
|
if (category & unicode::Zp) output.push_back('z'); |
|
|
0
|
|
|
|
|
|
19951
|
0
|
0
|
|
|
|
|
if (category & unicode::Zs) output.push_back(' '); |
|
|
0
|
|
|
|
|
|
19952
|
|
|
|
|
|
|
} |
19953
|
0
|
|
|
|
|
|
return output; |
19954
|
|
|
|
|
|
|
} |
19955
|
|
|
|
|
|
|
|
19956
|
0
|
|
|
|
|
|
bool detokenizer::has_letters(const string& word) const { |
19957
|
|
|
|
|
|
|
using namespace unilib; |
19958
|
|
|
|
|
|
|
|
19959
|
0
|
0
|
|
|
|
|
for (auto&& chr : utf8::decoder(word)) |
19960
|
0
|
0
|
|
|
|
|
if (unicode::category(chr) & unicode::L) |
19961
|
0
|
|
|
|
|
|
return true; |
19962
|
0
|
|
|
|
|
|
return false; |
19963
|
|
|
|
|
|
|
} |
19964
|
|
|
|
|
|
|
|
19965
|
0
|
|
|
|
|
|
bool detokenizer::only_digits(const string& word) const { |
19966
|
|
|
|
|
|
|
using namespace unilib; |
19967
|
|
|
|
|
|
|
|
19968
|
0
|
0
|
|
|
|
|
for (auto&& chr : utf8::decoder(word)) |
19969
|
0
|
0
|
|
|
|
|
if (unicode::category(chr) & ~unicode::N) |
19970
|
0
|
|
|
|
|
|
return false; |
19971
|
0
|
|
|
|
|
|
return true; |
19972
|
|
|
|
|
|
|
} |
19973
|
|
|
|
|
|
|
|
19974
|
0
|
|
|
|
|
|
detokenizer::suffix_array::suffix_array(const string& str) : suffix_comparator(str), suffix_lower_finder(str), suffix_upper_finder(str) { |
19975
|
0
|
0
|
|
|
|
|
sa.reserve(str.size()); |
19976
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < str.size(); i++) |
19977
|
0
|
0
|
|
|
|
|
sa.push_back(i); |
19978
|
|
|
|
|
|
|
|
19979
|
|
|
|
|
|
|
sort(sa.begin(), sa.end(), suffix_comparator); |
19980
|
0
|
|
|
|
|
|
} |
19981
|
|
|
|
|
|
|
|
19982
|
0
|
|
|
|
|
|
unsigned detokenizer::suffix_array::count(const string& data) const { |
19983
|
|
|
|
|
|
|
auto lower_it = lower_bound(sa.begin(), sa.end(), data, suffix_lower_finder); |
19984
|
|
|
|
|
|
|
auto upper_it = upper_bound(sa.begin(), sa.end(), data, suffix_upper_finder); |
19985
|
0
|
|
|
|
|
|
return upper_it - lower_it; |
19986
|
|
|
|
|
|
|
} |
19987
|
|
|
|
|
|
|
|
19988
|
|
|
|
|
|
|
///////// |
19989
|
|
|
|
|
|
|
// File: tokenizer/morphodita_tokenizer_wrapper.cpp |
19990
|
|
|
|
|
|
|
///////// |
19991
|
|
|
|
|
|
|
|
19992
|
|
|
|
|
|
|
// This file is part of UDPipe . |
19993
|
|
|
|
|
|
|
// |
19994
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
19995
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
19996
|
|
|
|
|
|
|
// |
19997
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
19998
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
19999
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
20000
|
|
|
|
|
|
|
|
20001
|
1
|
|
|
|
|
|
morphodita_tokenizer_wrapper::morphodita_tokenizer_wrapper(morphodita::tokenizer* tokenizer, const multiword_splitter* splitter, |
20002
|
|
|
|
|
|
|
bool normalized_spaces, bool token_ranges) |
20003
|
1
|
50
|
|
|
|
|
: tokenizer(tokenizer), splitter(splitter), normalized_spaces(normalized_spaces), token_ranges(token_ranges) {} |
20004
|
|
|
|
|
|
|
|
20005
|
0
|
|
|
|
|
|
bool morphodita_tokenizer_wrapper::read_block(istream& is, string& block) const { |
20006
|
0
|
|
|
|
|
|
return bool(getpara(is, block)); |
20007
|
|
|
|
|
|
|
} |
20008
|
|
|
|
|
|
|
|
20009
|
0
|
|
|
|
|
|
void morphodita_tokenizer_wrapper::reset_document(string_piece id) { |
20010
|
0
|
|
|
|
|
|
new_document = true; |
20011
|
0
|
|
|
|
|
|
document_id.assign(id.str, id.len); |
20012
|
0
|
|
|
|
|
|
preceeding_newlines = 2; |
20013
|
0
|
|
|
|
|
|
sentence_id = 1; |
20014
|
0
|
|
|
|
|
|
set_text(""); |
20015
|
0
|
|
|
|
|
|
unicode_offset = 0; |
20016
|
0
|
|
|
|
|
|
text_unicode_length = 0; |
20017
|
|
|
|
|
|
|
saved_spaces.clear(); |
20018
|
0
|
|
|
|
|
|
} |
20019
|
|
|
|
|
|
|
|
20020
|
1
|
|
|
|
|
|
void morphodita_tokenizer_wrapper::set_text(string_piece text, bool make_copy) { |
20021
|
|
|
|
|
|
|
// Start by skipping spaces and copying them to saved_spaces |
20022
|
|
|
|
|
|
|
string_piece following; |
20023
|
1
|
50
|
|
|
|
|
for (char32_t chr; |
20024
|
2
|
50
|
|
|
|
|
text.len && (following = text, chr = unilib::utf8::decode(following.str, following.len), |
|
|
50
|
|
|
|
|
|
20025
|
1
|
50
|
|
|
|
|
(unilib::unicode::category(chr) & unilib::unicode::Zs) || chr == '\r' || chr == '\n' || chr == '\t'); |
|
|
50
|
|
|
|
|
|
20026
|
0
|
|
|
|
|
|
text = following, unicode_offset++) |
20027
|
0
|
|
|
|
|
|
saved_spaces.append(text.str, following.str - text.str); |
20028
|
|
|
|
|
|
|
|
20029
|
|
|
|
|
|
|
// Offset unicode_offset by length of previous text, update text_unicode_length for the new text |
20030
|
1
|
|
|
|
|
|
unicode_offset += text_unicode_length; |
20031
|
1
|
|
|
|
|
|
text_unicode_length = 0; |
20032
|
35
|
100
|
|
|
|
|
for (following = text; following.len; unilib::utf8::decode(following.str, following.len)) |
20033
|
34
|
|
|
|
|
|
text_unicode_length++; |
20034
|
|
|
|
|
|
|
|
20035
|
|
|
|
|
|
|
// Copy the text to local storage if needed |
20036
|
1
|
50
|
|
|
|
|
if (make_copy) { |
20037
|
1
|
|
|
|
|
|
text_copy.assign(text.str, text.len); |
20038
|
|
|
|
|
|
|
text = string_piece(text_copy.c_str(), text_copy.size()); |
20039
|
|
|
|
|
|
|
} |
20040
|
|
|
|
|
|
|
|
20041
|
|
|
|
|
|
|
// Store the text locally and in the morphodita::tokenizer |
20042
|
1
|
|
|
|
|
|
this->text = text; |
20043
|
1
|
|
|
|
|
|
tokenizer->set_text(this->text, false); |
20044
|
|
|
|
|
|
|
|
20045
|
1
|
|
|
|
|
|
} |
20046
|
|
|
|
|
|
|
|
20047
|
2
|
|
|
|
|
|
bool morphodita_tokenizer_wrapper::next_sentence(sentence& s, string& error) { |
20048
|
|
|
|
|
|
|
unsigned following_newlines = 0; |
20049
|
|
|
|
|
|
|
|
20050
|
2
|
|
|
|
|
|
s.clear(); |
20051
|
|
|
|
|
|
|
error.clear(); |
20052
|
|
|
|
|
|
|
|
20053
|
2
|
50
|
|
|
|
|
if (tokenizer->next_sentence(&forms, token_ranges ? &tokens : nullptr)) { |
|
|
100
|
|
|
|
|
|
20054
|
|
|
|
|
|
|
// The forms returned by GRU tokenizer *should not* start/end with spaces, |
20055
|
|
|
|
|
|
|
// but we trim them anyway (including all "remove empty forms/sentences" machinery). |
20056
|
8
|
100
|
|
|
|
|
for (size_t i = 0; i < forms.size(); i++) { |
20057
|
14
|
50
|
|
|
|
|
while (forms[i].len && (forms[i].str[0] == '\r' || forms[i].str[0] == '\n' || |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
20058
|
7
|
50
|
|
|
|
|
forms[i].str[0] == '\t' || forms[i].str[0] == ' ')) |
20059
|
0
|
|
|
|
|
|
forms[i].str++, forms[i].len--; |
20060
|
14
|
50
|
|
|
|
|
while (forms[i].len && (forms[i].str[forms[i].len-1] == '\r' || forms[i].str[forms[i].len-1] == '\n' || |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
20061
|
7
|
50
|
|
|
|
|
forms[i].str[forms[i].len-1] == '\t' || forms[i].str[forms[i].len-1] == ' ')) |
20062
|
0
|
|
|
|
|
|
forms[i].len--; |
20063
|
7
|
50
|
|
|
|
|
if (!forms[i].len) |
20064
|
0
|
|
|
|
|
|
forms.erase(forms.begin() + i--); |
20065
|
|
|
|
|
|
|
} |
20066
|
8
|
50
|
|
|
|
|
if (!forms.size()) return next_sentence(s, error); |
20067
|
|
|
|
|
|
|
|
20068
|
8
|
100
|
|
|
|
|
for (size_t i = 0; i < forms.size(); i++) { |
20069
|
|
|
|
|
|
|
// The form might contain spaces, even '\r', '\n' or '\t', |
20070
|
|
|
|
|
|
|
// which we change to space. We also normalize multiple spaces to one. |
20071
|
|
|
|
|
|
|
tok.form.clear(); |
20072
|
41
|
100
|
|
|
|
|
for (size_t j = 0; j < forms[i].len; j++) { |
20073
|
34
|
|
|
|
|
|
char chr = forms[i].str[j]; |
20074
|
34
|
50
|
|
|
|
|
if (chr == '\r' || chr == '\n' || chr == '\t') chr = ' '; |
|
|
50
|
|
|
|
|
|
20075
|
34
|
50
|
|
|
|
|
if (chr != ' ' || tok.form.empty() || tok.form.back() != ' ') |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
20076
|
34
|
|
|
|
|
|
tok.form.push_back(chr); |
20077
|
|
|
|
|
|
|
} |
20078
|
|
|
|
|
|
|
|
20079
|
|
|
|
|
|
|
// Track pre-sentence spaces and store SpacesBefore |
20080
|
7
|
100
|
|
|
|
|
if (i == 0) { |
20081
|
1
|
50
|
|
|
|
|
if (forms[0].str > text.str) |
20082
|
0
|
|
|
|
|
|
saved_spaces.append(text.str, forms[0].str - text.str); |
20083
|
1
|
|
|
|
|
|
preceeding_newlines += count(saved_spaces.begin(), saved_spaces.end(), '\n'); |
20084
|
|
|
|
|
|
|
} |
20085
|
7
|
50
|
|
|
|
|
if (!normalized_spaces) { |
20086
|
15
|
100
|
|
|
|
|
tok.set_spaces_before(i == 0 ? saved_spaces : ""); |
|
|
50
|
|
|
|
|
|
20087
|
|
|
|
|
|
|
} |
20088
|
|
|
|
|
|
|
saved_spaces.clear(); |
20089
|
|
|
|
|
|
|
|
20090
|
|
|
|
|
|
|
// Track post-sentence spaces and store SpaceAfter, SpacesInToken and SpacesAfter |
20091
|
7
|
100
|
|
|
|
|
if (i+1 == forms.size()) { |
20092
|
1
|
|
|
|
|
|
text.len -= forms[i].str + forms[i].len - text.str; |
20093
|
1
|
|
|
|
|
|
text.str = forms[i].str + forms[i].len; |
20094
|
|
|
|
|
|
|
|
20095
|
|
|
|
|
|
|
string_piece following; |
20096
|
3
|
100
|
|
|
|
|
for (char32_t chr; text.len && (following = text, chr = unilib::utf8::decode(following.str, following.len), |
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
20097
|
0
|
0
|
|
|
|
|
(unilib::unicode::category(chr) & unilib::unicode::Zs) || chr == '\r' || chr == '\n' || chr == '\t'); text = following) |
|
|
0
|
|
|
|
|
|
20098
|
1
|
|
|
|
|
|
saved_spaces.append(text.str, following.str - text.str); |
20099
|
|
|
|
|
|
|
|
20100
|
1
|
|
|
|
|
|
following_newlines += count(saved_spaces.begin(), saved_spaces.end(), '\n'); |
20101
|
|
|
|
|
|
|
} |
20102
|
7
|
50
|
|
|
|
|
if (normalized_spaces) { |
20103
|
0
|
0
|
|
|
|
|
tok.set_space_after(i+1 == forms.size() ? !saved_spaces.empty() : forms[i+1].str > forms[i].str + forms[i].len); |
20104
|
|
|
|
|
|
|
} else { |
20105
|
7
|
50
|
|
|
|
|
tok.set_spaces_in_token(tok.form.size() != forms[i].len ? forms[i] : ""); |
20106
|
7
|
100
|
|
|
|
|
tok.set_spaces_after(i+1 == forms.size() ? saved_spaces : string_piece(forms[i].str + forms[i].len, forms[i+1].str - forms[i].str - forms[i].len)); |
20107
|
|
|
|
|
|
|
} |
20108
|
|
|
|
|
|
|
saved_spaces.clear(); |
20109
|
|
|
|
|
|
|
|
20110
|
|
|
|
|
|
|
// Store TokenRange if requested |
20111
|
7
|
50
|
|
|
|
|
if (token_ranges) |
20112
|
0
|
|
|
|
|
|
tok.set_token_range(unicode_offset + tokens[i].start, unicode_offset + tokens[i].start + tokens[i].length); |
20113
|
|
|
|
|
|
|
|
20114
|
7
|
50
|
|
|
|
|
if (splitter) |
20115
|
7
|
|
|
|
|
|
splitter->append_token(tok.form, tok.misc, s); |
20116
|
|
|
|
|
|
|
else |
20117
|
0
|
|
|
|
|
|
s.add_word(tok.form).misc.assign(tok.misc); |
20118
|
|
|
|
|
|
|
} |
20119
|
|
|
|
|
|
|
|
20120
|
|
|
|
|
|
|
// Mark new document if needed |
20121
|
1
|
50
|
|
|
|
|
if (new_document) { |
20122
|
1
|
|
|
|
|
|
s.set_new_doc(true, document_id); |
20123
|
1
|
|
|
|
|
|
new_document = false; |
20124
|
|
|
|
|
|
|
} |
20125
|
|
|
|
|
|
|
|
20126
|
|
|
|
|
|
|
// Mark new paragraph if needed |
20127
|
1
|
50
|
|
|
|
|
if (preceeding_newlines >= 2) |
20128
|
1
|
|
|
|
|
|
s.set_new_par(true); |
20129
|
1
|
|
|
|
|
|
preceeding_newlines = following_newlines; |
20130
|
|
|
|
|
|
|
|
20131
|
1
|
50
|
|
|
|
|
s.set_sent_id(to_string(sentence_id++)); |
20132
|
|
|
|
|
|
|
|
20133
|
|
|
|
|
|
|
// Fill "# text" comment |
20134
|
8
|
|
|
|
|
|
s.comments.emplace_back("# text = "); |
20135
|
8
|
100
|
|
|
|
|
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
20136
|
7
|
50
|
|
|
|
|
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j].form : (const token&)s.words[i].form; |
|
|
0
|
|
|
|
|
|
20137
|
7
|
50
|
|
|
|
|
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
20138
|
0
|
|
|
|
|
|
i = s.multiword_tokens[j++].id_last; |
20139
|
|
|
|
|
|
|
|
20140
|
|
|
|
|
|
|
s.comments.back().append(tok.form); |
20141
|
7
|
100
|
|
|
|
|
if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' '); |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
20142
|
|
|
|
|
|
|
} |
20143
|
|
|
|
|
|
|
|
20144
|
|
|
|
|
|
|
return true; |
20145
|
|
|
|
|
|
|
} |
20146
|
|
|
|
|
|
|
|
20147
|
|
|
|
|
|
|
// Save unused text parts. |
20148
|
1
|
50
|
|
|
|
|
if (text.len) { |
20149
|
0
|
|
|
|
|
|
saved_spaces.append(text.str, text.len); |
20150
|
0
|
|
|
|
|
|
text.str += text.len; |
20151
|
2
|
|
|
|
|
|
text.len = 0; |
20152
|
|
|
|
|
|
|
} |
20153
|
|
|
|
|
|
|
|
20154
|
|
|
|
|
|
|
return false; |
20155
|
|
|
|
|
|
|
} |
20156
|
|
|
|
|
|
|
|
20157
|
|
|
|
|
|
|
///////// |
20158
|
|
|
|
|
|
|
// File: tokenizer/multiword_splitter.cpp |
20159
|
|
|
|
|
|
|
///////// |
20160
|
|
|
|
|
|
|
|
20161
|
|
|
|
|
|
|
// This file is part of UDPipe . |
20162
|
|
|
|
|
|
|
// |
20163
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
20164
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
20165
|
|
|
|
|
|
|
// |
20166
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
20167
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
20168
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
20169
|
|
|
|
|
|
|
|
20170
|
7
|
|
|
|
|
|
void multiword_splitter::append_token(string_piece token, string_piece misc, sentence& s) const { |
20171
|
|
|
|
|
|
|
using namespace unilib; |
20172
|
|
|
|
|
|
|
|
20173
|
|
|
|
|
|
|
// Buffer |
20174
|
|
|
|
|
|
|
s.add_word(); |
20175
|
7
|
|
|
|
|
|
string& buffer = s.words.back().form; |
20176
|
|
|
|
|
|
|
|
20177
|
|
|
|
|
|
|
// Lowercase the token |
20178
|
7
|
|
|
|
|
|
utf8::map(unicode::lowercase, token.str, token.len, buffer); |
20179
|
|
|
|
|
|
|
reverse(buffer.begin(), buffer.end()); |
20180
|
|
|
|
|
|
|
|
20181
|
|
|
|
|
|
|
// Try finding lowercased version in the full_rules |
20182
|
|
|
|
|
|
|
size_t prefix_len = 0; |
20183
|
|
|
|
|
|
|
auto it = full_rules.find(buffer); |
20184
|
|
|
|
|
|
|
|
20185
|
7
|
50
|
|
|
|
|
if (it == full_rules.end()) { |
20186
|
7
|
50
|
|
|
|
|
if (version >= 2) { |
20187
|
0
|
|
|
|
|
|
string& suffix = s.words.back().misc; |
20188
|
|
|
|
|
|
|
// Try searching suffix_rules if needed |
20189
|
0
|
0
|
|
|
|
|
while (suffix.size() + 1 < buffer.size()) { |
20190
|
0
|
|
|
|
|
|
suffix.push_back(buffer[suffix.size()]); |
20191
|
|
|
|
|
|
|
|
20192
|
|
|
|
|
|
|
auto suffix_it = suffix_rules.find(suffix); |
20193
|
0
|
0
|
|
|
|
|
if (suffix_it == suffix_rules.end()) |
20194
|
|
|
|
|
|
|
break; |
20195
|
|
|
|
|
|
|
|
20196
|
0
|
0
|
|
|
|
|
if (!suffix_it->second.words.empty()) { |
20197
|
|
|
|
|
|
|
it = suffix_it; |
20198
|
0
|
|
|
|
|
|
prefix_len = buffer.size() - suffix.size(); |
20199
|
|
|
|
|
|
|
} |
20200
|
|
|
|
|
|
|
} |
20201
|
|
|
|
|
|
|
suffix.clear(); |
20202
|
|
|
|
|
|
|
} |
20203
|
|
|
|
|
|
|
|
20204
|
7
|
50
|
|
|
|
|
if (!prefix_len) { |
20205
|
|
|
|
|
|
|
// No match |
20206
|
14
|
|
|
|
|
|
s.words.back().form.assign(token.str, token.len); |
20207
|
7
|
100
|
|
|
|
|
if (misc.len) s.words.back().misc.assign(misc.str, misc.len); |
20208
|
|
|
|
|
|
|
return; |
20209
|
|
|
|
|
|
|
} |
20210
|
|
|
|
|
|
|
} |
20211
|
|
|
|
|
|
|
|
20212
|
|
|
|
|
|
|
// Determine casing |
20213
|
|
|
|
|
|
|
enum { UC_FIRST, UC_ALL, UC_OTHER }; int casing = UC_OTHER; |
20214
|
|
|
|
|
|
|
|
20215
|
0
|
0
|
|
|
|
|
if (unicode::category(utf8::first(token.str, token.len)) & unicode::Lut) { |
20216
|
|
|
|
|
|
|
casing = UC_ALL; |
20217
|
0
|
0
|
|
|
|
|
for (auto&& chr : utf8::decoder(token.str, token.len)) |
20218
|
0
|
0
|
|
|
|
|
if (unicode::category(chr) & (unicode::L & ~unicode::Lut)) { casing = UC_FIRST; break; } |
20219
|
|
|
|
|
|
|
} |
20220
|
|
|
|
|
|
|
|
20221
|
|
|
|
|
|
|
// Fill the multiword token |
20222
|
0
|
|
|
|
|
|
s.multiword_tokens.emplace_back(s.words.back().id, s.words.back().id + (int)it->second.words.size() - 1, token, misc); |
20223
|
|
|
|
|
|
|
|
20224
|
|
|
|
|
|
|
s.words.back().form.clear(); |
20225
|
0
|
0
|
|
|
|
|
if (prefix_len) { |
20226
|
|
|
|
|
|
|
// Note that prefix_len is measured in byte length of lowercased characters |
20227
|
0
|
|
|
|
|
|
string_piece suffix(token); |
20228
|
0
|
0
|
|
|
|
|
while (s.words.back().form.size() < prefix_len && suffix.len) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
20229
|
0
|
|
|
|
|
|
utf8::append(s.words.back().form, unicode::lowercase(utf8::decode(suffix.str, suffix.len))); |
20230
|
0
|
|
|
|
|
|
s.words.back().form.assign(token.str, token.len - suffix.len); |
20231
|
|
|
|
|
|
|
} |
20232
|
0
|
0
|
|
|
|
|
for (auto&& chr : utf8::decoder(it->second.words[0])) |
20233
|
0
|
0
|
|
|
|
|
utf8::append(s.words.back().form, casing == UC_ALL || (casing == UC_FIRST && s.words.back().form.empty()) ? unicode::uppercase(chr) : chr); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
20234
|
|
|
|
|
|
|
|
20235
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < it->second.words.size(); i++) |
20236
|
0
|
0
|
|
|
|
|
if (casing != UC_ALL) { |
20237
|
|
|
|
|
|
|
s.add_word(it->second.words[i]); |
20238
|
|
|
|
|
|
|
} else { |
20239
|
|
|
|
|
|
|
s.add_word(); |
20240
|
0
|
|
|
|
|
|
utf8::map(unicode::uppercase, it->second.words[i], s.words.back().form); |
20241
|
|
|
|
|
|
|
} |
20242
|
|
|
|
|
|
|
} |
20243
|
|
|
|
|
|
|
|
20244
|
1
|
|
|
|
|
|
multiword_splitter* multiword_splitter::load(istream& is) { |
20245
|
|
|
|
|
|
|
char version; |
20246
|
1
|
50
|
|
|
|
|
if (!is.get(version)) return nullptr; |
20247
|
1
|
50
|
|
|
|
|
if (!(version >= 1 && version <= VERSION_LATEST)) return nullptr; |
20248
|
|
|
|
|
|
|
|
20249
|
|
|
|
|
|
|
binary_decoder data; |
20250
|
1
|
50
|
|
|
|
|
if (!compressor::load(is, data)) return nullptr; |
|
|
50
|
|
|
|
|
|
20251
|
|
|
|
|
|
|
|
20252
|
1
|
50
|
|
|
|
|
unique_ptr splitter(new multiword_splitter(version)); |
20253
|
|
|
|
|
|
|
try { |
20254
|
1
|
50
|
|
|
|
|
for (unsigned full_rules = data.next_4B(); full_rules; full_rules--) { |
|
|
50
|
|
|
|
|
|
20255
|
|
|
|
|
|
|
string full_rule; |
20256
|
0
|
0
|
|
|
|
|
data.next_str(full_rule); |
20257
|
|
|
|
|
|
|
reverse(full_rule.begin(), full_rule.end()); |
20258
|
|
|
|
|
|
|
|
20259
|
|
|
|
|
|
|
// Add the full_rule and its words |
20260
|
|
|
|
|
|
|
auto& info = splitter->full_rules[full_rule]; |
20261
|
0
|
0
|
|
|
|
|
for (unsigned words = data.next_1B(); words; words--) { |
|
|
0
|
|
|
|
|
|
20262
|
0
|
0
|
|
|
|
|
info.words.emplace_back(); |
20263
|
0
|
0
|
|
|
|
|
data.next_str(info.words.back()); |
20264
|
|
|
|
|
|
|
} |
20265
|
0
|
0
|
|
|
|
|
if (info.words.empty()) return nullptr; |
20266
|
|
|
|
|
|
|
} |
20267
|
|
|
|
|
|
|
|
20268
|
1
|
50
|
|
|
|
|
if (version >= 2) |
20269
|
0
|
0
|
|
|
|
|
for (unsigned suffix_rules = data.next_4B(); suffix_rules; suffix_rules--) { |
|
|
0
|
|
|
|
|
|
20270
|
|
|
|
|
|
|
string suffix_rule; |
20271
|
0
|
0
|
|
|
|
|
data.next_str(suffix_rule); |
20272
|
|
|
|
|
|
|
reverse(suffix_rule.begin(), suffix_rule.end()); |
20273
|
|
|
|
|
|
|
|
20274
|
|
|
|
|
|
|
// Add the suffix_rule and its words |
20275
|
|
|
|
|
|
|
auto& info = splitter->suffix_rules[suffix_rule]; |
20276
|
0
|
0
|
|
|
|
|
for (unsigned words = data.next_1B(); words; words--) { |
|
|
0
|
|
|
|
|
|
20277
|
0
|
0
|
|
|
|
|
info.words.emplace_back(); |
20278
|
0
|
0
|
|
|
|
|
data.next_str(info.words.back()); |
20279
|
|
|
|
|
|
|
} |
20280
|
0
|
0
|
|
|
|
|
if (info.words.empty()) return nullptr; |
20281
|
|
|
|
|
|
|
|
20282
|
|
|
|
|
|
|
// Add prefixes of the suffix with empty data |
20283
|
0
|
0
|
|
|
|
|
if (!suffix_rule.empty()) |
20284
|
0
|
0
|
|
|
|
|
for (suffix_rule.pop_back(); !suffix_rule.empty(); suffix_rule.pop_back()) |
20285
|
|
|
|
|
|
|
splitter->suffix_rules[suffix_rule]; |
20286
|
|
0
|
|
|
|
|
} |
20287
|
|
|
|
|
|
|
} catch (binary_decoder_error&) { |
20288
|
|
|
|
|
|
|
return nullptr; |
20289
|
|
|
|
|
|
|
} |
20290
|
|
|
|
|
|
|
|
20291
|
1
|
50
|
|
|
|
|
return data.is_end() ? splitter.release() : nullptr; |
20292
|
|
|
|
|
|
|
} |
20293
|
|
|
|
|
|
|
|
20294
|
|
|
|
|
|
|
///////// |
20295
|
|
|
|
|
|
|
// File: tokenizer/multiword_splitter_trainer.h |
20296
|
|
|
|
|
|
|
///////// |
20297
|
|
|
|
|
|
|
|
20298
|
|
|
|
|
|
|
// This file is part of UDPipe . |
20299
|
|
|
|
|
|
|
// |
20300
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
20301
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
20302
|
|
|
|
|
|
|
// |
20303
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
20304
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
20305
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
20306
|
|
|
|
|
|
|
|
20307
|
|
|
|
|
|
|
class multiword_splitter_trainer { |
20308
|
|
|
|
|
|
|
public: |
20309
|
|
|
|
|
|
|
static bool train(const vector& data, ostream& os, string& error); |
20310
|
|
|
|
|
|
|
}; |
20311
|
|
|
|
|
|
|
|
20312
|
|
|
|
|
|
|
///////// |
20313
|
|
|
|
|
|
|
// File: tokenizer/multiword_splitter_trainer.cpp |
20314
|
|
|
|
|
|
|
///////// |
20315
|
|
|
|
|
|
|
|
20316
|
|
|
|
|
|
|
// This file is part of UDPipe . |
20317
|
|
|
|
|
|
|
// |
20318
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
20319
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
20320
|
|
|
|
|
|
|
// |
20321
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
20322
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
20323
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
20324
|
|
|
|
|
|
|
|
20325
|
0
|
|
|
|
|
|
bool multiword_splitter_trainer::train(const vector& data, ostream& os, string& error) { |
20326
|
|
|
|
|
|
|
using namespace unilib; |
20327
|
|
|
|
|
|
|
error.clear(); |
20328
|
|
|
|
|
|
|
|
20329
|
|
|
|
|
|
|
// Train |
20330
|
0
|
|
|
|
|
|
struct rule_info { |
20331
|
|
|
|
|
|
|
vector words; |
20332
|
|
|
|
|
|
|
unsigned count = 0; |
20333
|
|
|
|
|
|
|
}; |
20334
|
|
|
|
|
|
|
map full_rules, suffix_rules; |
20335
|
|
|
|
|
|
|
|
20336
|
|
|
|
|
|
|
// Full rules |
20337
|
|
|
|
|
|
|
string lc_form; |
20338
|
0
|
|
|
|
|
|
vector lc_words; |
20339
|
0
|
0
|
|
|
|
|
for (auto&& sentence : data) |
20340
|
0
|
0
|
|
|
|
|
for (auto&& multiword : sentence.multiword_tokens) { |
20341
|
|
|
|
|
|
|
utf8::map(unicode::lowercase, multiword.form, lc_form); |
20342
|
0
|
|
|
|
|
|
lc_words.clear(); |
20343
|
0
|
0
|
|
|
|
|
for (int i = multiword.id_first; i <= multiword.id_last; i++) |
20344
|
0
|
0
|
|
|
|
|
utf8::map(unicode::lowercase, sentence.words[i].form, (lc_words.emplace_back(), lc_words.back())); |
20345
|
|
|
|
|
|
|
|
20346
|
0
|
0
|
|
|
|
|
auto& info = full_rules[lc_form]; |
20347
|
0
|
0
|
|
|
|
|
if (info.words.empty()) |
20348
|
0
|
|
|
|
|
|
info.words.assign(lc_words.begin(), lc_words.end()); |
20349
|
0
|
|
|
|
|
|
info.count += lc_words == info.words; |
20350
|
0
|
0
|
|
|
|
|
if (!info.count) full_rules.erase(lc_form); |
20351
|
|
|
|
|
|
|
} |
20352
|
|
|
|
|
|
|
|
20353
|
|
|
|
|
|
|
// Remove the full rules which trigger too negatively |
20354
|
0
|
0
|
|
|
|
|
for (auto&& sentence : data) |
20355
|
0
|
0
|
|
|
|
|
for (size_t i = 1, j = 0; i < sentence.words.size(); i++) { |
20356
|
0
|
0
|
|
|
|
|
if (j < sentence.multiword_tokens.size() && sentence.multiword_tokens[j].id_first == int(i)) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
20357
|
0
|
|
|
|
|
|
i = sentence.multiword_tokens[j++].id_last; |
20358
|
|
|
|
|
|
|
continue; |
20359
|
|
|
|
|
|
|
} |
20360
|
|
|
|
|
|
|
|
20361
|
|
|
|
|
|
|
utf8::map(unicode::lowercase, sentence.words[i].form, lc_form); |
20362
|
|
|
|
|
|
|
auto it = full_rules.find(lc_form); |
20363
|
0
|
0
|
|
|
|
|
if (it != full_rules.end()) |
20364
|
0
|
0
|
|
|
|
|
if (!--it->second.count) |
20365
|
|
|
|
|
|
|
full_rules.erase(it); |
20366
|
|
|
|
|
|
|
} |
20367
|
|
|
|
|
|
|
|
20368
|
|
|
|
|
|
|
// Suffix rules |
20369
|
0
|
0
|
|
|
|
|
for (auto&& full_rule : full_rules) { |
20370
|
|
|
|
|
|
|
size_t prefix_match = 0; |
20371
|
0
|
0
|
|
|
|
|
while (prefix_match < full_rule.first.size() && prefix_match < full_rule.second.words[0].size()) prefix_match++; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
20372
|
0
|
0
|
|
|
|
|
for (; prefix_match; prefix_match--) |
20373
|
0
|
0
|
|
|
|
|
if (((unsigned char)full_rule.first[prefix_match]) < 0x80 || ((unsigned char)full_rule.first[prefix_match]) >= 0xC0) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
20374
|
0
|
0
|
|
|
|
|
lc_form.assign(full_rule.first, prefix_match, string::npos); |
20375
|
|
|
|
|
|
|
lc_words.assign(full_rule.second.words.begin(), full_rule.second.words.end()); |
20376
|
0
|
0
|
|
|
|
|
lc_words[0].erase(0, prefix_match); |
20377
|
|
|
|
|
|
|
|
20378
|
0
|
0
|
|
|
|
|
auto& info = suffix_rules[lc_form]; |
20379
|
0
|
0
|
|
|
|
|
if (info.words.empty()) |
20380
|
0
|
|
|
|
|
|
info.words.assign(lc_words.begin(), lc_words.end()); |
20381
|
0
|
|
|
|
|
|
info.count += lc_words == info.words; |
20382
|
0
|
0
|
|
|
|
|
if (!info.count) suffix_rules.erase(lc_form); |
20383
|
|
|
|
|
|
|
} |
20384
|
|
|
|
|
|
|
} |
20385
|
|
|
|
|
|
|
|
20386
|
|
|
|
|
|
|
// Remove the suffix rules which trigger too negatively |
20387
|
0
|
0
|
|
|
|
|
for (auto&& sentence : data) |
20388
|
0
|
0
|
|
|
|
|
for (size_t i = 1, j = 0; i < sentence.words.size(); i++) { |
20389
|
0
|
0
|
|
|
|
|
if (j < sentence.multiword_tokens.size() && sentence.multiword_tokens[j].id_first == int(i)) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
20390
|
0
|
|
|
|
|
|
i = sentence.multiword_tokens[j++].id_last; |
20391
|
0
|
|
|
|
|
|
continue; |
20392
|
|
|
|
|
|
|
} |
20393
|
|
|
|
|
|
|
|
20394
|
|
|
|
|
|
|
utf8::map(unicode::lowercase, sentence.words[i].form, lc_form); |
20395
|
0
|
0
|
|
|
|
|
while (lc_form.size() > 1) { |
20396
|
0
|
0
|
|
|
|
|
lc_form.erase(0, 1); |
20397
|
|
|
|
|
|
|
auto it = suffix_rules.find(lc_form); |
20398
|
0
|
0
|
|
|
|
|
if (it != suffix_rules.end()) { |
20399
|
0
|
0
|
|
|
|
|
if (it->second.count <= 10) |
20400
|
|
|
|
|
|
|
suffix_rules.erase(it); |
20401
|
|
|
|
|
|
|
else |
20402
|
0
|
|
|
|
|
|
it->second.count -= 10; |
20403
|
|
|
|
|
|
|
} |
20404
|
|
|
|
|
|
|
} |
20405
|
|
|
|
|
|
|
} |
20406
|
|
|
|
|
|
|
|
20407
|
|
|
|
|
|
|
// Encode |
20408
|
0
|
0
|
|
|
|
|
binary_encoder enc; |
20409
|
0
|
|
|
|
|
|
enc.add_4B(full_rules.size()); |
20410
|
0
|
0
|
|
|
|
|
for (auto&& full_rule : full_rules) { |
20411
|
0
|
0
|
|
|
|
|
enc.add_str(full_rule.first); |
20412
|
0
|
0
|
|
|
|
|
enc.add_1B(full_rule.second.words.size()); |
20413
|
0
|
0
|
|
|
|
|
for (auto& word : full_rule.second.words) |
20414
|
0
|
0
|
|
|
|
|
enc.add_str(word); |
20415
|
|
|
|
|
|
|
} |
20416
|
0
|
|
|
|
|
|
enc.add_4B(suffix_rules.size()); |
20417
|
0
|
0
|
|
|
|
|
for (auto&& suffix_rule : suffix_rules) { |
20418
|
0
|
0
|
|
|
|
|
enc.add_str(suffix_rule.first); |
20419
|
0
|
0
|
|
|
|
|
enc.add_1B(suffix_rule.second.words.size()); |
20420
|
0
|
0
|
|
|
|
|
for (auto& word : suffix_rule.second.words) |
20421
|
0
|
0
|
|
|
|
|
enc.add_str(word); |
20422
|
|
|
|
|
|
|
} |
20423
|
|
|
|
|
|
|
|
20424
|
|
|
|
|
|
|
// Save |
20425
|
0
|
0
|
|
|
|
|
os.put(multiword_splitter::VERSION_LATEST); |
20426
|
0
|
0
|
|
|
|
|
if (!compressor::save(os, enc)) return error.assign("Cannot encode multiword_splitter!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
20427
|
|
|
|
|
|
|
|
20428
|
|
|
|
|
|
|
return true; |
20429
|
|
|
|
|
|
|
} |
20430
|
|
|
|
|
|
|
|
20431
|
|
|
|
|
|
|
///////// |
20432
|
|
|
|
|
|
|
// File: trainer/trainer.h |
20433
|
|
|
|
|
|
|
///////// |
20434
|
|
|
|
|
|
|
|
20435
|
|
|
|
|
|
|
// This file is part of UDPipe . |
20436
|
|
|
|
|
|
|
// |
20437
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
20438
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
20439
|
|
|
|
|
|
|
// |
20440
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
20441
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
20442
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
20443
|
|
|
|
|
|
|
|
20444
|
|
|
|
|
|
|
class trainer { |
20445
|
|
|
|
|
|
|
public: |
20446
|
|
|
|
|
|
|
static bool train(const string& method, const vector& train, const vector& heldout, |
20447
|
|
|
|
|
|
|
const string& tokenizer, const string& tagger, const string& parser, ostream& os, string& error); |
20448
|
|
|
|
|
|
|
|
20449
|
|
|
|
|
|
|
static const string DEFAULT; |
20450
|
|
|
|
|
|
|
static const string NONE; |
20451
|
|
|
|
|
|
|
|
20452
|
|
|
|
|
|
|
protected: |
20453
|
|
|
|
|
|
|
static unsigned hyperparameter_integer(unsigned run, unsigned index, unsigned minimum, unsigned maximum); |
20454
|
|
|
|
|
|
|
static double hyperparameter_uniform(unsigned run, unsigned index, double minimum, double maximum); |
20455
|
|
|
|
|
|
|
static double hyperparameter_logarithmic(unsigned run, unsigned index, double minimum, double maximum); |
20456
|
|
|
|
|
|
|
|
20457
|
|
|
|
|
|
|
private: |
20458
|
|
|
|
|
|
|
static double rnd(unsigned run, unsigned index); |
20459
|
|
|
|
|
|
|
}; |
20460
|
|
|
|
|
|
|
|
20461
|
|
|
|
|
|
|
///////// |
20462
|
|
|
|
|
|
|
// File: trainer/trainer_morphodita_parsito.h |
20463
|
|
|
|
|
|
|
///////// |
20464
|
|
|
|
|
|
|
|
20465
|
|
|
|
|
|
|
// This file is part of UDPipe . |
20466
|
|
|
|
|
|
|
// |
20467
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
20468
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
20469
|
|
|
|
|
|
|
// |
20470
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
20471
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
20472
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
20473
|
|
|
|
|
|
|
|
20474
|
|
|
|
|
|
|
class trainer_morphodita_parsito : public trainer { |
20475
|
|
|
|
|
|
|
public: |
20476
|
|
|
|
|
|
|
static bool train(const vector& training, const vector& heldout, |
20477
|
|
|
|
|
|
|
const string& tokenizer, const string& tagger, const string& parser, ostream& os, string& error); |
20478
|
|
|
|
|
|
|
|
20479
|
|
|
|
|
|
|
private: |
20480
|
|
|
|
|
|
|
static bool train_tokenizer(const vector& training, const vector& heldout, |
20481
|
|
|
|
|
|
|
const string& options, ostream& os, string& error); |
20482
|
|
|
|
|
|
|
static bool train_tagger(const vector& training, const vector& heldout, |
20483
|
|
|
|
|
|
|
const string& options, ostream& os, string& error); |
20484
|
|
|
|
|
|
|
static bool train_parser(const vector& training, const vector& heldout, |
20485
|
|
|
|
|
|
|
const string& options, const string& tagger_model, ostream& os, string& error); |
20486
|
|
|
|
|
|
|
|
20487
|
|
|
|
|
|
|
// Generic model methods |
20488
|
|
|
|
|
|
|
enum model_type { TOKENIZER_MODEL, TAGGER_MODEL, PARSER_MODEL }; |
20489
|
|
|
|
|
|
|
static bool load_model(const string& data, model_type model, string_piece& range); |
20490
|
|
|
|
|
|
|
static const string& model_normalize_form(string_piece form, string& output); |
20491
|
|
|
|
|
|
|
static const string& model_normalize_lemma(string_piece lemma, string& output); |
20492
|
|
|
|
|
|
|
static void model_fill_word_analysis(const morphodita::tagged_lemma& analysis, bool upostag, int lemma, bool xpostag, bool feats, word& word); |
20493
|
|
|
|
|
|
|
|
20494
|
|
|
|
|
|
|
// Tagger-specific model methods |
20495
|
|
|
|
|
|
|
static bool train_tagger_model(const vector& training, const vector& heldout, |
20496
|
|
|
|
|
|
|
unsigned model, unsigned models, const named_values::map& tagger, ostream& os, string& error); |
20497
|
|
|
|
|
|
|
static bool can_combine_tag(const word& w, string& error); |
20498
|
|
|
|
|
|
|
static const string& combine_tag(const word& w, bool xpostag, bool feats, string& combined_tag); |
20499
|
|
|
|
|
|
|
static const string& most_frequent_tag(const vector& data, const string& upostag, bool xpostag, bool feats, string& combined_tag); |
20500
|
|
|
|
|
|
|
static const string& combine_lemma(const word& w, int use_lemma, string& combined_lemma, const unordered_set& flat_lemmas = unordered_set()); |
20501
|
|
|
|
|
|
|
|
20502
|
|
|
|
|
|
|
// Generic options handling |
20503
|
|
|
|
|
|
|
static const string& option_str(const named_values::map& options, const string& name, int model = -1); |
20504
|
|
|
|
|
|
|
static bool option_int(const named_values::map& options, const string& name, int& value, string& error, int model = -1); |
20505
|
|
|
|
|
|
|
static bool option_bool(const named_values::map& options, const string& name, bool& value, string& error, int model = -1); |
20506
|
|
|
|
|
|
|
static bool option_double(const named_values::map& options, const string& name, double& value, string& error, int model = -1); |
20507
|
|
|
|
|
|
|
|
20508
|
|
|
|
|
|
|
// Various string data |
20509
|
|
|
|
|
|
|
static const string empty_string; |
20510
|
|
|
|
|
|
|
static const string tag_separators; |
20511
|
|
|
|
|
|
|
static const string tagger_features_tagger; |
20512
|
|
|
|
|
|
|
static const string tagger_features_lemmatizer; |
20513
|
|
|
|
|
|
|
static const string parser_nodes; |
20514
|
|
|
|
|
|
|
}; |
20515
|
|
|
|
|
|
|
|
20516
|
|
|
|
|
|
|
///////// |
20517
|
|
|
|
|
|
|
// File: trainer/trainer.cpp |
20518
|
|
|
|
|
|
|
///////// |
20519
|
|
|
|
|
|
|
|
20520
|
|
|
|
|
|
|
// This file is part of UDPipe . |
20521
|
|
|
|
|
|
|
// |
20522
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
20523
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
20524
|
|
|
|
|
|
|
// |
20525
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
20526
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
20527
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
20528
|
|
|
|
|
|
|
|
20529
|
2
|
|
|
|
|
|
const string trainer::DEFAULT; |
20530
|
2
|
|
|
|
|
|
const string trainer::NONE = "none"; |
20531
|
|
|
|
|
|
|
|
20532
|
0
|
|
|
|
|
|
bool trainer::train(const string& method, const vector& training, const vector& heldout, |
20533
|
|
|
|
|
|
|
const string& tokenizer, const string& tagger, const string& parser, ostream& os, string& error) { |
20534
|
|
|
|
|
|
|
error.clear(); |
20535
|
|
|
|
|
|
|
|
20536
|
0
|
0
|
|
|
|
|
stringstream os_buffer; |
20537
|
0
|
0
|
|
|
|
|
os_buffer.put(method.size()); |
20538
|
0
|
0
|
|
|
|
|
os_buffer.write(method.c_str(), method.size()); |
20539
|
|
|
|
|
|
|
|
20540
|
|
|
|
|
|
|
try { |
20541
|
0
|
0
|
|
|
|
|
if (method == "morphodita_parsito") { |
20542
|
0
|
0
|
|
|
|
|
if (!trainer_morphodita_parsito::train(training, heldout, tokenizer, tagger, parser, os_buffer, error)) |
|
|
0
|
|
|
|
|
|
20543
|
|
|
|
|
|
|
return false; |
20544
|
|
|
|
|
|
|
} else { |
20545
|
0
|
0
|
|
|
|
|
error.assign("Unknown UDPipe method '").append(method).append("'!"); |
|
|
0
|
|
|
|
|
|
20546
|
|
|
|
|
|
|
return false; |
20547
|
|
0
|
|
|
|
|
} |
|
|
0
|
|
|
|
|
|
20548
|
|
|
|
|
|
|
} catch (training_error& e) { |
20549
|
|
|
|
|
|
|
error.assign(e.what()); |
20550
|
|
|
|
|
|
|
return false; |
20551
|
|
|
|
|
|
|
} |
20552
|
|
|
|
|
|
|
|
20553
|
0
|
0
|
|
|
|
|
os << os_buffer.rdbuf(); |
20554
|
|
|
|
|
|
|
return true; |
20555
|
|
|
|
|
|
|
} |
20556
|
|
|
|
|
|
|
|
20557
|
0
|
|
|
|
|
|
unsigned trainer::hyperparameter_integer(unsigned run, unsigned index, unsigned minimum, unsigned maximum) { |
20558
|
0
|
|
|
|
|
|
return minimum + int((maximum - minimum + 1) * rnd(run, index)); |
20559
|
|
|
|
|
|
|
} |
20560
|
|
|
|
|
|
|
|
20561
|
0
|
|
|
|
|
|
double trainer::hyperparameter_uniform(unsigned run, unsigned index, double minimum, double maximum) { |
20562
|
0
|
|
|
|
|
|
return minimum + (maximum - minimum) * rnd(run, index); |
20563
|
|
|
|
|
|
|
} |
20564
|
|
|
|
|
|
|
|
20565
|
0
|
|
|
|
|
|
double trainer::hyperparameter_logarithmic(unsigned run, unsigned index, double minimum, double maximum) { |
20566
|
0
|
|
|
|
|
|
return exp(log(minimum) + (log(maximum) - log(minimum)) * rnd(run, index)); |
20567
|
|
|
|
|
|
|
} |
20568
|
|
|
|
|
|
|
|
20569
|
0
|
|
|
|
|
|
double trainer::rnd(unsigned run, unsigned index) { |
20570
|
|
|
|
|
|
|
uint32_t state = 12345U; |
20571
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < 10; i++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
20572
|
0
|
|
|
|
|
|
state = state * 1103515245U + run * 19999999U + index * 1000000007U + 12345U; |
20573
|
0
|
|
|
|
|
|
return (state >> 16) / double(1<<16); |
20574
|
|
|
|
|
|
|
} |
20575
|
|
|
|
|
|
|
|
20576
|
|
|
|
|
|
|
///////// |
20577
|
|
|
|
|
|
|
// File: morphodita/tagger/elementary_features_encoder.h |
20578
|
|
|
|
|
|
|
///////// |
20579
|
|
|
|
|
|
|
|
20580
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
20581
|
|
|
|
|
|
|
// |
20582
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
20583
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
20584
|
|
|
|
|
|
|
// |
20585
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
20586
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
20587
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
20588
|
|
|
|
|
|
|
|
20589
|
|
|
|
|
|
|
namespace morphodita { |
20590
|
|
|
|
|
|
|
|
20591
|
|
|
|
|
|
|
template |
20592
|
0
|
|
|
|
|
|
inline bool elementary_features |
20593
|
0
|
|
|
|
|
|
binary_encoder enc; |
20594
|
|
|
|
|
|
|
|
20595
|
0
|
0
|
|
|
|
|
enc.add_1B(maps.size()); |
20596
|
0
|
0
|
|
|
|
|
for (auto&& map : maps) |
20597
|
0
|
0
|
|
|
|
|
map.save(enc); |
20598
|
|
|
|
|
|
|
|
20599
|
0
|
0
|
|
|
|
|
return compressor::save(os, enc); |
20600
|
|
|
|
|
|
|
} |
20601
|
|
|
|
|
|
|
|
20602
|
|
|
|
|
|
|
} // namespace morphodita |
20603
|
|
|
|
|
|
|
|
20604
|
|
|
|
|
|
|
///////// |
20605
|
|
|
|
|
|
|
// File: morphodita/tagger/feature_sequences_encoder.h |
20606
|
|
|
|
|
|
|
///////// |
20607
|
|
|
|
|
|
|
|
20608
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
20609
|
|
|
|
|
|
|
// |
20610
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
20611
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
20612
|
|
|
|
|
|
|
// |
20613
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
20614
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
20615
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
20616
|
|
|
|
|
|
|
|
20617
|
|
|
|
|
|
|
namespace morphodita { |
20618
|
|
|
|
|
|
|
|
20619
|
|
|
|
|
|
|
template |
20620
|
0
|
|
|
|
|
|
void feature_sequences::parse(int window_size, istream& is) { |
20621
|
|
|
|
|
|
|
unordered_map elementary_map; |
20622
|
0
|
0
|
|
|
|
|
for (auto&& description : ElementaryFeatures::descriptions) |
20623
|
0
|
0
|
|
|
|
|
if (!elementary_map.emplace(description.name, description).second) |
20624
|
0
|
0
|
|
|
|
|
training_failure("Repeated elementary feature with name " << description.name << '!'); |
20625
|
|
|
|
|
|
|
|
20626
|
|
|
|
|
|
|
string line; |
20627
|
0
|
|
|
|
|
|
vector tokens; |
20628
|
0
|
0
|
|
|
|
|
while (getline(is, line)) { |
|
|
0
|
|
|
|
|
|
20629
|
0
|
0
|
|
|
|
|
split(line, ',', tokens); |
20630
|
0
|
0
|
|
|
|
|
if (tokens.empty()) training_failure("Feature sequence cannot be empty!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
20631
|
|
|
|
|
|
|
|
20632
|
|
|
|
|
|
|
bool contains_only_current = false; |
20633
|
0
|
0
|
|
|
|
|
sequences.emplace_back(); |
20634
|
0
|
0
|
|
|
|
|
for (auto&& token : tokens) { |
20635
|
0
|
|
|
|
|
|
vector parts; |
20636
|
0
|
0
|
|
|
|
|
split(token, ' ', parts); |
20637
|
0
|
0
|
|
|
|
|
if (parts.size() != 2) training_failure("Cannot parse feature sequence element '" << token << "'!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
20638
|
|
|
|
|
|
|
auto it = elementary_map.find(parts[0]); |
20639
|
0
|
0
|
|
|
|
|
if (it == elementary_map.end()) training_failure("Unknown elementary feature '" << parts[0] << "' used in feature sequence '" << token << "'!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
20640
|
|
|
|
|
|
|
|
20641
|
|
|
|
|
|
|
auto& desc = it->second; |
20642
|
0
|
0
|
|
|
|
|
int sequence_index = parse_int(parts[1].c_str(), "sequence_index"); |
20643
|
0
|
0
|
|
|
|
|
if (desc.type == DYNAMIC && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of dynamic elementary feature '" << desc.name << "'!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
20644
|
0
|
0
|
|
|
|
|
if (desc.type == PER_TAG && (sequence_index > 0 || sequence_index <= -window_size)) training_failure("Wrong sequence index " << sequence_index << " of per-tag elementary feature '" << desc.name << "'!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
20645
|
0
|
0
|
|
|
|
|
if (desc.range == ONLY_CURRENT && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of elementary feature '" << desc.name << "' requiring zero offset!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
20646
|
|
|
|
|
|
|
|
20647
|
0
|
0
|
|
|
|
|
sequences.back().elements.emplace_back(it->second.type, it->second.index, sequence_index); |
20648
|
0
|
0
|
|
|
|
|
if (desc.type == DYNAMIC) sequences.back().dependant_range = max(sequences.back().dependant_range, window_size + 1); |
20649
|
0
|
0
|
|
|
|
|
if (desc.type == PER_TAG) sequences.back().dependant_range = max(sequences.back().dependant_range, 1 - sequence_index); |
20650
|
0
|
|
|
|
|
|
contains_only_current |= desc.range == ONLY_CURRENT; |
20651
|
|
|
|
|
|
|
} |
20652
|
0
|
0
|
|
|
|
|
if (contains_only_current && sequences.back().dependant_range > 1) training_failure("Feature sequence '" << line << "' contains both a non-local elementary feature and exclusively-local elementary feature!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
20653
|
|
|
|
|
|
|
} |
20654
|
|
|
|
|
|
|
|
20655
|
0
|
|
|
|
|
|
stable_sort(sequences.begin(), sequences.end(), [](const feature_sequence& a, const feature_sequence& b) { return a.dependant_range > b.dependant_range; }); |
20656
|
0
|
0
|
|
|
|
|
scores.resize(sequences.size()); |
20657
|
0
|
|
|
|
|
|
} |
20658
|
|
|
|
|
|
|
|
20659
|
|
|
|
|
|
|
template |
20660
|
0
|
|
|
|
|
|
inline bool feature_sequences::save(ostream& os) { |
20661
|
0
|
0
|
|
|
|
|
if (!elementary.save(os)) return false; |
20662
|
|
|
|
|
|
|
|
20663
|
0
|
|
|
|
|
|
binary_encoder enc; |
20664
|
0
|
0
|
|
|
|
|
enc.add_1B(sequences.size()); |
20665
|
0
|
0
|
|
|
|
|
for (auto&& sequence : sequences) { |
20666
|
0
|
|
|
|
|
|
enc.add_4B(sequence.dependant_range); |
20667
|
0
|
0
|
|
|
|
|
enc.add_1B(sequence.elements.size()); |
20668
|
0
|
0
|
|
|
|
|
for (auto&& element : sequence.elements) { |
20669
|
0
|
|
|
|
|
|
enc.add_4B(element.type); |
20670
|
0
|
|
|
|
|
|
enc.add_4B(element.elementary_index); |
20671
|
0
|
|
|
|
|
|
enc.add_4B(element.sequence_index); |
20672
|
|
|
|
|
|
|
} |
20673
|
|
|
|
|
|
|
} |
20674
|
|
|
|
|
|
|
|
20675
|
0
|
0
|
|
|
|
|
enc.add_1B(scores.size()); |
20676
|
0
|
0
|
|
|
|
|
for (auto&& score : scores) |
20677
|
0
|
0
|
|
|
|
|
score.save(enc); |
20678
|
|
|
|
|
|
|
|
20679
|
0
|
0
|
|
|
|
|
return compressor::save(os, enc); |
20680
|
|
|
|
|
|
|
} |
20681
|
|
|
|
|
|
|
|
20682
|
|
|
|
|
|
|
} // namespace morphodita |
20683
|
|
|
|
|
|
|
|
20684
|
|
|
|
|
|
|
///////// |
20685
|
|
|
|
|
|
|
// File: morphodita/tagger/training_maps.h |
20686
|
|
|
|
|
|
|
///////// |
20687
|
|
|
|
|
|
|
|
20688
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
20689
|
|
|
|
|
|
|
// |
20690
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
20691
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
20692
|
|
|
|
|
|
|
// |
20693
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
20694
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
20695
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
20696
|
|
|
|
|
|
|
|
20697
|
|
|
|
|
|
|
namespace morphodita { |
20698
|
|
|
|
|
|
|
|
20699
|
|
|
|
|
|
|
// Declarations |
20700
|
0
|
0
|
|
|
|
|
class training_elementary_feature_map { |
|
|
0
|
|
|
|
|
|
20701
|
|
|
|
|
|
|
public: |
20702
|
|
|
|
|
|
|
inline elementary_feature_value value(const char* feature, int len) const; |
20703
|
|
|
|
|
|
|
mutable unordered_map map = {{"", elementary_feature_empty}}; |
20704
|
|
|
|
|
|
|
private: |
20705
|
|
|
|
|
|
|
mutable string key; |
20706
|
|
|
|
|
|
|
}; |
20707
|
|
|
|
|
|
|
|
20708
|
0
|
|
|
|
|
|
class training_feature_sequence_map { |
20709
|
|
|
|
|
|
|
public: |
20710
|
|
|
|
|
|
|
struct info { |
20711
|
|
|
|
|
|
|
// We deliberately use feature_sequence*s*_score to check for overflow |
20712
|
|
|
|
|
|
|
feature_sequences_score alpha = 0; |
20713
|
|
|
|
|
|
|
feature_sequences_score gamma = 0; |
20714
|
|
|
|
|
|
|
int last_gamma_update = 0; |
20715
|
|
|
|
|
|
|
}; |
20716
|
|
|
|
|
|
|
|
20717
|
|
|
|
|
|
|
inline feature_sequence_score score(const char* feature, int len) const; |
20718
|
|
|
|
|
|
|
mutable unordered_map map; |
20719
|
|
|
|
|
|
|
private: |
20720
|
|
|
|
|
|
|
mutable string key; |
20721
|
|
|
|
|
|
|
}; |
20722
|
|
|
|
|
|
|
|
20723
|
|
|
|
|
|
|
template class ElementaryFeatures> using train_feature_sequences = feature_sequences, training_feature_sequence_map>; |
20724
|
|
|
|
|
|
|
|
20725
|
|
|
|
|
|
|
// Definitions |
20726
|
0
|
|
|
|
|
|
elementary_feature_value training_elementary_feature_map::value(const char* feature, int len) const { |
20727
|
0
|
|
|
|
|
|
key.assign(feature, len); |
20728
|
0
|
|
|
|
|
|
return map.emplace(key, elementary_feature_empty + elementary_feature_value(map.size())).first->second; |
20729
|
|
|
|
|
|
|
} |
20730
|
|
|
|
|
|
|
|
20731
|
0
|
|
|
|
|
|
feature_sequence_score training_feature_sequence_map::score(const char* feature, int len) const { |
20732
|
0
|
|
|
|
|
|
key.assign(feature, len); |
20733
|
|
|
|
|
|
|
auto it = map.find(key); |
20734
|
0
|
0
|
|
|
|
|
return it != map.end() ? it->second.alpha : 0; |
20735
|
|
|
|
|
|
|
} |
20736
|
|
|
|
|
|
|
|
20737
|
|
|
|
|
|
|
} // namespace morphodita |
20738
|
|
|
|
|
|
|
|
20739
|
|
|
|
|
|
|
///////// |
20740
|
|
|
|
|
|
|
// File: morphodita/tagger/feature_sequences_optimizer.h |
20741
|
|
|
|
|
|
|
///////// |
20742
|
|
|
|
|
|
|
|
20743
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
20744
|
|
|
|
|
|
|
// |
20745
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
20746
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
20747
|
|
|
|
|
|
|
// |
20748
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
20749
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
20750
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
20751
|
|
|
|
|
|
|
|
20752
|
|
|
|
|
|
|
namespace morphodita { |
20753
|
|
|
|
|
|
|
|
20754
|
|
|
|
|
|
|
// Declarations |
20755
|
|
|
|
|
|
|
template |
20756
|
|
|
|
|
|
|
class feature_sequences_optimizer; |
20757
|
|
|
|
|
|
|
|
20758
|
|
|
|
|
|
|
template class FeatureSequences, template class ElementaryFeatures> |
20759
|
|
|
|
|
|
|
class feature_sequences_optimizer, training_feature_sequence_map>> { |
20760
|
|
|
|
|
|
|
public: |
20761
|
|
|
|
|
|
|
typedef FeatureSequences, training_feature_sequence_map> original_feature_sequences; |
20762
|
|
|
|
|
|
|
typedef FeatureSequences, persistent_feature_sequence_map> optimized_feature_sequences; |
20763
|
|
|
|
|
|
|
|
20764
|
|
|
|
|
|
|
static void optimize(const original_feature_sequences& features, optimized_feature_sequences& optimized_features); |
20765
|
|
|
|
|
|
|
}; |
20766
|
|
|
|
|
|
|
|
20767
|
|
|
|
|
|
|
// Definitions |
20768
|
|
|
|
|
|
|
template class FeatureSequences, template class ElementaryFeatures> |
20769
|
0
|
|
|
|
|
|
void feature_sequences_optimizer, training_feature_sequence_map>>::optimize(const original_feature_sequences& features, optimized_feature_sequences& optimized_features) { |
20770
|
|
|
|
|
|
|
const ElementaryFeatures& elementary = features.elementary; |
20771
|
|
|
|
|
|
|
ElementaryFeatures& optimized_elementary = optimized_features.elementary; |
20772
|
|
|
|
|
|
|
|
20773
|
|
|
|
|
|
|
// Iterate over feature sequences of non-zero weight and count number of |
20774
|
|
|
|
|
|
|
// occurences in corresponding elementary feature maps. |
20775
|
|
|
|
|
|
|
// In order to be able to do so, precompute map_index for elements of features.sequences. |
20776
|
0
|
|
|
|
|
|
vector> map_indices(features.sequences.size()); |
20777
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < map_indices.size(); i++) { |
20778
|
0
|
0
|
|
|
|
|
for (auto&& element : features.sequences[i].elements) |
20779
|
0
|
0
|
|
|
|
|
for (auto&& description : decltype(features.elementary)::descriptions) |
20780
|
0
|
0
|
|
|
|
|
if (element.type == description.type && element.elementary_index == description.index) |
|
|
0
|
|
|
|
|
|
20781
|
0
|
0
|
|
|
|
|
map_indices[i].emplace_back(description.map_index); |
20782
|
|
|
|
|
|
|
|
20783
|
0
|
0
|
|
|
|
|
assert(map_indices[i].size() == features.sequences[i].elements.size()); |
20784
|
|
|
|
|
|
|
} |
20785
|
|
|
|
|
|
|
|
20786
|
|
|
|
|
|
|
struct count_info { elementary_feature_value ori = 0; int count = 0; }; |
20787
|
0
|
0
|
|
|
|
|
vector> counts(elementary.maps.size()); |
20788
|
|
|
|
|
|
|
vector elementary_ids; |
20789
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < features.sequences.size(); i++) |
20790
|
0
|
0
|
|
|
|
|
for (auto&& element : features.scores[i].map) |
20791
|
0
|
0
|
|
|
|
|
if (element.second.gamma) { |
20792
|
|
|
|
|
|
|
elementary_ids.clear(); |
20793
|
0
|
0
|
|
|
|
|
for (const char* key = element.first.c_str(); key != element.first.c_str() + element.first.size(); assert(key <= element.first.c_str() + element.first.size())) |
|
|
0
|
|
|
|
|
|
20794
|
0
|
0
|
|
|
|
|
elementary_ids.emplace_back(vli::decode(key)); |
20795
|
|
|
|
|
|
|
|
20796
|
0
|
0
|
|
|
|
|
assert(elementary_ids.size() == features.sequences[i].elements.size()); |
20797
|
0
|
0
|
|
|
|
|
for (unsigned j = 0; j < elementary_ids.size(); j++) { |
20798
|
0
|
0
|
|
|
|
|
if (map_indices[i][j] < 0) continue; |
20799
|
0
|
0
|
|
|
|
|
if (elementary_ids[j] >= counts[map_indices[i][j]].size()) counts[map_indices[i][j]].resize(elementary_ids[j] + 1); |
|
|
0
|
|
|
|
|
|
20800
|
0
|
|
|
|
|
|
counts[map_indices[i][j]][elementary_ids[j]].count++; |
20801
|
|
|
|
|
|
|
} |
20802
|
|
|
|
|
|
|
} |
20803
|
|
|
|
|
|
|
|
20804
|
|
|
|
|
|
|
// Sort counts by sizes decreasing |
20805
|
0
|
0
|
|
|
|
|
for (auto&& count : counts) { |
20806
|
0
|
0
|
|
|
|
|
if (elementary_feature_empty >= count.size()) count.resize(elementary_feature_empty + 1); |
|
|
0
|
|
|
|
|
|
20807
|
0
|
|
|
|
|
|
count[elementary_feature_unknown].count = 0; |
20808
|
0
|
|
|
|
|
|
count[elementary_feature_empty].count = 1; |
20809
|
0
|
0
|
|
|
|
|
for (elementary_feature_value i = 0; i < count.size(); i++) count[i].ori = i; |
20810
|
0
|
|
|
|
|
|
sort(count.begin() + elementary_feature_empty + 1, count.end(), [](const count_info& a, const count_info& b){ return a.count > b.count; }); |
20811
|
|
|
|
|
|
|
} |
20812
|
|
|
|
|
|
|
|
20813
|
|
|
|
|
|
|
// Create an elementary ids map |
20814
|
0
|
0
|
|
|
|
|
vector> elementary_ids_map(counts.size()); |
20815
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < counts.size(); i++) { |
20816
|
0
|
0
|
|
|
|
|
elementary_ids_map[i].resize(counts[i].size()); |
20817
|
0
|
0
|
|
|
|
|
for (elementary_feature_value j = 0; j < counts[i].size(); j++) |
20818
|
0
|
0
|
|
|
|
|
elementary_ids_map[i][counts[i][j].ori] = counts[i][j].count ? j : elementary_feature_unknown; |
20819
|
|
|
|
|
|
|
} |
20820
|
|
|
|
|
|
|
|
20821
|
|
|
|
|
|
|
// Make optimized elementary maps by applying elementary ids map |
20822
|
|
|
|
|
|
|
optimized_elementary.maps.clear(); |
20823
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < elementary.maps.size(); i++) { |
20824
|
|
|
|
|
|
|
unordered_map mapped_ids; |
20825
|
0
|
0
|
|
|
|
|
for (auto&& element : elementary.maps[i].map) |
20826
|
0
|
0
|
|
|
|
|
if (element.second < elementary_ids_map[i].size() && elementary_ids_map[i][element.second] != elementary_feature_unknown) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
20827
|
0
|
|
|
|
|
|
mapped_ids.emplace(element.first, elementary_ids_map[i][element.second]); |
20828
|
|
|
|
|
|
|
|
20829
|
0
|
0
|
|
|
|
|
optimized_elementary.maps.emplace_back(persistent_unordered_map(mapped_ids, 1, [](binary_encoder& enc, int id) { |
|
|
0
|
|
|
|
|
|
20830
|
|
|
|
|
|
|
enc.add_4B(id); |
20831
|
|
|
|
|
|
|
})); |
20832
|
|
|
|
|
|
|
} |
20833
|
|
|
|
|
|
|
|
20834
|
|
|
|
|
|
|
// Remap keys in feature sequences by applying elementary_ids_map to appropriate subkeys |
20835
|
0
|
0
|
|
|
|
|
optimized_features.sequences = features.sequences; |
20836
|
|
|
|
|
|
|
optimized_features.scores.clear(); |
20837
|
|
|
|
|
|
|
vector key_buffer; |
20838
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < features.sequences.size(); i++) { |
20839
|
|
|
|
|
|
|
decltype(features.scores[i].map) updated_map; |
20840
|
0
|
0
|
|
|
|
|
for (auto&& element : features.scores[i].map) |
20841
|
0
|
0
|
|
|
|
|
if (element.second.gamma) { |
20842
|
|
|
|
|
|
|
elementary_ids.clear(); |
20843
|
0
|
0
|
|
|
|
|
for (const char* key = element.first.c_str(); key < element.first.c_str() + element.first.size(); ) |
20844
|
0
|
0
|
|
|
|
|
elementary_ids.emplace_back(vli::decode(key)); |
20845
|
|
|
|
|
|
|
|
20846
|
0
|
0
|
|
|
|
|
assert(elementary_ids.size() == features.sequences[i].elements.size()); |
20847
|
0
|
0
|
|
|
|
|
for (unsigned j = 0; j < elementary_ids.size(); j++) { |
20848
|
0
|
0
|
|
|
|
|
if (map_indices[i][j] < 0) continue; |
20849
|
0
|
0
|
|
|
|
|
assert(elementary_ids[j] < elementary_ids_map[map_indices[i][j]].size() && elementary_ids_map[map_indices[i][j]][elementary_ids[j]] != elementary_feature_unknown); |
|
|
0
|
|
|
|
|
|
20850
|
0
|
|
|
|
|
|
elementary_ids[j] = elementary_ids_map[map_indices[i][j]][elementary_ids[j]]; |
20851
|
|
|
|
|
|
|
} |
20852
|
|
|
|
|
|
|
|
20853
|
0
|
0
|
|
|
|
|
key_buffer.resize(elementary_ids.size() * vli::max_length()); |
20854
|
0
|
|
|
|
|
|
char* key = key_buffer.data(); |
20855
|
0
|
0
|
|
|
|
|
for (unsigned j = 0; j < elementary_ids.size(); j++) |
20856
|
0
|
|
|
|
|
|
vli::encode(elementary_ids[j], key); |
20857
|
|
|
|
|
|
|
|
20858
|
0
|
|
|
|
|
|
updated_map.emplace(string(key_buffer.data(), key - key_buffer.data()), element.second); |
20859
|
|
|
|
|
|
|
} |
20860
|
|
|
|
|
|
|
|
20861
|
0
|
0
|
|
|
|
|
optimized_features.scores.emplace_back(persistent_unordered_map(updated_map, 1, [](binary_encoder& enc, const training_feature_sequence_map::info& info) { |
|
|
0
|
|
|
|
|
|
20862
|
0
|
0
|
|
|
|
|
assert(feature_sequence_score(info.gamma) == info.gamma); |
20863
|
0
|
|
|
|
|
|
enc.add_4B(info.gamma); |
20864
|
0
|
|
|
|
|
|
})); |
20865
|
|
|
|
|
|
|
} |
20866
|
|
|
|
|
|
|
|
20867
|
|
|
|
|
|
|
// Original code which only dropped feature sequences with gamma == 0 |
20868
|
|
|
|
|
|
|
// optimized_elementary.maps.clear(); |
20869
|
|
|
|
|
|
|
// for (auto&& map : elementary.maps) |
20870
|
|
|
|
|
|
|
// optimized_elementary.maps.emplace_back(persistent_unordered_map(map.map, 1, [](binary_encoder& enc, elementary_feature_value value) { |
20871
|
|
|
|
|
|
|
// enc.add_4B(value); |
20872
|
|
|
|
|
|
|
// })); |
20873
|
|
|
|
|
|
|
// |
20874
|
|
|
|
|
|
|
// optimized_features.sequences = features.sequences; |
20875
|
|
|
|
|
|
|
// optimized_features.scores.clear(); |
20876
|
|
|
|
|
|
|
// for (auto&& score : features.scores) { |
20877
|
|
|
|
|
|
|
// decltype(score.map) pruned_map; |
20878
|
|
|
|
|
|
|
// for (auto&& element : score.map) |
20879
|
|
|
|
|
|
|
// if (element.second.gamma) |
20880
|
|
|
|
|
|
|
// pruned_map.insert(element); |
20881
|
|
|
|
|
|
|
// |
20882
|
|
|
|
|
|
|
// optimized_features.scores.emplace_back(persistent_unordered_map(pruned_map, 1, [](binary_encoder& enc, const training_feature_sequence_map::info& info) { |
20883
|
|
|
|
|
|
|
// enc.add_4B(info.gamma); |
20884
|
|
|
|
|
|
|
// })); |
20885
|
|
|
|
|
|
|
// } |
20886
|
0
|
|
|
|
|
|
} |
20887
|
|
|
|
|
|
|
|
20888
|
|
|
|
|
|
|
} // namespace morphodita |
20889
|
|
|
|
|
|
|
|
20890
|
|
|
|
|
|
|
///////// |
20891
|
|
|
|
|
|
|
// File: morphodita/tagger/tagger_trainer.h |
20892
|
|
|
|
|
|
|
///////// |
20893
|
|
|
|
|
|
|
|
20894
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
20895
|
|
|
|
|
|
|
// |
20896
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
20897
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
20898
|
|
|
|
|
|
|
// |
20899
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
20900
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
20901
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
20902
|
|
|
|
|
|
|
|
20903
|
|
|
|
|
|
|
namespace morphodita { |
20904
|
|
|
|
|
|
|
|
20905
|
|
|
|
|
|
|
// Declarations |
20906
|
|
|
|
|
|
|
template |
20907
|
|
|
|
|
|
|
class tagger_trainer { |
20908
|
|
|
|
|
|
|
public: |
20909
|
0
|
|
|
|
|
|
struct sentence { |
20910
|
|
|
|
|
|
|
vector words; |
20911
|
|
|
|
|
|
|
vector forms; |
20912
|
|
|
|
|
|
|
vector> analyses; |
20913
|
|
|
|
|
|
|
vector gold; |
20914
|
|
|
|
|
|
|
vector gold_index; |
20915
|
|
|
|
|
|
|
}; |
20916
|
|
|
|
|
|
|
|
20917
|
|
|
|
|
|
|
static void train(int decoding_order, int window_size, int iterations, istream& in_morpho_dict, bool use_guesser, istream& in_feature_templates, bool prune_features, istream& in_train, istream& in_heldout, bool early_stopping, ostream& out_tagger); |
20918
|
|
|
|
|
|
|
|
20919
|
|
|
|
|
|
|
private: |
20920
|
|
|
|
|
|
|
static double load_data(istream& is, const morpho& d, bool use_guesser, vector& sentences, bool add_gold); |
20921
|
|
|
|
|
|
|
}; |
20922
|
|
|
|
|
|
|
|
20923
|
|
|
|
|
|
|
// Definitions |
20924
|
|
|
|
|
|
|
template |
20925
|
0
|
|
|
|
|
|
void tagger_trainer::train(int decoding_order, int window_size, int iterations, istream& in_morpho_dict, bool use_guesser, istream& in_feature_templates, bool prune_features, istream& in_train, istream& in_heldout, bool early_stopping, ostream& out_tagger) { |
20926
|
|
|
|
|
|
|
// cerr << "Loading dictionary: "; |
20927
|
0
|
|
|
|
|
|
unique_ptr d(morpho::load(in_morpho_dict)); |
20928
|
0
|
0
|
|
|
|
|
if (!d) training_failure("Cannot load dictionary!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
20929
|
|
|
|
|
|
|
// cerr << "done" << endl; |
20930
|
0
|
0
|
|
|
|
|
if (!in_morpho_dict.seekg(0, istream::beg)) training_failure("Cannot seek in dictionary file to the beginning!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
20931
|
|
|
|
|
|
|
|
20932
|
0
|
|
|
|
|
|
vector train_data; |
20933
|
|
|
|
|
|
|
// cerr << "Loading train data: "; |
20934
|
|
|
|
|
|
|
// cerr << "done, matched " << fixed << setprecision(2) << 100 * load_data(in_train, *d, use_guesser, train_data, true) << '%' << endl; |
20935
|
0
|
0
|
|
|
|
|
load_data(in_train, *d, use_guesser, train_data, true); |
20936
|
|
|
|
|
|
|
|
20937
|
0
|
|
|
|
|
|
vector heldout_data; |
20938
|
0
|
0
|
|
|
|
|
if (in_heldout) { |
20939
|
|
|
|
|
|
|
// cerr << "Loading heldout data: "; |
20940
|
|
|
|
|
|
|
// cerr << "done, matched " << fixed << setprecision(2) << 100 * load_data(in_heldout, *d, use_guesser, heldout_data, false) << '%' << endl; |
20941
|
0
|
0
|
|
|
|
|
load_data(in_heldout, *d, use_guesser, heldout_data, false); |
20942
|
|
|
|
|
|
|
} |
20943
|
|
|
|
|
|
|
|
20944
|
|
|
|
|
|
|
// Encode morphological dictionary |
20945
|
|
|
|
|
|
|
// cerr << "Encoding morphological dictionary." << endl; |
20946
|
0
|
0
|
|
|
|
|
out_tagger << in_morpho_dict.rdbuf(); |
20947
|
0
|
0
|
|
|
|
|
out_tagger.put(use_guesser); |
20948
|
|
|
|
|
|
|
|
20949
|
|
|
|
|
|
|
// Train and encode the tagger |
20950
|
0
|
0
|
|
|
|
|
TaggerTrainer::train(decoding_order, window_size, iterations, train_data, heldout_data, early_stopping, prune_features, in_feature_templates, out_tagger); |
20951
|
0
|
|
|
|
|
|
} |
20952
|
|
|
|
|
|
|
|
20953
|
|
|
|
|
|
|
template |
20954
|
0
|
|
|
|
|
|
double tagger_trainer::load_data(istream& is, const morpho& d, bool use_guesser, vector& sentences, bool add_gold) { |
20955
|
|
|
|
|
|
|
sentences.clear(); |
20956
|
|
|
|
|
|
|
|
20957
|
|
|
|
|
|
|
int forms = 0, forms_matched = 0; |
20958
|
|
|
|
|
|
|
|
20959
|
|
|
|
|
|
|
string line; |
20960
|
0
|
|
|
|
|
|
vector tokens; |
20961
|
0
|
0
|
|
|
|
|
sentences.emplace_back(); |
20962
|
0
|
0
|
|
|
|
|
while (getline(is, line)) { |
|
|
0
|
|
|
|
|
|
20963
|
0
|
0
|
|
|
|
|
if (line.empty()) { |
20964
|
0
|
0
|
|
|
|
|
if (!sentences.back().words.empty()) |
20965
|
0
|
0
|
|
|
|
|
sentences.emplace_back(); |
20966
|
|
|
|
|
|
|
continue; |
20967
|
|
|
|
|
|
|
} |
20968
|
|
|
|
|
|
|
|
20969
|
0
|
0
|
|
|
|
|
split(line, '\t', tokens); |
20970
|
0
|
0
|
|
|
|
|
if (tokens.size() != 3) training_failure("The tagger data line '" << line << "' does not contain three columns!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
20971
|
|
|
|
|
|
|
|
20972
|
|
|
|
|
|
|
// Add form to sentence |
20973
|
0
|
|
|
|
|
|
forms++; |
20974
|
|
|
|
|
|
|
sentence& s = sentences.back(); |
20975
|
0
|
0
|
|
|
|
|
s.words.emplace_back(tokens[0]); |
20976
|
0
|
0
|
|
|
|
|
s.gold.emplace_back(tokens[1], tokens[2]); |
20977
|
0
|
0
|
|
|
|
|
s.gold_index.emplace_back(-1); |
20978
|
|
|
|
|
|
|
|
20979
|
|
|
|
|
|
|
// Analyse |
20980
|
0
|
0
|
|
|
|
|
s.analyses.emplace_back(); |
20981
|
0
|
0
|
|
|
|
|
d.analyze(tokens[0], use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, s.analyses.back()); |
|
|
0
|
|
|
|
|
|
20982
|
|
|
|
|
|
|
|
20983
|
|
|
|
|
|
|
// Locate gold analysis |
20984
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < s.analyses.back().size(); i++) |
20985
|
0
|
0
|
|
|
|
|
if (s.analyses.back()[i].lemma == s.gold.back().lemma && s.analyses.back()[i].tag == s.gold.back().tag) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
20986
|
0
|
|
|
|
|
|
s.gold_index.back() = i; |
20987
|
0
|
|
|
|
|
|
forms_matched++; |
20988
|
0
|
|
|
|
|
|
break; |
20989
|
|
|
|
|
|
|
} |
20990
|
0
|
0
|
|
|
|
|
if (s.gold_index.back() == -1 && add_gold) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
20991
|
0
|
|
|
|
|
|
s.gold_index.back() = s.analyses.back().size(); |
20992
|
0
|
0
|
|
|
|
|
s.analyses.back().emplace_back(tokens[1], tokens[2]); |
20993
|
|
|
|
|
|
|
} |
20994
|
|
|
|
|
|
|
} |
20995
|
0
|
0
|
|
|
|
|
if (!sentences.empty() && sentences.back().words.empty()) sentences.pop_back(); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
20996
|
|
|
|
|
|
|
|
20997
|
|
|
|
|
|
|
// Fill the forms string_pieces now that the sentences will not reallocate |
20998
|
0
|
0
|
|
|
|
|
for (auto&& sentence : sentences) |
20999
|
0
|
0
|
|
|
|
|
for (auto&& word : sentence.words) |
21000
|
0
|
0
|
|
|
|
|
sentence.forms.emplace_back(string_piece(word.c_str(), d.raw_form_len(word))); |
|
|
0
|
|
|
|
|
|
21001
|
|
|
|
|
|
|
|
21002
|
0
|
|
|
|
|
|
return forms_matched / double(forms); |
21003
|
|
|
|
|
|
|
} |
21004
|
|
|
|
|
|
|
|
21005
|
|
|
|
|
|
|
} // namespace morphodita |
21006
|
|
|
|
|
|
|
|
21007
|
|
|
|
|
|
|
///////// |
21008
|
|
|
|
|
|
|
// File: morphodita/tagger/perceptron_tagger_trainer.h |
21009
|
|
|
|
|
|
|
///////// |
21010
|
|
|
|
|
|
|
|
21011
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
21012
|
|
|
|
|
|
|
// |
21013
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
21014
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
21015
|
|
|
|
|
|
|
// |
21016
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
21017
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
21018
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
21019
|
|
|
|
|
|
|
|
21020
|
|
|
|
|
|
|
namespace morphodita { |
21021
|
|
|
|
|
|
|
|
21022
|
|
|
|
|
|
|
// Declarations |
21023
|
|
|
|
|
|
|
template |
21024
|
|
|
|
|
|
|
class perceptron_tagger_trainer { |
21025
|
|
|
|
|
|
|
public: |
21026
|
|
|
|
|
|
|
typedef typename tagger_trainer>::sentence sentence; |
21027
|
|
|
|
|
|
|
|
21028
|
|
|
|
|
|
|
static void train(int decoding_order, int window_size, int iterations, const vector& train, const vector& heldout, bool early_stopping, bool prune_features, istream& in_feature_templates, ostream& out_tagger); |
21029
|
|
|
|
|
|
|
|
21030
|
|
|
|
|
|
|
private: |
21031
|
|
|
|
|
|
|
static void train_viterbi(int decoding_order, int window_size, int iterations, const vector& train, const vector& heldout, bool early_stopping, bool prune_features, FeatureSequences& features); |
21032
|
|
|
|
|
|
|
}; |
21033
|
|
|
|
|
|
|
|
21034
|
|
|
|
|
|
|
// Definitions |
21035
|
|
|
|
|
|
|
template |
21036
|
0
|
|
|
|
|
|
void perceptron_tagger_trainer::train(int decoding_order, int window_size, int iterations, const vector& train, const vector& heldout, bool early_stopping, bool prune_features, istream& in_feature_templates, ostream& out_tagger) { |
21037
|
0
|
|
|
|
|
|
FeatureSequences features; |
21038
|
|
|
|
|
|
|
|
21039
|
|
|
|
|
|
|
// cerr << "Parsing feature templates..." << endl; |
21040
|
0
|
0
|
|
|
|
|
features.parse(window_size, in_feature_templates); |
21041
|
|
|
|
|
|
|
|
21042
|
|
|
|
|
|
|
// cerr << "Training tagger..." << endl; |
21043
|
0
|
0
|
|
|
|
|
train_viterbi(decoding_order, window_size, iterations, train, heldout, early_stopping, prune_features, features); |
21044
|
|
|
|
|
|
|
|
21045
|
|
|
|
|
|
|
// cerr << "Encoding tagger..." << endl; |
21046
|
|
|
|
|
|
|
typedef feature_sequences_optimizer optimizer; |
21047
|
0
|
|
|
|
|
|
typename optimizer::optimized_feature_sequences optimized_features; |
21048
|
0
|
0
|
|
|
|
|
optimizer::optimize(features, optimized_features); |
21049
|
0
|
0
|
|
|
|
|
if (!optimized_features.save(out_tagger)) training_failure("Cannot save feature sequences!"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21050
|
0
|
|
|
|
|
|
} |
21051
|
|
|
|
|
|
|
|
21052
|
|
|
|
|
|
|
template |
21053
|
0
|
|
|
|
|
|
void perceptron_tagger_trainer::train_viterbi(int decoding_order, int window_size, int iterations, const vector& train, const vector& heldout, bool early_stopping, bool prune_features, FeatureSequences& features) { |
21054
|
|
|
|
|
|
|
int best_correct = 0, best_iteration = -1; |
21055
|
0
|
|
|
|
|
|
FeatureSequences best_features; |
21056
|
|
|
|
|
|
|
|
21057
|
|
|
|
|
|
|
viterbi decoder(features, decoding_order, window_size); |
21058
|
0
|
0
|
|
|
|
|
typename decltype(decoder)::cache decoder_cache(decoder); |
21059
|
|
|
|
|
|
|
|
21060
|
0
|
0
|
|
|
|
|
typename FeatureSequences::cache feature_sequences_cache(features); |
21061
|
|
|
|
|
|
|
typename FeatureSequences::dynamic_features decoded_dynamic_features, gold_dynamic_features; |
21062
|
0
|
|
|
|
|
|
vector decoded_feature_sequences_keys, gold_feature_sequences_keys; |
21063
|
|
|
|
|
|
|
|
21064
|
0
|
0
|
|
|
|
|
vector window(window_size); |
21065
|
|
|
|
|
|
|
|
21066
|
|
|
|
|
|
|
// Initialize feature sequences for the gold decoding only if requested |
21067
|
0
|
0
|
|
|
|
|
if (prune_features) |
21068
|
0
|
0
|
|
|
|
|
for (unsigned s = 0; s < train.size(); s++) { |
21069
|
|
|
|
|
|
|
auto& sentence = train[s]; |
21070
|
0
|
0
|
|
|
|
|
features.initialize_sentence(sentence.forms, sentence.analyses, feature_sequences_cache); |
21071
|
0
|
0
|
|
|
|
|
for (int i = 0; i < int(sentence.forms.size()); i++) { |
21072
|
0
|
|
|
|
|
|
window.assign(window_size, -1); |
21073
|
0
|
0
|
|
|
|
|
for (int j = 0; j < window_size && i - j >= 0; j++) window[j] = sentence.gold_index[i - j]; |
|
|
0
|
|
|
|
|
|
21074
|
|
|
|
|
|
|
|
21075
|
0
|
|
|
|
|
|
features.compute_dynamic_features(i, window[0], &gold_dynamic_features, gold_dynamic_features, feature_sequences_cache); |
21076
|
0
|
0
|
|
|
|
|
features.feature_keys(i, window.data(), 0, gold_dynamic_features, gold_feature_sequences_keys, feature_sequences_cache); |
21077
|
|
|
|
|
|
|
|
21078
|
0
|
0
|
|
|
|
|
for (unsigned f = 0; f < features.scores.size(); f++) |
21079
|
0
|
0
|
|
|
|
|
if (!gold_feature_sequences_keys[f].empty()) |
21080
|
|
|
|
|
|
|
features.scores[f].map[gold_feature_sequences_keys[f]]; |
21081
|
|
|
|
|
|
|
} |
21082
|
|
|
|
|
|
|
} |
21083
|
|
|
|
|
|
|
|
21084
|
|
|
|
|
|
|
// Train for given number of iterations |
21085
|
0
|
0
|
|
|
|
|
for (int i = 0; i < iterations; i++) { |
21086
|
|
|
|
|
|
|
// Train |
21087
|
|
|
|
|
|
|
int train_correct = 0, train_total = 0; |
21088
|
0
|
0
|
|
|
|
|
cerr << "Iteration " << i + 1 << ": "; |
|
|
0
|
|
|
|
|
|
21089
|
|
|
|
|
|
|
|
21090
|
|
|
|
|
|
|
vector tags; |
21091
|
0
|
0
|
|
|
|
|
for (unsigned s = 0; s < train.size(); s++) { |
21092
|
|
|
|
|
|
|
auto& sentence = train[s]; |
21093
|
|
|
|
|
|
|
|
21094
|
|
|
|
|
|
|
// Run Viterbi |
21095
|
0
|
0
|
|
|
|
|
if (tags.size() < sentence.forms.size()) tags.resize(2 * sentence.forms.size()); |
|
|
0
|
|
|
|
|
|
21096
|
0
|
0
|
|
|
|
|
decoder.tag(sentence.forms, sentence.analyses, decoder_cache, tags); |
21097
|
|
|
|
|
|
|
|
21098
|
|
|
|
|
|
|
// Compute feature sequence keys or decoded result and gold result and update alpha & gamma |
21099
|
0
|
0
|
|
|
|
|
features.initialize_sentence(sentence.forms, sentence.analyses, feature_sequences_cache); |
21100
|
0
|
0
|
|
|
|
|
for (int i = 0; i < int(sentence.forms.size()); i++) { |
21101
|
0
|
|
|
|
|
|
train_correct += tags[i] == sentence.gold_index[i]; |
21102
|
0
|
|
|
|
|
|
train_total++; |
21103
|
|
|
|
|
|
|
|
21104
|
0
|
|
|
|
|
|
window.assign(window_size, -1); |
21105
|
0
|
0
|
|
|
|
|
for (int j = 0; j < window_size && i - j >= 0; j++) window[j] = tags[i - j]; |
|
|
0
|
|
|
|
|
|
21106
|
0
|
|
|
|
|
|
features.compute_dynamic_features(i, window[0], &decoded_dynamic_features, decoded_dynamic_features, feature_sequences_cache); |
21107
|
0
|
0
|
|
|
|
|
features.feature_keys(i, window.data(), 0, decoded_dynamic_features, decoded_feature_sequences_keys, feature_sequences_cache); |
21108
|
|
|
|
|
|
|
|
21109
|
0
|
0
|
|
|
|
|
for (int j = 0; j < window_size && i - j >= 0; j++) window[j] = sentence.gold_index[i - j]; |
|
|
0
|
|
|
|
|
|
21110
|
0
|
|
|
|
|
|
features.compute_dynamic_features(i, window[0], &gold_dynamic_features, gold_dynamic_features, feature_sequences_cache); |
21111
|
0
|
0
|
|
|
|
|
features.feature_keys(i, window.data(), 0, gold_dynamic_features, gold_feature_sequences_keys, feature_sequences_cache); |
21112
|
|
|
|
|
|
|
|
21113
|
0
|
0
|
|
|
|
|
for (unsigned f = 0; f < features.scores.size(); f++) { |
21114
|
0
|
0
|
|
|
|
|
if (decoded_feature_sequences_keys[f] != gold_feature_sequences_keys[f]) { |
21115
|
0
|
0
|
|
|
|
|
if (!decoded_feature_sequences_keys[f].empty()) { |
21116
|
|
|
|
|
|
|
auto it = features.scores[f].map.find(decoded_feature_sequences_keys[f]); |
21117
|
0
|
0
|
|
|
|
|
if (it == features.scores[f].map.end() && !prune_features) it = features.scores[f].map.emplace(decoded_feature_sequences_keys[f], typename decltype(features.scores[f].map)::mapped_type()).first; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21118
|
0
|
0
|
|
|
|
|
if (it != features.scores[f].map.end()) { |
21119
|
|
|
|
|
|
|
auto& decoded_info = it->second; |
21120
|
0
|
|
|
|
|
|
decoded_info.gamma += decoded_info.alpha * (s - decoded_info.last_gamma_update); |
21121
|
0
|
|
|
|
|
|
decoded_info.last_gamma_update = s; |
21122
|
0
|
|
|
|
|
|
decoded_info.alpha--; |
21123
|
|
|
|
|
|
|
} |
21124
|
|
|
|
|
|
|
} |
21125
|
|
|
|
|
|
|
|
21126
|
0
|
0
|
|
|
|
|
if (!gold_feature_sequences_keys[f].empty()) { |
21127
|
|
|
|
|
|
|
auto it = features.scores[f].map.find(gold_feature_sequences_keys[f]); |
21128
|
0
|
0
|
|
|
|
|
if (it == features.scores[f].map.end() && !prune_features) it = features.scores[f].map.emplace(gold_feature_sequences_keys[f], typename decltype(features.scores[f].map)::mapped_type()).first; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21129
|
0
|
0
|
|
|
|
|
if (it != features.scores[f].map.end()) { |
21130
|
|
|
|
|
|
|
auto& gold_info = it->second; |
21131
|
0
|
|
|
|
|
|
gold_info.gamma += gold_info.alpha * (s - gold_info.last_gamma_update); |
21132
|
0
|
|
|
|
|
|
gold_info.last_gamma_update = s; |
21133
|
0
|
|
|
|
|
|
gold_info.alpha++; |
21134
|
|
|
|
|
|
|
} |
21135
|
|
|
|
|
|
|
} |
21136
|
|
|
|
|
|
|
} |
21137
|
|
|
|
|
|
|
} |
21138
|
|
|
|
|
|
|
} |
21139
|
|
|
|
|
|
|
} |
21140
|
|
|
|
|
|
|
|
21141
|
|
|
|
|
|
|
// Finalize incremental gamma updates |
21142
|
0
|
0
|
|
|
|
|
for (auto&& score : features.scores) |
21143
|
0
|
0
|
|
|
|
|
for (auto&& element : score.map) { |
21144
|
0
|
|
|
|
|
|
element.second.gamma += element.second.alpha * (train.size() - element.second.last_gamma_update); |
21145
|
0
|
|
|
|
|
|
element.second.last_gamma_update = 0; |
21146
|
|
|
|
|
|
|
} |
21147
|
0
|
|
|
|
|
|
cerr << "done, accuracy " << fixed << setprecision(2) << train_correct * 100 / double(train_total) << '%'; |
21148
|
|
|
|
|
|
|
|
21149
|
|
|
|
|
|
|
// If we have any heldout data, compute accuracy and if requested store best tagger configuration |
21150
|
0
|
0
|
|
|
|
|
if (!heldout.empty()) { |
21151
|
|
|
|
|
|
|
enum { TAGS, LEMMAS, BOTH, TOTAL }; |
21152
|
|
|
|
|
|
|
int heldout_correct[TOTAL] = {}, heldout_total = 0; |
21153
|
|
|
|
|
|
|
|
21154
|
|
|
|
|
|
|
typedef feature_sequences_optimizer optimizer; |
21155
|
0
|
|
|
|
|
|
typename optimizer::optimized_feature_sequences frozen_features; |
21156
|
0
|
0
|
|
|
|
|
optimizer::optimize(features, frozen_features); |
21157
|
|
|
|
|
|
|
viterbi frozen_decoder(frozen_features, decoding_order, window_size); |
21158
|
0
|
0
|
|
|
|
|
typename decltype(frozen_decoder)::cache frozen_decoder_cache(frozen_decoder); |
21159
|
|
|
|
|
|
|
|
21160
|
0
|
0
|
|
|
|
|
for (auto&& sentence : heldout) { |
21161
|
0
|
0
|
|
|
|
|
if (tags.size() < sentence.forms.size()) tags.resize(sentence.forms.size() * 2); |
|
|
0
|
|
|
|
|
|
21162
|
0
|
0
|
|
|
|
|
frozen_decoder.tag(sentence.forms, sentence.analyses, frozen_decoder_cache, tags); |
21163
|
|
|
|
|
|
|
|
21164
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < sentence.forms.size(); i++) { |
21165
|
0
|
|
|
|
|
|
heldout_correct[TAGS] += sentence.gold[i].tag == sentence.analyses[i][tags[i]].tag; |
21166
|
0
|
|
|
|
|
|
heldout_correct[LEMMAS] += sentence.gold[i].lemma == sentence.analyses[i][tags[i]].lemma; |
21167
|
0
|
0
|
|
|
|
|
heldout_correct[BOTH] += sentence.gold[i].tag == sentence.analyses[i][tags[i]].tag && sentence.gold[i].lemma == sentence.analyses[i][tags[i]].lemma; |
|
|
0
|
|
|
|
|
|
21168
|
0
|
|
|
|
|
|
heldout_total++; |
21169
|
|
|
|
|
|
|
} |
21170
|
|
|
|
|
|
|
} |
21171
|
|
|
|
|
|
|
|
21172
|
0
|
0
|
|
|
|
|
if (early_stopping && heldout_correct[BOTH] > best_correct) { |
|
|
0
|
|
|
|
|
|
21173
|
|
|
|
|
|
|
best_correct = heldout_correct[BOTH]; |
21174
|
|
|
|
|
|
|
best_iteration = i; |
21175
|
0
|
0
|
|
|
|
|
best_features = features; |
21176
|
|
|
|
|
|
|
} |
21177
|
|
|
|
|
|
|
|
21178
|
0
|
0
|
|
|
|
|
cerr << ", heldout accuracy " << fixed << setprecision(2) |
21179
|
0
|
|
|
|
|
|
<< 100 * heldout_correct[TAGS] / double(heldout_total) << "%t/" |
21180
|
0
|
|
|
|
|
|
<< 100 * heldout_correct[LEMMAS] / double(heldout_total) << "%l/" |
21181
|
0
|
|
|
|
|
|
<< 100 * heldout_correct[BOTH] / double(heldout_total) << "%b"; |
21182
|
|
|
|
|
|
|
} |
21183
|
|
|
|
|
|
|
cerr << endl; |
21184
|
|
|
|
|
|
|
} |
21185
|
|
|
|
|
|
|
|
21186
|
0
|
0
|
|
|
|
|
if (early_stopping && best_iteration >= 0) { |
21187
|
0
|
0
|
|
|
|
|
cerr << "Chosen tagger model from iteration " << best_iteration + 1 << endl; |
21188
|
0
|
0
|
|
|
|
|
features = best_features; |
21189
|
|
|
|
|
|
|
} |
21190
|
0
|
|
|
|
|
|
} |
21191
|
|
|
|
|
|
|
|
21192
|
|
|
|
|
|
|
} // namespace morphodita |
21193
|
|
|
|
|
|
|
|
21194
|
|
|
|
|
|
|
///////// |
21195
|
|
|
|
|
|
|
// File: utils/options.h |
21196
|
|
|
|
|
|
|
///////// |
21197
|
|
|
|
|
|
|
|
21198
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
21199
|
|
|
|
|
|
|
// |
21200
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
21201
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
21202
|
|
|
|
|
|
|
// |
21203
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
21204
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
21205
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
21206
|
|
|
|
|
|
|
|
21207
|
|
|
|
|
|
|
namespace utils { |
21208
|
|
|
|
|
|
|
|
21209
|
|
|
|
|
|
|
class options { |
21210
|
|
|
|
|
|
|
public: |
21211
|
|
|
|
|
|
|
typedef unordered_map map; |
21212
|
|
|
|
|
|
|
|
21213
|
|
|
|
|
|
|
struct value { |
21214
|
|
|
|
|
|
|
enum allowed_t { NONE, ANY, SET }; |
21215
|
|
|
|
|
|
|
allowed_t allowed; |
21216
|
|
|
|
|
|
|
unordered_set set; |
21217
|
|
|
|
|
|
|
|
21218
|
|
|
|
|
|
|
value(initializer_list set) : allowed(SET), set(set) {} |
21219
|
|
|
|
|
|
|
static const value none; |
21220
|
|
|
|
|
|
|
static const value any; |
21221
|
|
|
|
|
|
|
|
21222
|
|
|
|
|
|
|
private: |
21223
|
|
|
|
|
|
|
value(allowed_t allowed) : allowed(allowed) {} |
21224
|
|
|
|
|
|
|
}; |
21225
|
|
|
|
|
|
|
|
21226
|
|
|
|
|
|
|
// Parse options according to allowed map. If successful, argv is reordered so |
21227
|
|
|
|
|
|
|
// that non-option arguments are placed in argv[1] to argv[argc-1]. The '--' |
21228
|
|
|
|
|
|
|
// indicates end of option arguments (as usual). The allowed map contains |
21229
|
|
|
|
|
|
|
// values allowed for every option. If empty, no value is allowed, if it |
21230
|
|
|
|
|
|
|
// contains just an empty string, any value is allowed. |
21231
|
|
|
|
|
|
|
static bool parse(const unordered_map& allowed, int& argc, char**& argv, map& options); |
21232
|
|
|
|
|
|
|
}; |
21233
|
|
|
|
|
|
|
|
21234
|
|
|
|
|
|
|
} // namespace utils |
21235
|
|
|
|
|
|
|
|
21236
|
|
|
|
|
|
|
///////// |
21237
|
|
|
|
|
|
|
// File: version/version.h |
21238
|
|
|
|
|
|
|
///////// |
21239
|
|
|
|
|
|
|
|
21240
|
|
|
|
|
|
|
// This file is part of UDPipe . |
21241
|
|
|
|
|
|
|
// |
21242
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
21243
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
21244
|
|
|
|
|
|
|
// |
21245
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
21246
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
21247
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
21248
|
|
|
|
|
|
|
|
21249
|
0
|
|
|
|
|
|
class version { |
21250
|
|
|
|
|
|
|
public: |
21251
|
|
|
|
|
|
|
unsigned major; |
21252
|
|
|
|
|
|
|
unsigned minor; |
21253
|
|
|
|
|
|
|
unsigned patch; |
21254
|
|
|
|
|
|
|
std::string prerelease; |
21255
|
|
|
|
|
|
|
|
21256
|
|
|
|
|
|
|
// Returns current version. |
21257
|
|
|
|
|
|
|
static version current(); |
21258
|
|
|
|
|
|
|
|
21259
|
|
|
|
|
|
|
// Returns multi-line formated version and copyright string. |
21260
|
|
|
|
|
|
|
static string version_and_copyright(const string& other_libraries = string()); |
21261
|
|
|
|
|
|
|
}; |
21262
|
|
|
|
|
|
|
|
21263
|
|
|
|
|
|
|
///////// |
21264
|
|
|
|
|
|
|
// File: trainer/trainer_morphodita_parsito.cpp |
21265
|
|
|
|
|
|
|
///////// |
21266
|
|
|
|
|
|
|
|
21267
|
|
|
|
|
|
|
// This file is part of UDPipe . |
21268
|
|
|
|
|
|
|
// |
21269
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
21270
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
21271
|
|
|
|
|
|
|
// |
21272
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
21273
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
21274
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
21275
|
|
|
|
|
|
|
|
21276
|
0
|
|
|
|
|
|
bool trainer_morphodita_parsito::train(const vector& training, const vector& heldout, |
21277
|
|
|
|
|
|
|
const string& tokenizer, const string& tagger, const string& parser, ostream& os, string& error) { |
21278
|
|
|
|
|
|
|
error.clear(); |
21279
|
|
|
|
|
|
|
|
21280
|
|
|
|
|
|
|
// Save model version info |
21281
|
0
|
|
|
|
|
|
os.put(model_morphodita_parsito::VERSION_LATEST); |
21282
|
|
|
|
|
|
|
// Add sentinel required since version 2 |
21283
|
0
|
|
|
|
|
|
os.put(0x7F).put(0x7F); |
21284
|
|
|
|
|
|
|
|
21285
|
|
|
|
|
|
|
// Check input data |
21286
|
0
|
0
|
|
|
|
|
for (auto&& sentence : training) |
21287
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < sentence.words.size(); i++) |
21288
|
0
|
0
|
|
|
|
|
if (!can_combine_tag(sentence.words[i], error)) |
21289
|
|
|
|
|
|
|
return false; |
21290
|
0
|
0
|
|
|
|
|
for (auto&& sentence : heldout) |
21291
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < sentence.words.size(); i++) |
21292
|
0
|
0
|
|
|
|
|
if (!can_combine_tag(sentence.words[i], error)) |
21293
|
|
|
|
|
|
|
return false; |
21294
|
|
|
|
|
|
|
|
21295
|
0
|
0
|
|
|
|
|
if (!train_tokenizer(training, heldout, tokenizer, os, error)) return false; |
21296
|
|
|
|
|
|
|
string tagger_model; |
21297
|
|
|
|
|
|
|
{ |
21298
|
0
|
0
|
|
|
|
|
ostringstream os_tagger; |
21299
|
0
|
0
|
|
|
|
|
if (!train_tagger(training, heldout, tagger, os_tagger, error)) return false; |
|
|
0
|
|
|
|
|
|
21300
|
0
|
|
|
|
|
|
tagger_model.assign(os_tagger.str()); |
21301
|
0
|
0
|
|
|
|
|
os.write(tagger_model.data(), tagger_model.size()); |
21302
|
|
|
|
|
|
|
} |
21303
|
0
|
0
|
|
|
|
|
if (!train_parser(training, heldout, parser, tagger_model, os, error)) return false; |
|
|
0
|
|
|
|
|
|
21304
|
|
|
|
|
|
|
|
21305
|
0
|
|
|
|
|
|
return true; |
21306
|
|
|
|
|
|
|
} |
21307
|
|
|
|
|
|
|
|
21308
|
0
|
|
|
|
|
|
bool trainer_morphodita_parsito::train_tokenizer(const vector& training, const vector& heldout, |
21309
|
|
|
|
|
|
|
const string& options, ostream& os, string& error) { |
21310
|
0
|
0
|
|
|
|
|
if (options == NONE) { |
21311
|
0
|
|
|
|
|
|
os.put(0); |
21312
|
|
|
|
|
|
|
} else { |
21313
|
|
|
|
|
|
|
// Tokenizer options |
21314
|
|
|
|
|
|
|
named_values::map tokenizer; |
21315
|
0
|
0
|
|
|
|
|
if (!named_values::parse(options, tokenizer, error)) return false; |
|
|
0
|
|
|
|
|
|
21316
|
0
|
0
|
|
|
|
|
int run = 0; if (!option_int(tokenizer, "run", run, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21317
|
|
|
|
|
|
|
|
21318
|
0
|
0
|
|
|
|
|
if (tokenizer.count("from_model")) { |
|
|
0
|
|
|
|
|
|
21319
|
|
|
|
|
|
|
// Use specified tokenizer model |
21320
|
|
|
|
|
|
|
string_piece tokenizer_data; |
21321
|
0
|
0
|
|
|
|
|
if (!load_model(tokenizer["from_model"], TOKENIZER_MODEL, tokenizer_data)) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21322
|
0
|
0
|
|
|
|
|
return error.assign("Cannot load model from which the tokenizer should be used!"), false; |
21323
|
|
|
|
|
|
|
|
21324
|
|
|
|
|
|
|
cerr << "Using tokenizer from given model." << endl; |
21325
|
0
|
0
|
|
|
|
|
os.write(tokenizer_data.str, tokenizer_data.len); |
21326
|
|
|
|
|
|
|
} else { |
21327
|
0
|
0
|
|
|
|
|
os.put(1); |
21328
|
0
|
0
|
|
|
|
|
const string& model = option_str(tokenizer, "model"); |
|
|
0
|
|
|
|
|
|
21329
|
|
|
|
|
|
|
|
21330
|
|
|
|
|
|
|
// Tokenizer itself |
21331
|
0
|
0
|
|
|
|
|
if (model == "generic") { |
21332
|
0
|
0
|
|
|
|
|
os.put(morphodita::tokenizer_id::GENERIC); |
21333
|
|
|
|
|
|
|
morphodita::generic_tokenizer_factory_encoder::encode(morphodita::generic_tokenizer::LATEST, os); |
21334
|
0
|
0
|
|
|
|
|
} else if (model.empty() || model == "gru") { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21335
|
|
|
|
|
|
|
// Create a detokenizator if required |
21336
|
0
|
|
|
|
|
|
unique_ptr detokenizer; |
21337
|
0
|
0
|
|
|
|
|
if (tokenizer.count("detokenize")) { |
|
|
0
|
|
|
|
|
|
21338
|
0
|
0
|
|
|
|
|
detokenizer.reset(new udpipe::detokenizer(tokenizer["detokenize"])); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21339
|
0
|
0
|
|
|
|
|
if (!detokenizer) return error.assign("Cannot create detokenizer!"), false; |
|
|
0
|
|
|
|
|
|
21340
|
|
|
|
|
|
|
} |
21341
|
|
|
|
|
|
|
|
21342
|
|
|
|
|
|
|
// Prepare training data for the gru_tokenizer |
21343
|
0
|
|
|
|
|
|
vector sentences; |
21344
|
|
|
|
|
|
|
bool spaces_in_training = false; |
21345
|
0
|
0
|
|
|
|
|
for (size_t training_sentence = 0; training_sentence < training.size(); training_sentence++) { |
21346
|
0
|
0
|
|
|
|
|
sentence s = training[training_sentence]; |
21347
|
0
|
0
|
|
|
|
|
if (detokenizer) detokenizer->detokenize(s); |
|
|
0
|
|
|
|
|
|
21348
|
|
|
|
|
|
|
|
21349
|
0
|
0
|
|
|
|
|
auto& sentence = (sentences.emplace_back(), sentences.back()); |
21350
|
|
|
|
|
|
|
|
21351
|
0
|
0
|
|
|
|
|
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
21352
|
0
|
0
|
|
|
|
|
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? |
21353
|
0
|
0
|
|
|
|
|
(const token&)s.multiword_tokens[j] : (const token&)s.words[i]; |
21354
|
|
|
|
|
|
|
|
21355
|
0
|
0
|
|
|
|
|
sentence.tokens.emplace_back(sentence.sentence.size(), 0); |
21356
|
0
|
0
|
|
|
|
|
for (auto&& chr : unilib::utf8::decoder(tok.form)) { |
21357
|
0
|
0
|
|
|
|
|
sentence.sentence.push_back(chr); |
21358
|
0
|
0
|
|
|
|
|
if (unilib::unicode::category(chr) & unilib::unicode::Zs) spaces_in_training = true; |
21359
|
|
|
|
|
|
|
} |
21360
|
0
|
|
|
|
|
|
sentence.tokens.back().length = sentence.sentence.size() - sentence.tokens.back().start; |
21361
|
|
|
|
|
|
|
|
21362
|
0
|
0
|
|
|
|
|
if (tok.get_space_after()) sentence.sentence.push_back(' '); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21363
|
|
|
|
|
|
|
|
21364
|
0
|
0
|
|
|
|
|
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21365
|
0
|
|
|
|
|
|
i = s.multiword_tokens[j++].id_last; |
21366
|
|
|
|
|
|
|
} |
21367
|
0
|
0
|
|
|
|
|
if (training_sentence + 1 < training.size() && (training[training_sentence + 1].get_new_doc() || training[training_sentence + 1].get_new_par())) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21368
|
0
|
|
|
|
|
|
sentence.sentence.append(2, '\n'); |
21369
|
|
|
|
|
|
|
} |
21370
|
|
|
|
|
|
|
|
21371
|
|
|
|
|
|
|
// Heldout data |
21372
|
0
|
|
|
|
|
|
vector heldout_sentences; |
21373
|
|
|
|
|
|
|
|
21374
|
0
|
0
|
|
|
|
|
bool detokenize_handout = true; if (!option_bool(tokenizer, "detokenize_handout", detokenize_handout, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21375
|
0
|
0
|
|
|
|
|
for (size_t heldout_sentence = 0; heldout_sentence < heldout.size(); heldout_sentence++) { |
21376
|
0
|
0
|
|
|
|
|
sentence s = heldout[heldout_sentence]; |
21377
|
0
|
0
|
|
|
|
|
if (detokenizer && detokenize_handout) detokenizer->detokenize(s); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21378
|
|
|
|
|
|
|
|
21379
|
0
|
0
|
|
|
|
|
auto& sentence = (heldout_sentences.emplace_back(), heldout_sentences.back()); |
21380
|
|
|
|
|
|
|
|
21381
|
0
|
0
|
|
|
|
|
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
21382
|
0
|
0
|
|
|
|
|
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? |
21383
|
0
|
0
|
|
|
|
|
(const token&)s.multiword_tokens[j] : (const token&)s.words[i]; |
21384
|
|
|
|
|
|
|
|
21385
|
0
|
0
|
|
|
|
|
sentence.tokens.emplace_back(sentence.sentence.size(), 0); |
21386
|
0
|
0
|
|
|
|
|
for (auto&& chr : unilib::utf8::decoder(tok.form)) |
21387
|
0
|
0
|
|
|
|
|
sentence.sentence.push_back(chr); |
21388
|
0
|
|
|
|
|
|
sentence.tokens.back().length = sentence.sentence.size() - sentence.tokens.back().start; |
21389
|
|
|
|
|
|
|
|
21390
|
0
|
0
|
|
|
|
|
if (tok.get_space_after()) sentence.sentence.push_back(' '); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21391
|
|
|
|
|
|
|
|
21392
|
0
|
0
|
|
|
|
|
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21393
|
0
|
|
|
|
|
|
i = s.multiword_tokens[j++].id_last; |
21394
|
|
|
|
|
|
|
} |
21395
|
0
|
0
|
|
|
|
|
if (heldout_sentence + 1 < heldout.size() && (heldout[heldout_sentence + 1].get_new_doc() || heldout[heldout_sentence + 1].get_new_par())) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21396
|
0
|
|
|
|
|
|
sentence.sentence.append(2, '\n'); |
21397
|
|
|
|
|
|
|
} |
21398
|
|
|
|
|
|
|
|
21399
|
|
|
|
|
|
|
// Options |
21400
|
0
|
0
|
|
|
|
|
bool tokenize_url = true; if (!option_bool(tokenizer, "tokenize_url", tokenize_url, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21401
|
0
|
0
|
|
|
|
|
int segment_size = 50; if (!option_int(tokenizer, "segment_size", segment_size, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21402
|
0
|
0
|
|
|
|
|
bool allow_spaces = spaces_in_training; if (!option_bool(tokenizer, "allow_spaces", allow_spaces, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21403
|
0
|
0
|
|
|
|
|
int dimension = 24; if (!option_int(tokenizer, "dimension", dimension, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21404
|
0
|
0
|
|
|
|
|
int epochs = 100; if (!option_int(tokenizer, "epochs", epochs, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21405
|
0
|
0
|
|
|
|
|
int batch_size = run <= 1 ? 50 : 50 + 50 * hyperparameter_integer(run, 1, 0, 1); |
21406
|
0
|
0
|
|
|
|
|
if (!option_int(tokenizer, "batch_size", batch_size, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21407
|
0
|
0
|
|
|
|
|
double learning_rate = run <= 1 ? 0.005 : hyperparameter_logarithmic(run, 2, 0.0005, 0.01); |
21408
|
0
|
0
|
|
|
|
|
if (!option_double(tokenizer, "learning_rate", learning_rate, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21409
|
0
|
0
|
|
|
|
|
double learning_rate_final = 0.0; if (!option_double(tokenizer, "learning_rate_final", learning_rate_final, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21410
|
0
|
0
|
|
|
|
|
double dropout = 0.1; if (!option_double(tokenizer, "dropout", dropout, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21411
|
0
|
0
|
|
|
|
|
double initialization_range = 0.5; if (!option_double(tokenizer, "initialization_range", initialization_range, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21412
|
0
|
0
|
|
|
|
|
bool early_stopping = !heldout_sentences.empty(); if (!option_bool(tokenizer, "early_stopping", early_stopping, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21413
|
|
|
|
|
|
|
|
21414
|
0
|
0
|
|
|
|
|
if (run >= 1) cerr << "Random search run " << run << ", batch_size=" << batch_size |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21415
|
0
|
|
|
|
|
|
<< ", learning_rate=" << fixed << setprecision(8) << learning_rate << endl; |
21416
|
|
|
|
|
|
|
|
21417
|
0
|
0
|
|
|
|
|
cerr << "Training tokenizer with the following options: " << "tokenize_url=" << (tokenize_url ? 1 : 0) |
|
|
0
|
|
|
|
|
|
21418
|
0
|
0
|
|
|
|
|
<< ", allow_spaces=" << (allow_spaces ? 1 : 0) << ", dimension=" << dimension << endl |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21419
|
0
|
0
|
|
|
|
|
<< " epochs=" << epochs << ", batch_size=" << batch_size << ", segment_size=" << segment_size |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21420
|
0
|
|
|
|
|
|
<< ", learning_rate=" << fixed << setprecision(4) << learning_rate << ", learning_rate_final=" << learning_rate_final << endl |
21421
|
0
|
0
|
|
|
|
|
<< " dropout=" << dropout << ", early_stopping=" << (early_stopping ? 1 : 0) << endl; |
|
|
0
|
|
|
|
|
|
21422
|
|
|
|
|
|
|
|
21423
|
|
|
|
|
|
|
// Train and encode gru_tokenizer |
21424
|
0
|
0
|
|
|
|
|
os.put(morphodita::tokenizer_ids::GRU); |
21425
|
0
|
0
|
|
|
|
|
if (!morphodita::gru_tokenizer_trainer::train(tokenize_url ? morphodita::gru_tokenizer_trainer::URL_EMAIL_LATEST : 0, |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21426
|
|
|
|
|
|
|
segment_size, allow_spaces, dimension, epochs, batch_size, learning_rate, |
21427
|
|
|
|
|
|
|
learning_rate_final, dropout, initialization_range, early_stopping, |
21428
|
|
|
|
|
|
|
sentences, heldout_sentences, os, error)) |
21429
|
|
|
|
|
|
|
return false; |
21430
|
|
|
|
|
|
|
} else { |
21431
|
0
|
0
|
|
|
|
|
return error.assign("Unknown tokenizer model '").append(model).append("'!"), false; |
|
|
0
|
|
|
|
|
|
21432
|
|
|
|
|
|
|
} |
21433
|
|
|
|
|
|
|
|
21434
|
|
|
|
|
|
|
// Multiword splitter |
21435
|
0
|
0
|
|
|
|
|
if (!multiword_splitter_trainer::train(training, os, error)) return false; |
|
|
0
|
|
|
|
|
|
21436
|
|
|
|
|
|
|
} |
21437
|
|
|
|
|
|
|
} |
21438
|
|
|
|
|
|
|
|
21439
|
|
|
|
|
|
|
return true; |
21440
|
|
|
|
|
|
|
} |
21441
|
|
|
|
|
|
|
|
21442
|
0
|
|
|
|
|
|
bool trainer_morphodita_parsito::train_tagger(const vector& training, const vector& heldout, |
21443
|
|
|
|
|
|
|
const string& options, ostream& os, string& error) { |
21444
|
0
|
0
|
|
|
|
|
if (options == NONE) { |
21445
|
0
|
|
|
|
|
|
os.put(0); |
21446
|
|
|
|
|
|
|
} else { |
21447
|
|
|
|
|
|
|
// Parse options |
21448
|
|
|
|
|
|
|
named_values::map tagger; |
21449
|
0
|
0
|
|
|
|
|
if (!named_values::parse(options, tagger, error)) return false; |
|
|
0
|
|
|
|
|
|
21450
|
|
|
|
|
|
|
|
21451
|
0
|
0
|
|
|
|
|
if (tagger.count("from_model")) { |
|
|
0
|
|
|
|
|
|
21452
|
|
|
|
|
|
|
// Use specified tokenizer model(s) |
21453
|
|
|
|
|
|
|
int model_index = 0, taggers_total = 0; |
21454
|
0
|
0
|
|
|
|
|
string model_name = "from_model"; |
21455
|
|
|
|
|
|
|
vector taggers_data; |
21456
|
0
|
0
|
|
|
|
|
do { |
21457
|
0
|
0
|
|
|
|
|
taggers_data.emplace_back(); |
21458
|
0
|
0
|
|
|
|
|
if (!load_model(tagger[model_name], TAGGER_MODEL, taggers_data.back())) |
|
|
0
|
|
|
|
|
|
21459
|
0
|
0
|
|
|
|
|
return error.assign("Cannot load model from which the tagger should be used!"), false; |
21460
|
0
|
0
|
|
|
|
|
if (taggers_data.back().str[0]) { |
21461
|
0
|
|
|
|
|
|
taggers_total += taggers_data.back().str[0]; |
21462
|
|
|
|
|
|
|
|
21463
|
0
|
0
|
|
|
|
|
vector overrides = {"lemma", "xpostag", "feats"}; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21464
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < overrides.size(); i++) { |
21465
|
0
|
0
|
|
|
|
|
string override_name = "from_model_" + overrides[i]; |
21466
|
0
|
|
|
|
|
|
int override_value = -1; |
21467
|
0
|
0
|
|
|
|
|
if (!option_int(tagger, override_name, override_value, error, model_index)) return false; |
|
|
0
|
|
|
|
|
|
21468
|
0
|
0
|
|
|
|
|
if (override_value >= 0) |
21469
|
0
|
|
|
|
|
|
const_cast(taggers_data.back().str[1 + i]) = override_value; |
21470
|
|
|
|
|
|
|
} |
21471
|
|
|
|
|
|
|
} else { |
21472
|
|
|
|
|
|
|
taggers_data.pop_back(); |
21473
|
|
|
|
|
|
|
} |
21474
|
0
|
0
|
|
|
|
|
model_name = "from_model_" + to_string(1 + ++model_index); |
21475
|
|
|
|
|
|
|
} while (tagger.count(model_name)); |
21476
|
0
|
0
|
|
|
|
|
if (taggers_total < 0 || taggers_total > 4) return error.assign("Cannot create more than four tagger models!"), false; |
|
|
0
|
|
|
|
|
|
21477
|
|
|
|
|
|
|
|
21478
|
|
|
|
|
|
|
cerr << "Using tagger from given model(s)." << endl; |
21479
|
0
|
0
|
|
|
|
|
os.put(taggers_total); |
21480
|
0
|
0
|
|
|
|
|
for (auto&& tagger_data : taggers_data) |
21481
|
0
|
0
|
|
|
|
|
os.write(tagger_data.str + 1, tagger_data.len - 1); |
21482
|
|
|
|
|
|
|
} else { |
21483
|
|
|
|
|
|
|
// Create MorphoDiTa model(s) |
21484
|
0
|
0
|
|
|
|
|
int models = 1; if (!option_int(tagger, "models", models, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21485
|
0
|
0
|
|
|
|
|
if (models <= 0) return error.assign("Number of tagger models cannot be negative or zero!"), false; |
|
|
0
|
|
|
|
|
|
21486
|
0
|
0
|
|
|
|
|
if (models > 4) return error.assign("Cannot create more than four tagger models!"), false; |
|
|
0
|
|
|
|
|
|
21487
|
|
|
|
|
|
|
|
21488
|
0
|
0
|
|
|
|
|
os.put(models); |
21489
|
0
|
0
|
|
|
|
|
for (int model = 0; model < models; model++) |
21490
|
0
|
0
|
|
|
|
|
if (!train_tagger_model(training, heldout, model, models, tagger, os, error)) |
|
|
0
|
|
|
|
|
|
21491
|
|
|
|
|
|
|
return false; |
21492
|
|
|
|
|
|
|
} |
21493
|
|
|
|
|
|
|
} |
21494
|
|
|
|
|
|
|
|
21495
|
|
|
|
|
|
|
return true; |
21496
|
|
|
|
|
|
|
} |
21497
|
|
|
|
|
|
|
|
21498
|
0
|
|
|
|
|
|
bool trainer_morphodita_parsito::train_parser(const vector& training, const vector& heldout, |
21499
|
|
|
|
|
|
|
const string& options, const string& tagger_model, ostream& os, string& error) { |
21500
|
0
|
0
|
|
|
|
|
if (options == NONE) { |
21501
|
0
|
|
|
|
|
|
os.put(0); |
21502
|
|
|
|
|
|
|
} else { |
21503
|
|
|
|
|
|
|
// Create Parsito model |
21504
|
|
|
|
|
|
|
named_values::map parser; |
21505
|
0
|
0
|
|
|
|
|
if (!named_values::parse(options, parser, error)) return false; |
|
|
0
|
|
|
|
|
|
21506
|
0
|
0
|
|
|
|
|
int run = 0; if (!option_int(parser, "run", run, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21507
|
|
|
|
|
|
|
|
21508
|
0
|
0
|
|
|
|
|
if (parser.count("from_model")) { |
|
|
0
|
|
|
|
|
|
21509
|
|
|
|
|
|
|
// Use specified parser model |
21510
|
|
|
|
|
|
|
string_piece parser_data; |
21511
|
0
|
0
|
|
|
|
|
if (!load_model(parser["from_model"], PARSER_MODEL, parser_data)) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21512
|
0
|
0
|
|
|
|
|
return error.assign("Cannot load model from which the parser should be used!"), false; |
21513
|
|
|
|
|
|
|
|
21514
|
|
|
|
|
|
|
cerr << "Using parser from given model." << endl; |
21515
|
0
|
0
|
|
|
|
|
os.write(parser_data.str, parser_data.len); |
21516
|
|
|
|
|
|
|
} else { |
21517
|
0
|
0
|
|
|
|
|
os.put(1); |
21518
|
|
|
|
|
|
|
|
21519
|
|
|
|
|
|
|
// Parsito options |
21520
|
0
|
0
|
|
|
|
|
string transition_system = parser.count("transition_system") ? parser["transition_system"] : "projective"; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21521
|
0
|
0
|
|
|
|
|
string transition_oracle = parser.count("transition_oracle") ? parser["transition_oracle"] : |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21522
|
|
|
|
|
|
|
transition_system == "projective" ? "dynamic" : |
21523
|
|
|
|
|
|
|
transition_system == "swap" ? "static_lazy" : |
21524
|
0
|
0
|
|
|
|
|
"static"; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21525
|
|
|
|
|
|
|
|
21526
|
0
|
0
|
|
|
|
|
int embedding_upostag = 20; if (!option_int(parser, "embedding_upostag", embedding_upostag, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21527
|
0
|
0
|
|
|
|
|
int embedding_feats = 20; if (!option_int(parser, "embedding_feats", embedding_feats, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21528
|
0
|
0
|
|
|
|
|
int embedding_xpostag = 0; if (!option_int(parser, "embedding_xpostag", embedding_xpostag, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21529
|
0
|
0
|
|
|
|
|
int embedding_form = 50; if (!option_int(parser, "embedding_form", embedding_form, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21530
|
0
|
0
|
|
|
|
|
int embedding_form_mincount = 2; if (!option_int(parser, "embedding_form_mincount", embedding_form_mincount, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21531
|
0
|
0
|
|
|
|
|
int embedding_lemma = 0; if (!option_int(parser, "embedding_lemma", embedding_lemma, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21532
|
0
|
0
|
|
|
|
|
int embedding_lemma_mincount = 2; if (!option_int(parser, "embedding_lemma_mincount", embedding_lemma_mincount, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21533
|
0
|
0
|
|
|
|
|
int embedding_deprel = 20; if (!option_int(parser, "embedding_deprel", embedding_deprel, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21534
|
|
|
|
|
|
|
string embeddings; |
21535
|
0
|
0
|
|
|
|
|
if (embedding_upostag) embeddings.append("universal_tag ").append(to_string(embedding_upostag)).append(" 1\n"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21536
|
0
|
0
|
|
|
|
|
if (embedding_feats) embeddings.append("feats ").append(to_string(embedding_feats)).append(" 1\n"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21537
|
0
|
0
|
|
|
|
|
if (embedding_xpostag) embeddings.append("tag ").append(to_string(embedding_xpostag)).append(" 1\n"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21538
|
0
|
0
|
|
|
|
|
if (embedding_form) { |
21539
|
0
|
0
|
|
|
|
|
embeddings.append("form ").append(to_string(embedding_form)).append(" ").append(to_string(embedding_form_mincount)); |
|
|
0
|
|
|
|
|
|
21540
|
0
|
0
|
|
|
|
|
if (!option_str(parser, "embedding_form_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_form_file")); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21541
|
0
|
0
|
|
|
|
|
embeddings.push_back('\n'); |
21542
|
|
|
|
|
|
|
} |
21543
|
0
|
0
|
|
|
|
|
if (embedding_lemma) { |
21544
|
0
|
0
|
|
|
|
|
embeddings.append("lemma ").append(to_string(embedding_lemma)).append(" ").append(to_string(embedding_lemma_mincount)); |
|
|
0
|
|
|
|
|
|
21545
|
0
|
0
|
|
|
|
|
if (!option_str(parser, "embedding_lemma_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_lemma_file")); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21546
|
0
|
0
|
|
|
|
|
embeddings.push_back('\n'); |
21547
|
|
|
|
|
|
|
} |
21548
|
0
|
0
|
|
|
|
|
if (embedding_deprel) embeddings.append("deprel ").append(to_string(embedding_deprel)).append(" 1\n"); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21549
|
|
|
|
|
|
|
|
21550
|
0
|
0
|
|
|
|
|
bool single_root = true; if (!option_bool(parser, "single_root", single_root, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21551
|
0
|
0
|
|
|
|
|
int iterations = 10; if (!option_int(parser, "iterations", iterations, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21552
|
0
|
0
|
|
|
|
|
int hidden_layer = 200; if (!option_int(parser, "hidden_layer", hidden_layer, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21553
|
0
|
0
|
|
|
|
|
int batch_size = 10; if (!option_int(parser, "batch_size", batch_size, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21554
|
0
|
0
|
|
|
|
|
int structured_interval = run <= 1 ? 8 : hyperparameter_integer(run,1,0,2) == 2 ? 0 : 8 + 2*hyperparameter_integer(run,1,0,2); |
|
|
0
|
|
|
|
|
|
21555
|
0
|
0
|
|
|
|
|
if (!option_int(parser, "structured_interval", structured_interval, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21556
|
0
|
0
|
|
|
|
|
double learning_rate = run <= 1 ? 0.02 : hyperparameter_logarithmic(run, 2, 0.005, 0.04); |
21557
|
0
|
0
|
|
|
|
|
if (!option_double(parser, "learning_rate", learning_rate, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21558
|
0
|
0
|
|
|
|
|
double learning_rate_final = 0.001; if (!option_double(parser, "learning_rate_final", learning_rate_final, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21559
|
0
|
0
|
|
|
|
|
double l2 = run <= 1 ? 0.5 : hyperparameter_uniform(run, 3, 0.2, 0.6); |
21560
|
0
|
0
|
|
|
|
|
if (!option_double(parser, "l2", l2, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21561
|
0
|
0
|
|
|
|
|
bool early_stopping = !heldout.empty(); if (!option_bool(parser, "early_stopping", early_stopping, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21562
|
|
|
|
|
|
|
|
21563
|
0
|
0
|
|
|
|
|
if (run >= 1) cerr << "Random search run " << run << ", structured_interval=" << structured_interval |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21564
|
0
|
|
|
|
|
|
<< ", learning_rate=" << fixed << setprecision(8) << learning_rate |
21565
|
0
|
|
|
|
|
|
<< ", l2=" << l2 << endl; |
21566
|
|
|
|
|
|
|
|
21567
|
|
|
|
|
|
|
// Prepare data in the correct format |
21568
|
|
|
|
|
|
|
parsito::network_parameters parameters; |
21569
|
0
|
|
|
|
|
|
parameters.iterations = iterations; |
21570
|
0
|
|
|
|
|
|
parameters.structured_interval = structured_interval; |
21571
|
0
|
|
|
|
|
|
parameters.hidden_layer = hidden_layer; |
21572
|
0
|
|
|
|
|
|
parameters.hidden_layer_type = parsito::activation_function::TANH; |
21573
|
0
|
|
|
|
|
|
parameters.trainer.algorithm = parsito::network_trainer::SGD; |
21574
|
0
|
|
|
|
|
|
parameters.trainer.learning_rate = learning_rate; |
21575
|
0
|
|
|
|
|
|
parameters.trainer.learning_rate_final = learning_rate_final; |
21576
|
0
|
|
|
|
|
|
parameters.trainer.momentum = 0; |
21577
|
0
|
|
|
|
|
|
parameters.trainer.epsilon = 0; |
21578
|
0
|
|
|
|
|
|
parameters.batch_size = batch_size; |
21579
|
0
|
|
|
|
|
|
parameters.initialization_range = 0.1f; |
21580
|
0
|
|
|
|
|
|
parameters.l1_regularization = 0; |
21581
|
0
|
|
|
|
|
|
parameters.l2_regularization = l2; |
21582
|
0
|
|
|
|
|
|
parameters.maxnorm_regularization = 0; |
21583
|
0
|
|
|
|
|
|
parameters.dropout_hidden = 0; |
21584
|
0
|
|
|
|
|
|
parameters.dropout_input = 0; |
21585
|
0
|
|
|
|
|
|
parameters.early_stopping = early_stopping; |
21586
|
|
|
|
|
|
|
|
21587
|
|
|
|
|
|
|
// Tag the input if required |
21588
|
|
|
|
|
|
|
unique_ptr tagger; |
21589
|
0
|
0
|
|
|
|
|
bool use_gold_tags = false; if (!option_bool(parser, "use_gold_tags", use_gold_tags, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21590
|
0
|
0
|
|
|
|
|
if (!use_gold_tags && !tagger_model.empty() && tagger_model[0]) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21591
|
0
|
0
|
|
|
|
|
stringstream tagger_description; |
21592
|
0
|
0
|
|
|
|
|
tagger_description.put(model_morphodita_parsito::VERSION_LATEST).put(0x7F).put(0x7F).put(0).write(tagger_model.data(), tagger_model.size()).put(0); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21593
|
0
|
0
|
|
|
|
|
tagger.reset(model_morphodita_parsito::load(tagger_description)); |
21594
|
0
|
0
|
|
|
|
|
if (!tagger) return error.assign("Cannot load the tagger model for parser training data generation!"), false; |
|
|
0
|
|
|
|
|
|
21595
|
|
|
|
|
|
|
} |
21596
|
|
|
|
|
|
|
|
21597
|
|
|
|
|
|
|
// Training data |
21598
|
0
|
0
|
|
|
|
|
sentence tagged; |
21599
|
0
|
|
|
|
|
|
vector train_trees; |
21600
|
0
|
0
|
|
|
|
|
for (auto&& sentence : training) { |
21601
|
0
|
0
|
|
|
|
|
tagged = sentence; |
21602
|
0
|
0
|
|
|
|
|
if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21603
|
|
|
|
|
|
|
|
21604
|
0
|
0
|
|
|
|
|
train_trees.emplace_back(); |
21605
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < tagged.words.size(); i++) { |
21606
|
0
|
|
|
|
|
|
train_trees.back().add_node(string()); |
21607
|
0
|
0
|
|
|
|
|
model_normalize_form(tagged.words[i].form, train_trees.back().nodes.back().form); |
21608
|
0
|
|
|
|
|
|
train_trees.back().nodes.back().lemma.assign(tagged.words[i].lemma); |
21609
|
0
|
|
|
|
|
|
train_trees.back().nodes.back().upostag.assign(tagged.words[i].upostag); |
21610
|
0
|
|
|
|
|
|
train_trees.back().nodes.back().xpostag.assign(tagged.words[i].xpostag); |
21611
|
0
|
|
|
|
|
|
train_trees.back().nodes.back().feats.assign(tagged.words[i].feats); |
21612
|
|
|
|
|
|
|
} |
21613
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < tagged.words.size(); i++) |
21614
|
0
|
0
|
|
|
|
|
train_trees.back().set_head(tagged.words[i].id, tagged.words[i].head, tagged.words[i].deprel); |
21615
|
|
|
|
|
|
|
} |
21616
|
|
|
|
|
|
|
|
21617
|
|
|
|
|
|
|
// Heldout data |
21618
|
0
|
|
|
|
|
|
vector heldout_trees; |
21619
|
0
|
0
|
|
|
|
|
for (auto&& sentence : heldout) { |
21620
|
0
|
0
|
|
|
|
|
tagged = sentence; |
21621
|
0
|
0
|
|
|
|
|
if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21622
|
|
|
|
|
|
|
|
21623
|
0
|
0
|
|
|
|
|
heldout_trees.emplace_back(); |
21624
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < tagged.words.size(); i++) { |
21625
|
0
|
|
|
|
|
|
heldout_trees.back().add_node(string()); |
21626
|
0
|
0
|
|
|
|
|
model_normalize_form(tagged.words[i].form, heldout_trees.back().nodes.back().form); |
21627
|
0
|
|
|
|
|
|
heldout_trees.back().nodes.back().lemma.assign(tagged.words[i].lemma); |
21628
|
0
|
|
|
|
|
|
heldout_trees.back().nodes.back().upostag.assign(tagged.words[i].upostag); |
21629
|
0
|
|
|
|
|
|
heldout_trees.back().nodes.back().xpostag.assign(tagged.words[i].xpostag); |
21630
|
0
|
|
|
|
|
|
heldout_trees.back().nodes.back().feats.assign(tagged.words[i].feats); |
21631
|
|
|
|
|
|
|
} |
21632
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < tagged.words.size(); i++) |
21633
|
0
|
0
|
|
|
|
|
heldout_trees.back().set_head(tagged.words[i].id, tagged.words[i].head, tagged.words[i].deprel); |
21634
|
|
|
|
|
|
|
} |
21635
|
|
|
|
|
|
|
|
21636
|
|
|
|
|
|
|
cerr << "Parser transition options: system=" << transition_system << ", oracle=" << transition_oracle |
21637
|
0
|
0
|
|
|
|
|
<< ", structured_interval=" << structured_interval << ", single_root=" << (single_root ? 1 : 0) << endl |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21638
|
0
|
0
|
|
|
|
|
<< "Parser uses lemmas/upos/xpos/feats: " << (tagger ? "automatically generated by tagger" : "from gold data") << endl |
|
|
0
|
|
|
|
|
|
21639
|
0
|
0
|
|
|
|
|
<< "Parser embeddings options: upostag=" << embedding_upostag << ", feats=" << embedding_feats << ", xpostag=" << embedding_xpostag |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21640
|
0
|
0
|
|
|
|
|
<< ", form=" << embedding_form << ", lemma=" << embedding_lemma << ", deprel=" << embedding_deprel << endl |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21641
|
0
|
0
|
|
|
|
|
<< " form mincount=" << embedding_form_mincount << ", precomputed form embeddings=" << (parser["embedding_form_file"].empty() ? "none" : parser["embedding_form_file"]) << endl |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21642
|
0
|
0
|
|
|
|
|
<< " lemma mincount=" << embedding_lemma_mincount << ", precomputed lemma embeddings=" << (parser["embedding_lemma_file"].empty() ? "none" : parser["embedding_lemma_file"]) << endl |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21643
|
0
|
0
|
|
|
|
|
<< "Parser network options: iterations=" << iterations << ", hidden_layer=" << hidden_layer << ", batch_size=" << batch_size << "," << endl |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21644
|
0
|
|
|
|
|
|
<< " learning_rate=" << fixed << setprecision(4) << learning_rate << ", learning_rate_final=" << learning_rate_final |
21645
|
0
|
0
|
|
|
|
|
<< ", l2=" << l2 << ", early_stopping=" << (early_stopping ? 1 : 0) << endl; |
|
|
0
|
|
|
|
|
|
21646
|
|
|
|
|
|
|
|
21647
|
|
|
|
|
|
|
// Train the parser |
21648
|
0
|
0
|
|
|
|
|
binary_encoder enc; |
21649
|
0
|
0
|
|
|
|
|
enc.add_str("nn_versioned"); |
21650
|
0
|
|
|
|
|
|
parsito::parser_nn_trainer::train(transition_system, transition_oracle, single_root, embeddings, parser_nodes, |
21651
|
0
|
0
|
|
|
|
|
parameters, 1, train_trees, heldout_trees, enc); |
21652
|
0
|
0
|
|
|
|
|
compressor::save(os, enc); |
21653
|
|
|
|
|
|
|
} |
21654
|
|
|
|
|
|
|
} |
21655
|
|
|
|
|
|
|
|
21656
|
|
|
|
|
|
|
return true; |
21657
|
|
|
|
|
|
|
} |
21658
|
|
|
|
|
|
|
|
21659
|
0
|
|
|
|
|
|
bool trainer_morphodita_parsito::load_model(const string& data, model_type model, string_piece& range) { |
21660
|
0
|
|
|
|
|
|
istringstream is(data); |
21661
|
|
|
|
|
|
|
|
21662
|
|
|
|
|
|
|
// Check that it is morphodita_parsito model. |
21663
|
|
|
|
|
|
|
char len; |
21664
|
0
|
0
|
|
|
|
|
if (!is.get(len)) return false; |
|
|
0
|
|
|
|
|
|
21665
|
0
|
|
|
|
|
|
string name(len, ' '); |
21666
|
0
|
0
|
|
|
|
|
if (!is.read(&name[0], len)) return false; |
|
|
0
|
|
|
|
|
|
21667
|
0
|
0
|
|
|
|
|
if (name != "morphodita_parsito") return false; |
21668
|
|
|
|
|
|
|
|
21669
|
|
|
|
|
|
|
char version; |
21670
|
0
|
0
|
|
|
|
|
if (!is.get(version)) return false; |
|
|
0
|
|
|
|
|
|
21671
|
0
|
0
|
|
|
|
|
if (!(version >= 1 && version <= model_morphodita_parsito::VERSION_LATEST)) return false; |
21672
|
|
|
|
|
|
|
|
21673
|
|
|
|
|
|
|
// Because UDPipe 1.0 does not check the model version, |
21674
|
|
|
|
|
|
|
// a specific sentinel was added since version 2 so that |
21675
|
|
|
|
|
|
|
// loading of such model fail on UDPipe 1.0 |
21676
|
0
|
0
|
|
|
|
|
if (version >= 2) { |
21677
|
|
|
|
|
|
|
char sentinel; |
21678
|
0
|
0
|
|
|
|
|
if (!is.get(sentinel) || sentinel != 0x7F) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21679
|
0
|
0
|
|
|
|
|
if (!is.get(sentinel) || sentinel != 0x7F) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21680
|
|
|
|
|
|
|
} |
21681
|
|
|
|
|
|
|
|
21682
|
|
|
|
|
|
|
// Tokenizer |
21683
|
|
|
|
|
|
|
{ |
21684
|
0
|
0
|
|
|
|
|
if (model == TOKENIZER_MODEL) range.str = data.data() + is.tellg(); |
|
|
0
|
|
|
|
|
|
21685
|
0
|
0
|
|
|
|
|
char tokenizer; if (!is.get(tokenizer)) return false; |
|
|
0
|
|
|
|
|
|
21686
|
0
|
0
|
|
|
|
|
unique_ptr tokenizer_factory(tokenizer ? morphodita::tokenizer_factory::load(is) : nullptr); |
|
|
0
|
|
|
|
|
|
21687
|
0
|
0
|
|
|
|
|
if (tokenizer && !tokenizer_factory) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21688
|
0
|
0
|
|
|
|
|
unique_ptr splitter(tokenizer ? multiword_splitter::load(is) : nullptr); |
|
|
0
|
|
|
|
|
|
21689
|
0
|
0
|
|
|
|
|
if (model == TOKENIZER_MODEL) return range.len = is.tellg() - streampos(range.str - data.data()), true; |
|
|
0
|
|
|
|
|
|
21690
|
|
|
|
|
|
|
} |
21691
|
|
|
|
|
|
|
|
21692
|
|
|
|
|
|
|
// Tagger |
21693
|
|
|
|
|
|
|
{ |
21694
|
0
|
0
|
|
|
|
|
if (model == TAGGER_MODEL) range.str = data.data() + is.tellg(); |
|
|
0
|
|
|
|
|
|
21695
|
0
|
0
|
|
|
|
|
char taggers; if (!is.get(taggers)) return false; |
|
|
0
|
|
|
|
|
|
21696
|
0
|
0
|
|
|
|
|
for (char i = 0; i < taggers; i++) { |
21697
|
0
|
0
|
|
|
|
|
char lemma; if (!is.get(lemma)) return false; |
|
|
0
|
|
|
|
|
|
21698
|
0
|
0
|
|
|
|
|
char xpostag; if (!is.get(xpostag)) return false; |
|
|
0
|
|
|
|
|
|
21699
|
0
|
0
|
|
|
|
|
char feats; if (!is.get(feats)) return false; |
|
|
0
|
|
|
|
|
|
21700
|
0
|
0
|
|
|
|
|
unique_ptr tagger(morphodita::tagger::load(is)); |
21701
|
0
|
0
|
|
|
|
|
if (!tagger) return false; |
21702
|
|
|
|
|
|
|
} |
21703
|
0
|
0
|
|
|
|
|
if (model == TAGGER_MODEL) return range.len = is.tellg() - streampos(range.str - data.data()), true; |
|
|
0
|
|
|
|
|
|
21704
|
|
|
|
|
|
|
} |
21705
|
|
|
|
|
|
|
|
21706
|
|
|
|
|
|
|
// Parser |
21707
|
|
|
|
|
|
|
{ |
21708
|
0
|
0
|
|
|
|
|
if (model == PARSER_MODEL) range.str = data.data() + is.tellg(); |
|
|
0
|
|
|
|
|
|
21709
|
|
|
|
|
|
|
char parser; |
21710
|
0
|
0
|
|
|
|
|
if (!is.get(parser)) return false; |
|
|
0
|
|
|
|
|
|
21711
|
0
|
0
|
|
|
|
|
unique_ptr parser_model(parser ? parsito::parser::load(is) : nullptr); |
|
|
0
|
|
|
|
|
|
21712
|
0
|
0
|
|
|
|
|
if (parser && !parser_model) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21713
|
0
|
0
|
|
|
|
|
if (model == PARSER_MODEL) return range.len = is.tellg() - streampos(range.str - data.data()), true; |
|
|
0
|
|
|
|
|
|
21714
|
|
|
|
|
|
|
} |
21715
|
|
|
|
|
|
|
|
21716
|
0
|
|
|
|
|
|
return false; |
21717
|
|
|
|
|
|
|
} |
21718
|
|
|
|
|
|
|
|
21719
|
0
|
|
|
|
|
|
const string& trainer_morphodita_parsito::model_normalize_form(string_piece form, string& output) { |
21720
|
0
|
0
|
|
|
|
|
return model_morphodita_parsito(model_morphodita_parsito::VERSION_LATEST).normalize_form(form, output); |
21721
|
|
|
|
|
|
|
} |
21722
|
|
|
|
|
|
|
|
21723
|
0
|
|
|
|
|
|
const string& trainer_morphodita_parsito::model_normalize_lemma(string_piece lemma, string& output) { |
21724
|
0
|
0
|
|
|
|
|
return model_morphodita_parsito(model_morphodita_parsito::VERSION_LATEST).normalize_lemma(lemma, output); |
21725
|
|
|
|
|
|
|
} |
21726
|
|
|
|
|
|
|
|
21727
|
0
|
|
|
|
|
|
void trainer_morphodita_parsito::model_fill_word_analysis(const morphodita::tagged_lemma& analysis, bool upostag, int lemma, bool xpostag, bool feats, word& word) { |
21728
|
0
|
0
|
|
|
|
|
model_morphodita_parsito(model_morphodita_parsito::VERSION_LATEST).fill_word_analysis(analysis, false, upostag, lemma, xpostag, feats, word); |
21729
|
0
|
|
|
|
|
|
} |
21730
|
|
|
|
|
|
|
|
21731
|
|
|
|
|
|
|
// Tagger model helper functions |
21732
|
|
|
|
|
|
|
|
21733
|
0
|
|
|
|
|
|
bool trainer_morphodita_parsito::train_tagger_model(const vector& training, const vector& heldout, |
21734
|
|
|
|
|
|
|
unsigned model, unsigned models, const named_values::map& tagger, |
21735
|
|
|
|
|
|
|
ostream& os, string& error) { |
21736
|
0
|
0
|
|
|
|
|
unique_ptr conllu_input_format(input_format::new_conllu_input_format()); |
21737
|
|
|
|
|
|
|
|
21738
|
0
|
0
|
|
|
|
|
int run = 0; if (!option_int(tagger, "run", run, error, model)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21739
|
|
|
|
|
|
|
|
21740
|
|
|
|
|
|
|
bool have_lemma = false; |
21741
|
0
|
0
|
|
|
|
|
for (auto&& sentence : training) |
21742
|
0
|
0
|
|
|
|
|
for (size_t i = 1; !have_lemma && i < sentence.words.size(); i++) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21743
|
0
|
0
|
|
|
|
|
if (!sentence.words[i].lemma.empty() && sentence.words[i].lemma != "_") |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21744
|
|
|
|
|
|
|
have_lemma = true; |
21745
|
0
|
0
|
|
|
|
|
bool use_lemma_flag = model == 1 || models == 1; if (!option_bool(tagger, "use_lemma", use_lemma_flag, error, model)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21746
|
0
|
0
|
|
|
|
|
int lemma_encoding = 2; if (!option_int(tagger, "dictionary_lemma_encoding", lemma_encoding, error, model)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21747
|
0
|
0
|
|
|
|
|
int use_lemma = have_lemma && use_lemma_flag ? lemma_encoding : 0; |
|
|
0
|
|
|
|
|
|
21748
|
0
|
0
|
|
|
|
|
bool use_xpostag = model == 0; if (!option_bool(tagger, "use_xpostag", use_xpostag, error, model)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21749
|
0
|
0
|
|
|
|
|
bool use_feats = model == 0; if (!option_bool(tagger, "use_feats", use_feats, error, model)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21750
|
|
|
|
|
|
|
|
21751
|
0
|
0
|
|
|
|
|
bool provide_lemma = model == 1 || models == 1; if (!option_bool(tagger, "provide_lemma", provide_lemma, error, model)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21752
|
0
|
0
|
|
|
|
|
bool provide_xpostag = model == 0; if (!option_bool(tagger, "provide_xpostag", provide_xpostag, error, model)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21753
|
0
|
0
|
|
|
|
|
bool provide_feats = model == 0; if (!option_bool(tagger, "provide_feats", provide_feats, error, model)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21754
|
0
|
0
|
|
|
|
|
os.put(char(provide_lemma ? use_lemma : 0)); |
|
|
0
|
|
|
|
|
|
21755
|
0
|
0
|
|
|
|
|
os.put(char(provide_xpostag && use_xpostag)); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21756
|
0
|
0
|
|
|
|
|
os.put(char(provide_feats && use_feats)); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21757
|
|
|
|
|
|
|
|
21758
|
0
|
0
|
|
|
|
|
cerr << "Tagger model " << model+1 << " columns: " << "lemma use=" << (use_lemma ? 1 : 0) << "/provide=" << (provide_lemma ? 1 : 0) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21759
|
0
|
0
|
|
|
|
|
<< ", xpostag use=" << (use_xpostag ? 1 : 0) << "/provide=" << (provide_xpostag ? 1 : 0) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21760
|
0
|
0
|
|
|
|
|
<< ", feats use=" << (use_feats ? 1 : 0) << "/provide=" << (provide_feats ? 1 : 0) << endl; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21761
|
|
|
|
|
|
|
|
21762
|
|
|
|
|
|
|
// Start by creating the morphological dictionary |
21763
|
0
|
0
|
|
|
|
|
stringstream morpho_description; |
21764
|
|
|
|
|
|
|
string normalized_form, combined_tag, combined_lemma; |
21765
|
|
|
|
|
|
|
|
21766
|
|
|
|
|
|
|
// Generic options |
21767
|
0
|
0
|
|
|
|
|
const string& dictionary_model = option_str(tagger, "dictionary_model", model); |
|
|
0
|
|
|
|
|
|
21768
|
0
|
0
|
|
|
|
|
if (!dictionary_model.empty()) { |
21769
|
|
|
|
|
|
|
// Use specified morphological dictionary |
21770
|
|
|
|
|
|
|
cerr << "Using given morphological dictionary for tagger model " << model+1 << "." << endl; |
21771
|
|
|
|
|
|
|
morpho_description << dictionary_model; |
21772
|
|
|
|
|
|
|
} else { |
21773
|
|
|
|
|
|
|
// Create the morphological dictionary and guesser from data |
21774
|
|
|
|
|
|
|
cerr << "Creating morphological dictionary for tagger model " << model+1 << "." << endl; |
21775
|
|
|
|
|
|
|
|
21776
|
|
|
|
|
|
|
// Dictionary options |
21777
|
0
|
0
|
|
|
|
|
int dictionary_suffix_len = 8; if (!option_int(tagger, "dictionary_suffix_len", dictionary_suffix_len, error, model)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21778
|
|
|
|
|
|
|
unordered_set flat_lemmas; |
21779
|
0
|
0
|
|
|
|
|
if (!option_str(tagger, "dictionary_flat_lemmas", model).empty()) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21780
|
0
|
|
|
|
|
|
vector lemmas; |
21781
|
0
|
0
|
|
|
|
|
split(option_str(tagger, "dictionary_flat_lemmas", model), ',', lemmas); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21782
|
0
|
0
|
|
|
|
|
for (auto&& lemma : lemmas) { |
21783
|
0
|
0
|
|
|
|
|
if (lemma.find('~') != string::npos) |
21784
|
0
|
0
|
|
|
|
|
return error.assign("Dictionary_flat_lemmas cannot contain '~' character!"), false; |
21785
|
|
|
|
|
|
|
flat_lemmas.insert(lemma); |
21786
|
|
|
|
|
|
|
} |
21787
|
|
|
|
|
|
|
} else { |
21788
|
0
|
0
|
|
|
|
|
flat_lemmas.insert("greek.expression"); |
21789
|
|
|
|
|
|
|
} |
21790
|
|
|
|
|
|
|
|
21791
|
0
|
0
|
|
|
|
|
if (!option_str(tagger, "dictionary", model).empty()) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21792
|
0
|
0
|
|
|
|
|
return error.assign("The tagger 'dictionary' option is no longer supported, use 'dictionary_file' instead!"), false; |
21793
|
0
|
0
|
|
|
|
|
const string& dictionary_file = option_str(tagger, "dictionary_file", model); |
|
|
0
|
|
|
|
|
|
21794
|
0
|
0
|
|
|
|
|
int max_form_analyses = 0; if (!option_int(tagger, "dictionary_max_form_analyses", max_form_analyses, error, model)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21795
|
|
|
|
|
|
|
|
21796
|
0
|
0
|
|
|
|
|
cerr << "Tagger model " << model+1 << " dictionary options: " << "max_form_analyses=" << max_form_analyses |
21797
|
0
|
0
|
|
|
|
|
<< ", custom dictionary_file=" << (dictionary_file.empty() ? "none" : dictionary_file) << endl; |
|
|
0
|
|
|
|
|
|
21798
|
|
|
|
|
|
|
|
21799
|
|
|
|
|
|
|
// Guesser options |
21800
|
0
|
0
|
|
|
|
|
int guesser_suffix_len = 4; if (!option_int(tagger, "guesser_suffix_len", guesser_suffix_len, error, model)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21801
|
0
|
0
|
|
|
|
|
int guesser_suffix_rules = run <= 1 ? 8 : 5 + hyperparameter_integer(run, 1, 0, 7); |
21802
|
0
|
0
|
|
|
|
|
if (!option_int(tagger, "guesser_suffix_rules", guesser_suffix_rules, error, model)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21803
|
0
|
0
|
|
|
|
|
int guesser_prefixes_max = provide_lemma ? 4 : 0; if (!option_int(tagger, "guesser_prefixes_max", guesser_prefixes_max, error, model)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21804
|
0
|
0
|
|
|
|
|
int guesser_prefix_min_count = 10; if (!option_int(tagger, "guesser_prefix_min_count", guesser_prefix_min_count, error, model)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21805
|
0
|
0
|
|
|
|
|
int guesser_enrich_dictionary = run <= 1 ? 6 : 3 + hyperparameter_integer(run, 2, 0, 7); |
21806
|
0
|
0
|
|
|
|
|
if (!dictionary_file.empty()) guesser_enrich_dictionary = 0; |
21807
|
0
|
0
|
|
|
|
|
if (!option_int(tagger, "guesser_enrich_dictionary", guesser_enrich_dictionary, error, model)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21808
|
|
|
|
|
|
|
|
21809
|
0
|
0
|
|
|
|
|
if (run >= 1) cerr << "Random search run " << run << ", guesser_suffix_rules=" << guesser_suffix_rules |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21810
|
0
|
0
|
|
|
|
|
<< ", guesser_enrich_dictionary=" << guesser_enrich_dictionary << endl; |
21811
|
|
|
|
|
|
|
|
21812
|
0
|
0
|
|
|
|
|
cerr << "Tagger model " << model+1 << " guesser options: " << "suffix_rules=" << guesser_suffix_rules |
21813
|
0
|
0
|
|
|
|
|
<< ", prefixes_max=" << guesser_prefixes_max << ", prefix_min_count=" << guesser_prefix_min_count |
|
|
0
|
|
|
|
|
|
21814
|
0
|
0
|
|
|
|
|
<< ", enrich_dictionary=" << guesser_enrich_dictionary << endl; |
21815
|
|
|
|
|
|
|
|
21816
|
|
|
|
|
|
|
// Start by generating statistical guesser |
21817
|
0
|
0
|
|
|
|
|
stringstream guesser_description; |
21818
|
|
|
|
|
|
|
{ |
21819
|
0
|
0
|
|
|
|
|
stringstream guesser_input; |
21820
|
0
|
0
|
|
|
|
|
for (auto&& sentence : training) { |
21821
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < sentence.words.size(); i++) |
21822
|
0
|
0
|
|
|
|
|
guesser_input << model_normalize_form(sentence.words[i].form, normalized_form) << '\t' |
21823
|
0
|
0
|
|
|
|
|
<< combine_lemma(sentence.words[i], use_lemma, combined_lemma, flat_lemmas) << '\t' |
21824
|
0
|
0
|
|
|
|
|
<< combine_tag(sentence.words[i], use_xpostag, use_feats, combined_tag) << '\n'; |
21825
|
|
|
|
|
|
|
guesser_input << '\n'; |
21826
|
|
|
|
|
|
|
} |
21827
|
0
|
0
|
|
|
|
|
morphodita::morpho_statistical_guesser_trainer::train(guesser_input, guesser_suffix_len, guesser_suffix_rules, guesser_prefixes_max, guesser_prefix_min_count, guesser_description); |
21828
|
|
|
|
|
|
|
} |
21829
|
|
|
|
|
|
|
|
21830
|
|
|
|
|
|
|
// Generate morphological dictionary data from the input |
21831
|
|
|
|
|
|
|
unordered_set dictionary_entries; |
21832
|
|
|
|
|
|
|
{ |
21833
|
|
|
|
|
|
|
unordered_map> entries; |
21834
|
|
|
|
|
|
|
string entry; |
21835
|
0
|
0
|
|
|
|
|
for (auto&& sentence : training) |
21836
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < sentence.words.size(); i++) { |
21837
|
0
|
0
|
|
|
|
|
model_normalize_form(sentence.words[i].form, normalized_form); |
21838
|
0
|
0
|
|
|
|
|
entry.assign(combine_lemma(sentence.words[i], use_lemma, combined_lemma, flat_lemmas)) |
21839
|
0
|
0
|
|
|
|
|
.append("\t").append(combine_tag(sentence.words[i], use_xpostag, use_feats, combined_tag)) |
|
|
0
|
|
|
|
|
|
21840
|
0
|
0
|
|
|
|
|
.append("\t").append(normalized_form); |
21841
|
0
|
|
|
|
|
|
entries[normalized_form][entry]++; |
21842
|
|
|
|
|
|
|
} |
21843
|
|
|
|
|
|
|
|
21844
|
0
|
|
|
|
|
|
vector> analyses; |
21845
|
0
|
0
|
|
|
|
|
for (auto&& form_analyses : entries) { |
21846
|
0
|
|
|
|
|
|
analyses.clear(); |
21847
|
0
|
0
|
|
|
|
|
for (auto&& analysis : form_analyses.second) |
21848
|
0
|
0
|
|
|
|
|
analyses.emplace_back(analysis.second, analysis.first); |
21849
|
0
|
0
|
|
|
|
|
if (max_form_analyses && int(analyses.size()) > max_form_analyses) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21850
|
|
|
|
|
|
|
sort(analyses.begin(), analyses.end(), greater>()); |
21851
|
0
|
0
|
|
|
|
|
analyses.resize(max_form_analyses); |
21852
|
|
|
|
|
|
|
} |
21853
|
0
|
0
|
|
|
|
|
for (auto&& analysis : analyses) |
21854
|
0
|
|
|
|
|
|
dictionary_entries.insert(analysis.second); |
21855
|
|
|
|
|
|
|
} |
21856
|
|
|
|
|
|
|
} |
21857
|
0
|
|
|
|
|
|
morphodita::generic_morpho_encoder::tags dictionary_special_tags; |
21858
|
|
|
|
|
|
|
dictionary_special_tags.unknown_tag = "~X"; |
21859
|
0
|
0
|
|
|
|
|
dictionary_special_tags.number_tag = most_frequent_tag(training, "NUM", use_xpostag, use_feats, combined_tag); |
|
|
0
|
|
|
|
|
|
21860
|
0
|
0
|
|
|
|
|
dictionary_special_tags.punctuation_tag = most_frequent_tag(training, "PUNCT", use_xpostag, use_feats, combined_tag); |
|
|
0
|
|
|
|
|
|
21861
|
0
|
0
|
|
|
|
|
dictionary_special_tags.symbol_tag = most_frequent_tag(training, "SYM", use_xpostag, use_feats, combined_tag); |
|
|
0
|
|
|
|
|
|
21862
|
|
|
|
|
|
|
|
21863
|
|
|
|
|
|
|
// Append given dictionary_file if given |
21864
|
0
|
0
|
|
|
|
|
if (!dictionary_file.empty()) { |
21865
|
0
|
0
|
|
|
|
|
ifstream is(path_from_utf8(dictionary_file).c_str()); |
21866
|
0
|
0
|
|
|
|
|
if (!is.is_open()) return error.assign("Cannot open dictionary_file '").append(dictionary_file).append("'!"), false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21867
|
|
|
|
|
|
|
|
21868
|
|
|
|
|
|
|
vector dictionary_parts; |
21869
|
0
|
0
|
|
|
|
|
word entry; |
21870
|
|
|
|
|
|
|
string entry_encoded, line; |
21871
|
0
|
0
|
|
|
|
|
while (getline(is, line)) { |
|
|
0
|
|
|
|
|
|
21872
|
|
|
|
|
|
|
// Skip empty lines |
21873
|
0
|
0
|
|
|
|
|
if (line.empty()) continue; |
21874
|
|
|
|
|
|
|
|
21875
|
0
|
0
|
|
|
|
|
split(line, '\t', dictionary_parts); |
21876
|
|
|
|
|
|
|
|
21877
|
0
|
0
|
|
|
|
|
if (dictionary_parts.size() != 5) |
21878
|
0
|
0
|
|
|
|
|
return error.assign("Dictionary line '").append(line).append("' does not contain 5 tab-separated columns!"), false; |
|
|
0
|
|
|
|
|
|
21879
|
|
|
|
|
|
|
|
21880
|
0
|
0
|
|
|
|
|
model_normalize_form(dictionary_parts[0], entry.form); |
21881
|
0
|
0
|
|
|
|
|
entry.lemma.assign(dictionary_parts[1].str, dictionary_parts[1].len == 1 && dictionary_parts[1].str[0] == '_' ? 0 : dictionary_parts[1].len); |
|
|
0
|
|
|
|
|
|
21882
|
0
|
0
|
|
|
|
|
entry.upostag.assign(dictionary_parts[2].str, dictionary_parts[2].len == 1 && dictionary_parts[2].str[0] == '_' ? 0 : dictionary_parts[2].len); |
|
|
0
|
|
|
|
|
|
21883
|
0
|
0
|
|
|
|
|
entry.xpostag.assign(dictionary_parts[3].str, dictionary_parts[3].len == 1 && dictionary_parts[3].str[0] == '_' ? 0 : dictionary_parts[3].len); |
|
|
0
|
|
|
|
|
|
21884
|
0
|
0
|
|
|
|
|
entry.feats.assign(dictionary_parts[4].str, dictionary_parts[4].len == 1 && dictionary_parts[4].str[0] == '_' ? 0 : dictionary_parts[4].len); |
|
|
0
|
|
|
|
|
|
21885
|
|
|
|
|
|
|
|
21886
|
0
|
0
|
|
|
|
|
entry_encoded.assign(combine_lemma(entry, use_lemma, combined_lemma, flat_lemmas)) |
21887
|
0
|
0
|
|
|
|
|
.append("\t").append(combine_tag(entry, use_xpostag, use_feats, combined_tag)) |
|
|
0
|
|
|
|
|
|
21888
|
0
|
0
|
|
|
|
|
.append("\t").append(entry.form); |
21889
|
|
|
|
|
|
|
dictionary_entries.insert(entry_encoded); |
21890
|
|
|
|
|
|
|
} |
21891
|
|
|
|
|
|
|
} |
21892
|
|
|
|
|
|
|
|
21893
|
|
|
|
|
|
|
// Enrich the dictionary if required |
21894
|
0
|
0
|
|
|
|
|
if (guesser_enrich_dictionary) { |
21895
|
|
|
|
|
|
|
// Create temporary morphology using only the guesser |
21896
|
0
|
0
|
|
|
|
|
stringstream empty_data, guesser_description_copy(guesser_description.str()), guesser_only_morphology; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21897
|
0
|
0
|
|
|
|
|
guesser_only_morphology.put(morphodita::morpho_ids::GENERIC); |
21898
|
0
|
0
|
|
|
|
|
morphodita::generic_morpho_encoder::encode(empty_data, dictionary_suffix_len, dictionary_special_tags, guesser_description_copy, guesser_only_morphology); |
21899
|
|
|
|
|
|
|
|
21900
|
0
|
0
|
|
|
|
|
unique_ptr guesser_only_morpho(morphodita::morpho::load(guesser_only_morphology)); |
21901
|
0
|
0
|
|
|
|
|
if (!guesser_only_morpho) return error.assign("Cannot create temporary guesser-only morphology!"), false; |
|
|
0
|
|
|
|
|
|
21902
|
|
|
|
|
|
|
|
21903
|
|
|
|
|
|
|
string entry; |
21904
|
|
|
|
|
|
|
unordered_set analyzed_forms; |
21905
|
0
|
|
|
|
|
|
vector analyses; |
21906
|
0
|
0
|
|
|
|
|
for (auto&& sentence : training) |
21907
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < sentence.words.size(); i++) { |
21908
|
0
|
0
|
|
|
|
|
const auto& form = model_normalize_form(sentence.words[i].form, normalized_form); |
21909
|
0
|
0
|
|
|
|
|
if (!analyzed_forms.count(form)) { |
21910
|
0
|
0
|
|
|
|
|
guesser_only_morpho->analyze(form, morphodita::morpho::GUESSER, analyses); |
21911
|
|
|
|
|
|
|
|
21912
|
0
|
|
|
|
|
|
int to_add = guesser_enrich_dictionary; |
21913
|
0
|
0
|
|
|
|
|
for (auto&& analyse : analyses) { |
21914
|
0
|
0
|
|
|
|
|
entry.assign(analyse.lemma).push_back('\t'); |
21915
|
0
|
0
|
|
|
|
|
entry.append(analyse.tag).push_back('\t'); |
21916
|
|
|
|
|
|
|
entry.append(form); |
21917
|
0
|
0
|
|
|
|
|
if (dictionary_entries.insert(entry).second) |
21918
|
0
|
0
|
|
|
|
|
if (!--to_add) |
21919
|
|
|
|
|
|
|
break; |
21920
|
|
|
|
|
|
|
} |
21921
|
|
|
|
|
|
|
analyzed_forms.insert(form); |
21922
|
|
|
|
|
|
|
} |
21923
|
|
|
|
|
|
|
} |
21924
|
|
|
|
|
|
|
} |
21925
|
|
|
|
|
|
|
|
21926
|
|
|
|
|
|
|
// Create the dictionary |
21927
|
0
|
0
|
|
|
|
|
vector sorted_dictionary(dictionary_entries.begin(), dictionary_entries.end()); |
21928
|
|
|
|
|
|
|
sort(sorted_dictionary.begin(), sorted_dictionary.end()); |
21929
|
|
|
|
|
|
|
|
21930
|
0
|
0
|
|
|
|
|
stringstream morpho_input; |
21931
|
0
|
0
|
|
|
|
|
for (auto&& entry : sorted_dictionary) |
21932
|
|
|
|
|
|
|
morpho_input << entry << '\n'; |
21933
|
|
|
|
|
|
|
|
21934
|
0
|
0
|
|
|
|
|
morpho_description.put(morphodita::morpho_ids::GENERIC); |
21935
|
0
|
0
|
|
|
|
|
morphodita::generic_morpho_encoder::encode(morpho_input, dictionary_suffix_len, dictionary_special_tags, guesser_description, morpho_description); |
21936
|
|
|
|
|
|
|
} |
21937
|
|
|
|
|
|
|
|
21938
|
|
|
|
|
|
|
// Measure dictionary accuracy if required |
21939
|
0
|
0
|
|
|
|
|
const string& dictionary_accuracy = option_str(tagger, "dictionary_accuracy", model); |
|
|
0
|
|
|
|
|
|
21940
|
0
|
0
|
|
|
|
|
if (!dictionary_accuracy.empty()) { |
21941
|
0
|
0
|
|
|
|
|
unique_ptr morpho(morphodita::morpho::load(morpho_description)); |
21942
|
0
|
0
|
|
|
|
|
if (!morpho) return error.assign("Cannot create temporary morphology for evaluating accuracy!"), false; |
|
|
0
|
|
|
|
|
|
21943
|
0
|
0
|
|
|
|
|
morpho_description.seekg(0, ios::beg); |
21944
|
|
|
|
|
|
|
|
21945
|
|
|
|
|
|
|
// Measure dictionary accuracy on given data |
21946
|
|
|
|
|
|
|
unsigned words = 0, total_analyses = 0, upostag = 0, xpostag = 0, feats = 0, all_tags = 0, lemma = 0; |
21947
|
|
|
|
|
|
|
|
21948
|
0
|
0
|
|
|
|
|
word w; |
21949
|
0
|
|
|
|
|
|
vector analyses; |
21950
|
0
|
0
|
|
|
|
|
conllu_input_format->set_text(dictionary_accuracy.c_str()); |
21951
|
0
|
0
|
|
|
|
|
for (sentence sentence; conllu_input_format->next_sentence(sentence, error); ) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21952
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < sentence.words.size(); i++) { |
21953
|
0
|
0
|
|
|
|
|
morpho->analyze(model_normalize_form(sentence.words[i].form, normalized_form), morphodita::morpho::GUESSER, analyses); |
|
|
0
|
|
|
|
|
|
21954
|
|
|
|
|
|
|
unsigned upostag_ok = 0, xpostag_ok = 0, feats_ok = 0, all_tags_ok = 0, lemma_ok = 0; |
21955
|
0
|
0
|
|
|
|
|
for (auto&& analysis : analyses) { |
21956
|
0
|
0
|
|
|
|
|
w.lemma.assign("_"); |
21957
|
0
|
0
|
|
|
|
|
model_fill_word_analysis(analysis, true, use_lemma, true, true, w); |
21958
|
0
|
|
|
|
|
|
upostag_ok |= int(sentence.words[i].upostag == w.upostag); |
21959
|
0
|
|
|
|
|
|
xpostag_ok |= int(sentence.words[i].xpostag == w.xpostag); |
21960
|
0
|
|
|
|
|
|
feats_ok |= int(sentence.words[i].feats == w.feats); |
21961
|
0
|
0
|
|
|
|
|
all_tags_ok |= int(sentence.words[i].upostag == w.upostag && sentence.words[i].xpostag == w.xpostag && sentence.words[i].feats == w.feats); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21962
|
0
|
|
|
|
|
|
lemma_ok |= int(sentence.words[i].lemma == w.lemma); |
21963
|
|
|
|
|
|
|
} |
21964
|
0
|
|
|
|
|
|
words++; |
21965
|
0
|
|
|
|
|
|
total_analyses += analyses.size(); |
21966
|
0
|
|
|
|
|
|
upostag += upostag_ok; |
21967
|
0
|
|
|
|
|
|
xpostag += xpostag_ok; |
21968
|
0
|
|
|
|
|
|
feats += feats_ok; |
21969
|
0
|
|
|
|
|
|
all_tags += all_tags_ok; |
21970
|
0
|
|
|
|
|
|
lemma += lemma_ok; |
21971
|
|
|
|
|
|
|
} |
21972
|
0
|
0
|
|
|
|
|
if (!error.empty()) return false; |
21973
|
|
|
|
|
|
|
|
21974
|
|
|
|
|
|
|
cerr << "Dictionary accuracy for tagging model " << model+1 << " - forms: " << words |
21975
|
0
|
|
|
|
|
|
<< ", analyses per form: " << fixed << setprecision(2) << total_analyses / double(words) |
21976
|
0
|
|
|
|
|
|
<< ", upostag: " << setprecision(1) << 100. * upostag / words << "%, xpostag: " << 100. * xpostag / words |
21977
|
0
|
|
|
|
|
|
<< "%, feats: " << 100. * feats / words << "%, all tags: " << 100. * all_tags / words << "%, lemma: " << 100. * lemma / words << '%' << endl; |
21978
|
|
|
|
|
|
|
} |
21979
|
|
|
|
|
|
|
|
21980
|
|
|
|
|
|
|
// Tagger options |
21981
|
0
|
0
|
|
|
|
|
double tagger_order = 3; if (!option_double(tagger, "order", tagger_order, error, model)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21982
|
|
|
|
|
|
|
morphodita::tagger_id tagger_id; |
21983
|
0
|
0
|
|
|
|
|
if (tagger_order == 2) tagger_id = morphodita::tagger_ids::CONLLU2; |
21984
|
0
|
0
|
|
|
|
|
else if (tagger_order == 2.5) tagger_id = morphodita::tagger_ids::CONLLU2_3; |
21985
|
0
|
0
|
|
|
|
|
else if (tagger_order == 3) tagger_id = morphodita::tagger_ids::CONLLU3; |
21986
|
0
|
0
|
|
|
|
|
else return error.assign("The tagger_order can be only 2, 2.5 or 3!"), false; |
21987
|
|
|
|
|
|
|
|
21988
|
0
|
0
|
|
|
|
|
int tagger_iterations = 20; if (!option_int(tagger, "iterations", tagger_iterations, error, model)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21989
|
0
|
0
|
|
|
|
|
bool tagger_prune_features = false; if (!option_bool(tagger, "prune_features", tagger_prune_features, error, model)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21990
|
0
|
0
|
|
|
|
|
bool tagger_early_stopping = true; if (!option_bool(tagger, "early_stopping", tagger_early_stopping, error, model)) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21991
|
|
|
|
|
|
|
const string& tagger_feature_templates = |
21992
|
0
|
0
|
|
|
|
|
option_str(tagger, "templates", model) == "tagger" ? tagger_features_tagger : |
|
|
0
|
|
|
|
|
|
21993
|
0
|
0
|
|
|
|
|
option_str(tagger, "templates", model) == "lemmatizer" ? tagger_features_lemmatizer : |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21994
|
0
|
0
|
|
|
|
|
!option_str(tagger, "templates", model).empty() ? option_str(tagger, "templates", model) : |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21995
|
0
|
0
|
|
|
|
|
model == 1 ? tagger_features_lemmatizer : tagger_features_tagger; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
21996
|
0
|
0
|
|
|
|
|
if (heldout.empty()) tagger_early_stopping = false; |
21997
|
|
|
|
|
|
|
|
21998
|
0
|
0
|
|
|
|
|
cerr << "Tagger model " << model+1 << " options: iterations=" << tagger_iterations |
21999
|
0
|
0
|
|
|
|
|
<< ", early_stopping=" << (tagger_early_stopping ? 1 : 0) << ", templates=" |
|
|
0
|
|
|
|
|
|
22000
|
0
|
|
|
|
|
|
<< (tagger_feature_templates == tagger_features_tagger ? "tagger" : |
22001
|
0
|
0
|
|
|
|
|
tagger_feature_templates == tagger_features_lemmatizer ? "lemmatizer" : "custom") << endl; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
22002
|
|
|
|
|
|
|
|
22003
|
|
|
|
|
|
|
// Train the tagger |
22004
|
|
|
|
|
|
|
cerr << "Training tagger model " << model+1 << "." << endl; |
22005
|
0
|
0
|
|
|
|
|
stringstream input, heldout_input, feature_templates_input(tagger_feature_templates); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
22006
|
0
|
0
|
|
|
|
|
for (auto&& sentence : training) { |
22007
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < sentence.words.size(); i++) |
22008
|
0
|
0
|
|
|
|
|
input << model_normalize_form(sentence.words[i].form, normalized_form) << '\t' |
22009
|
0
|
0
|
|
|
|
|
<< combine_lemma(sentence.words[i], use_lemma, combined_lemma) << '\t' |
22010
|
0
|
0
|
|
|
|
|
<< combine_tag(sentence.words[i], use_xpostag, use_feats, combined_tag) << '\n'; |
22011
|
|
|
|
|
|
|
input << '\n'; |
22012
|
|
|
|
|
|
|
} |
22013
|
|
|
|
|
|
|
|
22014
|
0
|
0
|
|
|
|
|
for (auto&& sentence : heldout) { |
22015
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < sentence.words.size(); i++) |
22016
|
0
|
0
|
|
|
|
|
heldout_input << model_normalize_form(sentence.words[i].form, normalized_form) << '\t' |
22017
|
0
|
0
|
|
|
|
|
<< combine_lemma(sentence.words[i], use_lemma, combined_lemma) << '\t' |
22018
|
0
|
0
|
|
|
|
|
<< combine_tag(sentence.words[i], use_xpostag, use_feats, combined_tag) << '\n'; |
22019
|
|
|
|
|
|
|
heldout_input << '\n'; |
22020
|
|
|
|
|
|
|
} |
22021
|
|
|
|
|
|
|
|
22022
|
0
|
0
|
|
|
|
|
os.put(tagger_id); |
22023
|
0
|
0
|
|
|
|
|
morphodita::tagger_trainer>>::train(morphodita::tagger_ids::decoding_order(tagger_id), morphodita::tagger_ids::window_size(tagger_id), tagger_iterations, morpho_description, true, feature_templates_input, tagger_prune_features, input, heldout_input, tagger_early_stopping, os); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
22024
|
|
|
|
|
|
|
|
22025
|
|
|
|
|
|
|
return true; |
22026
|
|
|
|
|
|
|
} |
22027
|
|
|
|
|
|
|
|
22028
|
0
|
|
|
|
|
|
bool trainer_morphodita_parsito::can_combine_tag(const word& w, string& error) { |
22029
|
|
|
|
|
|
|
error.clear(); |
22030
|
|
|
|
|
|
|
|
22031
|
|
|
|
|
|
|
unsigned separator = 0; |
22032
|
0
|
0
|
|
|
|
|
while (separator < tag_separators.size() && |
22033
|
0
|
0
|
|
|
|
|
(w.upostag.find(tag_separators[separator]) != string::npos || w.xpostag.find(tag_separators[separator]) != string::npos)) |
22034
|
0
|
|
|
|
|
|
separator++; |
22035
|
|
|
|
|
|
|
|
22036
|
0
|
0
|
|
|
|
|
if (separator >= tag_separators.size()) { |
22037
|
0
|
|
|
|
|
|
error.assign("Cannot find tag separating character, UPOSTAG and XPOSTAG contain all of '").append(tag_separators).append("'!"); |
22038
|
0
|
|
|
|
|
|
return false; |
22039
|
|
|
|
|
|
|
} |
22040
|
|
|
|
|
|
|
return true; |
22041
|
|
|
|
|
|
|
} |
22042
|
|
|
|
|
|
|
|
22043
|
0
|
|
|
|
|
|
const string& trainer_morphodita_parsito::combine_tag(const word& w, bool xpostag, bool feats, string& combined_tag) { |
22044
|
|
|
|
|
|
|
unsigned separator = 0; |
22045
|
0
|
0
|
|
|
|
|
while (separator < tag_separators.size() && |
22046
|
0
|
0
|
|
|
|
|
(w.upostag.find(tag_separators[separator]) != string::npos || w.xpostag.find(tag_separators[separator]) != string::npos)) |
22047
|
0
|
|
|
|
|
|
separator++; |
22048
|
0
|
0
|
|
|
|
|
if (separator >= tag_separators.size()) |
22049
|
|
|
|
|
|
|
// Should not happen, as can_combine_tag was called before |
22050
|
|
|
|
|
|
|
separator = 0; |
22051
|
|
|
|
|
|
|
|
22052
|
0
|
|
|
|
|
|
combined_tag.assign(1, tag_separators[separator]); |
22053
|
|
|
|
|
|
|
combined_tag.append(w.upostag); |
22054
|
0
|
0
|
|
|
|
|
if (xpostag || feats) { |
22055
|
0
|
|
|
|
|
|
combined_tag.push_back(tag_separators[separator]); |
22056
|
0
|
0
|
|
|
|
|
if (xpostag) combined_tag.append(w.xpostag); |
22057
|
0
|
0
|
|
|
|
|
if (feats) combined_tag.push_back(tag_separators[separator]); |
22058
|
0
|
0
|
|
|
|
|
if (feats) combined_tag.append(w.feats); |
22059
|
|
|
|
|
|
|
} |
22060
|
|
|
|
|
|
|
|
22061
|
0
|
|
|
|
|
|
return combined_tag; |
22062
|
|
|
|
|
|
|
} |
22063
|
|
|
|
|
|
|
|
22064
|
0
|
|
|
|
|
|
const string& trainer_morphodita_parsito::most_frequent_tag(const vector& data, const string& upostag, bool xpostag, bool feats, string& combined_tag) { |
22065
|
|
|
|
|
|
|
unordered_map counts; |
22066
|
|
|
|
|
|
|
|
22067
|
0
|
0
|
|
|
|
|
for (auto&& sentence : data) |
22068
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < sentence.words.size(); i++) |
22069
|
0
|
0
|
|
|
|
|
if (sentence.words[i].upostag == upostag) |
22070
|
0
|
0
|
|
|
|
|
counts[combine_tag(sentence.words[i], xpostag, feats, combined_tag)]++; |
22071
|
|
|
|
|
|
|
|
22072
|
0
|
0
|
|
|
|
|
combined_tag.assign("~").append(upostag); |
22073
|
|
|
|
|
|
|
unsigned best = 0; |
22074
|
0
|
0
|
|
|
|
|
for (auto&& tags : counts) |
22075
|
0
|
0
|
|
|
|
|
if (tags.second > best) { |
22076
|
|
|
|
|
|
|
best = tags.second; |
22077
|
0
|
|
|
|
|
|
combined_tag.assign(tags.first); |
22078
|
|
|
|
|
|
|
} |
22079
|
0
|
|
|
|
|
|
return combined_tag; |
22080
|
|
|
|
|
|
|
} |
22081
|
|
|
|
|
|
|
|
22082
|
0
|
|
|
|
|
|
const string& trainer_morphodita_parsito::combine_lemma(const word& w, int use_lemma, string& combined_lemma, const unordered_set& flat_lemmas) { |
22083
|
0
|
|
|
|
|
|
switch (use_lemma) { |
22084
|
|
|
|
|
|
|
case 0: |
22085
|
0
|
|
|
|
|
|
return model_normalize_form(w.form, combined_lemma); |
22086
|
|
|
|
|
|
|
case 1: |
22087
|
0
|
|
|
|
|
|
model_normalize_lemma(w.lemma, combined_lemma); |
22088
|
0
|
0
|
|
|
|
|
if (flat_lemmas.count(w.lemma) || flat_lemmas.count(combined_lemma)) |
22089
|
0
|
|
|
|
|
|
return model_normalize_form(w.form, combined_lemma); |
22090
|
|
|
|
|
|
|
return combined_lemma; |
22091
|
|
|
|
|
|
|
default: /*2*/ |
22092
|
0
|
0
|
|
|
|
|
if (w.lemma == "") |
22093
|
0
|
|
|
|
|
|
return model_normalize_form(w.form, combined_lemma), combined_lemma.insert(0, "~~"); |
22094
|
0
|
0
|
|
|
|
|
else if (w.lemma == "_") |
22095
|
0
|
|
|
|
|
|
return model_normalize_form(w.form, combined_lemma), combined_lemma.insert(0, "~_~"); |
22096
|
|
|
|
|
|
|
|
22097
|
0
|
|
|
|
|
|
model_normalize_lemma(w.lemma, combined_lemma); |
22098
|
0
|
0
|
|
|
|
|
if (flat_lemmas.count(w.lemma) || flat_lemmas.count(combined_lemma)) { |
22099
|
|
|
|
|
|
|
string normalized_form; |
22100
|
0
|
0
|
|
|
|
|
model_normalize_form(w.form, normalized_form); |
22101
|
0
|
0
|
|
|
|
|
return combined_lemma.insert(0, "~").append("~").append(normalized_form); |
|
|
0
|
|
|
|
|
|
22102
|
|
|
|
|
|
|
} |
22103
|
|
|
|
|
|
|
return combined_lemma; |
22104
|
|
|
|
|
|
|
} |
22105
|
|
|
|
|
|
|
} |
22106
|
|
|
|
|
|
|
|
22107
|
|
|
|
|
|
|
// Generic options handling |
22108
|
|
|
|
|
|
|
|
22109
|
0
|
|
|
|
|
|
const string& trainer_morphodita_parsito::option_str(const named_values::map& options, const string& name, int model) { |
22110
|
|
|
|
|
|
|
string indexed_name(name); |
22111
|
0
|
0
|
|
|
|
|
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
22112
|
|
|
|
|
|
|
|
22113
|
0
|
|
|
|
|
|
return options.count(indexed_name) ? options.at(indexed_name) : options.count(name) ? options.at(name) : empty_string; |
22114
|
|
|
|
|
|
|
} |
22115
|
|
|
|
|
|
|
|
22116
|
0
|
|
|
|
|
|
bool trainer_morphodita_parsito::option_int(const named_values::map& options, const string& name, int& value, string& error, int model) { |
22117
|
|
|
|
|
|
|
string indexed_name(name); |
22118
|
0
|
0
|
|
|
|
|
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
22119
|
|
|
|
|
|
|
|
22120
|
0
|
0
|
|
|
|
|
if (options.count(indexed_name)) |
22121
|
0
|
0
|
|
|
|
|
return parse_int(options.at(indexed_name), name.c_str(), value, error); |
22122
|
0
|
0
|
|
|
|
|
if (options.count(name)) |
22123
|
0
|
0
|
|
|
|
|
return parse_int(options.at(name), name.c_str(), value, error); |
22124
|
|
|
|
|
|
|
return true; |
22125
|
|
|
|
|
|
|
} |
22126
|
|
|
|
|
|
|
|
22127
|
0
|
|
|
|
|
|
bool trainer_morphodita_parsito::option_bool(const named_values::map& options, const string& name, bool& value, string& error, int model) { |
22128
|
|
|
|
|
|
|
string indexed_name(name); |
22129
|
0
|
0
|
|
|
|
|
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
22130
|
|
|
|
|
|
|
|
22131
|
0
|
0
|
|
|
|
|
if (options.count(indexed_name) || options.count(name)) { |
22132
|
|
|
|
|
|
|
int int_value; |
22133
|
0
|
0
|
|
|
|
|
if (!parse_int(options.count(indexed_name) ? options.at(indexed_name) : options.at(name), name.c_str(), int_value, error)) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
22134
|
0
|
|
|
|
|
|
return false; |
22135
|
0
|
|
|
|
|
|
value = int_value != 0; |
22136
|
|
|
|
|
|
|
} |
22137
|
|
|
|
|
|
|
return true; |
22138
|
|
|
|
|
|
|
} |
22139
|
|
|
|
|
|
|
|
22140
|
0
|
|
|
|
|
|
bool trainer_morphodita_parsito::option_double(const named_values::map& options, const string& name, double& value, string& error, int model) { |
22141
|
|
|
|
|
|
|
string indexed_name(name); |
22142
|
0
|
0
|
|
|
|
|
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
22143
|
|
|
|
|
|
|
|
22144
|
0
|
0
|
|
|
|
|
if (options.count(indexed_name)) |
22145
|
0
|
0
|
|
|
|
|
return parse_double(options.at(indexed_name), name.c_str(), value, error); |
22146
|
0
|
0
|
|
|
|
|
if (options.count(name)) |
22147
|
0
|
0
|
|
|
|
|
return parse_double(options.at(name), name.c_str(), value, error); |
22148
|
|
|
|
|
|
|
return true; |
22149
|
|
|
|
|
|
|
} |
22150
|
|
|
|
|
|
|
|
22151
|
|
|
|
|
|
|
// Various string data |
22152
|
|
|
|
|
|
|
|
22153
|
2
|
|
|
|
|
|
const string trainer_morphodita_parsito::empty_string; |
22154
|
|
|
|
|
|
|
|
22155
|
2
|
|
|
|
|
|
const string trainer_morphodita_parsito::tag_separators = "~!@#$%^&*()/"; |
22156
|
|
|
|
|
|
|
|
22157
|
2
|
|
|
|
|
|
const string trainer_morphodita_parsito::tagger_features_tagger = |
22158
|
|
|
|
|
|
|
"Tag 0\n" |
22159
|
|
|
|
|
|
|
"Tag 0,Tag -1\n" |
22160
|
|
|
|
|
|
|
"Tag 0,TagUPos -1\n" |
22161
|
|
|
|
|
|
|
"Tag 0,Tag -1,Tag -2\n" |
22162
|
|
|
|
|
|
|
"Tag 0,TagUPos -1,TagUPos -2\n" |
22163
|
|
|
|
|
|
|
"Tag 0,Tag -2\n" |
22164
|
|
|
|
|
|
|
"Tag 0,Form 0\n" |
22165
|
|
|
|
|
|
|
"Tag 0,Form 0,Form -1\n" |
22166
|
|
|
|
|
|
|
"Tag 0,Form -1\n" |
22167
|
|
|
|
|
|
|
"Tag 0,Form -2\n" |
22168
|
|
|
|
|
|
|
"Tag 0,Form -1,Form -2\n" |
22169
|
|
|
|
|
|
|
"Tag 0,Form 1\n" |
22170
|
|
|
|
|
|
|
"Tag 0,Form 1,Form 2\n" |
22171
|
|
|
|
|
|
|
"Tag 0,PreviousVerbTag 0\n" |
22172
|
|
|
|
|
|
|
"Tag 0,PreviousVerbForm 0\n" |
22173
|
|
|
|
|
|
|
"Tag 0,FollowingVerbTag 0\n" |
22174
|
|
|
|
|
|
|
"Tag 0,FollowingVerbForm 0\n" |
22175
|
|
|
|
|
|
|
"Tag 0,Lemma -1\n" |
22176
|
|
|
|
|
|
|
"Tag 0,Form 1\n" |
22177
|
|
|
|
|
|
|
"Lemma 0,Tag -1\n" |
22178
|
|
|
|
|
|
|
"Tag 0,Prefix1 0\n" |
22179
|
|
|
|
|
|
|
"Tag 0,Prefix2 0\n" |
22180
|
|
|
|
|
|
|
"Tag 0,Prefix3 0\n" |
22181
|
|
|
|
|
|
|
"Tag 0,Prefix4 0\n" |
22182
|
|
|
|
|
|
|
"Tag 0,Prefix5 0\n" |
22183
|
|
|
|
|
|
|
"Tag 0,Prefix6 0\n" |
22184
|
|
|
|
|
|
|
"Tag 0,Prefix7 0\n" |
22185
|
|
|
|
|
|
|
"Tag 0,Prefix8 0\n" |
22186
|
|
|
|
|
|
|
"Tag 0,Prefix9 0\n" |
22187
|
|
|
|
|
|
|
"Tag 0,Suffix1 0\n" |
22188
|
|
|
|
|
|
|
"Tag 0,Suffix2 0\n" |
22189
|
|
|
|
|
|
|
"Tag 0,Suffix3 0\n" |
22190
|
|
|
|
|
|
|
"Tag 0,Suffix4 0\n" |
22191
|
|
|
|
|
|
|
"Tag 0,Suffix5 0\n" |
22192
|
|
|
|
|
|
|
"Tag 0,Suffix6 0\n" |
22193
|
|
|
|
|
|
|
"Tag 0,Suffix7 0\n" |
22194
|
|
|
|
|
|
|
"Tag 0,Suffix8 0\n" |
22195
|
|
|
|
|
|
|
"Tag 0,Suffix9 0\n" |
22196
|
|
|
|
|
|
|
"TagUPos 0\n" |
22197
|
|
|
|
|
|
|
"TagUPos 0,TagUPos -1\n" |
22198
|
|
|
|
|
|
|
"TagUPos 0,TagUPos -1,TagUPos -2\n" |
22199
|
|
|
|
|
|
|
"TagCase 0,TagCase -1\n" |
22200
|
|
|
|
|
|
|
"TagCase 0,TagCase -1,TagCase -2\n" |
22201
|
|
|
|
|
|
|
"TagGender 0,TagGender -1\n" |
22202
|
|
|
|
|
|
|
"TagGender 0,TagGender -1,TagGender -2\n" |
22203
|
|
|
|
|
|
|
"TagUPos 0,Prefix1 0\n" |
22204
|
|
|
|
|
|
|
"TagUPos 0,Prefix2 0\n" |
22205
|
|
|
|
|
|
|
"TagUPos 0,Prefix3 0\n" |
22206
|
|
|
|
|
|
|
"TagUPos 0,Prefix4 0\n" |
22207
|
|
|
|
|
|
|
"TagUPos 0,Prefix5 0\n" |
22208
|
|
|
|
|
|
|
"TagUPos 0,Prefix6 0\n" |
22209
|
|
|
|
|
|
|
"TagUPos 0,Prefix7 0\n" |
22210
|
|
|
|
|
|
|
"TagUPos 0,Prefix8 0\n" |
22211
|
|
|
|
|
|
|
"TagUPos 0,Prefix9 0\n" |
22212
|
|
|
|
|
|
|
"TagUPos 0,Suffix1 0\n" |
22213
|
|
|
|
|
|
|
"TagUPos 0,Suffix2 0\n" |
22214
|
|
|
|
|
|
|
"TagUPos 0,Suffix3 0\n" |
22215
|
|
|
|
|
|
|
"TagUPos 0,Suffix4 0\n" |
22216
|
|
|
|
|
|
|
"TagUPos 0,Suffix5 0\n" |
22217
|
|
|
|
|
|
|
"TagUPos 0,Suffix6 0\n" |
22218
|
|
|
|
|
|
|
"TagUPos 0,Suffix7 0\n" |
22219
|
|
|
|
|
|
|
"TagUPos 0,Suffix8 0\n" |
22220
|
|
|
|
|
|
|
"TagUPos 0,Suffix9 0\n" |
22221
|
|
|
|
|
|
|
"Tag 0,Num 0\n" |
22222
|
|
|
|
|
|
|
"Tag 0,Cap 0\n" |
22223
|
|
|
|
|
|
|
"Tag 0,Dash 0\n" |
22224
|
|
|
|
|
|
|
"TagNegative 0,Prefix1 0\n" |
22225
|
|
|
|
|
|
|
"TagNegative 0,Prefix2 0\n" |
22226
|
|
|
|
|
|
|
"TagNegative 0,Prefix3 0\n" |
22227
|
|
|
|
|
|
|
"TagCase 0,Suffix1 0\n" |
22228
|
|
|
|
|
|
|
"TagCase 0,Suffix2 0\n" |
22229
|
|
|
|
|
|
|
"TagCase 0,Suffix3 0\n" |
22230
|
|
|
|
|
|
|
"TagCase 0,Suffix4 0\n" |
22231
|
|
|
|
|
|
|
"TagCase 0,Suffix5 0\n"; |
22232
|
|
|
|
|
|
|
|
22233
|
2
|
|
|
|
|
|
const string trainer_morphodita_parsito::tagger_features_lemmatizer = |
22234
|
|
|
|
|
|
|
"Tag 0\n" |
22235
|
|
|
|
|
|
|
"Tag 0,Tag -1\n" |
22236
|
|
|
|
|
|
|
"Tag 0,Tag -1,Tag -2\n" |
22237
|
|
|
|
|
|
|
"Tag 0,Tag -2\n" |
22238
|
|
|
|
|
|
|
"Tag 0,Form 0\n" |
22239
|
|
|
|
|
|
|
"Tag 0,Form 0,Form -1\n" |
22240
|
|
|
|
|
|
|
"Tag 0,Form -1\n" |
22241
|
|
|
|
|
|
|
"Tag 0,Form -2\n" |
22242
|
|
|
|
|
|
|
"Tag 0,PreviousVerbTag 0\n" |
22243
|
|
|
|
|
|
|
"Tag 0,PreviousVerbForm 0\n" |
22244
|
|
|
|
|
|
|
"Tag 0,FollowingVerbTag 0\n" |
22245
|
|
|
|
|
|
|
"Tag 0,FollowingVerbForm 0\n" |
22246
|
|
|
|
|
|
|
"Tag 0,Lemma -1\n" |
22247
|
|
|
|
|
|
|
"Tag 0,Form 1\n" |
22248
|
|
|
|
|
|
|
"Lemma 0\n" |
22249
|
|
|
|
|
|
|
"Lemma 0,Tag -1\n" |
22250
|
|
|
|
|
|
|
"Lemma 0,Tag -1,Tag -2\n" |
22251
|
|
|
|
|
|
|
"Lemma 0,Tag -2\n" |
22252
|
|
|
|
|
|
|
"Lemma 0,Form -1\n" |
22253
|
|
|
|
|
|
|
"Lemma 0,Form -1,Form -2\n" |
22254
|
|
|
|
|
|
|
"Lemma 0,Form -2\n" |
22255
|
|
|
|
|
|
|
"Lemma 0,PreviousVerbTag 0\n" |
22256
|
|
|
|
|
|
|
"Lemma 0,PreviousVerbForm 0\n" |
22257
|
|
|
|
|
|
|
"Lemma 0,FollowingVerbTag 0\n" |
22258
|
|
|
|
|
|
|
"Lemma 0,FollowingVerbForm 0\n" |
22259
|
|
|
|
|
|
|
"Lemma 0,Form 1\n" |
22260
|
|
|
|
|
|
|
"Tag 0,Prefix1 0\n" |
22261
|
|
|
|
|
|
|
"Tag 0,Prefix2 0\n" |
22262
|
|
|
|
|
|
|
"Tag 0,Prefix3 0\n" |
22263
|
|
|
|
|
|
|
"Tag 0,Prefix4 0\n" |
22264
|
|
|
|
|
|
|
"Tag 0,Prefix5 0\n" |
22265
|
|
|
|
|
|
|
"Tag 0,Suffix1 0\n" |
22266
|
|
|
|
|
|
|
"Tag 0,Suffix2 0\n" |
22267
|
|
|
|
|
|
|
"Tag 0,Suffix3 0\n" |
22268
|
|
|
|
|
|
|
"Tag 0,Suffix4 0\n" |
22269
|
|
|
|
|
|
|
"Tag 0,Suffix5 0\n" |
22270
|
|
|
|
|
|
|
"Tag 0,Num 0\n" |
22271
|
|
|
|
|
|
|
"Tag 0,Cap 0\n" |
22272
|
|
|
|
|
|
|
"Tag 0,Dash 0\n"; |
22273
|
|
|
|
|
|
|
|
22274
|
2
|
|
|
|
|
|
const string trainer_morphodita_parsito::parser_nodes = |
22275
|
|
|
|
|
|
|
"stack 0\n" |
22276
|
|
|
|
|
|
|
"stack 1\n" |
22277
|
|
|
|
|
|
|
"stack 2\n" |
22278
|
|
|
|
|
|
|
"buffer 0\n" |
22279
|
|
|
|
|
|
|
"buffer 1\n" |
22280
|
|
|
|
|
|
|
"buffer 2\n" |
22281
|
|
|
|
|
|
|
"stack 0,child 0\n" |
22282
|
|
|
|
|
|
|
"stack 0,child 1\n" |
22283
|
|
|
|
|
|
|
"stack 0,child -2\n" |
22284
|
|
|
|
|
|
|
"stack 0,child -1\n" |
22285
|
|
|
|
|
|
|
"stack 1,child 0\n" |
22286
|
|
|
|
|
|
|
"stack 1,child 1\n" |
22287
|
|
|
|
|
|
|
"stack 1,child -2\n" |
22288
|
|
|
|
|
|
|
"stack 1,child -1\n" |
22289
|
|
|
|
|
|
|
"stack 0,child 0,child 0\n" |
22290
|
|
|
|
|
|
|
"stack 0,child -1,child -1\n" |
22291
|
|
|
|
|
|
|
"stack 1,child 0,child 0\n" |
22292
|
|
|
|
|
|
|
"stack 1,child -1,child -1\n"; |
22293
|
|
|
|
|
|
|
|
22294
|
|
|
|
|
|
|
///////// |
22295
|
|
|
|
|
|
|
// File: trainer/training_failure.cpp |
22296
|
|
|
|
|
|
|
///////// |
22297
|
|
|
|
|
|
|
|
22298
|
|
|
|
|
|
|
// This file is part of UDPipe . |
22299
|
|
|
|
|
|
|
// |
22300
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
22301
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
22302
|
|
|
|
|
|
|
// |
22303
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
22304
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
22305
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
22306
|
|
|
|
|
|
|
|
22307
|
0
|
0
|
|
|
|
|
training_error::training_error() : runtime_error(message_collector.str()) { |
22308
|
0
|
|
|
|
|
|
message_collector.str(string()); |
22309
|
0
|
|
|
|
|
|
} |
22310
|
|
|
|
|
|
|
|
22311
|
2
|
|
|
|
|
|
ostringstream training_error::message_collector; |
22312
|
|
|
|
|
|
|
|
22313
|
|
|
|
|
|
|
///////// |
22314
|
|
|
|
|
|
|
// File: unilib/unicode.cpp |
22315
|
|
|
|
|
|
|
///////// |
22316
|
|
|
|
|
|
|
|
22317
|
|
|
|
|
|
|
// This file is part of UniLib . |
22318
|
|
|
|
|
|
|
// |
22319
|
|
|
|
|
|
|
// Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of |
22320
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
22321
|
|
|
|
|
|
|
// |
22322
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
22323
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
22324
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
22325
|
|
|
|
|
|
|
// |
22326
|
|
|
|
|
|
|
// UniLib version: 3.3.0 |
22327
|
|
|
|
|
|
|
// Unicode version: 15.0.0 |
22328
|
|
|
|
|
|
|
|
22329
|
|
|
|
|
|
|
namespace unilib { |
22330
|
|
|
|
|
|
|
|
22331
|
|
|
|
|
|
|
const char32_t unicode::CHARS; |
22332
|
|
|
|
|
|
|
|
22333
|
|
|
|
|
|
|
const int32_t unicode::DEFAULT_CAT; |
22334
|
|
|
|
|
|
|
|
22335
|
|
|
|
|
|
|
const uint8_t unicode::category_index[unicode::CHARS >> 8] = { |
22336
|
|
|
|
|
|
|
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,17,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,33,41,42,43,44,45,46,47,48,39,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,49,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,50,17,17,17,51,17,52,53,54,55,56,57,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,58,59,59,59,59,59,59,59,59,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,17,61,62,17,63,64,65,66,67,68,69,70,71,17,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,17,17,17,97,98,99,100,100,100,100,100,100,100,100,100,101,17,17,17,17,102,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,17,17,103,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,17,17,104,105,100,100,106,107,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,108,17,17,17,17,109,110,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,111,17,112,113,100,100,100,100,100,100,100,100,100,114,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,115,116,117,118,119,120,121,122,123,39,39,124,100,100,100,100,125,126,127,128,100,129,100,100,130,131,132,100,100,133,134,135,100,136,137,138,139,39,39,140,141,142,39,143,144,100,100,100,100,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17, |
22337
|
|
|
|
|
|
|
17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,145,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,146,147,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,148,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,149,100,100,100,100,100,100,100,100,100,100,100,100,17,17,150,100,100,100,100,100,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,151,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,152,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100, |
22338
|
|
|
|
|
|
|
100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100, |
22339
|
|
|
|
|
|
|
100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100, |
22340
|
|
|
|
|
|
|
100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100, |
22341
|
|
|
|
|
|
|
100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100, |
22342
|
|
|
|
|
|
|
100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,153,154,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100, |
22343
|
|
|
|
|
|
|
100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,155,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60, |
22344
|
|
|
|
|
|
|
60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,155 |
22345
|
|
|
|
|
|
|
}; |
22346
|
|
|
|
|
|
|
|
22347
|
|
|
|
|
|
|
const uint8_t unicode::category_block[][256] = { |
22348
|
|
|
|
|
|
|
{_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Zs,_Po,_Po,_Po,_Sc,_Po,_Po,_Po,_Ps,_Pe,_Po,_Sm,_Po,_Pd,_Po,_Po,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Po,_Po,_Sm,_Sm,_Sm,_Po,_Po,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ps,_Po,_Pe,_Sk,_Pc,_Sk,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ps,_Sm,_Pe,_Sm,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Zs,_Po,_Sc,_Sc,_Sc,_Sc,_So,_Po,_Sk,_So,_Lo,_Pi,_Sm,_Cf,_So,_Sk,_So,_Sm,_No,_No,_Sk,_Ll,_Po,_Po,_Sk,_No,_Lo,_Pf,_No,_No,_No,_Po,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Sm,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll}, |
22349
|
|
|
|
|
|
|
{_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Ll,_Lu,_Lu,_Ll,_Lu,_Ll,_Lu,_Lu,_Ll,_Lu,_Lu,_Lu,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Ll,_Lu,_Lu,_Ll,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Lu,_Lu,_Ll,_Lu,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Lu,_Ll,_Lu,_Ll,_Ll,_Lu,_Ll,_Lu,_Lu,_Ll,_Lu,_Lu,_Lu,_Ll,_Lu,_Ll,_Lu,_Lu,_Ll,_Ll,_Lo,_Lu,_Ll,_Ll,_Ll,_Lo,_Lo,_Lo,_Lo,_Lu,_Lt,_Ll,_Lu,_Lt,_Ll,_Lu,_Lt,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Lu,_Lt,_Ll,_Lu,_Ll,_Lu,_Lu,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll}, |
22350
|
|
|
|
|
|
|
{_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Ll,_Lu,_Lu,_Ll,_Ll,_Lu,_Ll,_Lu,_Lu,_Lu,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lo,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Sk,_Sk,_Sk,_Sk,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Lm,_Lm,_Lm,_Lm,_Lm,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Lm,_Sk,_Lm,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk}, |
22351
|
|
|
|
|
|
|
{_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lu,_Ll,_Lu,_Ll,_Lm,_Sk,_Lu,_Ll,_Cn,_Cn,_Lm,_Ll,_Ll,_Ll,_Po,_Lu,_Cn,_Cn,_Cn,_Cn,_Sk,_Sk,_Lu,_Po,_Lu,_Lu,_Lu,_Cn,_Lu,_Cn,_Lu,_Lu,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Ll,_Ll,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Ll,_Sm,_Lu,_Ll,_Lu,_Lu,_Ll,_Ll,_Lu,_Lu,_Lu}, |
22352
|
|
|
|
|
|
|
{_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_So,_Mn,_Mn,_Mn,_Mn,_Mn,_Me,_Me,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll}, |
22353
|
|
|
|
|
|
|
{_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Cn,_Lm,_Po,_Po,_Po,_Po,_Po,_Po,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Po,_Pd,_Cn,_Cn,_So,_So,_Sc,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Pd,_Mn,_Po,_Mn,_Mn,_Po,_Mn,_Mn,_Po,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22354
|
|
|
|
|
|
|
{_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Sm,_Sm,_Sm,_Po,_Po,_Sc,_Po,_Po,_So,_So,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Cf,_Po,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Po,_Po,_Po,_Po,_Lo,_Lo,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Po,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cf,_So,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lm,_Lm,_Mn,_Mn,_So,_Mn,_Mn,_Mn,_Mn,_Lo,_Lo,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Lo,_Lo,_So,_So,_Lo}, |
22355
|
|
|
|
|
|
|
{_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Cf,_Lo,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lm,_Lm,_So,_Po,_Po,_Po,_Lm,_Cn,_Cn,_Mn,_Sc,_Sc}, |
22356
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Lm,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lm,_Mn,_Mn,_Mn,_Lm,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Cn,_Cn,_Po,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Sk,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cf,_Cf,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cf,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn}, |
22357
|
|
|
|
|
|
|
{_Mn,_Mn,_Mn,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mc,_Mn,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mc,_Mc,_Mn,_Mc,_Mc,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Po,_Po,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Po,_Lm,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mc,_Mc,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Mc,_Mc,_Cn,_Cn,_Mc,_Mc,_Mn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mc,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Mn,_Mn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Lo,_Sc,_Sc,_No,_No,_No,_No,_No,_No,_So,_Sc,_Lo,_Po,_Mn,_Cn}, |
22358
|
|
|
|
|
|
|
{_Cn,_Mn,_Mn,_Mc,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Cn,_Mn,_Cn,_Mc,_Mc,_Mc,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Cn,_Cn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Mn,_Mn,_Lo,_Lo,_Lo,_Mn,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mc,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Mn,_Mn,_Mc,_Cn,_Mc,_Mc,_Mn,_Cn,_Cn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Mn,_Mn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Po,_Sc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn}, |
22359
|
|
|
|
|
|
|
{_Cn,_Mn,_Mc,_Mc,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Lo,_Mc,_Mn,_Mc,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Mc,_Mc,_Cn,_Cn,_Mc,_Mc,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mc,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Mn,_Mn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_So,_Lo,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Cn,_Lo,_Cn,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Mc,_Mc,_Mn,_Mc,_Mc,_Cn,_Cn,_Cn,_Mc,_Mc,_Mc,_Cn,_Mc,_Mc,_Mc,_Mn,_Cn,_Cn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_No,_No,_No,_So,_So,_So,_So,_So,_So,_Sc,_So,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22360
|
|
|
|
|
|
|
{_Mn,_Mc,_Mc,_Mc,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Lo,_Mn,_Mn,_Mn,_Mc,_Mc,_Mc,_Mc,_Cn,_Mn,_Mn,_Mn,_Cn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Cn,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Cn,_Cn,_Lo,_Lo,_Mn,_Mn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_No,_No,_No,_No,_No,_No,_No,_So,_Lo,_Mn,_Mc,_Mc,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Lo,_Mc,_Mn,_Mc,_Mc,_Mc,_Mc,_Mc,_Cn,_Mn,_Mc,_Mc,_Cn,_Mc,_Mc,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mc,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Mn,_Mn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Lo,_Lo,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22361
|
|
|
|
|
|
|
{_Mn,_Mn,_Mc,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Cn,_Mc,_Mc,_Mc,_Cn,_Mc,_Mc,_Mc,_Mn,_Lo,_So,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Mc,_No,_No,_No,_No,_No,_No,_No,_Lo,_Lo,_Lo,_Mn,_Mn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Mn,_Mc,_Mc,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Mn,_Cn,_Cn,_Cn,_Cn,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Cn,_Mn,_Cn,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Mc,_Mc,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22362
|
|
|
|
|
|
|
{_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Sc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Cn,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lm,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22363
|
|
|
|
|
|
|
{_Lo,_So,_So,_So,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_So,_Po,_So,_So,_So,_Mn,_Mn,_So,_So,_So,_So,_So,_So,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_Mn,_So,_Mn,_So,_Mn,_Ps,_Pe,_Ps,_Pe,_Mc,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_Mn,_So,_So,_So,_So,_So,_So,_Cn,_So,_So,_Po,_Po,_Po,_Po,_Po,_So,_So,_So,_So,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22364
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mn,_Mc,_Mc,_Mn,_Mn,_Lo,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Po,_Po,_Po,_Po,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Lo,_Mc,_Mc,_Mc,_Lo,_Lo,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mc,_Mc,_Mn,_Mn,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mn,_Lo,_Mc,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Mc,_Mc,_Mc,_Mn,_So,_So,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Lu,_Cn,_Cn,_Cn,_Cn,_Cn,_Lu,_Cn,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Po,_Lm,_Ll,_Ll,_Ll}, |
22365
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo}, |
22366
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo}, |
22367
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Mn,_Mn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn}, |
22368
|
|
|
|
|
|
|
{_Pd,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo}, |
22369
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_So,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Zs,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Ps,_Pe,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Po,_Po,_Po,_Nl,_Nl,_Nl,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22370
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mc,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Cn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mn,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Po,_Po,_Lm,_Po,_Po,_Po,_Sc,_Lo,_Mn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22371
|
|
|
|
|
|
|
{_Po,_Po,_Po,_Po,_Po,_Po,_Pd,_Po,_Po,_Po,_Po,_Mn,_Mn,_Mn,_Cf,_Mn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22372
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mc,_Mc,_Mn,_Mn,_Mc,_Mc,_Mc,_Cn,_Cn,_Cn,_Cn,_Mc,_Mc,_Mn,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_So,_Cn,_Cn,_Cn,_Po,_Po,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_No,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So}, |
22373
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mc,_Mc,_Mn,_Cn,_Cn,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mn,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Mn,_Mc,_Mn,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Mn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Lm,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Me,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22374
|
|
|
|
|
|
|
{_Mn,_Mn,_Mn,_Mn,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mc,_Mc,_Mc,_Mc,_Mc,_Mn,_Mc,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Po,_Po,_Cn,_Mn,_Mn,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mn,_Mn,_Mc,_Mn,_Mn,_Mn,_Lo,_Lo,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mc,_Mn,_Mn,_Mc,_Mc,_Mc,_Mn,_Mc,_Mn,_Mn,_Mn,_Mc,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Po,_Po,_Po}, |
22375
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mn,_Mn,_Cn,_Cn,_Cn,_Po,_Po,_Po,_Po,_Po,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Po,_Po,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Cn,_Lu,_Lu,_Lu,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Po,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Lo,_Lo,_Mc,_Mn,_Mn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22376
|
|
|
|
|
|
|
{_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn}, |
22377
|
|
|
|
|
|
|
{_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll}, |
22378
|
|
|
|
|
|
|
{_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Lu,_Cn,_Lu,_Cn,_Lu,_Cn,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lt,_Lt,_Lt,_Lt,_Lt,_Lt,_Lt,_Lt,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lt,_Lt,_Lt,_Lt,_Lt,_Lt,_Lt,_Lt,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lt,_Lt,_Lt,_Lt,_Lt,_Lt,_Lt,_Lt,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lt,_Sk,_Ll,_Sk,_Sk,_Sk,_Ll,_Ll,_Ll,_Cn,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lt,_Sk,_Sk,_Sk,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Cn,_Sk,_Sk,_Sk,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Sk,_Sk,_Sk,_Cn,_Cn,_Ll,_Ll,_Ll,_Cn,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lt,_Sk,_Sk,_Cn}, |
22379
|
|
|
|
|
|
|
{_Zs,_Zs,_Zs,_Zs,_Zs,_Zs,_Zs,_Zs,_Zs,_Zs,_Zs,_Cf,_Cf,_Cf,_Cf,_Cf,_Pd,_Pd,_Pd,_Pd,_Pd,_Pd,_Po,_Po,_Pi,_Pf,_Ps,_Pi,_Pi,_Pf,_Ps,_Pi,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Zl,_Zp,_Cf,_Cf,_Cf,_Cf,_Cf,_Zs,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Pi,_Pf,_Po,_Po,_Po,_Po,_Pc,_Pc,_Po,_Po,_Po,_Sm,_Ps,_Pe,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Sm,_Po,_Pc,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Zs,_Cf,_Cf,_Cf,_Cf,_Cf,_Cn,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_No,_Lm,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_Sm,_Sm,_Sm,_Ps,_Pe,_Lm,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Sm,_Sm,_Sm,_Ps,_Pe,_Cn,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Cn,_Cn,_Cn,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Me,_Me,_Me,_Me,_Mn,_Me,_Me,_Me,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22380
|
|
|
|
|
|
|
{_So,_So,_Lu,_So,_So,_So,_So,_Lu,_So,_So,_Ll,_Lu,_Lu,_Lu,_Ll,_Ll,_Lu,_Lu,_Lu,_Ll,_So,_Lu,_So,_So,_Sm,_Lu,_Lu,_Lu,_Lu,_Lu,_So,_So,_So,_So,_So,_So,_Lu,_So,_Lu,_So,_Lu,_So,_Lu,_Lu,_Lu,_Lu,_So,_Ll,_Lu,_Lu,_Lu,_Lu,_Ll,_Lo,_Lo,_Lo,_Lo,_Ll,_So,_So,_Ll,_Ll,_Lu,_Lu,_Sm,_Sm,_Sm,_Sm,_Sm,_Lu,_Ll,_Ll,_Ll,_Ll,_So,_Sm,_So,_So,_Ll,_So,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Lu,_Ll,_Nl,_Nl,_Nl,_Nl,_No,_So,_So,_Cn,_Cn,_Cn,_Cn,_Sm,_Sm,_Sm,_Sm,_Sm,_So,_So,_So,_So,_So,_Sm,_Sm,_So,_So,_So,_So,_Sm,_So,_So,_Sm,_So,_So,_Sm,_So,_So,_So,_So,_So,_So,_So,_Sm,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_Sm,_So,_So,_Sm,_So,_Sm,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm}, |
22381
|
|
|
|
|
|
|
{_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm}, |
22382
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_Ps,_Pe,_Ps,_Pe,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_Sm,_So,_So,_So,_So,_So,_So,_So,_Ps,_Pe,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So}, |
22383
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No}, |
22384
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm}, |
22385
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So}, |
22386
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_Sm,_Sm,_Sm,_Sm,_Ps,_Pe,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm}, |
22387
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So}, |
22388
|
|
|
|
|
|
|
{_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Ps,_Pe,_Ps,_Pe,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Ps,_Pe,_Sm,_Sm}, |
22389
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_So,_So,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So}, |
22390
|
|
|
|
|
|
|
{_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Ll,_Lu,_Lu,_Lu,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Lu,_Lu,_Lu,_Ll,_Lu,_Ll,_Ll,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lm,_Lm,_Lu,_Lu,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_So,_So,_So,_So,_So,_So,_Lu,_Ll,_Lu,_Ll,_Mn,_Mn,_Mn,_Lu,_Ll,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Po,_Po,_Po,_No,_Po,_Po}, |
22391
|
|
|
|
|
|
|
{_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Ll,_Cn,_Cn,_Cn,_Cn,_Cn,_Ll,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lm,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn}, |
22392
|
|
|
|
|
|
|
{_Po,_Po,_Pi,_Pf,_Pi,_Pf,_Po,_Po,_Po,_Pi,_Pf,_Po,_Pi,_Pf,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Pd,_Po,_Po,_Pd,_Po,_Pi,_Pf,_Po,_Po,_Pi,_Pf,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Po,_Po,_Po,_Po,_Po,_Lm,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Pd,_Pd,_Po,_Po,_Po,_Po,_Pd,_Po,_Ps,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_So,_So,_Po,_Po,_Po,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Pd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22393
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn}, |
22394
|
|
|
|
|
|
|
{_Zs,_Po,_Po,_Po,_So,_Lm,_Lo,_Nl,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_So,_So,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Pd,_Ps,_Pe,_Pe,_So,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Pd,_Lm,_Lm,_Lm,_Lm,_Lm,_So,_So,_Nl,_Nl,_Nl,_Lm,_Lo,_Po,_So,_So,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Mn,_Sk,_Sk,_Lm,_Lm,_Lo,_Pd,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Po,_Lm,_Lm,_Lm,_Lo}, |
22395
|
|
|
|
|
|
|
{_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_So,_So,_No,_No,_No,_No,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo}, |
22396
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_No,_No,_No,_No,_No,_No,_No,_No,_So,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So}, |
22397
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So}, |
22398
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo}, |
22399
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Po,_Po}, |
22400
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Po,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lo,_Mn,_Me,_Me,_Me,_Po,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Lm,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lm,_Lm,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Mn,_Mn,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22401
|
|
|
|
|
|
|
{_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Sk,_Sk,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lm,_Sk,_Sk,_Lu,_Ll,_Lu,_Ll,_Lo,_Lu,_Ll,_Lu,_Ll,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Lu,_Lu,_Lu,_Ll,_Lu,_Ll,_Cn,_Cn,_Cn,_Cn,_Cn,_Lu,_Ll,_Cn,_Ll,_Cn,_Ll,_Lu,_Ll,_Lu,_Ll,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lm,_Lm,_Lm,_Lu,_Ll,_Lo,_Lm,_Lm,_Ll,_Lo,_Lo,_Lo,_Lo,_Lo}, |
22402
|
|
|
|
|
|
|
{_Lo,_Lo,_Mn,_Lo,_Lo,_Lo,_Mn,_Lo,_Lo,_Lo,_Lo,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mn,_Mn,_Mc,_So,_So,_So,_So,_Mn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_So,_So,_Sc,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mc,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Po,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Po,_Po,_Po,_Lo,_Po,_Lo,_Lo,_Mn}, |
22403
|
|
|
|
|
|
|
{_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mn,_Mn,_Mc,_Mc,_Mc,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Lm,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Lm,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn}, |
22404
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mn,_Mn,_Mc,_Mc,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mc,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Po,_Po,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_So,_So,_So,_Lo,_Mc,_Mn,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Lo,_Mn,_Mn,_Mn,_Lo,_Lo,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Lo,_Mn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lm,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mn,_Mn,_Mc,_Mc,_Po,_Po,_Lo,_Lm,_Lm,_Mc,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22405
|
|
|
|
|
|
|
{_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Sk,_Lm,_Lm,_Lm,_Lm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lm,_Sk,_Sk,_Cn,_Cn,_Cn,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mn,_Mc,_Mc,_Mn,_Mc,_Mc,_Po,_Mc,_Mn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22406
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn}, |
22407
|
|
|
|
|
|
|
{_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs}, |
22408
|
|
|
|
|
|
|
{_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co}, |
22409
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22410
|
|
|
|
|
|
|
{_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Sm,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo}, |
22411
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Pe,_Ps,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Sc,_So,_So,_So}, |
22412
|
|
|
|
|
|
|
{_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Ps,_Pe,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Pd,_Pd,_Pc,_Pc,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Po,_Po,_Ps,_Pe,_Po,_Po,_Po,_Po,_Pc,_Pc,_Pc,_Po,_Po,_Po,_Cn,_Po,_Po,_Po,_Po,_Pd,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Po,_Po,_Po,_Sm,_Pd,_Sm,_Sm,_Sm,_Cn,_Po,_Sc,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cf}, |
22413
|
|
|
|
|
|
|
{_Cn,_Po,_Po,_Po,_Sc,_Po,_Po,_Po,_Ps,_Pe,_Po,_Sm,_Po,_Pd,_Po,_Po,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Po,_Po,_Sm,_Sm,_Sm,_Po,_Po,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ps,_Po,_Pe,_Sk,_Pc,_Sk,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ps,_Sm,_Pe,_Sm,_Ps,_Pe,_Po,_Ps,_Pe,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Lm,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Sc,_Sc,_Sm,_Sk,_So,_Sc,_Sc,_Cn,_So,_Sm,_Sm,_Sm,_Sm,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cf,_Cf,_Cf,_So,_So,_Cn,_Cn}, |
22414
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22415
|
|
|
|
|
|
|
{_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_No,_No,_No,_No,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_No,_No,_So,_So,_So,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Mn,_Cn,_Cn}, |
22416
|
|
|
|
|
|
|
{_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn}, |
22417
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Nl,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Nl,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Po,_Nl,_Nl,_Nl,_Nl,_Nl,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22418
|
|
|
|
|
|
|
{_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Cn,_Cn,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Cn,_Cn}, |
22419
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Lu,_Lu,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Ll,_Ll,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22420
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Cn,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Cn,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22421
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Po,_No,_No,_No,_No,_No,_No,_No,_No,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_So,_So,_No,_No,_No,_No,_No,_No,_No,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No}, |
22422
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_No,_No,_Lo,_Lo,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No}, |
22423
|
|
|
|
|
|
|
{_Lo,_Mn,_Mn,_Mn,_Cn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Mn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_No,_No,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_So,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22424
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22425
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No}, |
22426
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22427
|
|
|
|
|
|
|
{_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Mn,_Mn,_Pd,_Cn,_Cn,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn}, |
22428
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_No,_No,_No,_No,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22429
|
|
|
|
|
|
|
{_Mc,_Mn,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Mn,_Lo,_Lo,_Mn,_Mn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mn,_Mn,_Po,_Po,_Cf,_Po,_Po,_Po,_Po,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cf,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22430
|
|
|
|
|
|
|
{_Mn,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Po,_Po,_Po,_Po,_Lo,_Mc,_Mc,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Po,_Po,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Lo,_Lo,_Lo,_Lo,_Po,_Po,_Po,_Po,_Mn,_Mn,_Mn,_Mn,_Po,_Mc,_Mn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Po,_Lo,_Po,_Po,_Po,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22431
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mc,_Mc,_Mn,_Mc,_Mn,_Mn,_Po,_Po,_Po,_Po,_Po,_Po,_Mn,_Lo,_Lo,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22432
|
|
|
|
|
|
|
{_Mn,_Mn,_Mc,_Mc,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Mn,_Mn,_Lo,_Mc,_Mc,_Mn,_Mc,_Mc,_Mc,_Mc,_Cn,_Cn,_Mc,_Mc,_Cn,_Cn,_Mc,_Mc,_Mc,_Cn,_Cn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22433
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mn,_Mn,_Mn,_Mc,_Mn,_Lo,_Lo,_Lo,_Lo,_Po,_Po,_Po,_Po,_Po,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Po,_Po,_Cn,_Po,_Mn,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mc,_Mc,_Mc,_Mc,_Mn,_Mn,_Mc,_Mn,_Mn,_Lo,_Lo,_Po,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22434
|
|
|
|
|
|
|
{_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Mc,_Mc,_Mc,_Mc,_Mn,_Mn,_Mc,_Mn,_Mn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22435
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mn,_Mc,_Mn,_Mn,_Po,_Po,_Po,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mc,_Mn,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Lo,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22436
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_No,_No,_Po,_Po,_Po,_So,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22437
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mn,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo}, |
22438
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Cn,_Mc,_Mc,_Cn,_Cn,_Mn,_Mn,_Mc,_Mn,_Lo,_Mc,_Lo,_Mc,_Mn,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Mn,_Mn,_Mc,_Mc,_Mc,_Mc,_Mn,_Lo,_Po,_Lo,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22439
|
|
|
|
|
|
|
{_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Lo,_Mn,_Mn,_Mn,_Mn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mn,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mn,_Po,_Po,_Po,_Lo,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22440
|
|
|
|
|
|
|
{_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22441
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Lo,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mn,_Mc,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22442
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Mn,_Cn,_Mn,_Mn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lo,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mc,_Mc,_Cn,_Mn,_Mn,_Cn,_Mc,_Mc,_Mn,_Mc,_Mn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22443
|
|
|
|
|
|
|
{_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mc,_Mc,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22444
|
|
|
|
|
|
|
{_Mn,_Mn,_Lo,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Mc,_Mc,_Mn,_Mc,_Mn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_So,_So,_So,_So,_So,_So,_So,_Sc,_Sc,_Sc,_Sc,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Po}, |
22445
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22446
|
|
|
|
|
|
|
{_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Cn,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo}, |
22447
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22448
|
|
|
|
|
|
|
{_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22449
|
|
|
|
|
|
|
{_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22450
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22451
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22452
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22453
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Po,_Po,_Po,_Po,_So,_So,_So,_So,_Lm,_Lm,_Lm,_Lm,_Po,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_No,_No,_No,_No,_No,_No,_No,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22454
|
|
|
|
|
|
|
{_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22455
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Mn,_Lo,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lm,_Lm,_Po,_Lm,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mc,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22456
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22457
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22458
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22459
|
|
|
|
|
|
|
{_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lm,_Lm,_Lm,_Lm,_Cn,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Cn,_Lm,_Lm,_Cn}, |
22460
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo}, |
22461
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn}, |
22462
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_So,_Mn,_Mn,_Po,_Cf,_Cf,_Cf,_Cf,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22463
|
|
|
|
|
|
|
{_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22464
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22465
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Mc,_Mc,_Mn,_Mn,_Mn,_So,_So,_So,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_So,_So,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Mn,_Mn,_Mn,_Mn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22466
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Mn,_Mn,_Mn,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22467
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22468
|
|
|
|
|
|
|
{_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Cn,_Lu,_Lu,_Cn,_Cn,_Lu,_Cn,_Cn,_Lu,_Lu,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Cn,_Ll,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll}, |
22469
|
|
|
|
|
|
|
{_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Cn,_Lu,_Lu,_Lu,_Lu,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Cn,_Lu,_Lu,_Lu,_Lu,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Lu,_Cn,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll}, |
22470
|
|
|
|
|
|
|
{_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Sm,_Ll,_Ll,_Ll,_Ll}, |
22471
|
|
|
|
|
|
|
{_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Ll,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd}, |
22472
|
|
|
|
|
|
|
{_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_So,_So,_So,_So,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_So,_So,_So,_So,_So,_So,_So,_So,_Mn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Mn,_So,_So,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22473
|
|
|
|
|
|
|
{_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lo,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22474
|
|
|
|
|
|
|
{_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Mn,_Mn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22475
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Lo,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22476
|
|
|
|
|
|
|
{_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Sc}, |
22477
|
|
|
|
|
|
|
{_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Mn,_Mn,_Mn,_Mn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22478
|
|
|
|
|
|
|
{_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn}, |
22479
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22480
|
|
|
|
|
|
|
{_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lm,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22481
|
|
|
|
|
|
|
{_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_No,_No,_No,_Sc,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22482
|
|
|
|
|
|
|
{_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22483
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Cn,_Cn,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Cn,_Cn,_Cn,_Cn,_Lo,_Cn,_Lo,_Cn,_Lo,_Cn,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Cn,_Cn,_Lo,_Cn,_Lo,_Cn,_Lo,_Cn,_Lo,_Cn,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Sm,_Sm,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22484
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22485
|
|
|
|
|
|
|
{_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So}, |
22486
|
|
|
|
|
|
|
{_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22487
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sk,_Sk,_Sk,_Sk,_Sk}, |
22488
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn}, |
22489
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22490
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22491
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22492
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22493
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22494
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo}, |
22495
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo}, |
22496
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo}, |
22497
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22498
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22499
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo}, |
22500
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22501
|
|
|
|
|
|
|
{_Cn,_Cf,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22502
|
|
|
|
|
|
|
{_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
22503
|
|
|
|
|
|
|
{_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Cn,_Cn} |
22504
|
|
|
|
|
|
|
}; |
22505
|
|
|
|
|
|
|
|
22506
|
|
|
|
|
|
|
const uint8_t unicode::othercase_index[unicode::CHARS >> 8] = { |
22507
|
|
|
|
|
|
|
0,1,2,3,4,5,6,6,6,6,6,6,6,6,6,6,7,6,6,8,6,6,6,6,6,6,6,6,9,10,11,12,6,13,6,6,14,6,6,6,6,6,6,6,15,16,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,17,18,6,6,6,19,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,20,6,6,6,6,21,22,6,6,6,6,6,6,23,6,6,6,6,6,6,6,6,6,6,6,24,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,25,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,26,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, |
22508
|
|
|
|
|
|
|
6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, |
22509
|
|
|
|
|
|
|
6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, |
22510
|
|
|
|
|
|
|
6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, |
22511
|
|
|
|
|
|
|
6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6 |
22512
|
|
|
|
|
|
|
}; |
22513
|
|
|
|
|
|
|
|
22514
|
|
|
|
|
|
|
const char32_t unicode::othercase_block[][256] = { |
22515
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,24833,25089,25345,25601,25857,26113,26369,26625,26881,27137,27393,27649,27905,28161,28417,28673,28929,29185,29441,29697,29953,30209,30465,30721,30977,31233,0,0,0,0,0,0,16642,16898,17154,17410,17666,17922,18178,18434,18690,18946,19202,19458,19714,19970,20226,20482,20738,20994,21250,21506,21762,22018,22274,22530,22786,23042,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,236546,0,0,0,0,0,0,0,0,0,0,57345,57601,57857,58113,58369,58625,58881,59137,59393,59649,59905,60161,60417,60673,60929,61185,61441,61697,61953,62209,62465,62721,62977,0,63489,63745,64001,64257,64513,64769,65025,0,49154,49410,49666,49922,50178,50434,50690,50946,51202,51458,51714,51970,52226,52482,52738,52994,53250,53506,53762,54018,54274,54530,54786,0,55298,55554,55810,56066,56322,56578,56834,96258}, |
22516
|
|
|
|
|
|
|
{65793,65538,66305,66050,66817,66562,67329,67074,67841,67586,68353,68098,68865,68610,69377,69122,69889,69634,70401,70146,70913,70658,71425,71170,71937,71682,72449,72194,72961,72706,73473,73218,73985,73730,74497,74242,75009,74754,75521,75266,76033,75778,76545,76290,77057,76802,77569,77314,26881,18690,78593,78338,79105,78850,79617,79362,0,80385,80130,80897,80642,81409,81154,81921,81666,82433,82178,82945,82690,83457,83202,83969,83714,0,84737,84482,85249,84994,85761,85506,86273,86018,86785,86530,87297,87042,87809,87554,88321,88066,88833,88578,89345,89090,89857,89602,90369,90114,90881,90626,91393,91138,91905,91650,92417,92162,92929,92674,93441,93186,93953,93698,94465,94210,94977,94722,95489,95234,96001,95746,65281,96769,96514,97281,97026,97793,97538,21250,148226,152321,99073,98818,99585,99330,152577,100353,100098,153089,153345,101377,101122,0,122113,153857,154369,102913,102658,155649,156417,128514,157953,157697,104705,104450,146690,0,159489,160257,139266,161025,106753,106498,107265,107010,107777,107522,163841,108545,108290,164609,0,0,109825,109570,165889,110593,110338,166401,166657,111617,111362,112129,111874,168449,112897,112642,0,0,113921,113666,0,128770,0,0,0,0,115974,116228,115717,116742,116996,116485,117510,117764,117253,118273,118018,118785,118530,119297,119042,119809,119554,120321,120066,120833,120578,121345,121090,121857,121602,101890,122625,122370,123137,122882,123649,123394,124161,123906,124673,124418,125185,124930,125697,125442,126209,125954,126721,126466,0,127494,127748,127237,128257,128002,103681,114433,129281,129026,129793,129538,130305,130050,130817,130562}, |
22517
|
|
|
|
|
|
|
{131329,131074,131841,131586,132353,132098,132865,132610,133377,133122,133889,133634,134401,134146,134913,134658,135425,135170,135937,135682,136449,136194,136961,136706,137473,137218,137985,137730,138497,138242,139009,138754,105985,0,140033,139778,140545,140290,141057,140802,141569,141314,142081,141826,142593,142338,143105,142850,143617,143362,144129,143874,0,0,0,0,0,0,2909441,146433,146178,104961,2909697,2915842,2916098,147969,147714,98305,166145,166913,149249,148994,149761,149506,150273,150018,150785,150530,151297,151042,2912002,2911490,2912258,98562,99842,0,100610,100866,0,102146,0,102402,10988290,0,0,0,103170,10988546,0,103426,0,10980610,10988034,0,104194,103938,10989058,2908674,10988802,0,0,105474,0,2911746,105730,0,0,106242,0,0,0,0,0,0,0,2909186,0,0,108034,0,10994946,108802,0,0,0,10989826,110082,148482,110850,111106,148738,0,0,0,0,0,112386,0,0,0,0,0,0,0,0,0,0,10990082,10989570,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22518
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,235778,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,225537,225282,226049,225794,0,0,227073,226818,0,0,0,261378,261634,261890,0,258817,0,0,0,0,0,0,240641,0,240897,241153,241409,0,248833,0,249089,249345,0,241921,242177,242433,242689,242945,243201,243457,243713,243969,244225,244481,244737,244993,245249,245505,245761,246017,0,246529,246785,247041,247297,247553,247809,248065,248321,248577,230914,231426,231682,231938,0,233730,233986,234242,234498,234754,235010,235266,235522,235778,236034,236290,236546,236802,237058,237314,237570,237826,238338,238338,238594,238850,239106,239362,239618,239874,240130,240386,232450,232962,233218,251649,233986,235522,0,0,0,239106,237570,249602,252161,251906,252673,252418,253185,252930,253697,253442,254209,253954,254721,254466,255233,254978,255745,255490,256257,256002,256769,256514,257281,257026,257793,257538,236034,237826,260354,229122,243713,234754,0,260097,259842,258561,260865,260610,0,228097,228353,228609}, |
22519
|
|
|
|
|
|
|
{282625,282881,283137,283393,283649,283905,284161,284417,284673,284929,285185,285441,285697,285953,286209,286465,274433,274689,274945,275201,275457,275713,275969,276225,276481,276737,276993,277249,277505,277761,278017,278273,278529,278785,279041,279297,279553,279809,280065,280321,280577,280833,281089,281345,281601,281857,282113,282369,266242,266498,266754,267010,267266,267522,267778,268034,268290,268546,268802,269058,269314,269570,269826,270082,270338,270594,270850,271106,271362,271618,271874,272130,272386,272642,272898,273154,273410,273666,273922,274178,262146,262402,262658,262914,263170,263426,263682,263938,264194,264450,264706,264962,265218,265474,265730,265986,286977,286722,287489,287234,288001,287746,288513,288258,289025,288770,289537,289282,290049,289794,290561,290306,291073,290818,291585,291330,292097,291842,292609,292354,293121,292866,293633,293378,294145,293890,294657,294402,295169,294914,0,0,0,0,0,0,0,0,297729,297474,298241,297986,298753,298498,299265,299010,299777,299522,300289,300034,300801,300546,301313,301058,301825,301570,302337,302082,302849,302594,303361,303106,303873,303618,304385,304130,304897,304642,305409,305154,305921,305666,306433,306178,306945,306690,307457,307202,307969,307714,308481,308226,308993,308738,309505,309250,310017,309762,310529,310274,311041,310786,315137,311809,311554,312321,312066,312833,312578,313345,313090,313857,313602,314369,314114,314881,314626,311298,315649,315394,316161,315906,316673,316418,317185,316930,317697,317442,318209,317954,318721,318466,319233,318978,319745,319490,320257,320002,320769,320514,321281,321026,321793,321538,322305,322050,322817,322562,323329,323074,323841,323586,324353,324098,324865,324610,325377,325122,325889,325634,326401,326146,326913,326658,327425,327170}, |
22520
|
|
|
|
|
|
|
{327937,327682,328449,328194,328961,328706,329473,329218,329985,329730,330497,330242,331009,330754,331521,331266,332033,331778,332545,332290,333057,332802,333569,333314,334081,333826,334593,334338,335105,334850,335617,335362,336129,335874,336641,336386,337153,336898,337665,337410,338177,337922,338689,338434,339201,338946,339713,339458,0,352513,352769,353025,353281,353537,353793,354049,354305,354561,354817,355073,355329,355585,355841,356097,356353,356609,356865,357121,357377,357633,357889,358145,358401,358657,358913,359169,359425,359681,359937,360193,360449,360705,360961,361217,361473,361729,361985,0,0,0,0,0,0,0,0,0,0,340226,340482,340738,340994,341250,341506,341762,342018,342274,342530,342786,343042,343298,343554,343810,344066,344322,344578,344834,345090,345346,345602,345858,346114,346370,346626,346882,347138,347394,347650,347906,348162,348418,348674,348930,349186,349442,349698,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22521
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22522
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2949121,2949377,2949633,2949889,2950145,2950401,2950657,2950913,2951169,2951425,2951681,2951937,2952193,2952449,2952705,2952961,2953217,2953473,2953729,2953985,2954241,2954497,2954753,2955009,2955265,2955521,2955777,2956033,2956289,2956545,2956801,2957057,2957313,2957569,2957825,2958081,2958337,2958593,0,2959105,0,0,0,0,0,2960641,0,0,1871875,1872131,1872387,1872643,1872899,1873155,1873411,1873667,1873923,1874179,1874435,1874691,1874947,1875203,1875459,1875715,1875971,1876227,1876483,1876739,1876995,1877251,1877507,1877763,1878019,1878275,1878531,1878787,1879043,1879299,1879555,1879811,1880067,1880323,1880579,1880835,1881091,1881347,1881603,1881859,1882115,1882371,1882627,0,0,1883395,1883651,1883907}, |
22523
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11235329,11235585,11235841,11236097,11236353,11236609,11236865,11237121,11237377,11237633,11237889,11238145,11238401,11238657,11238913,11239169,11239425,11239681,11239937,11240193,11240449,11240705,11240961,11241217,11241473,11241729,11241985,11242241,11242497,11242753,11243009,11243265,11243521,11243777,11244033,11244289,11244545,11244801,11245057,11245313,11245569,11245825,11246081,11246337,11246593,11246849,11247105,11247361,11247617,11247873,11248129,11248385,11248641,11248897,11249153,11249409,11249665,11249921,11250177,11250433,11250689,11250945,11251201,11251457,11251713,11251969,11252225,11252481,11252737,11252993,11253249,11253505,11253761,11254017,11254273,11254529,11254785,11255041,11255297,11255553,1308673,1308929,1309185,1309441,1309697,1309953,0,0,1306626,1306882,1307138,1307394,1307650,1307906,0,0}, |
22524
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,266754,267266,269826,270594,270850,270850,272898,287234,10897922,0,0,0,0,0,0,0,1101825,1102081,1102337,1102593,1102849,1103105,1103361,1103617,1103873,1104129,1104385,1104641,1104897,1105153,1105409,1105665,1105921,1106177,1106433,1106689,1106945,1107201,1107457,1107713,1107969,1108225,1108481,1108737,1108993,1109249,1109505,1109761,1110017,1110273,1110529,1110785,1111041,1111297,1111553,1111809,1112065,1112321,1112577,0,0,1113345,1113601,1113857,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22525
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10976514,0,0,0,2908930,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10995202,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22526
|
|
|
|
|
|
|
{1966337,1966082,1966849,1966594,1967361,1967106,1967873,1967618,1968385,1968130,1968897,1968642,1969409,1969154,1969921,1969666,1970433,1970178,1970945,1970690,1971457,1971202,1971969,1971714,1972481,1972226,1972993,1972738,1973505,1973250,1974017,1973762,1974529,1974274,1975041,1974786,1975553,1975298,1976065,1975810,1976577,1976322,1977089,1976834,1977601,1977346,1978113,1977858,1978625,1978370,1979137,1978882,1979649,1979394,1980161,1979906,1980673,1980418,1981185,1980930,1981697,1981442,1982209,1981954,1982721,1982466,1983233,1982978,1983745,1983490,1984257,1984002,1984769,1984514,1985281,1985026,1985793,1985538,1986305,1986050,1986817,1986562,1987329,1987074,1987841,1987586,1988353,1988098,1988865,1988610,1989377,1989122,1989889,1989634,1990401,1990146,1990913,1990658,1991425,1991170,1991937,1991682,1992449,1992194,1992961,1992706,1993473,1993218,1993985,1993730,1994497,1994242,1995009,1994754,1995521,1995266,1996033,1995778,1996545,1996290,1997057,1996802,1997569,1997314,1998081,1997826,1998593,1998338,1999105,1998850,1999617,1999362,2000129,1999874,2000641,2000386,2001153,2000898,2001665,2001410,2002177,2001922,2002689,2002434,2003201,2002946,2003713,2003458,2004225,2003970,0,0,0,0,0,1990658,0,0,57089,0,2007297,2007042,2007809,2007554,2008321,2008066,2008833,2008578,2009345,2009090,2009857,2009602,2010369,2010114,2010881,2010626,2011393,2011138,2011905,2011650,2012417,2012162,2012929,2012674,2013441,2013186,2013953,2013698,2014465,2014210,2014977,2014722,2015489,2015234,2016001,2015746,2016513,2016258,2017025,2016770,2017537,2017282,2018049,2017794,2018561,2018306,2019073,2018818,2019585,2019330,2020097,2019842,2020609,2020354,2021121,2020866,2021633,2021378,2022145,2021890,2022657,2022402,2023169,2022914,2023681,2023426,2024193,2023938,2024705,2024450,2025217,2024962,2025729,2025474,2026241,2025986,2026753,2026498,2027265,2027010,2027777,2027522,2028289,2028034,2028801,2028546,2029313,2029058,2029825,2029570,2030337,2030082,2030849,2030594,2031361, |
22527
|
|
|
|
|
|
|
2031106}, |
22528
|
|
|
|
|
|
|
{2033666,2033922,2034178,2034434,2034690,2034946,2035202,2035458,2031617,2031873,2032129,2032385,2032641,2032897,2033153,2033409,2037762,2038018,2038274,2038530,2038786,2039042,0,0,2035713,2035969,2036225,2036481,2036737,2036993,0,0,2041858,2042114,2042370,2042626,2042882,2043138,2043394,2043650,2039809,2040065,2040321,2040577,2040833,2041089,2041345,2041601,2045954,2046210,2046466,2046722,2046978,2047234,2047490,2047746,2043905,2044161,2044417,2044673,2044929,2045185,2045441,2045697,2050050,2050306,2050562,2050818,2051074,2051330,0,0,2048001,2048257,2048513,2048769,2049025,2049281,0,0,0,2054402,0,2054914,0,2055426,0,2055938,0,2052353,0,2052865,0,2053377,0,2053889,2058242,2058498,2058754,2059010,2059266,2059522,2059778,2060034,2056193,2056449,2056705,2056961,2057217,2057473,2057729,2057985,2079234,2079490,2082818,2083074,2083330,2083586,2087426,2087682,2095106,2095362,2091522,2091778,2095618,2095874,0,0,2066434,2066690,2066946,2067202,2067458,2067714,2067970,2068226,2064385,2064641,2064897,2065153,2065409,2065665,2065921,2066177,2070530,2070786,2071042,2071298,2071554,2071810,2072066,2072322,2068481,2068737,2068993,2069249,2069505,2069761,2070017,2070273,2074626,2074882,2075138,2075394,2075650,2075906,2076162,2076418,2072577,2072833,2073089,2073345,2073601,2073857,2074113,2074369,2078722,2078978,0,2079746,0,0,0,0,2076673,2076929,2060289,2060545,2077441,0,235778,0,0,0,0,2083842,0,0,0,0,2060801,2061057,2061313,2061569,2081537,0,0,0,2086914,2087170,0,0,0,0,0,0,2084865,2085121,2061825,2062081,0,0,0,0,2091010,2091266,0,0,0,2092034,0,0,2088961,2089217,2062849,2063105,2090241,0,0,0,0,0,0,2096130,0,0,0,0,2062337,2062593,2063361,2063617,2093825,0,0,0}, |
22529
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,248065,0,0,0,27393,58625,0,0,0,0,0,0,2182657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2175490,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2191361,2191617,2191873,2192129,2192385,2192641,2192897,2193153,2193409,2193665,2193921,2194177,2194433,2194689,2194945,2195201,2187266,2187522,2187778,2188034,2188290,2188546,2188802,2189058,2189314,2189570,2189826,2190082,2190338,2190594,2190850,2191106,0,0,0,2196481,2196226,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22530
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2412545,2412801,2413057,2413313,2413569,2413825,2414081,2414337,2414593,2414849,2415105,2415361,2415617,2415873,2416129,2416385,2416641,2416897,2417153,2417409,2417665,2417921,2418177,2418433,2418689,2418945,2405890,2406146,2406402,2406658,2406914,2407170,2407426,2407682,2407938,2408194,2408450,2408706,2408962,2409218,2409474,2409730,2409986,2410242,2410498,2410754,2411010,2411266,2411522,2411778,2412034,2412290,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22531
|
|
|
|
|
|
|
{2895873,2896129,2896385,2896641,2896897,2897153,2897409,2897665,2897921,2898177,2898433,2898689,2898945,2899201,2899457,2899713,2899969,2900225,2900481,2900737,2900993,2901249,2901505,2901761,2902017,2902273,2902529,2902785,2903041,2903297,2903553,2903809,2904065,2904321,2904577,2904833,2905089,2905345,2905601,2905857,2906113,2906369,2906625,2906881,2907137,2907393,2907649,2907905,2883586,2883842,2884098,2884354,2884610,2884866,2885122,2885378,2885634,2885890,2886146,2886402,2886658,2886914,2887170,2887426,2887682,2887938,2888194,2888450,2888706,2888962,2889218,2889474,2889730,2889986,2890242,2890498,2890754,2891010,2891266,2891522,2891778,2892034,2892290,2892546,2892802,2893058,2893314,2893570,2893826,2894082,2894338,2894594,2894850,2895106,2895362,2895618,2908417,2908162,158465,1932545,163073,145922,146946,2910209,2909954,2910721,2910466,2911233,2910978,151809,160001,151553,152065,0,2913025,2912770,0,2913793,2913538,0,0,0,0,0,0,0,147201,147457,2916609,2916354,2917121,2916866,2917633,2917378,2918145,2917890,2918657,2918402,2919169,2918914,2919681,2919426,2920193,2919938,2920705,2920450,2921217,2920962,2921729,2921474,2922241,2921986,2922753,2922498,2923265,2923010,2923777,2923522,2924289,2924034,2924801,2924546,2925313,2925058,2925825,2925570,2926337,2926082,2926849,2926594,2927361,2927106,2927873,2927618,2928385,2928130,2928897,2928642,2929409,2929154,2929921,2929666,2930433,2930178,2930945,2930690,2931457,2931202,2931969,2931714,2932481,2932226,2932993,2932738,2933505,2933250,2934017,2933762,2934529,2934274,2935041,2934786,2935553,2935298,2936065,2935810,2936577,2936322,2937089,2936834,2937601,2937346,2938113,2937858,2938625,2938370,2939137,2938882,2939649,2939394,2940161,2939906,2940673,2940418,2941185,2940930,2941697,2941442,0,0,0,0,0,0,0,2944001,2943746,2944513,2944258,0,0,0,2945793,2945538,0,0,0,0,0,0,0,0,0,0,0,0}, |
22532
|
|
|
|
|
|
|
{1089538,1089794,1090050,1090306,1090562,1090818,1091074,1091330,1091586,1091842,1092098,1092354,1092610,1092866,1093122,1093378,1093634,1093890,1094146,1094402,1094658,1094914,1095170,1095426,1095682,1095938,1096194,1096450,1096706,1096962,1097218,1097474,1097730,1097986,1098242,1098498,1098754,1099010,0,1099522,0,0,0,0,0,1101058,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22533
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10895617,10895362,10896129,10895874,10896641,10896386,10897153,10896898,10897665,10897410,10898177,10897922,10898689,10898434,10899201,10898946,10899713,10899458,10900225,10899970,10900737,10900482,10901249,10900994,10901761,10901506,10902273,10902018,10902785,10902530,10903297,10903042,10903809,10903554,10904321,10904066,10904833,10904578,10905345,10905090,10905857,10905602,10906369,10906114,10906881,10906626,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10912001,10911746,10912513,10912258,10913025,10912770,10913537,10913282,10914049,10913794,10914561,10914306,10915073,10914818,10915585,10915330,10916097,10915842,10916609,10916354,10917121,10916866,10917633,10917378,10918145,10917890,10918657,10918402,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22534
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10953473,10953218,10953985,10953730,10954497,10954242,10955009,10954754,10955521,10955266,10956033,10955778,10956545,10956290,0,0,10957569,10957314,10958081,10957826,10958593,10958338,10959105,10958850,10959617,10959362,10960129,10959874,10960641,10960386,10961153,10960898,10961665,10961410,10962177,10961922,10962689,10962434,10963201,10962946,10963713,10963458,10964225,10963970,10964737,10964482,10965249,10964994,10965761,10965506,10966273,10966018,10966785,10966530,10967297,10967042,10967809,10967554,10968321,10968066,10968833,10968578,10969345,10969090,10969857,10969602,10970369,10970114,10970881,10970626,10971393,10971138,10971905,10971650,10972417,10972162,10972929,10972674,0,0,0,0,0,0,0,0,0,10975745,10975490,10976257,10976002,1931521,10977025,10976770,10977537,10977282,10978049,10977794,10978561,10978306,10979073,10978818,0,0,0,10980353,10980098,156929,0,0,10981633,10981378,10982145,10981890,10994690,0,10983169,10982914,10983681,10983426,10984193,10983938,10984705,10984450,10985217,10984962,10985729,10985474,10986241,10985986,10986753,10986498,10987265,10987010,10987777,10987522,157185,154625,155905,158721,158209,0,171521,165633,171265,11227905,10990849,10990594,10991361,10991106,10991873,10991618,10992385,10992130,10992897,10992642,10993409,10993154,10993921,10993666,10994433,10994178,10982401,164353,1936897,10995713,10995458,10996225,10995970,0,0,0,0,0,10998017,10997762,0,0,0,0,10999553,10999298,11000065,10999810,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11007489,11007234,0,0,0,0,0,0,0,0,0}, |
22535
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10990338,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1286146,1286402,1286658,1286914,1287170,1287426,1287682,1287938,1288194,1288450,1288706,1288962,1289218,1289474,1289730,1289986,1290242,1290498,1290754,1291010,1291266,1291522,1291778,1292034,1292290,1292546,1292802,1293058,1293314,1293570,1293826,1294082,1294338,1294594,1294850,1295106,1295362,1295618,1295874,1296130,1296386,1296642,1296898,1297154,1297410,1297666,1297922,1298178,1298434,1298690,1298946,1299202,1299458,1299714,1299970,1300226,1300482,1300738,1300994,1301250,1301506,1301762,1302018,1302274,1302530,1302786,1303042,1303298,1303554,1303810,1304066,1304322,1304578,1304834,1305090,1305346,1305602,1305858,1306114,1306370,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22536
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16728321,16728577,16728833,16729089,16729345,16729601,16729857,16730113,16730369,16730625,16730881,16731137,16731393,16731649,16731905,16732161,16732417,16732673,16732929,16733185,16733441,16733697,16733953,16734209,16734465,16734721,0,0,0,0,0,0,16720130,16720386,16720642,16720898,16721154,16721410,16721666,16721922,16722178,16722434,16722690,16722946,16723202,16723458,16723714,16723970,16724226,16724482,16724738,16724994,16725250,16725506,16725762,16726018,16726274,16726530,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22537
|
|
|
|
|
|
|
{17049601,17049857,17050113,17050369,17050625,17050881,17051137,17051393,17051649,17051905,17052161,17052417,17052673,17052929,17053185,17053441,17053697,17053953,17054209,17054465,17054721,17054977,17055233,17055489,17055745,17056001,17056257,17056513,17056769,17057025,17057281,17057537,17057793,17058049,17058305,17058561,17058817,17059073,17059329,17059585,17039362,17039618,17039874,17040130,17040386,17040642,17040898,17041154,17041410,17041666,17041922,17042178,17042434,17042690,17042946,17043202,17043458,17043714,17043970,17044226,17044482,17044738,17044994,17045250,17045506,17045762,17046018,17046274,17046530,17046786,17047042,17047298,17047554,17047810,17048066,17048322,17048578,17048834,17049090,17049346,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,17094657,17094913,17095169,17095425,17095681,17095937,17096193,17096449,17096705,17096961,17097217,17097473,17097729,17097985,17098241,17098497,17098753,17099009,17099265,17099521,17099777,17100033,17100289,17100545,17100801,17101057,17101313,17101569,17101825,17102081,17102337,17102593,17102849,17103105,17103361,17103617,0,0,0,0,17084418,17084674,17084930,17085186,17085442,17085698,17085954,17086210,17086466,17086722,17086978,17087234,17087490,17087746,17088002,17088258,17088514,17088770,17089026,17089282,17089538,17089794,17090050,17090306,17090562,17090818,17091074,17091330,17091586,17091842,17092098,17092354,17092610,17092866,17093122,17093378,0,0,0,0}, |
22538
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,17143553,17143809,17144065,17144321,17144577,17144833,17145089,17145345,17145601,17145857,17146113,0,17146625,17146881,17147137,17147393,17147649,17147905,17148161,17148417,17148673,17148929,17149185,17149441,17149697,17149953,17150209,0,17150721,17150977,17151233,17151489,17151745,17152001,17152257,0,17152769,17153025,0,17133570,17133826,17134082,17134338,17134594,17134850,17135106,17135362,17135618,17135874,17136130,0,17136642,17136898,17137154,17137410,17137666,17137922,17138178,17138434,17138690,17138946,17139202,17139458,17139714,17139970,17140226,0,17140738,17140994,17141250,17141506,17141762,17142018,17142274,0,17142786,17143042,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22539
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,17612801,17613057,17613313,17613569,17613825,17614081,17614337,17614593,17614849,17615105,17615361,17615617,17615873,17616129,17616385,17616641,17616897,17617153,17617409,17617665,17617921,17618177,17618433,17618689,17618945,17619201,17619457,17619713,17619969,17620225,17620481,17620737,17620993,17621249,17621505,17621761,17622017,17622273,17622529,17622785,17623041,17623297,17623553,17623809,17624065,17624321,17624577,17624833,17625089,17625345,17625601,0,0,0,0,0,0,0,0,0,0,0,0,0,17596418,17596674,17596930,17597186,17597442,17597698,17597954,17598210,17598466,17598722,17598978,17599234,17599490,17599746,17600002,17600258,17600514,17600770,17601026,17601282,17601538,17601794,17602050,17602306,17602562,17602818,17603074,17603330,17603586,17603842,17604098,17604354,17604610,17604866,17605122,17605378,17605634,17605890,17606146,17606402,17606658,17606914,17607170,17607426,17607682,17607938,17608194,17608450,17608706,17608962,17609218,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22540
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,18399233,18399489,18399745,18400001,18400257,18400513,18400769,18401025,18401281,18401537,18401793,18402049,18402305,18402561,18402817,18403073,18403329,18403585,18403841,18404097,18404353,18404609,18404865,18405121,18405377,18405633,18405889,18406145,18406401,18406657,18406913,18407169,18391042,18391298,18391554,18391810,18392066,18392322,18392578,18392834,18393090,18393346,18393602,18393858,18394114,18394370,18394626,18394882,18395138,18395394,18395650,18395906,18396162,18396418,18396674,18396930,18397186,18397442,18397698,18397954,18398210,18398466,18398722,18398978,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22541
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,24010753,24011009,24011265,24011521,24011777,24012033,24012289,24012545,24012801,24013057,24013313,24013569,24013825,24014081,24014337,24014593,24014849,24015105,24015361,24015617,24015873,24016129,24016385,24016641,24016897,24017153,24017409,24017665,24017921,24018177,24018433,24018689,24002562,24002818,24003074,24003330,24003586,24003842,24004098,24004354,24004610,24004866,24005122,24005378,24005634,24005890,24006146,24006402,24006658,24006914,24007170,24007426,24007682,24007938,24008194,24008450,24008706,24008962,24009218,24009474,24009730,24009986,24010242,24010498,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22542
|
|
|
|
|
|
|
{32055809,32056065,32056321,32056577,32056833,32057089,32057345,32057601,32057857,32058113,32058369,32058625,32058881,32059137,32059393,32059649,32059905,32060161,32060417,32060673,32060929,32061185,32061441,32061697,32061953,32062209,32062465,32062721,32062977,32063233,32063489,32063745,32064001,32064257,32047106,32047362,32047618,32047874,32048130,32048386,32048642,32048898,32049154,32049410,32049666,32049922,32050178,32050434,32050690,32050946,32051202,32051458,32051714,32051970,32052226,32052482,32052738,32052994,32053250,32053506,32053762,32054018,32054274,32054530,32054786,32055042,32055298,32055554,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} |
22543
|
|
|
|
|
|
|
}; |
22544
|
|
|
|
|
|
|
|
22545
|
|
|
|
|
|
|
} // namespace unilib |
22546
|
|
|
|
|
|
|
|
22547
|
|
|
|
|
|
|
///////// |
22548
|
|
|
|
|
|
|
// File: unilib/uninorms.cpp |
22549
|
|
|
|
|
|
|
///////// |
22550
|
|
|
|
|
|
|
|
22551
|
|
|
|
|
|
|
// This file is part of UniLib . |
22552
|
|
|
|
|
|
|
// |
22553
|
|
|
|
|
|
|
// Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of |
22554
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
22555
|
|
|
|
|
|
|
// |
22556
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
22557
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
22558
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
22559
|
|
|
|
|
|
|
// |
22560
|
|
|
|
|
|
|
// UniLib version: 3.3.0 |
22561
|
|
|
|
|
|
|
// Unicode version: 15.0.0 |
22562
|
|
|
|
|
|
|
|
22563
|
|
|
|
|
|
|
namespace unilib { |
22564
|
|
|
|
|
|
|
|
22565
|
0
|
|
|
|
|
|
void uninorms::nfc(std::u32string& str) { |
22566
|
0
|
|
|
|
|
|
decompose(str, false); |
22567
|
0
|
|
|
|
|
|
compose(str); |
22568
|
0
|
|
|
|
|
|
} |
22569
|
|
|
|
|
|
|
|
22570
|
0
|
|
|
|
|
|
void uninorms::nfd(std::u32string& str) { |
22571
|
0
|
|
|
|
|
|
decompose(str, false); |
22572
|
0
|
|
|
|
|
|
} |
22573
|
|
|
|
|
|
|
|
22574
|
0
|
|
|
|
|
|
void uninorms::nfkc(std::u32string& str) { |
22575
|
0
|
|
|
|
|
|
decompose(str, true); |
22576
|
0
|
|
|
|
|
|
compose(str); |
22577
|
0
|
|
|
|
|
|
} |
22578
|
|
|
|
|
|
|
|
22579
|
0
|
|
|
|
|
|
void uninorms::nfkd(std::u32string& str) { |
22580
|
0
|
0
|
|
|
|
|
decompose(str, true); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
22581
|
0
|
|
|
|
|
|
} |
22582
|
|
|
|
|
|
|
|
22583
|
0
|
|
|
|
|
|
void uninorms::compose(std::u32string& str) { |
22584
|
|
|
|
|
|
|
size_t old, com; |
22585
|
0
|
0
|
|
|
|
|
for (old = 0, com = 0; old < str.size(); old++, com++) { |
22586
|
0
|
|
|
|
|
|
str[com] = str[old]; |
22587
|
0
|
0
|
|
|
|
|
if (str[old] >= Hangul::LBase && str[old] < Hangul::LBase + Hangul::LCount) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
22588
|
|
|
|
|
|
|
// Check Hangul composition L + V [+ T]. |
22589
|
0
|
0
|
|
|
|
|
if (old + 1 < str.size() && str[old + 1] >= Hangul::VBase && str[old + 1] < Hangul::VBase + Hangul::VCount) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
22590
|
0
|
|
|
|
|
|
str[com] = Hangul::SBase + ((str[old] - Hangul::LBase) * Hangul::VCount + str[old + 1] - Hangul::VBase) * Hangul::TCount; |
22591
|
|
|
|
|
|
|
old++; |
22592
|
0
|
0
|
|
|
|
|
if (old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
22593
|
0
|
|
|
|
|
|
str[com] += str[++old] - Hangul::TBase; |
22594
|
|
|
|
|
|
|
} |
22595
|
0
|
0
|
|
|
|
|
} else if (str[old] >= Hangul::SBase && str[old] < Hangul::SBase + Hangul::SCount) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
22596
|
|
|
|
|
|
|
// Check Hangul composition LV + T |
22597
|
0
|
0
|
|
|
|
|
if ((str[old] - Hangul::SBase) % Hangul::TCount && old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
22598
|
0
|
|
|
|
|
|
str[com] += str[++old] - Hangul::TBase; |
22599
|
0
|
0
|
|
|
|
|
} else if (str[old] < CHARS) { |
22600
|
|
|
|
|
|
|
// Check composition_data |
22601
|
0
|
|
|
|
|
|
auto composition = &composition_block[composition_index[str[old] >> 8]][str[old] & 0xFF]; |
22602
|
|
|
|
|
|
|
auto starter = com; |
22603
|
0
|
0
|
|
|
|
|
for (int last_ccc = -1; old + 1 < str.size(); old++) { |
22604
|
0
|
0
|
|
|
|
|
int ccc = str[old + 1] < CHARS ? ccc_block[ccc_index[str[old + 1] >> 8]][str[old + 1] & 0xFF] : 0; |
22605
|
0
|
0
|
|
|
|
|
if (composition[1] - composition[0] && last_ccc < ccc) { |
|
|
0
|
|
|
|
|
|
22606
|
|
|
|
|
|
|
// Try finding a composition. |
22607
|
|
|
|
|
|
|
auto l = composition[0], r = composition[1]; |
22608
|
0
|
0
|
|
|
|
|
while (l + 2 < r) { |
22609
|
0
|
|
|
|
|
|
auto m = l + (((r - l) >> 1) & ~1); |
22610
|
0
|
0
|
|
|
|
|
if (composition_data[m] <= str[old + 1]) l = m; |
22611
|
0
|
0
|
|
|
|
|
if (composition_data[m] >= str[old + 1]) r = m; |
22612
|
|
|
|
|
|
|
} |
22613
|
0
|
0
|
|
|
|
|
if (composition_data[l] == str[old + 1]) { |
22614
|
|
|
|
|
|
|
// Found a composition. |
22615
|
0
|
|
|
|
|
|
str[starter] = composition_data[l + 1]; |
22616
|
0
|
|
|
|
|
|
composition = &composition_block[composition_index[composition_data[l + 1] >> 8]][composition_data[l + 1] & 0xFF]; |
22617
|
0
|
|
|
|
|
|
continue; |
22618
|
|
|
|
|
|
|
} |
22619
|
|
|
|
|
|
|
} |
22620
|
|
|
|
|
|
|
|
22621
|
0
|
0
|
|
|
|
|
if (!ccc) break; |
22622
|
|
|
|
|
|
|
last_ccc = ccc; |
22623
|
0
|
|
|
|
|
|
str[++com] = str[old + 1]; |
22624
|
|
|
|
|
|
|
} |
22625
|
|
|
|
|
|
|
} |
22626
|
|
|
|
|
|
|
} |
22627
|
|
|
|
|
|
|
|
22628
|
0
|
0
|
|
|
|
|
if (com < old) str.resize(com); |
22629
|
0
|
|
|
|
|
|
} |
22630
|
|
|
|
|
|
|
|
22631
|
0
|
|
|
|
|
|
void uninorms::decompose(std::u32string& str, bool kompatibility) { |
22632
|
|
|
|
|
|
|
// Count how much additional space do we need. |
22633
|
|
|
|
|
|
|
bool any_decomposition = false; |
22634
|
|
|
|
|
|
|
size_t additional = 0; |
22635
|
0
|
0
|
|
|
|
|
for (auto&& chr : str) { |
22636
|
|
|
|
|
|
|
int decomposition_len = 0; |
22637
|
|
|
|
|
|
|
|
22638
|
0
|
0
|
|
|
|
|
if (chr >= Hangul::SBase && chr < Hangul::SBase + Hangul::SCount) { |
22639
|
|
|
|
|
|
|
// Hangul decomposition. |
22640
|
0
|
0
|
|
|
|
|
decomposition_len = 2 + ((chr - Hangul::SBase) % Hangul::TCount ? 1 : 0); |
22641
|
0
|
0
|
|
|
|
|
} else if (chr < CHARS) { |
22642
|
|
|
|
|
|
|
// Check decomposition_data. |
22643
|
0
|
|
|
|
|
|
auto decomposition = &decomposition_block[decomposition_index[chr >> 8]][chr & 0xFF]; |
22644
|
0
|
|
|
|
|
|
decomposition_len = (decomposition[1] >> 2) - (decomposition[0] >> 2); |
22645
|
0
|
0
|
|
|
|
|
if (decomposition_len && !kompatibility && (decomposition[0] & 1)) decomposition_len = 0; |
|
|
0
|
|
|
|
|
|
22646
|
0
|
0
|
|
|
|
|
if (decomposition_len && kompatibility && (decomposition[0] & 2)) |
|
|
0
|
|
|
|
|
|
22647
|
|
|
|
|
|
|
// Further kompatibility decomposition. |
22648
|
0
|
0
|
|
|
|
|
for (auto i = decomposition[0] >> 2; i < decomposition[1] >> 2; i++) { |
22649
|
0
|
|
|
|
|
|
auto further_decomposition = &decomposition_block[decomposition_index[decomposition_data[i] >> 8]][decomposition_data[i] & 0xFF]; |
22650
|
0
|
0
|
|
|
|
|
if (further_decomposition[0] & 1) decomposition_len += (further_decomposition[1] >> 2) - (further_decomposition[0] >> 2) - 1; |
22651
|
|
|
|
|
|
|
} |
22652
|
|
|
|
|
|
|
} |
22653
|
|
|
|
|
|
|
// Do we decompose current character? |
22654
|
0
|
0
|
|
|
|
|
if (!decomposition_len) continue; |
22655
|
|
|
|
|
|
|
any_decomposition = true; |
22656
|
0
|
|
|
|
|
|
additional += decomposition_len - 1; |
22657
|
|
|
|
|
|
|
} |
22658
|
|
|
|
|
|
|
|
22659
|
|
|
|
|
|
|
// If needed, allocate enough space and perform the decomposition. |
22660
|
0
|
0
|
|
|
|
|
if (any_decomposition) { |
22661
|
0
|
|
|
|
|
|
str.resize(str.size() + additional); |
22662
|
0
|
0
|
|
|
|
|
for (size_t dec = str.size(), old = dec - additional; old--; ) |
22663
|
0
|
0
|
|
|
|
|
if (str[old] >= Hangul::SBase && str[old] < Hangul::SBase + Hangul::SCount) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
22664
|
|
|
|
|
|
|
// Hangul decomposition. |
22665
|
0
|
|
|
|
|
|
char32_t s_index = str[old] - Hangul::SBase; |
22666
|
0
|
0
|
|
|
|
|
if (s_index % Hangul::TCount) str[--dec] = Hangul::TBase + s_index % Hangul::TCount; |
22667
|
0
|
|
|
|
|
|
str[--dec] = Hangul::VBase + (s_index % Hangul::NCount) / Hangul::TCount; |
22668
|
0
|
|
|
|
|
|
str[--dec] = Hangul::LBase + s_index / Hangul::NCount; |
22669
|
0
|
0
|
|
|
|
|
} else if (str[old] < CHARS) { |
22670
|
|
|
|
|
|
|
// Check decomposition_data. |
22671
|
0
|
|
|
|
|
|
auto decomposition = &decomposition_block[decomposition_index[str[old] >> 8]][str[old] & 0xFF]; |
22672
|
0
|
|
|
|
|
|
int decomposition_len = (decomposition[1] >> 2) - (decomposition[0] >> 2); |
22673
|
0
|
0
|
|
|
|
|
if (decomposition_len && !kompatibility && (decomposition[0] & 1)) decomposition_len = 0; |
|
|
0
|
|
|
|
|
|
22674
|
0
|
0
|
|
|
|
|
if (decomposition_len && kompatibility && (decomposition[0] & 2)) { |
|
|
0
|
|
|
|
|
|
22675
|
|
|
|
|
|
|
// Further kompatibility decomposition. |
22676
|
0
|
0
|
|
|
|
|
while (decomposition_len--) { |
22677
|
0
|
|
|
|
|
|
auto chr = decomposition_data[(decomposition[0] >> 2) + decomposition_len]; |
22678
|
0
|
|
|
|
|
|
auto further_decomposition = &decomposition_block[decomposition_index[chr >> 8]][chr & 0xFF]; |
22679
|
0
|
0
|
|
|
|
|
if (further_decomposition[0] & 1) { |
22680
|
0
|
0
|
|
|
|
|
for (int further_decomposition_len = (further_decomposition[1] >> 2) - (further_decomposition[0] >> 2); further_decomposition_len--; ) |
22681
|
0
|
|
|
|
|
|
str[--dec] = decomposition_data[(further_decomposition[0] >> 2) + further_decomposition_len]; |
22682
|
|
|
|
|
|
|
} else { |
22683
|
0
|
|
|
|
|
|
str[--dec] = chr; |
22684
|
|
|
|
|
|
|
} |
22685
|
|
|
|
|
|
|
} |
22686
|
0
|
0
|
|
|
|
|
} else if (decomposition_len) { |
22687
|
|
|
|
|
|
|
// Non-recursive decomposition. |
22688
|
0
|
0
|
|
|
|
|
while (decomposition_len--) |
22689
|
0
|
|
|
|
|
|
str[--dec] = decomposition_data[(decomposition[0] >> 2) + decomposition_len]; |
22690
|
|
|
|
|
|
|
} else { |
22691
|
|
|
|
|
|
|
// No decomposition. |
22692
|
0
|
|
|
|
|
|
str[--dec] = str[old]; |
22693
|
|
|
|
|
|
|
} |
22694
|
|
|
|
|
|
|
} else { |
22695
|
|
|
|
|
|
|
// Non-Unicode character. |
22696
|
0
|
|
|
|
|
|
str[--dec] = str[old]; |
22697
|
|
|
|
|
|
|
} |
22698
|
|
|
|
|
|
|
} |
22699
|
|
|
|
|
|
|
|
22700
|
|
|
|
|
|
|
// Sort combining marks. |
22701
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < str.size(); i++) { |
22702
|
0
|
0
|
|
|
|
|
unsigned ccc = str[i] < CHARS ? ccc_block[ccc_index[str[i] >> 8]][str[i] & 0xFF] : 0; |
22703
|
0
|
0
|
|
|
|
|
if (!ccc) continue; |
22704
|
|
|
|
|
|
|
|
22705
|
|
|
|
|
|
|
auto chr = str[i]; |
22706
|
|
|
|
|
|
|
size_t j; |
22707
|
0
|
0
|
|
|
|
|
for (j = i; j && (str[j-1] < CHARS ? ccc_block[ccc_index[str[j-1] >> 8]][str[j-1] & 0xFF] : 0) > ccc; j--) str[j] = str[j-1]; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
22708
|
0
|
|
|
|
|
|
str[j] = chr; |
22709
|
|
|
|
|
|
|
} |
22710
|
0
|
|
|
|
|
|
} |
22711
|
|
|
|
|
|
|
|
22712
|
|
|
|
|
|
|
// Data fields |
22713
|
|
|
|
|
|
|
const char32_t uninorms::CHARS; |
22714
|
|
|
|
|
|
|
|
22715
|
|
|
|
|
|
|
const char32_t uninorms::Hangul::SBase; |
22716
|
|
|
|
|
|
|
const char32_t uninorms::Hangul::LBase; |
22717
|
|
|
|
|
|
|
const char32_t uninorms::Hangul::VBase; |
22718
|
|
|
|
|
|
|
const char32_t uninorms::Hangul::TBase; |
22719
|
|
|
|
|
|
|
const char32_t uninorms::Hangul::LCount; |
22720
|
|
|
|
|
|
|
const char32_t uninorms::Hangul::VCount; |
22721
|
|
|
|
|
|
|
const char32_t uninorms::Hangul::TCount; |
22722
|
|
|
|
|
|
|
const char32_t uninorms::Hangul::NCount; |
22723
|
|
|
|
|
|
|
const char32_t uninorms::Hangul::SCount; |
22724
|
|
|
|
|
|
|
|
22725
|
|
|
|
|
|
|
const uint8_t uninorms::ccc_index[uninorms::CHARS >> 8] = { |
22726
|
|
|
|
|
|
|
0,0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,0,0,15,0,0,0,16,17,18,19,20,21,22,0,0,23,0,0,0,0,0,0,0,0,0,0,0,24,25,0,0,26,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,27,0,28,29,30,31,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,32,0,0,33,0,0,34,35,36,0,0,0,0,0,0,37,0,0,38,39,40,41,42,43,44,45,46,47,48,49,50,51,0,52,53,0,54,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,55,56,0,0,0,57,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,58,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,59,60,0,0,0,0,0,0,0,0,0,0,0,0,0,61,56,62,0,63,0,0,0,64,65,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
22727
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
22728
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
22729
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
22730
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 |
22731
|
|
|
|
|
|
|
}; |
22732
|
|
|
|
|
|
|
const uint8_t uninorms::ccc_block[][256] = { |
22733
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22734
|
|
|
|
|
|
|
{230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,232,220,220,220,220,232,216,220,220,220,220,220,202,202,220,220,220,220,202,202,220,220,220,220,220,220,220,220,220,220,220,1,1,1,1,1,220,220,220,220,230,230,230,230,230,230,230,230,240,230,220,220,220,230,230,230,220,220,0,230,230,230,220,220,220,220,230,232,220,220,230,233,234,234,233,234,234,233,230,230,230,230,230,230,230,230,230,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22735
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22736
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,220,230,230,230,230,220,230,230,230,222,220,230,230,230,230,230,230,220,220,220,220,220,220,230,230,220,230,230,222,228,230,10,11,12,13,14,15,16,17,18,19,19,20,21,22,0,23,0,24,25,0,230,220,0,18,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22737
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,230,230,230,230,30,31,32,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,27,28,29,30,31,32,33,34,230,230,220,220,230,230,230,230,230,220,230,230,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,230,230,230,0,0,230,230,230,230,220,230,0,0,230,230,0,220,230,230,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22738
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,220,230,230,220,230,230,220,220,220,230,220,220,230,220,230,230,230,220,230,220,230,220,230,220,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,230,230,230,220,230,0,0,0,0,0,0,0,0,0,220,0,0}, |
22739
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,0,230,230,230,230,230,230,230,230,230,0,230,230,230,0,230,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,220,220,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,220,220,220,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,230,220,220,220,220,220,230,230,230,230,230,230,230,230,230,230,230,230,230,230,0,220,230,230,220,230,230,220,230,230,230,220,220,220,27,28,29,230,230,230,220,230,230,220,220,230,230,230,230,230}, |
22740
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,230,220,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,0}, |
22741
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22742
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22743
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,84,91,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22744
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22745
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,103,103,9,0,0,0,0,0,0,0,0,0,0,0,0,0,107,107,107,107,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,118,118,9,0,0,0,0,0,0,0,0,0,0,0,0,0,122,122,122,122,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22746
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,220,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,220,0,220,0,216,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,129,130,0,132,0,0,0,0,0,130,130,130,130,0,0,130,0,230,230,9,0,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22747
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,9,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22748
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22749
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22750
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,228,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22751
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,222,230,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22752
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,230,230,230,230,0,0,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,230,220,220,220,220,220,220,230,230,220,0,220,220,230,230,220,220,230,230,230,230,230,220,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22753
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,220,230,230,230,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,9,9,0,0,0,0,0,0,0,0,0,0,0,0}, |
22754
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,0,1,220,220,220,220,220,230,230,220,220,220,220,230,0,1,1,1,1,1,1,1,0,0,0,0,220,0,0,0,0,0,0,230,0,0,0,230,230,0,0,0,0,0,0}, |
22755
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,220,230,230,230,230,230,230,230,220,230,230,234,214,220,202,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,232,228,228,220,218,230,233,220,230,220}, |
22756
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,1,1,230,230,230,230,1,1,1,230,230,0,0,0,0,230,0,0,0,1,1,230,220,230,1,1,220,220,220,220,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22757
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22758
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230}, |
22759
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,218,228,232,222,224,224,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22760
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,0,0,0,0,230,230,230,230,230,230,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22761
|
|
|
|
|
|
|
{0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22762
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,220,220,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22763
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,0,230,230,220,0,0,230,230,0,0,0,0,0,230,230,0,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0}, |
22764
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22765
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,26,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22766
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,230,230,230,220,220,220,220,220,220,220,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22767
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,220,0,0}, |
22768
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22769
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22770
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,220,0,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,1,220,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22771
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22772
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,220,220,220}, |
22773
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,220,220,230,230,230,220,230,220,220,220,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,220,230,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22774
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22775
|
|
|
|
|
|
|
{230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22776
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22777
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,230,230,230,0,0,0,230,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22778
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22779
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22780
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22781
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22782
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22783
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,9,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22784
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22785
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22786
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,9,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22787
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22788
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0}, |
22789
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22790
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22791
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22792
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,216,216,1,1,1,0,0,0,226,216,216,216,216,216,0,0,0,0,0,0,0,0,220,220,220,220,220,220,220,220,0,0,230,230,230,230,230,220,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22793
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22794
|
|
|
|
|
|
|
{230,230,230,230,230,230,230,0,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,0,0,230,230,230,230,230,230,230,0,230,230,0,230,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22795
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22796
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,232,232,220,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22797
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,220,220,220,220,220,220,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22798
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,230,230,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} |
22799
|
|
|
|
|
|
|
}; |
22800
|
|
|
|
|
|
|
|
22801
|
|
|
|
|
|
|
const uint8_t uninorms::composition_index[uninorms::CHARS >> 8] = { |
22802
|
|
|
|
|
|
|
0,1,2,3,4,5,6,5,5,7,5,8,9,10,5,5,11,5,5,5,5,5,5,5,5,5,5,12,5,5,13,14,5,15,16,5,5,5,5,5,5,5,5,5,5,5,5,5,17,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,18,19,5,20,21,22,5,5,5,23,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, |
22803
|
|
|
|
|
|
|
5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, |
22804
|
|
|
|
|
|
|
5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, |
22805
|
|
|
|
|
|
|
5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, |
22806
|
|
|
|
|
|
|
5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5 |
22807
|
|
|
|
|
|
|
}; |
22808
|
|
|
|
|
|
|
const uint16_t uninorms::composition_block[][257] = { |
22809
|
|
|
|
|
|
|
{1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,3,5,7,7,7,39,45,55,67,101,103,117,131,161,163,173,185,191,209,241,245,245,261,275,289,327,331,343,347,365,377,377,377,377,377,377,377,409,415,425,437,471,473,487,503,531,535,545,557,563,581,613,617,617,633,647,663,701,705,719,723,743,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,761,761,761,761,761,761,761,761,761,761,761,761,761,761,761,761,761,761,761,761,761,761,761,761,761,761,769,769,771,773,777,779,779,779,787,787,787,787,787,789,789,789,789,789,797,803,805,805,807,807,807,807,815,815,815,815,815,815,823,823,825,827,831,833,833,833,841,841,841,841,841,843,843,843,843,843,851,857,859,859,861,861,861,861,869,869,869,869}, |
22810
|
|
|
|
|
|
|
{869,869,869,877,885,885,885,885,885,885,885,885,885,885,885,885,885,885,885,889,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,897,901,901,901,901,901,901,901,901,901,901,901,901,901,903,905,905,905,905,905,907,909,909,909,909,909,909,909,911,913,915,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,929,939,939,939,939,939,939,939,939,939,939,939,939,939,939,949,959,959,959,959,959,959,959,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,963,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965}, |
22811
|
|
|
|
|
|
|
{965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,967,969,971,973,973,973,973,973,975,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979}, |
22812
|
|
|
|
|
|
|
{979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,993,993,993,993,1001,1001,1011,1011,1025,1025,1025,1025,1025,1025,1033,1033,1035,1035,1035,1035,1047,1047,1047,1047,1057,1057,1057,1059,1059,1061,1061,1061,1077,1077,1077,1077,1085,1085,1097,1097,1113,1113,1113,1113,1113,1113,1121,1121,1125,1125,1125,1125,1141,1141,1141,1141,1153,1159,1165,1165,1165,1167,1167,1167,1167,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171}, |
22813
|
|
|
|
|
|
|
{1171,1171,1171,1171,1171,1171,1171,1173,1173,1173,1173,1173,1173,1173,1173,1173,1173,1177,1177,1177,1179,1179,1185,1189,1191,1199,1199,1201,1201,1201,1201,1203,1203,1203,1203,1203,1211,1211,1211,1211,1213,1213,1213,1213,1215,1215,1217,1217,1217,1221,1221,1221,1223,1223,1229,1233,1235,1243,1243,1245,1245,1245,1245,1247,1247,1247,1247,1247,1255,1255,1255,1255,1257,1257,1257,1257,1259,1259,1261,1261,1261,1261,1261,1261,1261,1261,1261,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1265,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1269,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1273,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275}, |
22814
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22815
|
|
|
|
|
|
|
{1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1283,1283,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1287,1287,1287,1287,1287,1287,1287,1287,1287,1287,1287,1287,1287,1287,1287,1287,1287,1289,1289,1289,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291}, |
22816
|
|
|
|
|
|
|
{1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1293,1293,1293,1293,1293,1293,1293,1293,1295,1295,1295,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301}, |
22817
|
|
|
|
|
|
|
{1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1313,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315}, |
22818
|
|
|
|
|
|
|
{1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1319,1319,1319,1319,1319,1319,1319,1325,1325,1325,1325,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327}, |
22819
|
|
|
|
|
|
|
{1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1331,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1339,1339,1339,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341}, |
22820
|
|
|
|
|
|
|
{1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343}, |
22821
|
|
|
|
|
|
|
{1343,1343,1343,1343,1343,1343,1345,1345,1347,1347,1349,1349,1351,1351,1353,1353,1353,1353,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1357,1357,1359,1359,1361,1363,1363,1363,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365}, |
22822
|
|
|
|
|
|
|
{1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1367,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1371,1373,1373,1373,1373,1373,1373,1373,1375,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1381,1385,1385,1385,1385,1385,1385,1385,1385,1385,1385,1385,1385,1385,1385,1385,1385,1385,1385,1385,1385,1385,1385,1385,1387,1389,1389,1389,1389,1389,1389,1389,1389,1389,1389,1389,1389,1389,1389,1389,1389,1389,1389,1389,1391,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393}, |
22823
|
|
|
|
|
|
|
{1393,1401,1409,1411,1413,1415,1417,1419,1421,1429,1437,1439,1441,1443,1445,1447,1449,1453,1457,1457,1457,1457,1457,1457,1457,1461,1465,1465,1465,1465,1465,1465,1465,1473,1481,1483,1485,1487,1489,1491,1493,1501,1509,1511,1513,1515,1517,1519,1521,1527,1533,1533,1533,1533,1533,1533,1533,1539,1545,1545,1545,1545,1545,1545,1545,1549,1553,1553,1553,1553,1553,1553,1553,1557,1561,1561,1561,1561,1561,1561,1561,1567,1573,1573,1573,1573,1573,1573,1573,1573,1579,1579,1579,1579,1579,1579,1579,1587,1595,1597,1599,1601,1603,1605,1607,1615,1623,1625,1627,1629,1631,1633,1635,1637,1637,1637,1637,1639,1639,1639,1639,1639,1639,1639,1639,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1643,1643,1643,1643,1643,1643,1643,1643,1643,1649,1649,1649,1649,1649,1649,1649,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1653,1653,1653,1653,1653,1653,1653,1653,1659,1659}, |
22824
|
|
|
|
|
|
|
{1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1661,1661,1663,1663,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1667,1667,1669,1669,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671}, |
22825
|
|
|
|
|
|
|
{1671,1671,1671,1671,1673,1673,1673,1673,1673,1675,1675,1675,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1679,1679,1681,1681,1681,1681,1681,1681,1681,1681,1681,1681,1681,1681,1681,1681,1681,1681,1681,1681,1681,1681,1681,1681,1681,1683,1683,1683,1683,1683,1683,1683,1685,1685,1687,1687,1687,1689,1689,1689,1689,1689,1691,1691,1691,1691,1691,1691,1691,1691,1691,1691,1691,1691,1691,1691,1691,1691,1691,1691,1691,1691,1693,1693,1693,1695,1697,1697,1697,1697,1697,1697,1697,1697,1697,1697,1697,1697,1697,1699,1701,1701,1701,1703,1705,1705,1705,1707,1709,1711,1713,1713,1713,1713,1713,1715,1717,1717,1717,1719,1721,1721,1721,1721,1721,1721,1721,1721,1721,1721,1723,1725,1725,1725,1725,1725,1725,1725,1725,1725,1725,1725,1725,1725,1725,1725,1725,1727,1727,1727,1727,1727,1727,1729,1731,1731,1733,1733,1733,1733,1733,1733,1733,1735,1737,1739,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741}, |
22826
|
|
|
|
|
|
|
{1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1743,1743,1743,1743,1743,1745,1745,1747,1747,1749,1749,1751,1751,1753,1753,1755,1755,1757,1757,1759,1759,1761,1761,1763,1763,1765,1765,1767,1767,1767,1769,1769,1771,1771,1773,1773,1773,1773,1773,1773,1773,1777,1777,1777,1781,1781,1781,1785,1785,1785,1789,1789,1789,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1795,1795,1795,1795,1795,1795,1795,1795,1795,1797,1797,1797,1797,1797,1799,1799,1801,1801,1803,1803,1805,1805,1807,1807,1809,1809,1811,1811,1813,1813,1815,1815,1817,1817,1819,1819,1821,1821,1821,1823,1823,1825,1825,1827,1827,1827,1827,1827,1827,1827,1831,1831,1831,1835,1835,1835,1839,1839,1839,1843,1843,1843,1847,1847,1847,1847,1847,1847,1847,1847,1847,1847,1847,1847,1847,1847,1847,1847,1847,1847,1847,1847,1849,1851,1853,1855,1855,1855,1855,1855,1855,1855,1855,1855,1855,1855,1857,1857,1857}, |
22827
|
|
|
|
|
|
|
{1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1859,1859,1861,1861,1861,1861,1861,1861,1861,1861,1861,1861,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863}, |
22828
|
|
|
|
|
|
|
{1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1865,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867}, |
22829
|
|
|
|
|
|
|
{1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871}, |
22830
|
|
|
|
|
|
|
{1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877}, |
22831
|
|
|
|
|
|
|
{1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1879,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881}, |
22832
|
|
|
|
|
|
|
{1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883} |
22833
|
|
|
|
|
|
|
}; |
22834
|
|
|
|
|
|
|
const char32_t uninorms::composition_data[] = { |
22835
|
|
|
|
|
|
|
0,824,8814,824,8800,824,8815,768,192,769,193,770,194,771,195,772,256,774,258,775,550,776,196,777,7842,778,197,780,461,783,512,785,514,803,7840,805,7680,808,260,775,7682,803,7684,817,7686,769,262,770,264,775,266,780,268,807,199,775,7690,780,270,803,7692,807,7696,813,7698,817,7694,768,200,769,201,770,202,771,7868,772,274,774,276,775,278,776,203,777,7866,780,282,783,516,785,518,803,7864,807,552,808,280,813,7704,816,7706,775,7710,769,500,770,284,772,7712,774,286,775,288,780,486,807,290,770,292,775,7714,776,7718,780,542,803,7716,807,7720,814,7722,768,204,769,205,770,206,771,296,772,298,774,300,775,304,776,207,777,7880,780,463,783,520,785,522,803,7882,808,302,816,7724,770,308,769,7728,780,488,803,7730,807,310,817,7732,769,313,780,317,803,7734,807,315,813,7740,817,7738,769,7742,775,7744,803,7746,768,504,769,323,771,209,775,7748,780,327,803,7750,807,325,813,7754,817,7752,768,210,769,211,770,212,771,213,772,332,774,334,775,558,776,214,777,7886,779,336,780,465,783,524,785,526,795,416,803,7884,808,490,769,7764,775,7766,769,340,775,7768,780,344,783,528,785,530,803,7770,807,342,817,7774,769,346,770,348,775,7776,780,352,803,7778,806,536,807,350,775,7786,780,356,803,7788,806,538,807,354,813,7792,817,7790,768,217,769,218,770,219,771,360,772,362,774,364,776,220,777,7910,778,366,779,368,780,467,783,532,785,534,795,431,803,7908,804,7794,808,370,813,7798,816,7796,771,7804,803,7806,768,7808,769,7810,770,372,775,7814,776,7812,803,7816,775,7818,776,7820,768,7922,769,221,770,374,771,7928,772,562,775,7822,776,376,777,7926,803,7924,769,377,770,7824,775,379,780,381,803,7826,817,7828,768,224,769,225,770,226,771,227,772,257,774,259,775,551,776,228,777,7843,778,229,780,462,783,513,785,515,803,7841,805,7681,808,261,775,7683,803,7685,817,7687,769,263,770,265,775,267,780,269,807,231,775,7691,780,271,803,7693,807,7697,813,7699,817,7695,768,232,769,233,770,234,771,7869,772,275,774,277,775,279,776,235,777,7867,780,283,783,517,785,519,803,7865,807,553,808,281,813,7705,816,7707,775,7711,769,501,770, |
22836
|
|
|
|
|
|
|
285,772,7713,774,287,775,289,780,487,807,291,770,293,775,7715,776,7719,780,543,803,7717,807,7721,814,7723,817,7830,768,236,769,237,770,238,771,297,772,299,774,301,776,239,777,7881,780,464,783,521,785,523,803,7883,808,303,816,7725,770,309,780,496,769,7729,780,489,803,7731,807,311,817,7733,769,314,780,318,803,7735,807,316,813,7741,817,7739,769,7743,775,7745,803,7747,768,505,769,324,771,241,775,7749,780,328,803,7751,807,326,813,7755,817,7753,768,242,769,243,770,244,771,245,772,333,774,335,775,559,776,246,777,7887,779,337,780,466,783,525,785,527,795,417,803,7885,808,491,769,7765,775,7767,769,341,775,7769,780,345,783,529,785,531,803,7771,807,343,817,7775,769,347,770,349,775,7777,780,353,803,7779,806,537,807,351,775,7787,776,7831,780,357,803,7789,806,539,807,355,813,7793,817,7791,768,249,769,250,770,251,771,361,772,363,774,365,776,252,777,7911,778,367,779,369,780,468,783,533,785,535,795,432,803,7909,804,7795,808,371,813,7799,816,7797,771,7805,803,7807,768,7809,769,7811,770,373,775,7815,776,7813,778,7832,803,7817,775,7819,776,7821,768,7923,769,253,770,375,771,7929,772,563,775,7823,776,255,777,7927,778,7833,803,7925,769,378,770,7825,775,380,780,382,803,7827,817,7829,768,8173,769,901,834,8129,768,7846,769,7844,771,7850,777,7848,772,478,769,506,769,508,772,482,769,7688,768,7872,769,7870,771,7876,777,7874,769,7726,768,7890,769,7888,771,7894,777,7892,769,7756,772,556,776,7758,772,554,769,510,768,475,769,471,772,469,780,473,768,7847,769,7845,771,7851,777,7849,772,479,769,507,769,509,772,483,769,7689,768,7873,769,7871,771,7877,777,7875,769,7727,768,7891,769,7889,771,7895,777,7893,769,7757,772,557,776,7759,772,555,769,511,768,476,769,472,772,470,780,474,768,7856,769,7854,771,7860,777,7858,768,7857,769,7855,771,7861,777,7859,768,7700,769,7702,768,7701,769,7703,768,7760,769,7762,768,7761,769,7763,775,7780,775,7781,775,7782,775,7783,769,7800,769,7801,776,7802,776,7803,775,7835,768,7900,769,7898,771,7904,777,7902,803,7906,768,7901,769,7899,771,7905,777,7903,803,7907,768,7914,769,7912, |
22837
|
|
|
|
|
|
|
771,7918,777,7916,803,7920,768,7915,769,7913,771,7919,777,7917,803,7921,780,494,772,492,772,493,772,480,772,481,774,7708,774,7709,772,560,772,561,780,495,768,8122,769,902,772,8121,774,8120,787,7944,788,7945,837,8124,768,8136,769,904,787,7960,788,7961,768,8138,769,905,787,7976,788,7977,837,8140,768,8154,769,906,772,8153,774,8152,776,938,787,7992,788,7993,768,8184,769,908,787,8008,788,8009,788,8172,768,8170,769,910,772,8169,774,8168,776,939,788,8025,768,8186,769,911,787,8040,788,8041,837,8188,837,8116,837,8132,768,8048,769,940,772,8113,774,8112,787,7936,788,7937,834,8118,837,8115,768,8050,769,941,787,7952,788,7953,768,8052,769,942,787,7968,788,7969,834,8134,837,8131,768,8054,769,943,772,8145,774,8144,776,970,787,7984,788,7985,834,8150,768,8056,769,972,787,8000,788,8001,787,8164,788,8165,768,8058,769,973,772,8161,774,8160,776,971,787,8016,788,8017,834,8166,768,8060,769,974,787,8032,788,8033,834,8182,837,8179,768,8146,769,912,834,8151,768,8162,769,944,834,8167,837,8180,769,979,776,980,776,1031,774,1232,776,1234,769,1027,768,1024,774,1238,776,1025,774,1217,776,1244,776,1246,768,1037,772,1250,774,1049,776,1252,769,1036,776,1254,772,1262,774,1038,776,1264,779,1266,776,1268,776,1272,776,1260,774,1233,776,1235,769,1107,768,1104,774,1239,776,1105,774,1218,776,1245,776,1247,768,1117,772,1251,774,1081,776,1253,769,1116,776,1255,772,1263,774,1118,776,1265,779,1267,776,1269,776,1273,776,1261,776,1111,783,1142,783,1143,776,1242,776,1243,776,1258,776,1259,1619,1570,1620,1571,1621,1573,1620,1572,1620,1574,1620,1730,1620,1747,1620,1728,2364,2345,2364,2353,2364,2356,2494,2507,2519,2508,2878,2891,2902,2888,2903,2892,3031,2964,3006,3018,3031,3020,3006,3019,3158,3144,3285,3264,3266,3274,3285,3271,3286,3272,3285,3275,3390,3402,3415,3404,3390,3403,3530,3546,3535,3548,3551,3550,3530,3549,4142,4134,6965,6918,6965,6920,6965,6922,6965,6924,6965,6926,6965,6930,6965,6971,6965,6973,6965,6976,6965,6977,6965,6979,772,7736,772,7737,772,7772,772,7773,775,7784,775,7785,770,7852,774,7862,770,7853,774, |
22838
|
|
|
|
|
|
|
7863,770,7878,770,7879,770,7896,770,7897,768,7938,769,7940,834,7942,837,8064,768,7939,769,7941,834,7943,837,8065,837,8066,837,8067,837,8068,837,8069,837,8070,837,8071,768,7946,769,7948,834,7950,837,8072,768,7947,769,7949,834,7951,837,8073,837,8074,837,8075,837,8076,837,8077,837,8078,837,8079,768,7954,769,7956,768,7955,769,7957,768,7962,769,7964,768,7963,769,7965,768,7970,769,7972,834,7974,837,8080,768,7971,769,7973,834,7975,837,8081,837,8082,837,8083,837,8084,837,8085,837,8086,837,8087,768,7978,769,7980,834,7982,837,8088,768,7979,769,7981,834,7983,837,8089,837,8090,837,8091,837,8092,837,8093,837,8094,837,8095,768,7986,769,7988,834,7990,768,7987,769,7989,834,7991,768,7994,769,7996,834,7998,768,7995,769,7997,834,7999,768,8002,769,8004,768,8003,769,8005,768,8010,769,8012,768,8011,769,8013,768,8018,769,8020,834,8022,768,8019,769,8021,834,8023,768,8027,769,8029,834,8031,768,8034,769,8036,834,8038,837,8096,768,8035,769,8037,834,8039,837,8097,837,8098,837,8099,837,8100,837,8101,837,8102,837,8103,768,8042,769,8044,834,8046,837,8104,768,8043,769,8045,834,8047,837,8105,837,8106,837,8107,837,8108,837,8109,837,8110,837,8111,837,8114,837,8130,837,8178,837,8119,768,8141,769,8142,834,8143,837,8135,837,8183,768,8157,769,8158,834,8159,824,8602,824,8603,824,8622,824,8653,824,8655,824,8654,824,8708,824,8713,824,8716,824,8740,824,8742,824,8769,824,8772,824,8775,824,8777,824,8813,824,8802,824,8816,824,8817,824,8820,824,8821,824,8824,824,8825,824,8832,824,8833,824,8928,824,8929,824,8836,824,8837,824,8840,824,8841,824,8930,824,8931,824,8876,824,8877,824,8878,824,8879,824,8938,824,8939,824,8940,824,8941,12441,12436,12441,12364,12441,12366,12441,12368,12441,12370,12441,12372,12441,12374,12441,12376,12441,12378,12441,12380,12441,12382,12441,12384,12441,12386,12441,12389,12441,12391,12441,12393,12441,12400,12442,12401,12441,12403,12442,12404,12441,12406,12442,12407,12441,12409,12442,12410,12441,12412,12442,12413,12441,12446,12441,12532,12441,12460,12441,12462,12441,12464,12441,12466,12441, |
22839
|
|
|
|
|
|
|
12468,12441,12470,12441,12472,12441,12474,12441,12476,12441,12478,12441,12480,12441,12482,12441,12485,12441,12487,12441,12489,12441,12496,12442,12497,12441,12499,12442,12500,12441,12502,12442,12503,12441,12505,12442,12506,12441,12508,12442,12509,12441,12535,12441,12536,12441,12537,12441,12538,12441,12542,69818,69786,69818,69788,69818,69803,69927,69934,69927,69935,70462,70475,70487,70476,70832,70844,70842,70843,70845,70846,71087,71098,71087,71099,71984,71992 |
22840
|
|
|
|
|
|
|
}; |
22841
|
|
|
|
|
|
|
|
22842
|
|
|
|
|
|
|
const uint8_t uninorms::decomposition_index[uninorms::CHARS >> 8] = { |
22843
|
|
|
|
|
|
|
0,1,2,3,4,5,6,7,7,8,9,10,11,12,13,14,15,7,7,7,7,7,7,7,7,7,7,16,7,17,18,19,20,21,22,23,24,7,7,7,7,7,25,7,26,27,28,29,30,31,32,33,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,34,35,7,7,7,36,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,37,38,39,40,41,42,43,7,7,7,7,7,7,7,44,7,7,7,7,7,7,7,7,45,46,7,47,48,49,7,7,7,50,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,51,7,52,53,54,55,56,7,7,7,7,7,7,7,7,57,7,7,7,7,7,7,7,7,7,7,7,7,58,59,7,60,61,62,7,7,7,7,7,7,7,7,63,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,64,65,66,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, |
22844
|
|
|
|
|
|
|
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, |
22845
|
|
|
|
|
|
|
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, |
22846
|
|
|
|
|
|
|
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, |
22847
|
|
|
|
|
|
|
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 |
22848
|
|
|
|
|
|
|
}; |
22849
|
|
|
|
|
|
|
const uint16_t uninorms::decomposition_block[][257] = { |
22850
|
|
|
|
|
|
|
{4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,8,8,8,8,8,8,8,9,16,17,20,20,20,20,21,28,28,29,33,37,45,48,48,49,57,61,64,65,77,89,100,100,108,116,124,132,140,148,148,156,164,172,180,188,196,204,212,220,220,228,236,244,252,260,268,268,268,276,284,292,300,308,308,308,316,324,332,340,348,356,356,364,372,380,388,396,404,412,420,428,428,436,444,452,460,468,476,476,476,484,492,500,508,516,516,524}, |
22851
|
|
|
|
|
|
|
{524,532,540,548,556,564,572,580,588,596,604,612,620,628,636,644,652,652,652,660,668,676,684,692,700,708,716,724,732,740,748,756,764,772,780,788,796,804,812,812,812,820,828,836,844,852,860,868,876,884,885,893,900,908,916,924,932,932,940,948,956,964,972,981,989,996,996,996,1004,1012,1020,1028,1036,1045,1052,1052,1052,1060,1068,1076,1084,1092,1100,1100,1100,1108,1116,1124,1132,1140,1148,1156,1164,1172,1180,1188,1196,1204,1212,1220,1228,1236,1244,1244,1244,1252,1260,1268,1276,1284,1292,1300,1308,1316,1324,1332,1340,1348,1356,1364,1372,1380,1388,1396,1404,1412,1420,1429,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1440,1448,1448,1448,1448,1448,1448,1448,1448,1448,1448,1448,1448,1448,1448,1456,1464,1464,1464,1464,1464,1464,1464,1464,1464,1464,1464,1464,1464,1464,1464,1464,1464,1464,1464,1465,1477,1489,1501,1509,1517,1525,1533,1541,1548,1556,1564,1572,1580,1588,1596,1604,1612,1624,1636,1648,1660,1672,1684,1696,1708,1708,1720,1732,1744,1756,1764,1772,1772,1772,1780,1788,1796,1804,1812,1820,1832,1844,1852,1860,1869,1877,1885,1892,1900,1908,1908,1908,1916,1924,1936,1948,1956,1964,1972,1980}, |
22852
|
|
|
|
|
|
|
{1980,1988,1996,2004,2012,2020,2028,2036,2044,2052,2060,2068,2076,2084,2092,2100,2108,2116,2124,2132,2140,2148,2156,2164,2172,2180,2188,2196,2204,2204,2204,2212,2220,2220,2220,2220,2220,2220,2220,2228,2236,2244,2252,2264,2276,2288,2300,2308,2316,2328,2340,2348,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2357,2361,2365,2369,2373,2377,2381,2385,2389,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2393,2401,2409,2417,2425,2433,2440,2440,2441,2445,2449,2453,2457,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460}, |
22853
|
|
|
|
|
|
|
{2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2464,2468,2468,2472,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2484,2484,2484,2484,2484,2485,2492,2492,2492,2492,2496,2496,2496,2496,2496,2497,2506,2512,2520,2524,2532,2540,2548,2548,2556,2556,2564,2572,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2592,2600,2608,2616,2624,2632,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2652,2660,2668,2676,2684,2685,2689,2693,2698,2706,2713,2717,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2721,2725,2729,2732,2733,2737,2740,2740,2740,2741,2744,2744,2744,2744,2744,2744,2744}, |
22854
|
|
|
|
|
|
|
{2744,2752,2760,2760,2768,2768,2768,2768,2776,2776,2776,2776,2776,2784,2792,2800,2800,2800,2800,2800,2800,2800,2800,2800,2800,2800,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2816,2816,2816,2816,2816,2816,2816,2816,2816,2816,2816,2816,2816,2816,2816,2816,2816,2816,2816,2816,2816,2816,2816,2824,2832,2832,2840,2840,2840,2840,2848,2848,2848,2848,2848,2856,2864,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2880,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2896,2904,2904,2904,2904,2904,2904,2904,2904,2904,2904,2904,2904,2904,2904,2912,2920,2928,2936,2936,2936,2944,2952,2952,2952,2960,2968,2976,2984,2992,3000,3000,3000,3008,3016,3024,3032,3040,3048,3048,3048,3056,3064,3072,3080,3088,3096,3104,3112,3120,3128,3136,3144,3144,3144,3152,3160,3160,3160,3160,3160,3160,3160}, |
22855
|
|
|
|
|
|
|
{3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3161,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168}, |
22856
|
|
|
|
|
|
|
{3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3176,3184,3192,3200,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3209,3217,3225,3233,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3248,3248,3256,3256,3256,3256,3256,3256,3256,3256,3256,3256,3256,3256,3256,3256,3256,3256,3256,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264}, |
22857
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
22858
|
|
|
|
|
|
|
{3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3272,3272,3272,3272,3272,3272,3272,3272,3280,3280,3280,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3296,3304,3312,3320,3328,3336,3344,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3360,3368,3368,3368,3368,3368,3368,3368,3368,3368,3368,3368,3368,3368,3368,3368,3368,3376,3384,3384,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392}, |
22859
|
|
|
|
|
|
|
{3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3400,3400,3400,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3416,3424,3432,3432,3432,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440}, |
22860
|
|
|
|
|
|
|
{3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3448,3448,3448,3456,3464,3464,3464,3464,3464,3464,3464,3464,3464,3464,3464,3464,3464,3464,3464,3464,3472,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3496,3504,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512}, |
22861
|
|
|
|
|
|
|
{3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3528,3528,3528,3528,3528,3528,3528,3536,3544,3544,3552,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564}, |
22862
|
|
|
|
|
|
|
{3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3572,3580,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3596,3596,3604,3616,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624}, |
22863
|
|
|
|
|
|
|
{3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3625,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3633,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3641,3649,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656}, |
22864
|
|
|
|
|
|
|
{3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3657,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3668,3668,3668,3668,3668,3668,3668,3668,3668,3668,3676,3676,3676,3676,3676,3684,3684,3684,3684,3684,3692,3692,3692,3692,3692,3700,3700,3700,3700,3700,3700,3700,3700,3700,3700,3700,3700,3700,3708,3708,3708,3708,3708,3708,3708,3708,3708,3708,3716,3716,3724,3733,3744,3753,3764,3764,3764,3764,3764,3764,3764,3764,3772,3772,3772,3772,3772,3772,3772,3772,3772,3772,3772,3772,3772,3772,3772,3772,3772,3772,3780,3780,3780,3780,3780,3780,3780,3780,3780,3780,3788,3788,3788,3788,3788,3796,3796,3796,3796,3796,3804,3804,3804,3804,3804,3812,3812,3812,3812,3812,3812,3812,3812,3812,3812,3812,3812,3812,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820}, |
22865
|
|
|
|
|
|
|
{3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3829,3832,3832,3832,3832}, |
22866
|
|
|
|
|
|
|
{3832,3832,3832,3832,3832,3832,3832,3840,3840,3848,3848,3856,3856,3864,3864,3872,3872,3872,3872,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3888,3888,3896,3896,3896,3904,3912,3912,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920}, |
22867
|
|
|
|
|
|
|
{3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3921,3925,3929,3932,3933,3937,3941,3945,3949,3953,3957,3961,3965,3969,3973,3976,3977,3981,3985,3989,3993,3997,4001,4005,4009,4013,4017,4021,4025,4029,4033,4037,4041,4045,4048,4049,4053,4057,4061,4065,4069,4073,4077,4081,4085,4089,4093,4097,4101,4105,4109,4113,4117,4121,4125,4129,4133,4137,4141,4145,4149,4153,4157,4160,4160,4160,4160,4160,4160,4160,4160,4160,4160,4160,4160,4160,4161,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4165,4169,4173,4177,4181,4185,4189,4193,4197,4201,4205,4209,4213,4217,4221,4225,4229,4233,4237,4241,4245,4249,4253,4257,4261,4265,4269,4273,4277,4281,4285,4289,4293,4297,4301,4305,4309,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312}, |
22868
|
|
|
|
|
|
|
{4312,4320,4328,4336,4344,4352,4360,4368,4376,4388,4400,4408,4416,4424,4432,4440,4448,4456,4464,4472,4480,4492,4504,4516,4528,4536,4544,4552,4560,4572,4584,4592,4600,4608,4616,4624,4632,4640,4648,4656,4664,4672,4680,4688,4696,4704,4712,4724,4736,4744,4752,4760,4768,4776,4784,4792,4800,4812,4824,4832,4840,4848,4856,4864,4872,4880,4888,4896,4904,4912,4920,4928,4936,4944,4952,4960,4968,4980,4992,5004,5016,5028,5040,5052,5064,5072,5080,5088,5096,5104,5112,5120,5128,5140,5152,5160,5168,5176,5184,5192,5200,5212,5224,5236,5248,5260,5272,5280,5288,5296,5304,5312,5320,5328,5336,5344,5352,5360,5368,5376,5384,5396,5408,5420,5432,5440,5448,5456,5464,5472,5480,5488,5496,5504,5512,5520,5528,5536,5544,5552,5560,5568,5576,5584,5592,5600,5608,5616,5624,5632,5640,5648,5656,5664,5673,5682,5688,5688,5688,5688,5688,5696,5704,5712,5720,5732,5744,5756,5768,5780,5792,5804,5816,5828,5840,5852,5864,5876,5888,5900,5912,5924,5936,5948,5960,5968,5976,5984,5992,6000,6008,6020,6032,6044,6056,6068,6080,6092,6104,6116,6128,6136,6144,6152,6160,6168,6176,6184,6192,6204,6216,6228,6240,6252,6264,6276,6288,6300,6312,6324,6336,6348,6360,6372,6384,6396,6408,6420,6432,6440,6448,6456,6464,6476,6488,6500,6512,6524,6536,6548,6560,6572,6584,6592,6600,6608,6616,6624,6632,6640,6648,6648,6648,6648,6648,6648,6648}, |
22869
|
|
|
|
|
|
|
{6648,6656,6664,6676,6688,6700,6712,6724,6736,6744,6752,6764,6776,6788,6800,6812,6824,6832,6840,6852,6864,6876,6888,6888,6888,6896,6904,6916,6928,6940,6952,6952,6952,6960,6968,6980,6992,7004,7016,7028,7040,7048,7056,7068,7080,7092,7104,7116,7128,7136,7144,7156,7168,7180,7192,7204,7216,7224,7232,7244,7256,7268,7280,7292,7304,7312,7320,7332,7344,7356,7368,7368,7368,7376,7384,7396,7408,7420,7432,7432,7432,7440,7448,7460,7472,7484,7496,7508,7520,7520,7528,7528,7540,7540,7552,7552,7564,7572,7580,7592,7604,7616,7628,7640,7652,7660,7668,7680,7692,7704,7716,7728,7740,7748,7756,7764,7772,7780,7788,7796,7804,7812,7820,7828,7836,7844,7852,7852,7852,7864,7876,7892,7908,7924,7940,7956,7972,7984,7996,8012,8028,8044,8060,8076,8092,8104,8116,8132,8148,8164,8180,8196,8212,8224,8236,8252,8268,8284,8300,8316,8332,8344,8356,8372,8388,8404,8420,8436,8452,8464,8476,8492,8508,8524,8540,8556,8572,8580,8588,8600,8608,8620,8620,8628,8640,8648,8656,8664,8672,8681,8688,8693,8701,8710,8716,8728,8736,8748,8748,8756,8768,8776,8784,8792,8800,8810,8818,8826,8832,8840,8848,8860,8872,8872,8872,8880,8892,8900,8908,8916,8924,8926,8934,8942,8948,8956,8964,8976,8988,8996,9004,9012,9024,9032,9040,9048,9056,9066,9074,9080,9084,9084,9084,9096,9104,9116,9116,9124,9136,9144,9152,9160,9168,9178,9181,9188,9190}, |
22870
|
|
|
|
|
|
|
{9190,9194,9197,9201,9205,9209,9213,9217,9221,9225,9229,9232,9232,9232,9232,9232,9232,9233,9236,9236,9236,9236,9236,9237,9244,9244,9244,9244,9244,9244,9244,9244,9244,9244,9244,9244,9245,9249,9257,9268,9268,9268,9268,9268,9268,9268,9268,9269,9272,9272,9272,9273,9281,9292,9293,9301,9312,9312,9312,9312,9313,9320,9321,9328,9328,9328,9328,9328,9328,9328,9328,9329,9337,9345,9352,9352,9352,9352,9352,9352,9352,9352,9352,9352,9352,9352,9352,9353,9368,9368,9368,9368,9368,9368,9368,9369,9372,9372,9372,9372,9372,9372,9372,9372,9372,9372,9372,9372,9372,9372,9372,9372,9373,9377,9380,9380,9381,9385,9389,9393,9397,9401,9405,9409,9413,9417,9421,9425,9429,9433,9437,9441,9445,9449,9453,9457,9461,9465,9469,9473,9477,9481,9485,9488,9489,9493,9497,9501,9505,9509,9513,9517,9521,9525,9529,9533,9537,9540,9540,9540,9540,9540,9540,9540,9540,9540,9540,9540,9541,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9549}, |
22871
|
|
|
|
|
|
|
{9549,9561,9573,9577,9584,9585,9597,9609,9612,9613,9621,9625,9629,9633,9637,9641,9645,9649,9653,9657,9660,9661,9665,9672,9672,9673,9677,9681,9685,9689,9692,9692,9693,9701,9713,9720,9721,9724,9724,9728,9729,9732,9732,9736,9745,9749,9752,9753,9757,9761,9764,9765,9769,9773,9777,9781,9785,9789,9792,9793,9805,9809,9813,9817,9821,9824,9824,9824,9824,9825,9829,9833,9837,9841,9844,9844,9844,9844,9844,9844,9845,9857,9869,9885,9897,9909,9921,9933,9945,9957,9969,9981,9993,10005,10017,10029,10037,10041,10049,10061,10069,10073,10081,10093,10109,10117,10121,10129,10141,10145,10149,10153,10157,10161,10169,10181,10189,10193,10201,10213,10229,10237,10241,10249,10261,10265,10269,10273,10276,10276,10276,10276,10276,10276,10276,10276,10276,10277,10288,10288,10288,10288,10288,10288,10288,10288,10288,10288,10288,10288,10288,10288,10288,10288,10288,10296,10304,10304,10304,10304,10304,10304,10304,10304,10304,10304,10304,10304,10304,10304,10304,10304,10304,10304,10304,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10320,10328,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336}, |
22872
|
|
|
|
|
|
|
{10336,10336,10336,10336,10336,10344,10344,10344,10344,10344,10352,10352,10352,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10368,10368,10376,10376,10376,10376,10376,10377,10385,10396,10397,10405,10416,10416,10416,10416,10416,10416,10416,10416,10416,10416,10416,10416,10416,10416,10416,10416,10416,10424,10424,10424,10432,10432,10432,10440,10440,10448,10448,10448,10448,10448,10448,10448,10448,10448,10448,10448,10448,10448,10448,10448,10448,10448,10448,10448,10448,10448,10448,10448,10456,10456,10464,10464,10464,10464,10464,10464,10464,10464,10464,10464,10464,10472,10480,10488,10496,10504,10504,10504,10512,10520,10520,10520,10528,10536,10536,10536,10536,10536,10536,10536,10544,10552,10552,10552,10560,10568,10568,10568,10576,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10592,10600,10608,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10624,10632,10640,10648,10648,10648,10648,10648,10648,10648,10656,10664,10672,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680}, |
22873
|
|
|
|
|
|
|
{10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10684,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688}, |
22874
|
|
|
|
|
|
|
{10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10689,10693,10697,10701,10705,10709,10713,10717,10721,10725,10733,10741,10749,10757,10765,10773,10781,10789,10797,10805,10813,10825,10837,10849,10861,10873,10885,10897,10909,10921,10937,10953,10969,10985,11001,11017,11033,11049,11065,11081,11097,11105,11113,11121,11129,11137,11145,11153,11161,11169,11181,11193,11205,11217,11229,11241,11253,11265,11277,11289,11301,11313,11325,11337,11349,11361,11373,11385,11397,11409,11421,11433,11445,11457,11469,11481,11493,11505,11517,11529,11541,11553,11565,11577,11589,11601,11613,11617,11621,11625,11629,11633,11637,11641,11645,11649,11653,11657,11661,11665,11669,11673,11677,11681,11685,11689,11693,11697,11701,11705,11709,11713,11717,11721,11725,11729,11733,11737,11741,11745,11749,11753,11757,11761,11765,11769,11773,11777,11781,11785,11789,11793,11797,11801,11805,11809,11813,11817,11821,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824}, |
22875
|
|
|
|
|
|
|
{11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11825,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11841,11853,11861,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880}, |
22876
|
|
|
|
|
|
|
{11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11881,11885,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888}, |
22877
|
|
|
|
|
|
|
{11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11889,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892}, |
22878
|
|
|
|
|
|
|
{11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11893,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11897,11900,11900,11900,11900,11900,11900,11900,11900,11900,11900,11900,11900,11901}, |
22879
|
|
|
|
|
|
|
{11901,11905,11909,11913,11917,11921,11925,11929,11933,11937,11941,11945,11949,11953,11957,11961,11965,11969,11973,11977,11981,11985,11989,11993,11997,12001,12005,12009,12013,12017,12021,12025,12029,12033,12037,12041,12045,12049,12053,12057,12061,12065,12069,12073,12077,12081,12085,12089,12093,12097,12101,12105,12109,12113,12117,12121,12125,12129,12133,12137,12141,12145,12149,12153,12157,12161,12165,12169,12173,12177,12181,12185,12189,12193,12197,12201,12205,12209,12213,12217,12221,12225,12229,12233,12237,12241,12245,12249,12253,12257,12261,12265,12269,12273,12277,12281,12285,12289,12293,12297,12301,12305,12309,12313,12317,12321,12325,12329,12333,12337,12341,12345,12349,12353,12357,12361,12365,12369,12373,12377,12381,12385,12389,12393,12397,12401,12405,12409,12413,12417,12421,12425,12429,12433,12437,12441,12445,12449,12453,12457,12461,12465,12469,12473,12477,12481,12485,12489,12493,12497,12501,12505,12509,12513,12517,12521,12525,12529,12533,12537,12541,12545,12549,12553,12557,12561,12565,12569,12573,12577,12581,12585,12589,12593,12597,12601,12605,12609,12613,12617,12621,12625,12629,12633,12637,12641,12645,12649,12653,12657,12661,12665,12669,12673,12677,12681,12685,12689,12693,12697,12701,12705,12709,12713,12717,12721,12725,12729,12733,12737,12741,12745,12749,12753,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12757}, |
22880
|
|
|
|
|
|
|
{12757,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12761,12764,12765,12769,12773,12776,12776,12776,12776,12776,12776,12776,12776,12776,12776,12776,12776,12776,12776,12776,12776,12776,12776,12784,12784,12792,12792,12800,12800,12808,12808,12816,12816,12824,12824,12832,12832,12840,12840,12848,12848,12856,12856,12864,12864,12872,12872,12872,12880,12880,12888,12888,12896,12896,12896,12896,12896,12896,12896,12904,12912,12912,12920,12928,12928,12936,12944,12944,12952,12960,12960,12968,12976,12976,12976,12976,12976,12976,12976,12976,12976,12976,12976,12976,12976,12976,12976,12976,12976,12976,12976,12976,12976,12976,12976,12984,12984,12984,12984,12984,12984,12985,12993,13000,13000,13009,13016,13016,13016,13016,13016,13016,13016,13016,13016,13016,13016,13016,13016,13024,13024,13032,13032,13040,13040,13048,13048,13056,13056,13064,13064,13072,13072,13080,13080,13088,13088,13096,13096,13104,13104,13112,13112,13112,13120,13120,13128,13128,13136,13136,13136,13136,13136,13136,13136,13144,13152,13152,13160,13168,13168,13176,13184,13184,13192,13200,13200,13208,13216,13216,13216,13216,13216,13216,13216,13216,13216,13216,13216,13216,13216,13216,13216,13216,13216,13216,13216,13216,13216,13216,13216,13224,13224,13224,13232,13240,13248,13256,13256,13256,13256,13265,13272}, |
22881
|
|
|
|
|
|
|
{13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13273,13277,13281,13285,13289,13293,13297,13301,13305,13309,13313,13317,13321,13325,13329,13333,13337,13341,13345,13349,13353,13357,13361,13365,13369,13373,13377,13381,13385,13389,13393,13397,13401,13405,13409,13413,13417,13421,13425,13429,13433,13437,13441,13445,13449,13453,13457,13461,13465,13469,13473,13477,13481,13485,13489,13493,13497,13501,13505,13509,13513,13517,13521,13525,13529,13533,13537,13541,13545,13549,13553,13557,13561,13565,13569,13573,13577,13581,13585,13589,13593,13597,13601,13605,13609,13613,13617,13621,13625,13629,13633,13637,13641,13645,13648,13648,13648,13649,13653,13657,13661,13665,13669,13673,13677,13681,13685,13689,13693,13697,13701,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13705}, |
22882
|
|
|
|
|
|
|
{13705,13717,13729,13741,13753,13765,13777,13789,13801,13813,13825,13837,13849,13861,13873,13889,13905,13921,13937,13953,13969,13985,14001,14017,14033,14049,14065,14081,14097,14113,14141,14164,14165,14177,14189,14201,14213,14225,14237,14249,14261,14273,14285,14297,14309,14321,14333,14345,14357,14369,14381,14393,14405,14417,14429,14441,14453,14465,14477,14489,14501,14513,14525,14537,14549,14561,14573,14585,14597,14601,14605,14609,14612,14612,14612,14612,14612,14612,14612,14612,14613,14625,14633,14641,14649,14657,14665,14673,14681,14689,14697,14705,14713,14721,14729,14737,14745,14749,14753,14757,14761,14765,14769,14773,14777,14781,14785,14789,14793,14797,14801,14809,14817,14825,14833,14841,14849,14857,14865,14873,14881,14889,14897,14905,14913,14933,14949,14956,14957,14961,14965,14969,14973,14977,14981,14985,14989,14993,14997,15001,15005,15009,15013,15017,15021,15025,15029,15033,15037,15041,15045,15049,15053,15057,15061,15065,15069,15073,15077,15081,15085,15089,15093,15097,15101,15105,15109,15113,15117,15121,15125,15129,15133,15137,15141,15145,15149,15153,15161,15169,15177,15185,15193,15201,15209,15217,15225,15233,15241,15249,15257,15265,15273,15281,15289,15297,15305,15313,15321,15329,15337,15345,15357,15369,15381,15389,15401,15409,15421,15425,15429,15433,15437,15441,15445,15449,15453,15457,15461,15465,15469,15473,15477,15481,15485,15489,15493,15497,15501,15505,15509,15513,15517,15521,15525,15529,15533,15537,15541,15545,15549,15553,15557,15561,15565,15569,15573,15577,15581,15585,15589,15593,15597,15601,15605,15609,15617}, |
22883
|
|
|
|
|
|
|
{15617,15637,15653,15673,15685,15705,15717,15729,15753,15769,15781,15793,15805,15821,15837,15853,15869,15885,15901,15917,15941,15949,15973,15997,16017,16033,16057,16081,16097,16109,16121,16137,16153,16173,16193,16205,16217,16233,16245,16257,16265,16273,16285,16297,16321,16337,16357,16381,16397,16409,16421,16445,16461,16485,16497,16517,16529,16545,16557,16573,16593,16609,16629,16645,16653,16673,16685,16697,16713,16725,16737,16749,16769,16785,16793,16817,16829,16849,16865,16881,16893,16905,16921,16929,16945,16965,16973,16997,17009,17017,17025,17033,17041,17049,17057,17065,17073,17081,17089,17101,17113,17125,17137,17149,17161,17173,17185,17197,17209,17221,17233,17245,17257,17269,17281,17289,17297,17309,17317,17325,17333,17345,17357,17365,17373,17381,17389,17397,17413,17421,17429,17437,17445,17453,17461,17469,17477,17489,17505,17513,17521,17529,17537,17545,17553,17561,17573,17585,17597,17609,17617,17625,17633,17641,17649,17657,17665,17673,17681,17689,17701,17713,17721,17733,17745,17757,17765,17777,17789,17805,17813,17825,17837,17849,17861,17881,17905,17913,17921,17929,17937,17945,17953,17961,17969,17977,17985,17993,18001,18009,18017,18025,18033,18041,18049,18065,18073,18081,18089,18105,18117,18125,18133,18141,18149,18157,18165,18173,18181,18189,18197,18209,18217,18225,18237,18249,18257,18273,18285,18293,18301,18309,18317,18329,18341,18349,18357,18365,18373,18381,18389,18397,18405,18413,18425,18437,18449,18461,18473,18485,18497,18509,18521,18533,18545,18557,18569,18581,18593,18605,18617,18629,18641,18653,18665,18677,18688}, |
22884
|
|
|
|
|
|
|
{18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18689,18693,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696}, |
22885
|
|
|
|
|
|
|
{18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18697,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18701,18705,18709,18712,18712,18712,18713,18717,18720,18720,18720,18720,18720,18720,18720}, |
22886
|
|
|
|
|
|
|
{18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18721,18725,18729,18733,18736,18736,18736,18736,18736,18736,18736,18736,18736,18737,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740}, |
22887
|
|
|
|
|
|
|
{18740,18744,18748,18752,18756,18760,18764,18768,18772,18776,18780,18784,18788,18792,18796,18800,18804,18808,18812,18816,18820,18824,18828,18832,18836,18840,18844,18848,18852,18856,18860,18864,18868,18872,18876,18880,18884,18888,18892,18896,18900,18904,18908,18912,18916,18920,18924,18928,18932,18936,18940,18944,18948,18952,18956,18960,18964,18968,18972,18976,18980,18984,18988,18992,18996,19000,19004,19008,19012,19016,19020,19024,19028,19032,19036,19040,19044,19048,19052,19056,19060,19064,19068,19072,19076,19080,19084,19088,19092,19096,19100,19104,19108,19112,19116,19120,19124,19128,19132,19136,19140,19144,19148,19152,19156,19160,19164,19168,19172,19176,19180,19184,19188,19192,19196,19200,19204,19208,19212,19216,19220,19224,19228,19232,19236,19240,19244,19248,19252,19256,19260,19264,19268,19272,19276,19280,19284,19288,19292,19296,19300,19304,19308,19312,19316,19320,19324,19328,19332,19336,19340,19344,19348,19352,19356,19360,19364,19368,19372,19376,19380,19384,19388,19392,19396,19400,19404,19408,19412,19416,19420,19424,19428,19432,19436,19440,19444,19448,19452,19456,19460,19464,19468,19472,19476,19480,19484,19488,19492,19496,19500,19504,19508,19512,19516,19520,19524,19528,19532,19536,19540,19544,19548,19552,19556,19560,19564,19568,19572,19576,19580,19584,19588,19592,19596,19600,19604,19608,19612,19616,19620,19624,19628,19632,19636,19640,19644,19648,19652,19656,19660,19664,19668,19672,19676,19680,19684,19688,19692,19696,19700,19704,19708,19712,19716,19720,19724,19728,19732,19736,19740,19744,19748,19752,19756,19760,19764}, |
22888
|
|
|
|
|
|
|
{19764,19768,19772,19776,19780,19784,19788,19792,19796,19800,19804,19808,19812,19816,19820,19820,19820,19824,19824,19828,19828,19828,19832,19836,19840,19844,19848,19852,19856,19860,19864,19868,19868,19872,19872,19876,19876,19876,19880,19884,19884,19884,19884,19888,19892,19896,19900,19904,19908,19912,19916,19920,19924,19928,19932,19936,19940,19944,19948,19952,19956,19960,19964,19968,19972,19976,19980,19984,19988,19992,19996,20000,20004,20008,20012,20016,20020,20024,20028,20032,20036,20040,20044,20048,20052,20056,20060,20064,20068,20072,20076,20080,20084,20088,20092,20096,20100,20104,20108,20112,20116,20120,20124,20128,20132,20136,20140,20144,20148,20152,20156,20156,20156,20160,20164,20168,20172,20176,20180,20184,20188,20192,20196,20200,20204,20208,20212,20216,20220,20224,20228,20232,20236,20240,20244,20248,20252,20256,20260,20264,20268,20272,20276,20280,20284,20288,20292,20296,20300,20304,20308,20312,20316,20320,20324,20328,20332,20336,20340,20344,20348,20352,20356,20360,20364,20368,20372,20376,20380,20384,20388,20392,20396,20400,20404,20408,20412,20416,20420,20424,20428,20432,20436,20440,20444,20448,20452,20456,20460,20464,20468,20472,20476,20480,20484,20488,20492,20496,20500,20504,20508,20512,20516,20520,20524,20528,20532,20536,20540,20544,20548,20552,20556,20560,20564,20568,20572,20576,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20581}, |
22889
|
|
|
|
|
|
|
{20581,20589,20597,20605,20617,20629,20637,20644,20644,20644,20644,20644,20644,20644,20644,20644,20644,20644,20644,20645,20653,20661,20669,20677,20684,20684,20684,20684,20684,20684,20692,20692,20701,20705,20709,20713,20717,20721,20725,20729,20733,20737,20740,20748,20756,20768,20780,20788,20796,20804,20812,20820,20828,20836,20844,20852,20852,20860,20868,20876,20884,20892,20892,20900,20900,20908,20916,20916,20924,20932,20932,20940,20948,20956,20964,20972,20980,20988,20996,21005,21013,21017,21021,21025,21029,21033,21037,21041,21045,21049,21053,21057,21061,21065,21069,21073,21077,21081,21085,21089,21093,21097,21101,21105,21109,21113,21117,21121,21125,21129,21133,21137,21141,21145,21149,21153,21157,21161,21165,21169,21173,21177,21181,21185,21189,21193,21197,21201,21205,21209,21213,21217,21221,21225,21229,21233,21237,21241,21245,21249,21253,21257,21261,21265,21269,21273,21277,21281,21285,21289,21293,21297,21301,21305,21309,21313,21317,21321,21325,21329,21333,21337,21341,21345,21349,21357,21365,21369,21373,21377,21381,21385,21389,21393,21397,21401,21405,21413,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21421,21425,21429,21433,21437,21441,21445,21449,21453,21457,21461,21469,21473,21477,21481,21485,21489,21493,21497,21501,21505,21509,21513,21517,21529,21541,21553,21565,21577,21589,21601,21613,21625,21637,21649,21661,21673,21685,21697,21709,21721,21733,21737,21741,21745,21749}, |
22890
|
|
|
|
|
|
|
{21749,21761,21773,21785,21797,21809,21817,21825,21833,21841,21849,21857,21865,21873,21881,21889,21897,21905,21913,21921,21929,21937,21945,21953,21961,21969,21977,21985,21993,22001,22009,22017,22025,22033,22041,22049,22057,22065,22073,22081,22089,22097,22105,22113,22121,22129,22137,22145,22153,22161,22169,22177,22185,22193,22201,22209,22217,22225,22233,22241,22249,22257,22265,22273,22281,22289,22297,22305,22313,22321,22329,22337,22345,22353,22361,22369,22377,22385,22393,22401,22409,22417,22425,22433,22441,22449,22457,22465,22473,22481,22489,22497,22505,22513,22521,22533,22545,22557,22569,22581,22593,22605,22617,22629,22641,22653,22665,22673,22681,22689,22697,22705,22713,22721,22729,22737,22745,22753,22761,22769,22777,22785,22793,22801,22809,22817,22825,22833,22841,22849,22857,22865,22873,22881,22889,22897,22905,22913,22921,22929,22937,22945,22953,22961,22969,22977,22985,22993,23001,23009,23017,23025,23037,23049,23061,23073,23085,23093,23101,23109,23117,23125,23133,23141,23149,23157,23165,23173,23181,23189,23197,23205,23213,23221,23229,23237,23245,23253,23261,23269,23277,23285,23293,23301,23309,23317,23325,23333,23341,23349,23357,23365,23373,23381,23389,23397,23405,23413,23421,23429,23437,23445,23453,23461,23469,23477,23485,23493,23501,23509,23517,23525,23533,23541,23549,23557,23565,23573,23581,23589,23597,23605,23613,23621,23633,23645,23653,23661,23669,23677,23685,23693,23701,23709,23717,23725,23733,23741,23749,23757,23765,23773,23781,23793,23805,23817,23825,23833,23841,23849,23857,23865,23873,23881,23889,23897,23905}, |
22891
|
|
|
|
|
|
|
{23905,23913,23921,23929,23937,23945,23953,23961,23969,23977,23985,23993,24001,24009,24017,24025,24033,24041,24049,24057,24065,24073,24081,24089,24097,24105,24113,24121,24129,24137,24145,24153,24161,24169,24177,24185,24193,24201,24209,24217,24225,24233,24241,24249,24257,24265,24273,24281,24289,24297,24305,24313,24321,24329,24337,24345,24353,24361,24369,24377,24385,24393,24400,24400,24400,24400,24400,24400,24400,24400,24400,24400,24400,24400,24400,24400,24400,24400,24400,24400,24401,24413,24425,24437,24449,24461,24473,24485,24497,24509,24521,24533,24545,24557,24569,24581,24593,24605,24617,24629,24641,24653,24665,24677,24689,24701,24713,24725,24737,24749,24761,24773,24785,24797,24809,24821,24833,24845,24857,24869,24881,24893,24905,24917,24929,24941,24953,24965,24977,24989,25001,25013,25025,25037,25049,25061,25073,25085,25097,25109,25121,25133,25145,25157,25168,25168,25169,25181,25193,25205,25217,25229,25241,25253,25265,25277,25289,25301,25313,25325,25337,25349,25361,25373,25385,25397,25409,25421,25433,25445,25457,25469,25481,25493,25505,25517,25529,25541,25553,25565,25577,25589,25601,25613,25625,25637,25649,25661,25673,25685,25697,25709,25721,25733,25745,25757,25769,25781,25793,25805,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25817,25829,25841,25857,25873,25889,25905,25921,25937,25953,25965,26037,26069,26084,26084,26084,26084}, |
22892
|
|
|
|
|
|
|
{26084,26084,26084,26084,26084,26084,26084,26084,26084,26084,26084,26084,26084,26084,26084,26084,26085,26089,26093,26097,26101,26105,26109,26113,26117,26121,26132,26132,26132,26132,26132,26132,26132,26132,26132,26132,26132,26132,26132,26132,26132,26132,26132,26132,26132,26132,26132,26132,26133,26141,26145,26149,26153,26157,26161,26165,26169,26173,26177,26181,26185,26189,26193,26197,26201,26205,26209,26213,26217,26220,26220,26221,26225,26229,26237,26245,26253,26261,26265,26269,26273,26277,26281,26284,26285,26289,26293,26297,26301,26305,26309,26313,26317,26321,26325,26329,26333,26337,26341,26345,26349,26353,26357,26360,26361,26365,26369,26373,26376,26376,26376,26376,26377,26385,26393,26400,26401,26408,26409,26417,26425,26433,26441,26449,26457,26465,26473,26481,26489,26493,26501,26509,26517,26525,26533,26541,26549,26557,26565,26573,26581,26589,26593,26597,26601,26605,26609,26613,26617,26621,26625,26629,26633,26637,26641,26645,26649,26653,26657,26661,26665,26669,26673,26677,26681,26685,26689,26693,26697,26701,26705,26709,26713,26717,26721,26725,26729,26733,26737,26741,26745,26749,26753,26757,26761,26765,26769,26773,26777,26781,26785,26789,26793,26797,26801,26805,26809,26813,26817,26821,26825,26829,26833,26837,26841,26845,26849,26853,26857,26861,26865,26869,26873,26877,26881,26885,26889,26893,26897,26901,26905,26909,26913,26917,26921,26925,26929,26933,26937,26941,26945,26949,26953,26957,26961,26965,26969,26973,26977,26981,26985,26989,26993,26997,27001,27005,27017,27029,27041,27053,27065,27077,27085,27092,27092,27092,27092}, |
22893
|
|
|
|
|
|
|
{27092,27093,27097,27101,27105,27109,27113,27117,27121,27125,27129,27133,27137,27141,27145,27149,27153,27157,27161,27165,27169,27173,27177,27181,27185,27189,27193,27197,27201,27205,27209,27213,27217,27221,27225,27229,27233,27237,27241,27245,27249,27253,27257,27261,27265,27269,27273,27277,27281,27285,27289,27293,27297,27301,27305,27309,27313,27317,27321,27325,27329,27333,27337,27341,27345,27349,27353,27357,27361,27365,27369,27373,27377,27381,27385,27389,27393,27397,27401,27405,27409,27413,27417,27421,27425,27429,27433,27437,27441,27445,27449,27453,27457,27461,27465,27469,27473,27477,27481,27485,27489,27493,27497,27501,27505,27509,27513,27517,27521,27525,27529,27533,27537,27541,27545,27549,27553,27557,27561,27565,27569,27573,27577,27581,27585,27589,27593,27597,27601,27605,27609,27613,27617,27621,27625,27629,27633,27637,27641,27645,27649,27653,27657,27661,27665,27669,27673,27677,27681,27685,27689,27693,27697,27701,27705,27709,27713,27717,27721,27725,27729,27733,27737,27741,27745,27749,27753,27757,27761,27765,27769,27773,27777,27781,27785,27789,27793,27797,27801,27805,27809,27813,27817,27821,27825,27829,27833,27837,27841,27845,27849,27852,27852,27852,27853,27857,27861,27865,27869,27873,27876,27876,27877,27881,27885,27889,27893,27897,27900,27900,27901,27905,27909,27913,27917,27921,27924,27924,27925,27929,27933,27936,27936,27936,27937,27941,27945,27949,27957,27961,27965,27968,27969,27973,27977,27981,27985,27989,27993,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996}, |
22894
|
|
|
|
|
|
|
{27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27997,28001,28005,28009,28013,28016,28017,28021,28025,28029,28033,28037,28041,28045,28049,28053,28057,28061,28065,28069,28073,28077,28081,28085,28089,28093,28097,28101,28105,28109,28113,28117,28121,28125,28129,28133,28137,28141,28145,28149,28153,28157,28161,28165,28169,28173,28177,28181,28184,28185,28189,28193,28197,28201,28205,28209,28213,28217,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220}, |
22895
|
|
|
|
|
|
|
{28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28228,28228,28236,28236,28236,28236,28236,28236,28236,28236,28236,28236,28236,28236,28236,28236,28236,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244}, |
22896
|
|
|
|
|
|
|
{28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28252,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260}, |
22897
|
|
|
|
|
|
|
{28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28268,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276}, |
22898
|
|
|
|
|
|
|
{28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28284,28292,28292,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300}, |
22899
|
|
|
|
|
|
|
{28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28308,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316}, |
22900
|
|
|
|
|
|
|
{28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324}, |
22901
|
|
|
|
|
|
|
{28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28332,28340,28352,28364,28376,28388,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28408,28416,28428,28440,28452,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464}, |
22902
|
|
|
|
|
|
|
{28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28465}, |
22903
|
|
|
|
|
|
|
{28465,28469,28473,28477,28481,28485,28489,28493,28497,28501,28505,28509,28513,28517,28521,28525,28529,28533,28537,28541,28545,28549,28553,28557,28561,28565,28569,28573,28577,28581,28585,28589,28593,28597,28601,28605,28609,28613,28617,28621,28625,28629,28633,28637,28641,28645,28649,28653,28657,28661,28665,28669,28673,28677,28681,28685,28689,28693,28697,28701,28705,28709,28713,28717,28721,28725,28729,28733,28737,28741,28745,28749,28753,28757,28761,28765,28769,28773,28777,28781,28785,28789,28793,28797,28801,28804,28805,28809,28813,28817,28821,28825,28829,28833,28837,28841,28845,28849,28853,28857,28861,28865,28869,28873,28877,28881,28885,28889,28893,28897,28901,28905,28909,28913,28917,28921,28925,28929,28933,28937,28941,28945,28949,28953,28957,28961,28965,28969,28973,28977,28981,28985,28989,28993,28997,29001,29005,29009,29013,29017,29021,29025,29029,29033,29037,29041,29045,29049,29053,29057,29061,29065,29069,29073,29077,29081,29085,29088,29089,29093,29096,29096,29097,29100,29100,29101,29105,29108,29108,29109,29113,29117,29121,29124,29125,29129,29133,29137,29141,29145,29149,29153,29157,29161,29165,29169,29172,29173,29176,29177,29181,29185,29189,29193,29197,29201,29204,29205,29209,29213,29217,29221,29225,29229,29233,29237,29241,29245,29249,29253,29257,29261,29265,29269,29273,29277,29281,29285,29289,29293,29297,29301,29305,29309,29313,29317,29321,29325,29329,29333,29337,29341,29345,29349,29353,29357,29361,29365,29369,29373,29377,29381,29385,29389,29393,29397,29401,29405,29409,29413,29417,29421,29425,29429,29433,29437,29441}, |
22904
|
|
|
|
|
|
|
{29441,29445,29449,29453,29457,29461,29464,29465,29469,29473,29477,29480,29480,29481,29485,29489,29493,29497,29501,29505,29509,29512,29513,29517,29521,29525,29529,29533,29537,29540,29541,29545,29549,29553,29557,29561,29565,29569,29573,29577,29581,29585,29589,29593,29597,29601,29605,29609,29613,29617,29621,29625,29629,29633,29637,29641,29645,29649,29652,29653,29657,29661,29665,29668,29669,29673,29677,29681,29685,29688,29689,29692,29692,29692,29693,29697,29701,29705,29709,29713,29717,29720,29721,29725,29729,29733,29737,29741,29745,29749,29753,29757,29761,29765,29769,29773,29777,29781,29785,29789,29793,29797,29801,29805,29809,29813,29817,29821,29825,29829,29833,29837,29841,29845,29849,29853,29857,29861,29865,29869,29873,29877,29881,29885,29889,29893,29897,29901,29905,29909,29913,29917,29921,29925,29929,29933,29937,29941,29945,29949,29953,29957,29961,29965,29969,29973,29977,29981,29985,29989,29993,29997,30001,30005,30009,30013,30017,30021,30025,30029,30033,30037,30041,30045,30049,30053,30057,30061,30065,30069,30073,30077,30081,30085,30089,30093,30097,30101,30105,30109,30113,30117,30121,30125,30129,30133,30137,30141,30145,30149,30153,30157,30161,30165,30169,30173,30177,30181,30185,30189,30193,30197,30201,30205,30209,30213,30217,30221,30225,30229,30233,30237,30241,30245,30249,30253,30257,30261,30265,30269,30273,30277,30281,30285,30289,30293,30297,30301,30305,30309,30313,30317,30321,30325,30329,30333,30337,30341,30345,30349,30353,30357,30361,30365,30369,30373,30377,30381,30385,30389,30393,30397,30401,30405,30409,30413,30417}, |
22905
|
|
|
|
|
|
|
{30417,30421,30425,30429,30433,30437,30441,30445,30449,30453,30457,30461,30465,30469,30473,30477,30481,30485,30489,30493,30497,30501,30505,30509,30513,30517,30521,30525,30529,30533,30537,30541,30545,30549,30553,30557,30561,30565,30569,30573,30577,30581,30585,30589,30593,30597,30601,30605,30609,30613,30617,30621,30625,30629,30633,30637,30641,30645,30649,30653,30657,30661,30665,30669,30673,30677,30681,30685,30689,30693,30697,30701,30705,30709,30713,30717,30721,30725,30729,30733,30737,30741,30745,30749,30753,30757,30761,30765,30769,30773,30777,30781,30785,30789,30793,30797,30801,30805,30809,30813,30817,30821,30825,30829,30833,30837,30841,30845,30849,30853,30857,30861,30865,30869,30873,30877,30881,30885,30889,30893,30897,30901,30905,30909,30913,30917,30921,30925,30929,30933,30937,30941,30945,30949,30953,30957,30961,30965,30969,30973,30977,30981,30985,30989,30993,30997,31001,31005,31009,31013,31017,31021,31025,31029,31033,31037,31041,31045,31049,31053,31057,31061,31065,31069,31073,31077,31080,31080,31081,31085,31089,31093,31097,31101,31105,31109,31113,31117,31121,31125,31129,31133,31137,31141,31145,31149,31153,31157,31161,31165,31169,31173,31177,31181,31185,31189,31193,31197,31201,31205,31209,31213,31217,31221,31225,31229,31233,31237,31241,31245,31249,31253,31257,31261,31265,31269,31273,31277,31281,31285,31289,31293,31297,31301,31305,31309,31313,31317,31321,31325,31329,31333,31337,31341,31345,31349,31353,31357,31361,31365,31369,31373,31377,31381,31385,31389,31393,31397,31401,31405,31409,31413,31417,31421,31425,31429,31433}, |
22906
|
|
|
|
|
|
|
{31433,31437,31441,31445,31449,31453,31457,31461,31465,31469,31473,31477,31481,31485,31489,31493,31497,31501,31505,31509,31513,31517,31521,31525,31529,31533,31537,31541,31545,31549,31553,31557,31561,31565,31569,31573,31577,31581,31585,31589,31593,31597,31601,31605,31609,31613,31617,31621,31625,31629,31633,31637,31641,31645,31649,31653,31657,31661,31665,31669,31673,31677,31681,31685,31689,31693,31697,31701,31705,31709,31713,31717,31721,31725,31729,31733,31737,31741,31745,31749,31753,31757,31761,31765,31769,31773,31777,31781,31785,31789,31793,31797,31801,31805,31809,31813,31817,31821,31825,31829,31833,31837,31841,31845,31849,31853,31857,31861,31865,31869,31873,31877,31881,31885,31889,31893,31897,31901,31905,31909,31913,31917,31921,31925,31929,31933,31937,31941,31945,31949,31953,31957,31961,31965,31969,31973,31977,31981,31985,31989,31993,31997,32001,32005,32009,32013,32017,32021,32025,32029,32033,32037,32041,32045,32049,32053,32057,32061,32065,32069,32073,32077,32081,32085,32089,32093,32097,32101,32105,32109,32113,32117,32121,32125,32129,32133,32137,32141,32145,32149,32153,32157,32161,32165,32169,32173,32177,32181,32185,32189,32193,32197,32201,32205,32209,32213,32217,32221,32225,32229,32233,32237,32241,32245,32248,32248,32249,32253,32257,32261,32265,32269,32273,32277,32281,32285,32289,32293,32297,32301,32305,32309,32313,32317,32321,32325,32329,32333,32337,32341,32345,32349,32353,32357,32361,32365,32369,32373,32377,32381,32385,32389,32393,32397,32401,32405,32409,32413,32417,32421,32425,32429,32433,32437,32441,32445,32448}, |
22907
|
|
|
|
|
|
|
{32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32449,32453,32457,32461,32465,32469,32473,32477,32481,32485,32489,32493,32497,32501,32505,32509,32513,32517,32521,32525,32529,32533,32537,32541,32545,32549,32553,32557,32561,32565,32569,32573,32577,32581,32585,32589,32593,32597,32601,32605,32609,32613,32617,32621,32625,32629,32633,32637,32641,32645,32649,32653,32657,32661,32665,32669,32673,32677,32681,32685,32689,32693,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696}, |
22908
|
|
|
|
|
|
|
{32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32697}, |
22909
|
|
|
|
|
|
|
{32697,32701,32705,32709,32712,32713,32717,32721,32725,32729,32733,32737,32741,32745,32749,32753,32757,32761,32765,32769,32773,32777,32781,32785,32789,32793,32797,32801,32805,32809,32813,32817,32820,32821,32825,32828,32829,32832,32832,32833,32836,32837,32841,32845,32849,32853,32857,32861,32865,32869,32873,32876,32877,32881,32885,32889,32892,32893,32896,32897,32900,32900,32900,32900,32900,32900,32901,32904,32904,32904,32904,32905,32908,32909,32912,32913,32916,32917,32921,32925,32928,32929,32933,32936,32937,32940,32940,32941,32944,32945,32948,32949,32952,32953,32956,32957,32960,32961,32965,32968,32969,32972,32972,32973,32977,32981,32985,32988,32989,32993,32997,33001,33005,33009,33013,33016,33017,33021,33025,33029,33032,33033,33037,33041,33045,33048,33049,33052,33053,33057,33061,33065,33069,33073,33077,33081,33085,33089,33092,33093,33097,33101,33105,33109,33113,33117,33121,33125,33129,33133,33137,33141,33145,33149,33153,33157,33160,33160,33160,33160,33160,33161,33165,33169,33172,33173,33177,33181,33185,33189,33192,33193,33197,33201,33205,33209,33213,33217,33221,33225,33229,33233,33237,33241,33245,33249,33253,33257,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260}, |
22910
|
|
|
|
|
|
|
{33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33261}, |
22911
|
|
|
|
|
|
|
{33261,33269,33277,33285,33293,33301,33309,33317,33325,33333,33341,33348,33348,33348,33348,33348,33349,33361,33373,33385,33397,33409,33421,33433,33445,33457,33469,33481,33493,33505,33517,33529,33541,33553,33565,33577,33589,33601,33613,33625,33637,33649,33661,33673,33677,33681,33689,33696,33697,33701,33705,33709,33713,33717,33721,33725,33729,33733,33737,33741,33745,33749,33753,33757,33761,33765,33769,33773,33777,33781,33785,33789,33793,33797,33801,33809,33817,33825,33833,33845,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33853,33861,33869,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33877,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33885}, |
22912
|
|
|
|
|
|
|
{33885,33893,33901,33904,33904,33904,33904,33904,33904,33904,33904,33904,33904,33904,33904,33904,33905,33909,33913,33917,33925,33929,33933,33937,33941,33945,33949,33953,33957,33961,33965,33969,33973,33977,33981,33985,33989,33993,33997,34001,34005,34009,34013,34017,34021,34025,34029,34033,34037,34041,34045,34049,34053,34057,34061,34065,34069,34073,34077,34081,34084,34084,34084,34084,34085,34097,34109,34121,34133,34145,34157,34169,34181,34192,34192,34192,34192,34192,34192,34192,34193,34197,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200}, |
22913
|
|
|
|
|
|
|
{34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34201,34205,34209,34213,34217,34221,34225,34229,34233,34237,34240,34240,34240,34240,34240,34240,34240}, |
22914
|
|
|
|
|
|
|
{34240,34244,34248,34252,34256,34260,34264,34268,34272,34276,34280,34284,34288,34292,34296,34300,34304,34308,34312,34316,34320,34324,34328,34332,34336,34340,34344,34348,34352,34356,34360,34364,34368,34372,34376,34380,34384,34388,34392,34396,34400,34404,34408,34412,34416,34420,34424,34428,34432,34436,34440,34444,34448,34452,34456,34460,34464,34468,34472,34476,34480,34484,34488,34492,34496,34500,34504,34508,34512,34516,34520,34524,34528,34532,34536,34540,34544,34548,34552,34556,34560,34564,34568,34572,34576,34580,34584,34588,34592,34596,34600,34604,34608,34612,34616,34620,34624,34628,34632,34636,34640,34644,34648,34652,34656,34660,34664,34668,34672,34676,34680,34684,34688,34692,34696,34700,34704,34708,34712,34716,34720,34724,34728,34732,34736,34740,34744,34748,34752,34756,34760,34764,34768,34772,34776,34780,34784,34788,34792,34796,34800,34804,34808,34812,34816,34820,34824,34828,34832,34836,34840,34844,34848,34852,34856,34860,34864,34868,34872,34876,34880,34884,34888,34892,34896,34900,34904,34908,34912,34916,34920,34924,34928,34932,34936,34940,34944,34948,34952,34956,34960,34964,34968,34972,34976,34980,34984,34988,34992,34996,35000,35004,35008,35012,35016,35020,35024,35028,35032,35036,35040,35044,35048,35052,35056,35060,35064,35068,35072,35076,35080,35084,35088,35092,35096,35100,35104,35108,35112,35116,35120,35124,35128,35132,35136,35140,35144,35148,35152,35156,35160,35164,35168,35172,35176,35180,35184,35188,35192,35196,35200,35204,35208,35212,35216,35220,35224,35228,35232,35236,35240,35244,35248,35252,35256,35260,35264}, |
22915
|
|
|
|
|
|
|
{35264,35268,35272,35276,35280,35284,35288,35292,35296,35300,35304,35308,35312,35316,35320,35324,35328,35332,35336,35340,35344,35348,35352,35356,35360,35364,35368,35372,35376,35380,35384,35388,35392,35396,35400,35404,35408,35412,35416,35420,35424,35428,35432,35436,35440,35444,35448,35452,35456,35460,35464,35468,35472,35476,35480,35484,35488,35492,35496,35500,35504,35508,35512,35516,35520,35524,35528,35532,35536,35540,35544,35548,35552,35556,35560,35564,35568,35572,35576,35580,35584,35588,35592,35596,35600,35604,35608,35612,35616,35620,35624,35628,35632,35636,35640,35644,35648,35652,35656,35660,35664,35668,35672,35676,35680,35684,35688,35692,35696,35700,35704,35708,35712,35716,35720,35724,35728,35732,35736,35740,35744,35748,35752,35756,35760,35764,35768,35772,35776,35780,35784,35788,35792,35796,35800,35804,35808,35812,35816,35820,35824,35828,35832,35836,35840,35844,35848,35852,35856,35860,35864,35868,35872,35876,35880,35884,35888,35892,35896,35900,35904,35908,35912,35916,35920,35924,35928,35932,35936,35940,35944,35948,35952,35956,35960,35964,35968,35972,35976,35980,35984,35988,35992,35996,36000,36004,36008,36012,36016,36020,36024,36028,36032,36036,36040,36044,36048,36052,36056,36060,36064,36068,36072,36076,36080,36084,36088,36092,36096,36100,36104,36108,36112,36116,36120,36124,36128,36132,36136,36140,36144,36148,36152,36156,36160,36164,36168,36172,36176,36180,36184,36188,36192,36196,36200,36204,36208,36212,36216,36220,36224,36228,36232,36236,36240,36244,36248,36252,36256,36260,36264,36268,36272,36276,36280,36284,36288}, |
22916
|
|
|
|
|
|
|
{36288,36292,36296,36300,36304,36308,36312,36316,36320,36324,36328,36332,36336,36340,36344,36348,36352,36356,36360,36364,36368,36372,36376,36380,36384,36388,36392,36396,36400,36404,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408} |
22917
|
|
|
|
|
|
|
}; |
22918
|
|
|
|
|
|
|
const char32_t uninorms::decomposition_data[] = { |
22919
|
|
|
|
|
|
|
0,32,32,776,97,32,772,50,51,32,769,956,32,807,49,111,49,8260,52,49,8260,50,51,8260,52,65,768,65,769,65,770,65,771,65,776,65,778,67,807,69,768,69,769,69,770,69,776,73,768,73,769,73,770,73,776,78,771,79,768,79,769,79,770,79,771,79,776,85,768,85,769,85,770,85,776,89,769,97,768,97,769,97,770,97,771,97,776,97,778,99,807,101,768,101,769,101,770,101,776,105,768,105,769,105,770,105,776,110,771,111,768,111,769,111,770,111,771,111,776,117,768,117,769,117,770,117,776,121,769,121,776,65,772,97,772,65,774,97,774,65,808,97,808,67,769,99,769,67,770,99,770,67,775,99,775,67,780,99,780,68,780,100,780,69,772,101,772,69,774,101,774,69,775,101,775,69,808,101,808,69,780,101,780,71,770,103,770,71,774,103,774,71,775,103,775,71,807,103,807,72,770,104,770,73,771,105,771,73,772,105,772,73,774,105,774,73,808,105,808,73,775,73,74,105,106,74,770,106,770,75,807,107,807,76,769,108,769,76,807,108,807,76,780,108,780,76,183,108,183,78,769,110,769,78,807,110,807,78,780,110,780,700,110,79,772,111,772,79,774,111,774,79,779,111,779,82,769,114,769,82,807,114,807,82,780,114,780,83,769,115,769,83,770,115,770,83,807,115,807,83,780,115,780,84,807,116,807,84,780,116,780,85,771,117,771,85,772,117,772,85,774,117,774,85,778,117,778,85,779,117,779,85,808,117,808,87,770,119,770,89,770,121,770,89,776,90,769,122,769,90,775,122,775,90,780,122,780,115,79,795,111,795,85,795,117,795,68,90,780,68,122,780,100,122,780,76,74,76,106,108,106,78,74,78,106,110,106,65,780,97,780,73,780,105,780,79,780,111,780,85,780,117,780,85,776,772,117,776,772,85,776,769,117,776,769,85,776,780,117,776,780,85,776,768,117,776,768,65,776,772,97,776,772,65,775,772,97,775,772,198,772,230,772,71,780,103,780,75,780,107,780,79,808,111,808,79,808,772,111,808,772,439,780,658,780,106,780,68,90,68,122,100,122,71,769,103,769,78,768,110,768,65,778,769,97,778,769,198,769,230,769,216,769,248,769,65,783,97,783,65,785,97,785,69,783,101,783,69,785,101,785,73,783,105,783,73,785,105,785,79,783,111,783,79,785,111,785,82,783,114,783,82,785,114,785,85,783,117,783, |
22920
|
|
|
|
|
|
|
85,785,117,785,83,806,115,806,84,806,116,806,72,780,104,780,65,775,97,775,69,807,101,807,79,776,772,111,776,772,79,771,772,111,771,772,79,775,111,775,79,775,772,111,775,772,89,772,121,772,104,614,106,114,633,635,641,119,121,32,774,32,775,32,778,32,808,32,771,32,779,611,108,115,120,661,768,769,787,776,769,697,32,837,59,32,769,168,769,913,769,183,917,769,919,769,921,769,927,769,933,769,937,769,953,776,769,921,776,933,776,945,769,949,769,951,769,953,769,965,776,769,953,776,965,776,959,769,965,769,969,769,946,952,933,978,769,978,776,966,960,954,961,962,920,949,931,1045,768,1045,776,1043,769,1030,776,1050,769,1048,768,1059,774,1048,774,1080,774,1077,768,1077,776,1075,769,1110,776,1082,769,1080,768,1091,774,1140,783,1141,783,1046,774,1078,774,1040,774,1072,774,1040,776,1072,776,1045,774,1077,774,1240,776,1241,776,1046,776,1078,776,1047,776,1079,776,1048,772,1080,772,1048,776,1080,776,1054,776,1086,776,1256,776,1257,776,1069,776,1101,776,1059,772,1091,772,1059,776,1091,776,1059,779,1091,779,1063,776,1095,776,1067,776,1099,776,1381,1410,1575,1619,1575,1620,1608,1620,1575,1621,1610,1620,1575,1652,1608,1652,1735,1652,1610,1652,1749,1620,1729,1620,1746,1620,2344,2364,2352,2364,2355,2364,2325,2364,2326,2364,2327,2364,2332,2364,2337,2364,2338,2364,2347,2364,2351,2364,2503,2494,2503,2519,2465,2492,2466,2492,2479,2492,2610,2620,2616,2620,2582,2620,2583,2620,2588,2620,2603,2620,2887,2902,2887,2878,2887,2903,2849,2876,2850,2876,2962,3031,3014,3006,3015,3006,3014,3031,3142,3158,3263,3285,3270,3285,3270,3286,3270,3266,3270,3266,3285,3398,3390,3399,3390,3398,3415,3545,3530,3545,3535,3545,3535,3530,3545,3551,3661,3634,3789,3762,3755,3737,3755,3745,3851,3906,4023,3916,4023,3921,4023,3926,4023,3931,4023,3904,4021,3953,3954,3953,3956,4018,3968,4018,3953,3968,4019,3968,4019,3953,3968,3953,3968,3986,4023,3996,4023,4001,4023,4006,4023,4011,4023,3984,4021,4133,4142,4316,6917,6965,6919,6965,6921,6965,6923,6965,6925,6965,6929,6965,6970,6965,6972,6965,6974,6965,6975,6965,6978,6965,65,198,66,68, |
22921
|
|
|
|
|
|
|
69,398,71,72,73,74,75,76,77,78,79,546,80,82,84,85,87,97,592,593,7426,98,100,101,601,603,604,103,107,109,331,111,596,7446,7447,112,116,117,7453,623,118,7461,946,947,948,966,967,105,114,117,118,946,947,961,966,967,1085,594,99,597,240,604,102,607,609,613,616,617,618,7547,669,621,7557,671,625,624,626,627,628,629,632,642,643,427,649,650,7452,651,652,122,656,657,658,952,65,805,97,805,66,775,98,775,66,803,98,803,66,817,98,817,67,807,769,99,807,769,68,775,100,775,68,803,100,803,68,817,100,817,68,807,100,807,68,813,100,813,69,772,768,101,772,768,69,772,769,101,772,769,69,813,101,813,69,816,101,816,69,807,774,101,807,774,70,775,102,775,71,772,103,772,72,775,104,775,72,803,104,803,72,776,104,776,72,807,104,807,72,814,104,814,73,816,105,816,73,776,769,105,776,769,75,769,107,769,75,803,107,803,75,817,107,817,76,803,108,803,76,803,772,108,803,772,76,817,108,817,76,813,108,813,77,769,109,769,77,775,109,775,77,803,109,803,78,775,110,775,78,803,110,803,78,817,110,817,78,813,110,813,79,771,769,111,771,769,79,771,776,111,771,776,79,772,768,111,772,768,79,772,769,111,772,769,80,769,112,769,80,775,112,775,82,775,114,775,82,803,114,803,82,803,772,114,803,772,82,817,114,817,83,775,115,775,83,803,115,803,83,769,775,115,769,775,83,780,775,115,780,775,83,803,775,115,803,775,84,775,116,775,84,803,116,803,84,817,116,817,84,813,116,813,85,804,117,804,85,816,117,816,85,813,117,813,85,771,769,117,771,769,85,772,776,117,772,776,86,771,118,771,86,803,118,803,87,768,119,768,87,769,119,769,87,776,119,776,87,775,119,775,87,803,119,803,88,775,120,775,88,776,120,776,89,775,121,775,90,770,122,770,90,803,122,803,90,817,122,817,104,817,116,776,119,778,121,778,97,702,383,775,65,803,97,803,65,777,97,777,65,770,769,97,770,769,65,770,768,97,770,768,65,770,777,97,770,777,65,770,771,97,770,771,65,803,770,97,803,770,65,774,769,97,774,769,65,774,768,97,774,768,65,774,777,97,774,777,65,774,771,97,774,771,65,803,774,97,803,774,69,803,101,803,69,777,101,777,69,771,101,771,69,770,769,101,770,769,69,770,768,101,770, |
22922
|
|
|
|
|
|
|
768,69,770,777,101,770,777,69,770,771,101,770,771,69,803,770,101,803,770,73,777,105,777,73,803,105,803,79,803,111,803,79,777,111,777,79,770,769,111,770,769,79,770,768,111,770,768,79,770,777,111,770,777,79,770,771,111,770,771,79,803,770,111,803,770,79,795,769,111,795,769,79,795,768,111,795,768,79,795,777,111,795,777,79,795,771,111,795,771,79,795,803,111,795,803,85,803,117,803,85,777,117,777,85,795,769,117,795,769,85,795,768,117,795,768,85,795,777,117,795,777,85,795,771,117,795,771,85,795,803,117,795,803,89,768,121,768,89,803,121,803,89,777,121,777,89,771,121,771,945,787,945,788,945,787,768,945,788,768,945,787,769,945,788,769,945,787,834,945,788,834,913,787,913,788,913,787,768,913,788,768,913,787,769,913,788,769,913,787,834,913,788,834,949,787,949,788,949,787,768,949,788,768,949,787,769,949,788,769,917,787,917,788,917,787,768,917,788,768,917,787,769,917,788,769,951,787,951,788,951,787,768,951,788,768,951,787,769,951,788,769,951,787,834,951,788,834,919,787,919,788,919,787,768,919,788,768,919,787,769,919,788,769,919,787,834,919,788,834,953,787,953,788,953,787,768,953,788,768,953,787,769,953,788,769,953,787,834,953,788,834,921,787,921,788,921,787,768,921,788,768,921,787,769,921,788,769,921,787,834,921,788,834,959,787,959,788,959,787,768,959,788,768,959,787,769,959,788,769,927,787,927,788,927,787,768,927,788,768,927,787,769,927,788,769,965,787,965,788,965,787,768,965,788,768,965,787,769,965,788,769,965,787,834,965,788,834,933,788,933,788,768,933,788,769,933,788,834,969,787,969,788,969,787,768,969,788,768,969,787,769,969,788,769,969,787,834,969,788,834,937,787,937,788,937,787,768,937,788,768,937,787,769,937,788,769,937,787,834,937,788,834,945,768,945,769,949,768,949,769,951,768,951,769,953,768,953,769,959,768,959,769,965,768,965,769,969,768,969,769,945,787,837,945,788,837,945,787,768,837,945,788,768,837,945,787,769,837,945,788,769,837,945,787,834,837,945,788,834,837,913,787,837,913,788,837,913,787,768,837,913,788,768,837,913,787,769,837,913,788,769,837,913,787,834,837,913, |
22923
|
|
|
|
|
|
|
788,834,837,951,787,837,951,788,837,951,787,768,837,951,788,768,837,951,787,769,837,951,788,769,837,951,787,834,837,951,788,834,837,919,787,837,919,788,837,919,787,768,837,919,788,768,837,919,787,769,837,919,788,769,837,919,787,834,837,919,788,834,837,969,787,837,969,788,837,969,787,768,837,969,788,768,837,969,787,769,837,969,788,769,837,969,787,834,837,969,788,834,837,937,787,837,937,788,837,937,787,768,837,937,788,768,837,937,787,769,837,937,788,769,837,937,787,834,837,937,788,834,837,945,774,945,772,945,768,837,945,837,945,769,837,945,834,945,834,837,913,774,913,772,913,768,913,769,913,837,32,787,953,32,787,32,834,168,834,951,768,837,951,837,951,769,837,951,834,951,834,837,917,768,917,769,919,768,919,769,919,837,8127,768,8127,769,8127,834,953,774,953,772,953,776,768,953,776,769,953,834,953,776,834,921,774,921,772,921,768,921,769,8190,768,8190,769,8190,834,965,774,965,772,965,776,768,965,776,769,961,787,961,788,965,834,965,776,834,933,774,933,772,933,768,933,769,929,788,168,768,168,769,96,969,768,837,969,837,969,769,837,969,834,969,834,837,927,768,927,769,937,768,937,769,937,837,180,32,788,8194,8195,32,32,32,32,32,32,32,32,32,8208,32,819,46,46,46,46,46,46,32,8242,8242,8242,8242,8242,8245,8245,8245,8245,8245,33,33,32,773,63,63,63,33,33,63,8242,8242,8242,8242,32,48,105,52,53,54,55,56,57,43,8722,61,40,41,110,48,49,50,51,52,53,54,55,56,57,43,8722,61,40,41,97,101,111,120,601,104,107,108,109,110,112,115,116,82,115,97,47,99,97,47,115,67,176,67,99,47,111,99,47,117,400,176,70,103,72,72,72,104,295,73,73,76,108,78,78,111,80,81,82,82,82,83,77,84,69,76,84,77,90,937,90,75,65,778,66,67,101,69,70,77,111,1488,1489,1490,1491,105,70,65,88,960,947,915,928,8721,68,100,101,105,106,49,8260,55,49,8260,57,49,8260,49,48,49,8260,51,50,8260,51,49,8260,53,50,8260,53,51,8260,53,52,8260,53,49,8260,54,53,8260,54,49,8260,56,51,8260,56,53,8260,56,55,8260,56,49,8260,73,73,73,73,73,73,73,86,86,86,73,86,73,73,86,73,73,73,73,88,88,88,73,88,73,73,76,67,68,77,105,105,105,105,105,105,105,118,118,118,105, |
22924
|
|
|
|
|
|
|
118,105,105,118,105,105,105,105,120,120,120,105,120,105,105,108,99,100,109,48,8260,51,8592,824,8594,824,8596,824,8656,824,8660,824,8658,824,8707,824,8712,824,8715,824,8739,824,8741,824,8747,8747,8747,8747,8747,8750,8750,8750,8750,8750,8764,824,8771,824,8773,824,8776,824,61,824,8801,824,8781,824,60,824,62,824,8804,824,8805,824,8818,824,8819,824,8822,824,8823,824,8826,824,8827,824,8834,824,8835,824,8838,824,8839,824,8866,824,8872,824,8873,824,8875,824,8828,824,8829,824,8849,824,8850,824,8882,824,8883,824,8884,824,8885,824,12296,12297,49,50,51,52,53,54,55,56,57,49,48,49,49,49,50,49,51,49,52,49,53,49,54,49,55,49,56,49,57,50,48,40,49,41,40,50,41,40,51,41,40,52,41,40,53,41,40,54,41,40,55,41,40,56,41,40,57,41,40,49,48,41,40,49,49,41,40,49,50,41,40,49,51,41,40,49,52,41,40,49,53,41,40,49,54,41,40,49,55,41,40,49,56,41,40,49,57,41,40,50,48,41,49,46,50,46,51,46,52,46,53,46,54,46,55,46,56,46,57,46,49,48,46,49,49,46,49,50,46,49,51,46,49,52,46,49,53,46,49,54,46,49,55,46,49,56,46,49,57,46,50,48,46,40,97,41,40,98,41,40,99,41,40,100,41,40,101,41,40,102,41,40,103,41,40,104,41,40,105,41,40,106,41,40,107,41,40,108,41,40,109,41,40,110,41,40,111,41,40,112,41,40,113,41,40,114,41,40,115,41,40,116,41,40,117,41,40,118,41,40,119,41,40,120,41,40,121,41,40,122,41,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,48,8747,8747,8747,8747,58,58,61,61,61,61,61,61,10973,824,106,86,11617,27597,40863,19968,20008,20022,20031,20057,20101,20108,20128,20154,20799,20837,20843,20866,20886,20907,20960,20981,20992,21147,21241,21269,21274,21304,21313,21340,21353,21378,21430,21448,21475,22231,22303,22763,22786,22794,22805,22823,22899,23376,23424,23544,23567,23586,23608,23662,23665,24027,24037,24049,24062,24178,24186,24191,24308,24318,24331,24339,24400,24417,24435,24515,25096,25142,25163,25903,25908,25991,26007,26020,26041,26080,26085,26352,26376,26408,27424,27490,27513,27571,27595, |
22925
|
|
|
|
|
|
|
27604,27611,27663,27668,27700,28779,29226,29238,29243,29247,29255,29273,29275,29356,29572,29577,29916,29926,29976,29983,29992,30000,30091,30098,30326,30333,30382,30399,30446,30683,30690,30707,31034,31160,31166,31348,31435,31481,31859,31992,32566,32593,32650,32701,32769,32780,32786,32819,32895,32905,33251,33258,33267,33276,33292,33307,33311,33390,33394,33400,34381,34411,34880,34892,34915,35198,35211,35282,35328,35895,35910,35925,35960,35997,36196,36208,36275,36523,36554,36763,36784,36789,37009,37193,37318,37324,37329,38263,38272,38428,38582,38585,38632,38737,38750,38754,38761,38859,38893,38899,38913,39080,39131,39135,39318,39321,39340,39592,39640,39647,39717,39727,39730,39740,39770,40165,40565,40575,40613,40635,40643,40653,40657,40697,40701,40718,40723,40736,40763,40778,40786,40845,40860,40864,32,12306,21313,21316,21317,12363,12441,12365,12441,12367,12441,12369,12441,12371,12441,12373,12441,12375,12441,12377,12441,12379,12441,12381,12441,12383,12441,12385,12441,12388,12441,12390,12441,12392,12441,12399,12441,12399,12442,12402,12441,12402,12442,12405,12441,12405,12442,12408,12441,12408,12442,12411,12441,12411,12442,12358,12441,32,12441,32,12442,12445,12441,12424,12426,12459,12441,12461,12441,12463,12441,12465,12441,12467,12441,12469,12441,12471,12441,12473,12441,12475,12441,12477,12441,12479,12441,12481,12441,12484,12441,12486,12441,12488,12441,12495,12441,12495,12442,12498,12441,12498,12442,12501,12441,12501,12442,12504,12441,12504,12442,12507,12441,12507,12442,12454,12441,12527,12441,12528,12441,12529,12441,12530,12441,12541,12441,12467,12488,4352,4353,4522,4354,4524,4525,4355,4356,4357,4528,4529,4530,4531,4532,4533,4378,4358,4359,4360,4385,4361,4362,4363,4364,4365,4366,4367,4368,4369,4370,4449,4450,4451,4452,4453,4454,4455,4456,4457,4458,4459,4460,4461,4462,4463,4464,4465,4466,4467,4468,4469,4448,4372,4373,4551,4552,4556,4558,4563,4567,4569,4380,4573,4575,4381,4382,4384,4386,4387,4391,4393,4395,4396,4397,4398,4399,4402,4406,4416,4423,4428,4593,4594,4439,4440,4441, |
22926
|
|
|
|
|
|
|
4484,4485,4488,4497,4498,4500,4510,4513,19968,20108,19977,22235,19978,20013,19979,30002,20057,19993,19969,22825,22320,20154,40,4352,41,40,4354,41,40,4355,41,40,4357,41,40,4358,41,40,4359,41,40,4361,41,40,4363,41,40,4364,41,40,4366,41,40,4367,41,40,4368,41,40,4369,41,40,4370,41,40,4352,4449,41,40,4354,4449,41,40,4355,4449,41,40,4357,4449,41,40,4358,4449,41,40,4359,4449,41,40,4361,4449,41,40,4363,4449,41,40,4364,4449,41,40,4366,4449,41,40,4367,4449,41,40,4368,4449,41,40,4369,4449,41,40,4370,4449,41,40,4364,4462,41,40,4363,4457,4364,4453,4523,41,40,4363,4457,4370,4462,41,40,19968,41,40,20108,41,40,19977,41,40,22235,41,40,20116,41,40,20845,41,40,19971,41,40,20843,41,40,20061,41,40,21313,41,40,26376,41,40,28779,41,40,27700,41,40,26408,41,40,37329,41,40,22303,41,40,26085,41,40,26666,41,40,26377,41,40,31038,41,40,21517,41,40,29305,41,40,36001,41,40,31069,41,40,21172,41,40,20195,41,40,21628,41,40,23398,41,40,30435,41,40,20225,41,40,36039,41,40,21332,41,40,31085,41,40,20241,41,40,33258,41,40,33267,41,21839,24188,25991,31631,80,84,69,50,49,50,50,50,51,50,52,50,53,50,54,50,55,50,56,50,57,51,48,51,49,51,50,51,51,51,52,51,53,4352,4354,4355,4357,4358,4359,4361,4363,4364,4366,4367,4368,4369,4370,4352,4449,4354,4449,4355,4449,4357,4449,4358,4449,4359,4449,4361,4449,4363,4449,4364,4449,4366,4449,4367,4449,4368,4449,4369,4449,4370,4449,4366,4449,4535,4352,4457,4364,4462,4363,4468,4363,4462,19968,20108,19977,22235,20116,20845,19971,20843,20061,21313,26376,28779,27700,26408,37329,22303,26085,26666,26377,31038,21517,29305,36001,31069,21172,31192,30007,22899,36969,20778,21360,27880,38917,20241,20889,27491,19978,20013,19979,24038,21491,21307,23447,23398,30435,20225,36039,21332,22812,51,54,51,55,51,56,51,57,52,48,52,49,52,50,52,51,52,52,52,53,52,54,52,55,52,56,52,57,53,48,49,26376,50,26376,51,26376,52,26376,53,26376,54,26376,55,26376,56,26376,57,26376,49,48,26376,49,49,26376,49,50,26376,72,103,101,114,103,101,86,76,84,68,12450,12452,12454,12456,12458,12459,12461,12463,12465,12467,12469, |
22927
|
|
|
|
|
|
|
12471,12473,12475,12477,12479,12481,12484,12486,12488,12490,12491,12492,12493,12494,12495,12498,12501,12504,12507,12510,12511,12512,12513,12514,12516,12518,12520,12521,12522,12523,12524,12525,12527,12528,12529,12530,20196,21644,12450,12495,12442,12540,12488,12450,12523,12501,12449,12450,12531,12504,12442,12450,12450,12540,12523,12452,12491,12531,12463,12441,12452,12531,12481,12454,12457,12531,12456,12473,12463,12540,12488,12441,12456,12540,12459,12540,12458,12531,12473,12458,12540,12512,12459,12452,12522,12459,12521,12483,12488,12459,12525,12522,12540,12459,12441,12525,12531,12459,12441,12531,12510,12461,12441,12459,12441,12461,12441,12491,12540,12461,12517,12522,12540,12461,12441,12523,12479,12441,12540,12461,12525,12461,12525,12463,12441,12521,12512,12461,12525,12513,12540,12488,12523,12461,12525,12527,12483,12488,12463,12441,12521,12512,12463,12441,12521,12512,12488,12531,12463,12523,12475,12441,12452,12525,12463,12525,12540,12493,12465,12540,12473,12467,12523,12490,12467,12540,12507,12442,12469,12452,12463,12523,12469,12531,12481,12540,12512,12471,12522,12531,12463,12441,12475,12531,12481,12475,12531,12488,12479,12441,12540,12473,12486,12441,12471,12488,12441,12523,12488,12531,12490,12494,12494,12483,12488,12495,12452,12484,12495,12442,12540,12475,12531,12488,12495,12442,12540,12484,12495,12441,12540,12524,12523,12498,12442,12450,12473,12488,12523,12498,12442,12463,12523,12498,12442,12467,12498,12441,12523,12501,12449,12521,12483,12488,12441,12501,12451,12540,12488,12501,12441,12483,12471,12455,12523,12501,12521,12531,12504,12463,12479,12540,12523,12504,12442,12477,12504,12442,12491,12498,12504,12523,12484,12504,12442,12531,12473,12504,12442,12540,12471,12441,12504,12441,12540,12479,12507,12442,12452,12531,12488,12507,12441,12523,12488,12507,12531,12507,12442,12531,12488,12441,12507,12540,12523,12507,12540,12531,12510,12452,12463,12525,12510,12452,12523,12510,12483,12495,12510,12523,12463,12510,12531,12471,12519,12531,12511,12463,12525,12531,12511,12522,12511, |
22928
|
|
|
|
|
|
|
12522,12495,12441,12540,12523,12513,12459,12441,12513,12459,12441,12488,12531,12513,12540,12488,12523,12516,12540,12488,12441,12516,12540,12523,12518,12450,12531,12522,12483,12488,12523,12522,12521,12523,12498,12442,12540,12523,12540,12501,12441,12523,12524,12512,12524,12531,12488,12465,12441,12531,12527,12483,12488,48,28857,49,28857,50,28857,51,28857,52,28857,53,28857,54,28857,55,28857,56,28857,57,28857,49,48,28857,49,49,28857,49,50,28857,49,51,28857,49,52,28857,49,53,28857,49,54,28857,49,55,28857,49,56,28857,49,57,28857,50,48,28857,50,49,28857,50,50,28857,50,51,28857,50,52,28857,104,80,97,100,97,65,85,98,97,114,111,86,112,99,100,109,100,109,50,100,109,51,73,85,24179,25104,26157,21644,22823,27491,26126,27835,26666,24335,20250,31038,112,65,110,65,956,65,109,65,107,65,75,66,77,66,71,66,99,97,108,107,99,97,108,112,70,110,70,956,70,956,103,109,103,107,103,72,122,107,72,122,77,72,122,71,72,122,84,72,122,956,108,109,108,100,108,107,108,102,109,110,109,956,109,109,109,99,109,107,109,109,109,50,99,109,50,109,50,107,109,50,109,109,51,99,109,51,109,51,107,109,51,109,8725,115,109,8725,115,50,80,97,107,80,97,77,80,97,71,80,97,114,97,100,114,97,100,8725,115,114,97,100,8725,115,50,112,115,110,115,956,115,109,115,112,86,110,86,956,86,109,86,107,86,77,86,112,87,110,87,956,87,109,87,107,87,77,87,107,937,77,937,97,46,109,46,66,113,99,99,99,100,67,8725,107,103,67,111,46,100,66,71,121,104,97,72,80,105,110,75,75,75,77,107,116,108,109,108,110,108,111,103,108,120,109,98,109,105,108,109,111,108,80,72,112,46,109,46,80,80,77,80,82,115,114,83,118,87,98,86,8725,109,65,8725,109,49,26085,50,26085,51,26085,52,26085,53,26085,54,26085,55,26085,56,26085,57,26085,49,48,26085,49,49,26085,49,50,26085,49,51,26085,49,52,26085,49,53,26085,49,54,26085,49,55,26085,49,56,26085,49,57,26085,50,48,26085,50,49,26085,50,50,26085,50,51,26085,50,52,26085,50,53,26085,50,54,26085,50,55,26085,50,56,26085,50,57,26085,51,48,26085,51,49,26085,103,97,108,1098,1100,42863,67,70,81,294,339,42791,43831,619,43858,653,35912, |
22929
|
|
|
|
|
|
|
26356,36554,36040,28369,20018,21477,40860,40860,22865,37329,21895,22856,25078,30313,32645,34367,34746,35064,37007,27138,27931,28889,29662,33853,37226,39409,20098,21365,27396,29211,34349,40478,23888,28651,34253,35172,25289,33240,34847,24266,26391,28010,29436,37070,20358,20919,21214,25796,27347,29200,30439,32769,34310,34396,36335,38706,39791,40442,30860,31103,32160,33737,37636,40575,35542,22751,24324,31840,32894,29282,30922,36034,38647,22744,23650,27155,28122,28431,32047,32311,38475,21202,32907,20956,20940,31260,32190,33777,38517,35712,25295,27138,35582,20025,23527,24594,29575,30064,21271,30971,20415,24489,19981,27852,25976,32034,21443,22622,30465,33865,35498,27578,36784,27784,25342,33509,25504,30053,20142,20841,20937,26753,31975,33391,35538,37327,21237,21570,22899,24300,26053,28670,31018,38317,39530,40599,40654,21147,26310,27511,36706,24180,24976,25088,25754,28451,29001,29833,31178,32244,32879,36646,34030,36899,37706,21015,21155,21693,28872,35010,35498,24265,24565,25467,27566,31806,29557,20196,22265,23527,23994,24604,29618,29801,32666,32838,37428,38646,38728,38936,20363,31150,37300,38584,24801,20102,20698,23534,23615,26009,27138,29134,30274,34044,36988,40845,26248,38446,21129,26491,26611,27969,28316,29705,30041,30827,32016,39006,20845,25134,38520,20523,23833,28138,36650,24459,24900,26647,29575,38534,21033,21519,23653,26131,26446,26792,27877,29702,30178,32633,35023,35041,37324,38626,21311,28346,21533,29136,29848,34298,38563,40023,40607,26519,28107,33256,31435,31520,31890,29376,28825,35672,20160,33590,21050,20999,24230,25299,31958,23429,27934,26292,36667,34892,38477,35211,24275,20800,21952,22618,26228,20958,29482,30410,31036,31070,31077,31119,38742,31934,32701,34322,35576,36920,37117,39151,39164,39208,40372,37086,38583,20398,20711,20813,21193,21220,21329,21917,22022,22120,22592,22696,23652,23662,24724,24936,24974,25074,25935,26082,26257,26757,28023,28186,28450,29038,29227,29730,30865,31038,31049,31048,31056,31062,31069,31117,31118,31296,31361,31680,32244,32265,32321, |
22930
|
|
|
|
|
|
|
32626,32773,33261,33401,33401,33879,35088,35222,35585,35641,36051,36104,36790,36920,38627,38911,38971,24693,148206,33304,20006,20917,20840,20352,20805,20864,21191,21242,21917,21845,21913,21986,22618,22707,22852,22868,23138,23336,24274,24281,24425,24493,24792,24910,24840,24974,24928,25074,25140,25540,25628,25682,25942,26228,26391,26395,26454,27513,27578,27969,28379,28363,28450,28702,29038,30631,29237,29359,29482,29809,29958,30011,30237,30239,30410,30427,30452,30538,30528,30924,31409,31680,31867,32091,32244,32574,32773,33618,33775,34681,35137,35206,35222,35519,35576,35531,35585,35582,35565,35641,35722,36104,36664,36978,37273,37494,38524,38627,38742,38875,38911,38923,38971,39698,40860,141386,141380,144341,15261,16408,16441,152137,154832,163539,40771,40846,102,102,102,105,102,108,102,102,105,102,102,108,115,116,115,116,1396,1398,1396,1381,1396,1387,1406,1398,1396,1389,1497,1460,1522,1463,1506,1488,1491,1492,1499,1500,1501,1512,1514,43,1513,1473,1513,1474,1513,1468,1473,1513,1468,1474,1488,1463,1488,1464,1488,1468,1489,1468,1490,1468,1491,1468,1492,1468,1493,1468,1494,1468,1496,1468,1497,1468,1498,1468,1499,1468,1500,1468,1502,1468,1504,1468,1505,1468,1507,1468,1508,1468,1510,1468,1511,1468,1512,1468,1513,1468,1514,1468,1493,1465,1489,1471,1499,1471,1508,1471,1488,1500,1649,1649,1659,1659,1659,1659,1662,1662,1662,1662,1664,1664,1664,1664,1658,1658,1658,1658,1663,1663,1663,1663,1657,1657,1657,1657,1700,1700,1700,1700,1702,1702,1702,1702,1668,1668,1668,1668,1667,1667,1667,1667,1670,1670,1670,1670,1671,1671,1671,1671,1677,1677,1676,1676,1678,1678,1672,1672,1688,1688,1681,1681,1705,1705,1705,1705,1711,1711,1711,1711,1715,1715,1715,1715,1713,1713,1713,1713,1722,1722,1723,1723,1723,1723,1749,1620,1749,1620,1729,1729,1729,1729,1726,1726,1726,1726,1746,1746,1746,1620,1746,1620,1709,1709,1709,1709,1735,1735,1734,1734,1736,1736,1735,1652,1739,1739,1733,1733,1737,1737,1744,1744,1744,1744,1609,1609,1610,1620,1575,1610,1620,1575,1610,1620,1749,1610,1620,1749,1610,1620,1608,1610,1620, |
22931
|
|
|
|
|
|
|
1608,1610,1620,1735,1610,1620,1735,1610,1620,1734,1610,1620,1734,1610,1620,1736,1610,1620,1736,1610,1620,1744,1610,1620,1744,1610,1620,1744,1610,1620,1609,1610,1620,1609,1610,1620,1609,1740,1740,1740,1740,1610,1620,1580,1610,1620,1581,1610,1620,1605,1610,1620,1609,1610,1620,1610,1576,1580,1576,1581,1576,1582,1576,1605,1576,1609,1576,1610,1578,1580,1578,1581,1578,1582,1578,1605,1578,1609,1578,1610,1579,1580,1579,1605,1579,1609,1579,1610,1580,1581,1580,1605,1581,1580,1581,1605,1582,1580,1582,1581,1582,1605,1587,1580,1587,1581,1587,1582,1587,1605,1589,1581,1589,1605,1590,1580,1590,1581,1590,1582,1590,1605,1591,1581,1591,1605,1592,1605,1593,1580,1593,1605,1594,1580,1594,1605,1601,1580,1601,1581,1601,1582,1601,1605,1601,1609,1601,1610,1602,1581,1602,1605,1602,1609,1602,1610,1603,1575,1603,1580,1603,1581,1603,1582,1603,1604,1603,1605,1603,1609,1603,1610,1604,1580,1604,1581,1604,1582,1604,1605,1604,1609,1604,1610,1605,1580,1605,1581,1605,1582,1605,1605,1605,1609,1605,1610,1606,1580,1606,1581,1606,1582,1606,1605,1606,1609,1606,1610,1607,1580,1607,1605,1607,1609,1607,1610,1610,1580,1610,1581,1610,1582,1610,1605,1610,1609,1610,1610,1584,1648,1585,1648,1609,1648,32,1612,1617,32,1613,1617,32,1614,1617,32,1615,1617,32,1616,1617,32,1617,1648,1610,1620,1585,1610,1620,1586,1610,1620,1605,1610,1620,1606,1610,1620,1609,1610,1620,1610,1576,1585,1576,1586,1576,1605,1576,1606,1576,1609,1576,1610,1578,1585,1578,1586,1578,1605,1578,1606,1578,1609,1578,1610,1579,1585,1579,1586,1579,1605,1579,1606,1579,1609,1579,1610,1601,1609,1601,1610,1602,1609,1602,1610,1603,1575,1603,1604,1603,1605,1603,1609,1603,1610,1604,1605,1604,1609,1604,1610,1605,1575,1605,1605,1606,1585,1606,1586,1606,1605,1606,1606,1606,1609,1606,1610,1609,1648,1610,1585,1610,1586,1610,1605,1610,1606,1610,1609,1610,1610,1610,1620,1580,1610,1620,1581,1610,1620,1582,1610,1620,1605,1610,1620,1607,1576,1580,1576,1581,1576,1582,1576,1605,1576,1607,1578,1580,1578,1581,1578,1582,1578,1605,1578,1607,1579,1605,1580,1581,1580,1605,1581, |
22932
|
|
|
|
|
|
|
1580,1581,1605,1582,1580,1582,1605,1587,1580,1587,1581,1587,1582,1587,1605,1589,1581,1589,1582,1589,1605,1590,1580,1590,1581,1590,1582,1590,1605,1591,1581,1592,1605,1593,1580,1593,1605,1594,1580,1594,1605,1601,1580,1601,1581,1601,1582,1601,1605,1602,1581,1602,1605,1603,1580,1603,1581,1603,1582,1603,1604,1603,1605,1604,1580,1604,1581,1604,1582,1604,1605,1604,1607,1605,1580,1605,1581,1605,1582,1605,1605,1606,1580,1606,1581,1606,1582,1606,1605,1606,1607,1607,1580,1607,1605,1607,1648,1610,1580,1610,1581,1610,1582,1610,1605,1610,1607,1610,1620,1605,1610,1620,1607,1576,1605,1576,1607,1578,1605,1578,1607,1579,1605,1579,1607,1587,1605,1587,1607,1588,1605,1588,1607,1603,1604,1603,1605,1604,1605,1606,1605,1606,1607,1610,1605,1610,1607,1600,1614,1617,1600,1615,1617,1600,1616,1617,1591,1609,1591,1610,1593,1609,1593,1610,1594,1609,1594,1610,1587,1609,1587,1610,1588,1609,1588,1610,1581,1609,1581,1610,1580,1609,1580,1610,1582,1609,1582,1610,1589,1609,1589,1610,1590,1609,1590,1610,1588,1580,1588,1581,1588,1582,1588,1605,1588,1585,1587,1585,1589,1585,1590,1585,1591,1609,1591,1610,1593,1609,1593,1610,1594,1609,1594,1610,1587,1609,1587,1610,1588,1609,1588,1610,1581,1609,1581,1610,1580,1609,1580,1610,1582,1609,1582,1610,1589,1609,1589,1610,1590,1609,1590,1610,1588,1580,1588,1581,1588,1582,1588,1605,1588,1585,1587,1585,1589,1585,1590,1585,1588,1580,1588,1581,1588,1582,1588,1605,1587,1607,1588,1607,1591,1605,1587,1580,1587,1581,1587,1582,1588,1580,1588,1581,1588,1582,1591,1605,1592,1605,1575,1611,1575,1611,1578,1580,1605,1578,1581,1580,1578,1581,1580,1578,1581,1605,1578,1582,1605,1578,1605,1580,1578,1605,1581,1578,1605,1582,1580,1605,1581,1580,1605,1581,1581,1605,1610,1581,1605,1609,1587,1581,1580,1587,1580,1581,1587,1580,1609,1587,1605,1581,1587,1605,1581,1587,1605,1580,1587,1605,1605,1587,1605,1605,1589,1581,1581,1589,1581,1581,1589,1605,1605,1588,1581,1605,1588,1581,1605,1588,1580,1610,1588,1605,1582,1588,1605,1582,1588,1605,1605,1588,1605,1605,1590,1581,1609,1590,1582,1605,1590,1582, |
22933
|
|
|
|
|
|
|
1605,1591,1605,1581,1591,1605,1581,1591,1605,1605,1591,1605,1610,1593,1580,1605,1593,1605,1605,1593,1605,1605,1593,1605,1609,1594,1605,1605,1594,1605,1610,1594,1605,1609,1601,1582,1605,1601,1582,1605,1602,1605,1581,1602,1605,1605,1604,1581,1605,1604,1581,1610,1604,1581,1609,1604,1580,1580,1604,1580,1580,1604,1582,1605,1604,1582,1605,1604,1605,1581,1604,1605,1581,1605,1581,1580,1605,1581,1605,1605,1581,1610,1605,1580,1581,1605,1580,1605,1605,1582,1580,1605,1582,1605,1605,1580,1582,1607,1605,1580,1607,1605,1605,1606,1581,1605,1606,1581,1609,1606,1580,1605,1606,1580,1605,1606,1580,1609,1606,1605,1610,1606,1605,1609,1610,1605,1605,1610,1605,1605,1576,1582,1610,1578,1580,1610,1578,1580,1609,1578,1582,1610,1578,1582,1609,1578,1605,1610,1578,1605,1609,1580,1605,1610,1580,1581,1609,1580,1605,1609,1587,1582,1609,1589,1581,1610,1588,1581,1610,1590,1581,1610,1604,1580,1610,1604,1605,1610,1610,1581,1610,1610,1580,1610,1610,1605,1610,1605,1605,1610,1602,1605,1610,1606,1581,1610,1602,1605,1581,1604,1581,1605,1593,1605,1610,1603,1605,1610,1606,1580,1581,1605,1582,1610,1604,1580,1605,1603,1605,1605,1604,1580,1605,1606,1580,1581,1580,1581,1610,1581,1580,1610,1605,1580,1610,1601,1605,1610,1576,1581,1610,1603,1605,1605,1593,1580,1605,1589,1605,1605,1587,1582,1610,1606,1580,1610,1589,1604,1746,1602,1604,1746,1575,1604,1604,1607,1575,1603,1576,1585,1605,1581,1605,1583,1589,1604,1593,1605,1585,1587,1608,1604,1593,1604,1610,1607,1608,1587,1604,1605,1589,1604,1609,1589,1604,1609,32,1575,1604,1604,1607,32,1593,1604,1610,1607,32,1608,1587,1604,1605,1580,1604,32,1580,1604,1575,1604,1607,1585,1740,1575,1604,44,12289,12290,58,59,33,63,12310,12311,46,46,46,46,46,8212,8211,95,95,40,41,123,125,12308,12309,12304,12305,12298,12299,12296,12297,12300,12301,12302,12303,91,93,32,773,32,773,32,773,32,773,95,95,95,44,12289,46,59,58,63,33,8212,40,41,123,125,12308,12309,35,38,42,43,45,60,62,61,92,36,37,64,32,1611,1600,1611,32,1612,32,1613,32,1614,1600,1614,32,1615,1600,1615,32,1616,1600,1616,32,1617,1600, |
22934
|
|
|
|
|
|
|
1617,32,1618,1600,1618,1569,1575,1619,1575,1619,1575,1620,1575,1620,1608,1620,1608,1620,1575,1621,1575,1621,1610,1620,1610,1620,1610,1620,1610,1620,1575,1575,1576,1576,1576,1576,1577,1577,1578,1578,1578,1578,1579,1579,1579,1579,1580,1580,1580,1580,1581,1581,1581,1581,1582,1582,1582,1582,1583,1583,1584,1584,1585,1585,1586,1586,1587,1587,1587,1587,1588,1588,1588,1588,1589,1589,1589,1589,1590,1590,1590,1590,1591,1591,1591,1591,1592,1592,1592,1592,1593,1593,1593,1593,1594,1594,1594,1594,1601,1601,1601,1601,1602,1602,1602,1602,1603,1603,1603,1603,1604,1604,1604,1604,1605,1605,1605,1605,1606,1606,1606,1606,1607,1607,1607,1607,1608,1608,1609,1609,1610,1610,1610,1610,1604,1575,1619,1604,1575,1619,1604,1575,1620,1604,1575,1620,1604,1575,1621,1604,1575,1621,1604,1575,1604,1575,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,10629,10630,12290,12300,12301,12289,12539,12530,12449,12451,12453,12455,12457,12515,12517,12519,12483,12540,12450,12452,12454,12456,12458,12459,12461,12463,12465,12467,12469,12471,12473,12475,12477,12479,12481,12484,12486,12488,12490,12491,12492,12493,12494,12495,12498,12501,12504,12507,12510,12511,12512,12513,12514,12516,12518,12520,12521,12522,12523,12524,12525,12527,12531,12441,12442,4448,4352,4353,4522,4354,4524,4525,4355,4356,4357,4528,4529,4530,4531,4532,4533,4378,4358,4359,4360,4385,4361,4362,4363,4364,4365,4366,4367,4368,4369,4370,4449,4450,4451,4452,4453,4454,4455,4456,4457,4458,4459,4460,4461,4462,4463,4464,4465,4466,4467,4468,4469,162,163,172,32,772,166,165,8361,9474,8592,8593,8594,8595,9632,9675,720,721,230,665,595,675,43878,677,676,598,599,7569,600,606,681,612,610,608,667,295,668,615,644,682,683,620,122628,42894,622,122629,654,122630,248,630,631,113,634,122632,637,638,640,680,678,43879, |
22935
|
|
|
|
|
|
|
679,648,11377,655,673,674,664,448,449,450,122634,122654,69785,69818,69787,69818,69797,69818,69937,69927,69938,69927,70471,70462,70471,70487,70841,70842,70841,70832,70841,70845,71096,71087,71097,71087,71989,71984,119127,119141,119128,119141,119128,119141,119150,119128,119141,119151,119128,119141,119152,119128,119141,119153,119128,119141,119154,119225,119141,119226,119141,119225,119141,119150,119226,119141,119150,119225,119141,119151,119226,119141,119151,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,97,98,99,100,101,102,103,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,65,67,68,71,74,75,78,79,80,81,83,84,85,86,87,88,89,90,97,98,99,100,102,104,105,106,107,108,109,110,112,113,114,115,116,117,118,119,120,121,122,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,65,66,68,69,70,71,74,75,76,77,78,79,80,81,83,84,85,86,87,88,89,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,65,66,68,69,70,71,73,74,75,76,77,79,83,84,85,86,87,88,89,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,65,66, |
22936
|
|
|
|
|
|
|
67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,305,567,913,914,915,916,917,918,919,920,921,922,923,924,925,926,927,928,929,920,931,932,933,934,935,936,937,8711,945,946,947,948,949,950,951,952,953,954,955,956,957,958,959,960,961,962,963,964,965,966,967,968,969,8706,949,952,954,966,961,960,913,914,915,916,917,918,919,920,921,922,923,924,925,926,927,928,929,920,931,932,933,934,935,936,937,8711,945,946,947,948,949,950,951,952,953,954,955,956,957,958,959,960,961,962,963,964,965,966,967,968,969,8706,949,952,954,966,961,960,913,914,915,916,917,918,919,920,921,922,923,924,925,926,927,928,929,920,931,932,933,934,935,936,937,8711,945,946,947,948,949,950,951,952,953,954,955,956,957,958,959,960,961,962,963,964,965,966,967,968,969,8706,949,952,954,966,961,960,913,914,915,916,917,918,919,920,921,922,923,924,925,926,927,928,929,920,931,932,933,934,935,936,937,8711,945,946,947,948,949,950,951,952,953,954,955,956,957,958,959,960,961,962,963,964,965,966,967,968,969,8706,949,952,954,966,961,960,913,914,915,916,917,918,919,920,921,922,923,924,925,926,927,928,929,920,931,932,933,934,935,936,937,8711,945,946,947,948,949,950,951,952,953,954,955,956,957,958,959,960,961,962,963,964,965,966,967,968,969,8706,949,952,954,966,961,960,988,989,48,49,50,51,52,53,54,55,56,57,48,49,50,51,52,53,54,55,56,57,48,49,50,51,52,53,54,55,56,57,48,49,50,51, |
22937
|
|
|
|
|
|
|
52,53,54,55,56,57,48,49,50,51,52,53,54,55,56,57,1072,1073,1074,1075,1076,1077,1078,1079,1080,1082,1083,1084,1086,1087,1088,1089,1090,1091,1092,1093,1094,1095,1096,1099,1101,1102,42633,1241,1110,1112,1257,1199,1231,1072,1073,1074,1075,1076,1077,1078,1079,1080,1082,1083,1086,1087,1089,1091,1092,1093,1094,1095,1096,1098,1099,1169,1110,1109,1119,1195,42577,1201,1575,1576,1580,1583,1608,1586,1581,1591,1610,1603,1604,1605,1606,1587,1593,1601,1589,1602,1585,1588,1578,1579,1582,1584,1590,1592,1594,1646,1722,1697,1647,1576,1580,1607,1581,1610,1603,1604,1605,1606,1587,1593,1601,1589,1602,1588,1578,1579,1582,1590,1594,1580,1581,1610,1604,1606,1587,1593,1589,1602,1588,1582,1590,1594,1722,1647,1576,1580,1607,1581,1591,1610,1603,1605,1606,1587,1593,1601,1589,1602,1588,1578,1579,1582,1590,1592,1594,1646,1697,1575,1576,1580,1583,1607,1608,1586,1581,1591,1610,1604,1605,1606,1587,1593,1601,1589,1602,1585,1588,1578,1579,1582,1584,1590,1592,1594,1576,1580,1583,1608,1586,1581,1591,1610,1604,1605,1606,1587,1593,1601,1589,1602,1585,1588,1578,1579,1582,1584,1590,1592,1594,48,46,48,44,49,44,50,44,51,44,52,44,53,44,54,44,55,44,56,44,57,44,40,65,41,40,66,41,40,67,41,40,68,41,40,69,41,40,70,41,40,71,41,40,72,41,40,73,41,40,74,41,40,75,41,40,76,41,40,77,41,40,78,41,40,79,41,40,80,41,40,81,41,40,82,41,40,83,41,40,84,41,40,85,41,40,86,41,40,87,41,40,88,41,40,89,41,40,90,41,12308,83,12309,67,82,67,68,87,90,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,72,86,77,86,83,68,83,83,80,80,86,87,67,77,67,77,68,77,82,68,74,12411,12363,12467,12467,12469,25163,23383,21452,12486,12441,20108,22810,35299,22825,20132,26144,28961,26009,21069,24460,20877,26032,21021,32066,29983,36009,22768,21561,28436,25237,25429,19968,19977,36938,24038,20013,21491,25351,36208,25171,31105,31354,21512,28288,26377,26376,30003,21106,21942,37197,12308,26412,12309,12308,19977,12309,12308,20108,12309,12308,23433,12309,12308,28857,12309,12308,25171,12309,12308,30423,12309,12308,21213,12309,12308,25943, |
22938
|
|
|
|
|
|
|
12309,24471,21487,48,49,50,51,52,53,54,55,56,57,20029,20024,20033,131362,20320,20398,20411,20482,20602,20633,20711,20687,13470,132666,20813,20820,20836,20855,132380,13497,20839,20877,132427,20887,20900,20172,20908,20917,168415,20981,20995,13535,21051,21062,21106,21111,13589,21191,21193,21220,21242,21253,21254,21271,21321,21329,21338,21363,21373,21375,21375,21375,133676,28784,21450,21471,133987,21483,21489,21510,21662,21560,21576,21608,21666,21750,21776,21843,21859,21892,21892,21913,21931,21939,21954,22294,22022,22295,22097,22132,20999,22766,22478,22516,22541,22411,22578,22577,22700,136420,22770,22775,22790,22810,22818,22882,136872,136938,23020,23067,23079,23000,23142,14062,14076,23304,23358,23358,137672,23491,23512,23527,23539,138008,23551,23558,24403,23586,14209,23648,23662,23744,23693,138724,23875,138726,23918,23915,23932,24033,24034,14383,24061,24104,24125,24169,14434,139651,14460,24240,24243,24246,24266,172946,24318,140081,140081,33281,24354,24354,14535,144056,156122,24418,24427,14563,24474,24525,24535,24569,24705,14650,14620,24724,141012,24775,24904,24908,24910,24908,24954,24974,25010,24996,25007,25054,25074,25078,25104,25115,25181,25265,25300,25424,142092,25405,25340,25448,25475,25572,142321,25634,25541,25513,14894,25705,25726,25757,25719,14956,25935,25964,143370,26083,26360,26185,15129,26257,15112,15076,20882,20885,26368,26268,32941,17369,26391,26395,26401,26462,26451,144323,15177,26618,26501,26706,26757,144493,26766,26655,26900,15261,26946,27043,27114,27304,145059,27355,15384,27425,145575,27476,15438,27506,27551,27578,27579,146061,138507,146170,27726,146620,27839,27853,27751,27926,27966,28023,27969,28009,28024,28037,146718,27956,28207,28270,15667,28363,28359,147153,28153,28526,147294,147342,28614,28729,28702,28699,15766,28746,28797,28791,28845,132389,28997,148067,29084,148395,29224,29237,29264,149000,29312,29333,149301,149524,29562,29579,16044,29605,16056,16056,29767,29788,29809,29829,29898,16155,29988,150582,30014,150674,30064,139679,30224,151457,151480, |
22939
|
|
|
|
|
|
|
151620,16380,16392,30452,151795,151794,151833,151859,30494,30495,30495,30538,16441,30603,16454,16534,152605,30798,30860,30924,16611,153126,31062,153242,153285,31119,31211,16687,31296,31306,31311,153980,154279,154279,31470,16898,154539,31686,31689,16935,154752,31954,17056,31976,31971,32000,155526,32099,17153,32199,32258,32325,17204,156200,156231,17241,156377,32634,156478,32661,32762,32773,156890,156963,32864,157096,32880,144223,17365,32946,33027,17419,33086,23221,157607,157621,144275,144284,33281,33284,36766,17515,33425,33419,33437,21171,33457,33459,33469,33510,158524,33509,33565,33635,33709,33571,33725,33767,33879,33619,33738,33740,33756,158774,159083,158933,17707,34033,34035,34070,160714,34148,159532,17757,17761,159665,159954,17771,34384,34396,34407,34409,34473,34440,34574,34530,34681,34600,34667,34694,17879,34785,34817,17913,34912,34915,161383,35031,35038,17973,35066,13499,161966,162150,18110,18119,35488,35565,35722,35925,162984,36011,36033,36123,36215,163631,133124,36299,36284,36336,133342,36564,36664,165330,165357,37012,37105,37137,165678,37147,37432,37591,37592,37500,37881,37909,166906,38283,18837,38327,167287,18918,38595,23986,38691,168261,168474,19054,19062,38880,168970,19122,169110,38923,38923,38953,169398,39138,19251,39209,39335,39362,39422,19406,170800,39698,40000,40189,19662,19693,40295,172238,19704,172293,172558,172689,40635,19798,40697,40702,40709,40719,40726,40763,173568 |
22940
|
|
|
|
|
|
|
}; |
22941
|
|
|
|
|
|
|
|
22942
|
|
|
|
|
|
|
} // namespace unilib |
22943
|
|
|
|
|
|
|
|
22944
|
|
|
|
|
|
|
///////// |
22945
|
|
|
|
|
|
|
// File: unilib/utf8.cpp |
22946
|
|
|
|
|
|
|
///////// |
22947
|
|
|
|
|
|
|
|
22948
|
|
|
|
|
|
|
// This file is part of UniLib . |
22949
|
|
|
|
|
|
|
// |
22950
|
|
|
|
|
|
|
// Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of |
22951
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
22952
|
|
|
|
|
|
|
// |
22953
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
22954
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
22955
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
22956
|
|
|
|
|
|
|
// |
22957
|
|
|
|
|
|
|
// UniLib version: 3.3.0 |
22958
|
|
|
|
|
|
|
// Unicode version: 15.0.0 |
22959
|
|
|
|
|
|
|
|
22960
|
|
|
|
|
|
|
namespace unilib { |
22961
|
|
|
|
|
|
|
|
22962
|
0
|
|
|
|
|
|
bool utf8::valid(const char* str) { |
22963
|
0
|
0
|
|
|
|
|
for (; *str; str++) |
22964
|
0
|
0
|
|
|
|
|
if (((unsigned char)*str) >= 0x80) { |
22965
|
0
|
0
|
|
|
|
|
if (((unsigned char)*str) < 0xC0) return false; |
22966
|
0
|
0
|
|
|
|
|
else if (((unsigned char)*str) < 0xE0) { |
22967
|
0
|
0
|
|
|
|
|
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0
|
|
|
|
|
|
22968
|
0
|
0
|
|
|
|
|
} else if (((unsigned char)*str) < 0xF0) { |
22969
|
0
|
0
|
|
|
|
|
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0
|
|
|
|
|
|
22970
|
0
|
0
|
|
|
|
|
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0
|
|
|
|
|
|
22971
|
0
|
0
|
|
|
|
|
} else if (((unsigned char)*str) < 0xF8) { |
22972
|
0
|
0
|
|
|
|
|
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0
|
|
|
|
|
|
22973
|
0
|
0
|
|
|
|
|
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0
|
|
|
|
|
|
22974
|
0
|
0
|
|
|
|
|
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0
|
|
|
|
|
|
22975
|
|
|
|
|
|
|
} else return false; |
22976
|
|
|
|
|
|
|
} |
22977
|
|
|
|
|
|
|
return true; |
22978
|
|
|
|
|
|
|
} |
22979
|
|
|
|
|
|
|
|
22980
|
0
|
|
|
|
|
|
bool utf8::valid(const char* str, size_t len) { |
22981
|
0
|
0
|
|
|
|
|
for (; len > 0; str++, len--) |
22982
|
0
|
0
|
|
|
|
|
if (((unsigned char)*str) >= 0x80) { |
22983
|
0
|
0
|
|
|
|
|
if (((unsigned char)*str) < 0xC0) return false; |
22984
|
0
|
0
|
|
|
|
|
else if (((unsigned char)*str) < 0xE0) { |
22985
|
0
|
0
|
|
|
|
|
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
22986
|
0
|
0
|
|
|
|
|
} else if (((unsigned char)*str) < 0xF0) { |
22987
|
0
|
0
|
|
|
|
|
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
22988
|
0
|
0
|
|
|
|
|
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
22989
|
0
|
0
|
|
|
|
|
} else if (((unsigned char)*str) < 0xF8) { |
22990
|
0
|
0
|
|
|
|
|
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
22991
|
0
|
0
|
|
|
|
|
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
22992
|
0
|
0
|
|
|
|
|
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
22993
|
|
|
|
|
|
|
} else return false; |
22994
|
|
|
|
|
|
|
} |
22995
|
|
|
|
|
|
|
return true; |
22996
|
|
|
|
|
|
|
} |
22997
|
|
|
|
|
|
|
|
22998
|
0
|
|
|
|
|
|
void utf8::decode(const char* str, std::u32string& decoded) { |
22999
|
|
|
|
|
|
|
decoded.clear(); |
23000
|
|
|
|
|
|
|
|
23001
|
0
|
0
|
|
|
|
|
for (char32_t chr; (chr = decode(str)); ) |
23002
|
0
|
|
|
|
|
|
decoded.push_back(chr); |
23003
|
0
|
|
|
|
|
|
} |
23004
|
|
|
|
|
|
|
|
23005
|
0
|
|
|
|
|
|
void utf8::decode(const char* str, size_t len, std::u32string& decoded) { |
23006
|
|
|
|
|
|
|
decoded.clear(); |
23007
|
|
|
|
|
|
|
|
23008
|
0
|
0
|
|
|
|
|
while (len) |
23009
|
0
|
|
|
|
|
|
decoded.push_back(decode(str, len)); |
23010
|
0
|
|
|
|
|
|
} |
23011
|
|
|
|
|
|
|
|
23012
|
0
|
|
|
|
|
|
void utf8::encode(const std::u32string& str, std::string& encoded) { |
23013
|
|
|
|
|
|
|
encoded.clear(); |
23014
|
|
|
|
|
|
|
|
23015
|
0
|
0
|
|
|
|
|
for (auto&& chr : str) |
23016
|
0
|
|
|
|
|
|
append(encoded, chr); |
23017
|
0
|
|
|
|
|
|
} |
23018
|
|
|
|
|
|
|
|
23019
|
|
|
|
|
|
|
const char utf8::REPLACEMENT_CHAR; |
23020
|
|
|
|
|
|
|
|
23021
|
|
|
|
|
|
|
} // namespace unilib |
23022
|
|
|
|
|
|
|
|
23023
|
|
|
|
|
|
|
///////// |
23024
|
|
|
|
|
|
|
// File: unilib/version.cpp |
23025
|
|
|
|
|
|
|
///////// |
23026
|
|
|
|
|
|
|
|
23027
|
|
|
|
|
|
|
// This file is part of UniLib . |
23028
|
|
|
|
|
|
|
// |
23029
|
|
|
|
|
|
|
// Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of |
23030
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
23031
|
|
|
|
|
|
|
// |
23032
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
23033
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
23034
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
23035
|
|
|
|
|
|
|
// |
23036
|
|
|
|
|
|
|
// UniLib version: 3.3.0 |
23037
|
|
|
|
|
|
|
// Unicode version: 15.0.0 |
23038
|
|
|
|
|
|
|
|
23039
|
|
|
|
|
|
|
namespace unilib { |
23040
|
|
|
|
|
|
|
|
23041
|
|
|
|
|
|
|
// Returns current version. |
23042
|
0
|
|
|
|
|
|
version version::current() { |
23043
|
0
|
0
|
|
|
|
|
return {3, 3, 0, ""}; |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
23044
|
|
|
|
|
|
|
} |
23045
|
|
|
|
|
|
|
|
23046
|
|
|
|
|
|
|
} // namespace unilib |
23047
|
|
|
|
|
|
|
|
23048
|
|
|
|
|
|
|
///////// |
23049
|
|
|
|
|
|
|
// File: utils/compressor_load.cpp |
23050
|
|
|
|
|
|
|
///////// |
23051
|
|
|
|
|
|
|
|
23052
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
23053
|
|
|
|
|
|
|
// |
23054
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
23055
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
23056
|
|
|
|
|
|
|
// |
23057
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
23058
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
23059
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
23060
|
|
|
|
|
|
|
|
23061
|
|
|
|
|
|
|
namespace utils { |
23062
|
|
|
|
|
|
|
|
23063
|
|
|
|
|
|
|
// Start of LZMA compression library by Igor Pavlov |
23064
|
|
|
|
|
|
|
namespace lzma { |
23065
|
|
|
|
|
|
|
|
23066
|
|
|
|
|
|
|
// Types.h -- Basic types |
23067
|
|
|
|
|
|
|
// 2010-10-09 : Igor Pavlov : Public domain |
23068
|
|
|
|
|
|
|
#ifndef UFAL_CPPUTILS_COMPRESSOR_LZMA_TYPES_H |
23069
|
|
|
|
|
|
|
#define UFAL_CPPUTILS_COMPRESSOR_LZMA_TYPES_H |
23070
|
|
|
|
|
|
|
|
23071
|
|
|
|
|
|
|
#define SZ_OK 0 |
23072
|
|
|
|
|
|
|
|
23073
|
|
|
|
|
|
|
#define SZ_ERROR_DATA 1 |
23074
|
|
|
|
|
|
|
#define SZ_ERROR_MEM 2 |
23075
|
|
|
|
|
|
|
#define SZ_ERROR_CRC 3 |
23076
|
|
|
|
|
|
|
#define SZ_ERROR_UNSUPPORTED 4 |
23077
|
|
|
|
|
|
|
#define SZ_ERROR_PARAM 5 |
23078
|
|
|
|
|
|
|
#define SZ_ERROR_INPUT_EOF 6 |
23079
|
|
|
|
|
|
|
#define SZ_ERROR_OUTPUT_EOF 7 |
23080
|
|
|
|
|
|
|
#define SZ_ERROR_READ 8 |
23081
|
|
|
|
|
|
|
#define SZ_ERROR_WRITE 9 |
23082
|
|
|
|
|
|
|
#define SZ_ERROR_PROGRESS 10 |
23083
|
|
|
|
|
|
|
#define SZ_ERROR_FAIL 11 |
23084
|
|
|
|
|
|
|
#define SZ_ERROR_THREAD 12 |
23085
|
|
|
|
|
|
|
|
23086
|
|
|
|
|
|
|
#define SZ_ERROR_ARCHIVE 16 |
23087
|
|
|
|
|
|
|
#define SZ_ERROR_NO_ARCHIVE 17 |
23088
|
|
|
|
|
|
|
|
23089
|
|
|
|
|
|
|
typedef int SRes; |
23090
|
|
|
|
|
|
|
|
23091
|
|
|
|
|
|
|
#ifndef RINOK |
23092
|
|
|
|
|
|
|
#define RINOK(x) { int __result__ = (x); if (__result__ != 0) return __result__; } |
23093
|
|
|
|
|
|
|
#endif |
23094
|
|
|
|
|
|
|
|
23095
|
|
|
|
|
|
|
/* The following interfaces use first parameter as pointer to structure */ |
23096
|
|
|
|
|
|
|
|
23097
|
|
|
|
|
|
|
struct IByteIn |
23098
|
|
|
|
|
|
|
{ |
23099
|
|
|
|
|
|
|
uint8_t (*Read)(void *p); /* reads one byte, returns 0 in case of EOF or error */ |
23100
|
|
|
|
|
|
|
}; |
23101
|
|
|
|
|
|
|
|
23102
|
|
|
|
|
|
|
struct IByteOut |
23103
|
|
|
|
|
|
|
{ |
23104
|
|
|
|
|
|
|
void (*Write)(void *p, uint8_t b); |
23105
|
|
|
|
|
|
|
}; |
23106
|
|
|
|
|
|
|
|
23107
|
|
|
|
|
|
|
struct ISeqInStream |
23108
|
|
|
|
|
|
|
{ |
23109
|
|
|
|
|
|
|
SRes (*Read)(void *p, void *buf, size_t *size); |
23110
|
|
|
|
|
|
|
/* if (input(*size) != 0 && output(*size) == 0) means end_of_stream. |
23111
|
|
|
|
|
|
|
(output(*size) < input(*size)) is allowed */ |
23112
|
|
|
|
|
|
|
}; |
23113
|
|
|
|
|
|
|
|
23114
|
|
|
|
|
|
|
/* it can return SZ_ERROR_INPUT_EOF */ |
23115
|
|
|
|
|
|
|
SRes SeqInStream_Read(ISeqInStream *stream, void *buf, size_t size); |
23116
|
|
|
|
|
|
|
SRes SeqInStream_Read2(ISeqInStream *stream, void *buf, size_t size, SRes errorType); |
23117
|
|
|
|
|
|
|
SRes SeqInStream_ReadByte(ISeqInStream *stream, uint8_t *buf); |
23118
|
|
|
|
|
|
|
|
23119
|
|
|
|
|
|
|
struct ISeqOutStream |
23120
|
|
|
|
|
|
|
{ |
23121
|
|
|
|
|
|
|
size_t (*Write)(void *p, const void *buf, size_t size); |
23122
|
|
|
|
|
|
|
/* Returns: result - the number of actually written bytes. |
23123
|
|
|
|
|
|
|
(result < size) means error */ |
23124
|
|
|
|
|
|
|
}; |
23125
|
|
|
|
|
|
|
|
23126
|
|
|
|
|
|
|
enum ESzSeek |
23127
|
|
|
|
|
|
|
{ |
23128
|
|
|
|
|
|
|
SZ_SEEK_SET = 0, |
23129
|
|
|
|
|
|
|
SZ_SEEK_CUR = 1, |
23130
|
|
|
|
|
|
|
SZ_SEEK_END = 2 |
23131
|
|
|
|
|
|
|
}; |
23132
|
|
|
|
|
|
|
|
23133
|
|
|
|
|
|
|
struct ISeekInStream |
23134
|
|
|
|
|
|
|
{ |
23135
|
|
|
|
|
|
|
SRes (*Read)(void *p, void *buf, size_t *size); /* same as ISeqInStream::Read */ |
23136
|
|
|
|
|
|
|
SRes (*Seek)(void *p, int64_t *pos, ESzSeek origin); |
23137
|
|
|
|
|
|
|
}; |
23138
|
|
|
|
|
|
|
|
23139
|
|
|
|
|
|
|
struct ILookInStream |
23140
|
|
|
|
|
|
|
{ |
23141
|
|
|
|
|
|
|
SRes (*Look)(void *p, const void **buf, size_t *size); |
23142
|
|
|
|
|
|
|
/* if (input(*size) != 0 && output(*size) == 0) means end_of_stream. |
23143
|
|
|
|
|
|
|
(output(*size) > input(*size)) is not allowed |
23144
|
|
|
|
|
|
|
(output(*size) < input(*size)) is allowed */ |
23145
|
|
|
|
|
|
|
SRes (*Skip)(void *p, size_t offset); |
23146
|
|
|
|
|
|
|
/* offset must be <= output(*size) of Look */ |
23147
|
|
|
|
|
|
|
|
23148
|
|
|
|
|
|
|
SRes (*Read)(void *p, void *buf, size_t *size); |
23149
|
|
|
|
|
|
|
/* reads directly (without buffer). It's same as ISeqInStream::Read */ |
23150
|
|
|
|
|
|
|
SRes (*Seek)(void *p, int64_t *pos, ESzSeek origin); |
23151
|
|
|
|
|
|
|
}; |
23152
|
|
|
|
|
|
|
|
23153
|
|
|
|
|
|
|
SRes LookInStream_LookRead(ILookInStream *stream, void *buf, size_t *size); |
23154
|
|
|
|
|
|
|
SRes LookInStream_SeekTo(ILookInStream *stream, uint64_t offset); |
23155
|
|
|
|
|
|
|
|
23156
|
|
|
|
|
|
|
/* reads via ILookInStream::Read */ |
23157
|
|
|
|
|
|
|
SRes LookInStream_Read2(ILookInStream *stream, void *buf, size_t size, SRes errorType); |
23158
|
|
|
|
|
|
|
SRes LookInStream_Read(ILookInStream *stream, void *buf, size_t size); |
23159
|
|
|
|
|
|
|
|
23160
|
|
|
|
|
|
|
#define LookToRead_BUF_SIZE (1 << 14) |
23161
|
|
|
|
|
|
|
|
23162
|
|
|
|
|
|
|
struct CLookToRead |
23163
|
|
|
|
|
|
|
{ |
23164
|
|
|
|
|
|
|
ILookInStream s; |
23165
|
|
|
|
|
|
|
ISeekInStream *realStream; |
23166
|
|
|
|
|
|
|
size_t pos; |
23167
|
|
|
|
|
|
|
size_t size; |
23168
|
|
|
|
|
|
|
uint8_t buf[LookToRead_BUF_SIZE]; |
23169
|
|
|
|
|
|
|
}; |
23170
|
|
|
|
|
|
|
|
23171
|
|
|
|
|
|
|
void LookToRead_CreateVTable(CLookToRead *p, int lookahead); |
23172
|
|
|
|
|
|
|
void LookToRead_Init(CLookToRead *p); |
23173
|
|
|
|
|
|
|
|
23174
|
|
|
|
|
|
|
struct CSecToLook |
23175
|
|
|
|
|
|
|
{ |
23176
|
|
|
|
|
|
|
ISeqInStream s; |
23177
|
|
|
|
|
|
|
ILookInStream *realStream; |
23178
|
|
|
|
|
|
|
}; |
23179
|
|
|
|
|
|
|
|
23180
|
|
|
|
|
|
|
void SecToLook_CreateVTable(CSecToLook *p); |
23181
|
|
|
|
|
|
|
|
23182
|
|
|
|
|
|
|
struct CSecToRead |
23183
|
|
|
|
|
|
|
{ |
23184
|
|
|
|
|
|
|
ISeqInStream s; |
23185
|
|
|
|
|
|
|
ILookInStream *realStream; |
23186
|
|
|
|
|
|
|
}; |
23187
|
|
|
|
|
|
|
|
23188
|
|
|
|
|
|
|
void SecToRead_CreateVTable(CSecToRead *p); |
23189
|
|
|
|
|
|
|
|
23190
|
|
|
|
|
|
|
struct ICompressProgress |
23191
|
|
|
|
|
|
|
{ |
23192
|
|
|
|
|
|
|
SRes (*Progress)(void *p, uint64_t inSize, uint64_t outSize); |
23193
|
|
|
|
|
|
|
/* Returns: result. (result != SZ_OK) means break. |
23194
|
|
|
|
|
|
|
Value (uint64_t)(int64_t)-1 for size means unknown value. */ |
23195
|
|
|
|
|
|
|
}; |
23196
|
|
|
|
|
|
|
|
23197
|
|
|
|
|
|
|
struct ISzAlloc |
23198
|
|
|
|
|
|
|
{ |
23199
|
|
|
|
|
|
|
void *(*Alloc)(void *p, size_t size); |
23200
|
|
|
|
|
|
|
void (*Free)(void *p, void *address); /* address can be 0 */ |
23201
|
|
|
|
|
|
|
}; |
23202
|
|
|
|
|
|
|
|
23203
|
|
|
|
|
|
|
#define IAlloc_Alloc(p, size) (p)->Alloc((p), size) |
23204
|
|
|
|
|
|
|
#define IAlloc_Free(p, a) (p)->Free((p), a) |
23205
|
|
|
|
|
|
|
|
23206
|
|
|
|
|
|
|
#endif // UFAL_CPPUTILS_COMPRESSOR_LZMA_TYPES_H |
23207
|
|
|
|
|
|
|
|
23208
|
|
|
|
|
|
|
// LzmaDec.h -- LZMA Decoder |
23209
|
|
|
|
|
|
|
// 2009-02-07 : Igor Pavlov : Public domain |
23210
|
|
|
|
|
|
|
|
23211
|
|
|
|
|
|
|
/* #define _LZMA_PROB32 */ |
23212
|
|
|
|
|
|
|
/* _LZMA_PROB32 can increase the speed on some CPUs, |
23213
|
|
|
|
|
|
|
but memory usage for CLzmaDec::probs will be doubled in that case */ |
23214
|
|
|
|
|
|
|
|
23215
|
|
|
|
|
|
|
#ifdef _LZMA_PROB32 |
23216
|
|
|
|
|
|
|
#define CLzmaProb uint32_t |
23217
|
|
|
|
|
|
|
#else |
23218
|
|
|
|
|
|
|
#define CLzmaProb uint16_t |
23219
|
|
|
|
|
|
|
#endif |
23220
|
|
|
|
|
|
|
|
23221
|
|
|
|
|
|
|
/* ---------- LZMA Properties ---------- */ |
23222
|
|
|
|
|
|
|
|
23223
|
|
|
|
|
|
|
#define LZMA_PROPS_SIZE 5 |
23224
|
|
|
|
|
|
|
|
23225
|
|
|
|
|
|
|
struct CLzmaProps |
23226
|
|
|
|
|
|
|
{ |
23227
|
|
|
|
|
|
|
unsigned lc, lp, pb; |
23228
|
|
|
|
|
|
|
uint32_t dicSize; |
23229
|
|
|
|
|
|
|
}; |
23230
|
|
|
|
|
|
|
|
23231
|
|
|
|
|
|
|
/* LzmaProps_Decode - decodes properties |
23232
|
|
|
|
|
|
|
Returns: |
23233
|
|
|
|
|
|
|
SZ_OK |
23234
|
|
|
|
|
|
|
SZ_ERROR_UNSUPPORTED - Unsupported properties |
23235
|
|
|
|
|
|
|
*/ |
23236
|
|
|
|
|
|
|
|
23237
|
|
|
|
|
|
|
SRes LzmaProps_Decode(CLzmaProps *p, const uint8_t *data, unsigned size); |
23238
|
|
|
|
|
|
|
|
23239
|
|
|
|
|
|
|
/* ---------- LZMA Decoder state ---------- */ |
23240
|
|
|
|
|
|
|
|
23241
|
|
|
|
|
|
|
/* LZMA_REQUIRED_INPUT_MAX = number of required input bytes for worst case. |
23242
|
|
|
|
|
|
|
Num bits = log2((2^11 / 31) ^ 22) + 26 < 134 + 26 = 160; */ |
23243
|
|
|
|
|
|
|
|
23244
|
|
|
|
|
|
|
#define LZMA_REQUIRED_INPUT_MAX 20 |
23245
|
|
|
|
|
|
|
|
23246
|
|
|
|
|
|
|
struct CLzmaDec |
23247
|
|
|
|
|
|
|
{ |
23248
|
|
|
|
|
|
|
CLzmaProps prop; |
23249
|
|
|
|
|
|
|
CLzmaProb *probs; |
23250
|
|
|
|
|
|
|
uint8_t *dic; |
23251
|
|
|
|
|
|
|
const uint8_t *buf; |
23252
|
|
|
|
|
|
|
uint32_t range, code; |
23253
|
|
|
|
|
|
|
size_t dicPos; |
23254
|
|
|
|
|
|
|
size_t dicBufSize; |
23255
|
|
|
|
|
|
|
uint32_t processedPos; |
23256
|
|
|
|
|
|
|
uint32_t checkDicSize; |
23257
|
|
|
|
|
|
|
unsigned state; |
23258
|
|
|
|
|
|
|
uint32_t reps[4]; |
23259
|
|
|
|
|
|
|
unsigned remainLen; |
23260
|
|
|
|
|
|
|
int needFlush; |
23261
|
|
|
|
|
|
|
int needInitState; |
23262
|
|
|
|
|
|
|
uint32_t numProbs; |
23263
|
|
|
|
|
|
|
unsigned tempBufSize; |
23264
|
|
|
|
|
|
|
uint8_t tempBuf[LZMA_REQUIRED_INPUT_MAX]; |
23265
|
|
|
|
|
|
|
}; |
23266
|
|
|
|
|
|
|
|
23267
|
|
|
|
|
|
|
#define LzmaDec_Construct(p) { (p)->dic = 0; (p)->probs = 0; } |
23268
|
|
|
|
|
|
|
|
23269
|
|
|
|
|
|
|
void LzmaDec_Init(CLzmaDec *p); |
23270
|
|
|
|
|
|
|
|
23271
|
|
|
|
|
|
|
/* There are two types of LZMA streams: |
23272
|
|
|
|
|
|
|
0) Stream with end mark. That end mark adds about 6 bytes to compressed size. |
23273
|
|
|
|
|
|
|
1) Stream without end mark. You must know exact uncompressed size to decompress such stream. */ |
23274
|
|
|
|
|
|
|
|
23275
|
|
|
|
|
|
|
enum ELzmaFinishMode |
23276
|
|
|
|
|
|
|
{ |
23277
|
|
|
|
|
|
|
LZMA_FINISH_ANY, /* finish at any point */ |
23278
|
|
|
|
|
|
|
LZMA_FINISH_END /* block must be finished at the end */ |
23279
|
|
|
|
|
|
|
}; |
23280
|
|
|
|
|
|
|
|
23281
|
|
|
|
|
|
|
/* ELzmaFinishMode has meaning only if the decoding reaches output limit !!! |
23282
|
|
|
|
|
|
|
|
23283
|
|
|
|
|
|
|
You must use LZMA_FINISH_END, when you know that current output buffer |
23284
|
|
|
|
|
|
|
covers last bytes of block. In other cases you must use LZMA_FINISH_ANY. |
23285
|
|
|
|
|
|
|
|
23286
|
|
|
|
|
|
|
If LZMA decoder sees end marker before reaching output limit, it returns SZ_OK, |
23287
|
|
|
|
|
|
|
and output value of destLen will be less than output buffer size limit. |
23288
|
|
|
|
|
|
|
You can check status result also. |
23289
|
|
|
|
|
|
|
|
23290
|
|
|
|
|
|
|
You can use multiple checks to test data integrity after full decompression: |
23291
|
|
|
|
|
|
|
1) Check Result and "status" variable. |
23292
|
|
|
|
|
|
|
2) Check that output(destLen) = uncompressedSize, if you know real uncompressedSize. |
23293
|
|
|
|
|
|
|
3) Check that output(srcLen) = compressedSize, if you know real compressedSize. |
23294
|
|
|
|
|
|
|
You must use correct finish mode in that case. */ |
23295
|
|
|
|
|
|
|
|
23296
|
|
|
|
|
|
|
enum ELzmaStatus |
23297
|
|
|
|
|
|
|
{ |
23298
|
|
|
|
|
|
|
LZMA_STATUS_NOT_SPECIFIED, /* use main error code instead */ |
23299
|
|
|
|
|
|
|
LZMA_STATUS_FINISHED_WITH_MARK, /* stream was finished with end mark. */ |
23300
|
|
|
|
|
|
|
LZMA_STATUS_NOT_FINISHED, /* stream was not finished */ |
23301
|
|
|
|
|
|
|
LZMA_STATUS_NEEDS_MORE_INPUT, /* you must provide more input bytes */ |
23302
|
|
|
|
|
|
|
LZMA_STATUS_MAYBE_FINISHED_WITHOUT_MARK /* there is probability that stream was finished without end mark */ |
23303
|
|
|
|
|
|
|
}; |
23304
|
|
|
|
|
|
|
|
23305
|
|
|
|
|
|
|
/* ELzmaStatus is used only as output value for function call */ |
23306
|
|
|
|
|
|
|
|
23307
|
|
|
|
|
|
|
/* ---------- Interfaces ---------- */ |
23308
|
|
|
|
|
|
|
|
23309
|
|
|
|
|
|
|
/* There are 3 levels of interfaces: |
23310
|
|
|
|
|
|
|
1) Dictionary Interface |
23311
|
|
|
|
|
|
|
2) Buffer Interface |
23312
|
|
|
|
|
|
|
3) One Call Interface |
23313
|
|
|
|
|
|
|
You can select any of these interfaces, but don't mix functions from different |
23314
|
|
|
|
|
|
|
groups for same object. */ |
23315
|
|
|
|
|
|
|
|
23316
|
|
|
|
|
|
|
/* There are two variants to allocate state for Dictionary Interface: |
23317
|
|
|
|
|
|
|
1) LzmaDec_Allocate / LzmaDec_Free |
23318
|
|
|
|
|
|
|
2) LzmaDec_AllocateProbs / LzmaDec_FreeProbs |
23319
|
|
|
|
|
|
|
You can use variant 2, if you set dictionary buffer manually. |
23320
|
|
|
|
|
|
|
For Buffer Interface you must always use variant 1. |
23321
|
|
|
|
|
|
|
|
23322
|
|
|
|
|
|
|
LzmaDec_Allocate* can return: |
23323
|
|
|
|
|
|
|
SZ_OK |
23324
|
|
|
|
|
|
|
SZ_ERROR_MEM - Memory allocation error |
23325
|
|
|
|
|
|
|
SZ_ERROR_UNSUPPORTED - Unsupported properties |
23326
|
|
|
|
|
|
|
*/ |
23327
|
|
|
|
|
|
|
|
23328
|
|
|
|
|
|
|
SRes LzmaDec_AllocateProbs(CLzmaDec *p, const uint8_t *props, unsigned propsSize, ISzAlloc *alloc); |
23329
|
|
|
|
|
|
|
void LzmaDec_FreeProbs(CLzmaDec *p, ISzAlloc *alloc); |
23330
|
|
|
|
|
|
|
|
23331
|
|
|
|
|
|
|
SRes LzmaDec_Allocate(CLzmaDec *state, const uint8_t *prop, unsigned propsSize, ISzAlloc *alloc); |
23332
|
|
|
|
|
|
|
void LzmaDec_Free(CLzmaDec *state, ISzAlloc *alloc); |
23333
|
|
|
|
|
|
|
|
23334
|
|
|
|
|
|
|
/* ---------- Dictionary Interface ---------- */ |
23335
|
|
|
|
|
|
|
|
23336
|
|
|
|
|
|
|
/* You can use it, if you want to eliminate the overhead for data copying from |
23337
|
|
|
|
|
|
|
dictionary to some other external buffer. |
23338
|
|
|
|
|
|
|
You must work with CLzmaDec variables directly in this interface. |
23339
|
|
|
|
|
|
|
|
23340
|
|
|
|
|
|
|
STEPS: |
23341
|
|
|
|
|
|
|
LzmaDec_Constr() |
23342
|
|
|
|
|
|
|
LzmaDec_Allocate() |
23343
|
|
|
|
|
|
|
for (each new stream) |
23344
|
|
|
|
|
|
|
{ |
23345
|
|
|
|
|
|
|
LzmaDec_Init() |
23346
|
|
|
|
|
|
|
while (it needs more decompression) |
23347
|
|
|
|
|
|
|
{ |
23348
|
|
|
|
|
|
|
LzmaDec_DecodeToDic() |
23349
|
|
|
|
|
|
|
use data from CLzmaDec::dic and update CLzmaDec::dicPos |
23350
|
|
|
|
|
|
|
} |
23351
|
|
|
|
|
|
|
} |
23352
|
|
|
|
|
|
|
LzmaDec_Free() |
23353
|
|
|
|
|
|
|
*/ |
23354
|
|
|
|
|
|
|
|
23355
|
|
|
|
|
|
|
/* LzmaDec_DecodeToDic |
23356
|
|
|
|
|
|
|
|
23357
|
|
|
|
|
|
|
The decoding to internal dictionary buffer (CLzmaDec::dic). |
23358
|
|
|
|
|
|
|
You must manually update CLzmaDec::dicPos, if it reaches CLzmaDec::dicBufSize !!! |
23359
|
|
|
|
|
|
|
|
23360
|
|
|
|
|
|
|
finishMode: |
23361
|
|
|
|
|
|
|
It has meaning only if the decoding reaches output limit (dicLimit). |
23362
|
|
|
|
|
|
|
LZMA_FINISH_ANY - Decode just dicLimit bytes. |
23363
|
|
|
|
|
|
|
LZMA_FINISH_END - Stream must be finished after dicLimit. |
23364
|
|
|
|
|
|
|
|
23365
|
|
|
|
|
|
|
Returns: |
23366
|
|
|
|
|
|
|
SZ_OK |
23367
|
|
|
|
|
|
|
status: |
23368
|
|
|
|
|
|
|
LZMA_STATUS_FINISHED_WITH_MARK |
23369
|
|
|
|
|
|
|
LZMA_STATUS_NOT_FINISHED |
23370
|
|
|
|
|
|
|
LZMA_STATUS_NEEDS_MORE_INPUT |
23371
|
|
|
|
|
|
|
LZMA_STATUS_MAYBE_FINISHED_WITHOUT_MARK |
23372
|
|
|
|
|
|
|
SZ_ERROR_DATA - Data error |
23373
|
|
|
|
|
|
|
*/ |
23374
|
|
|
|
|
|
|
|
23375
|
|
|
|
|
|
|
SRes LzmaDec_DecodeToDic(CLzmaDec *p, size_t dicLimit, |
23376
|
|
|
|
|
|
|
const uint8_t *src, size_t *srcLen, ELzmaFinishMode finishMode, ELzmaStatus *status); |
23377
|
|
|
|
|
|
|
|
23378
|
|
|
|
|
|
|
/* ---------- Buffer Interface ---------- */ |
23379
|
|
|
|
|
|
|
|
23380
|
|
|
|
|
|
|
/* It's zlib-like interface. |
23381
|
|
|
|
|
|
|
See LzmaDec_DecodeToDic description for information about STEPS and return results, |
23382
|
|
|
|
|
|
|
but you must use LzmaDec_DecodeToBuf instead of LzmaDec_DecodeToDic and you don't need |
23383
|
|
|
|
|
|
|
to work with CLzmaDec variables manually. |
23384
|
|
|
|
|
|
|
|
23385
|
|
|
|
|
|
|
finishMode: |
23386
|
|
|
|
|
|
|
It has meaning only if the decoding reaches output limit (*destLen). |
23387
|
|
|
|
|
|
|
LZMA_FINISH_ANY - Decode just destLen bytes. |
23388
|
|
|
|
|
|
|
LZMA_FINISH_END - Stream must be finished after (*destLen). |
23389
|
|
|
|
|
|
|
*/ |
23390
|
|
|
|
|
|
|
|
23391
|
|
|
|
|
|
|
SRes LzmaDec_DecodeToBuf(CLzmaDec *p, uint8_t *dest, size_t *destLen, |
23392
|
|
|
|
|
|
|
const uint8_t *src, size_t *srcLen, ELzmaFinishMode finishMode, ELzmaStatus *status); |
23393
|
|
|
|
|
|
|
|
23394
|
|
|
|
|
|
|
/* ---------- One Call Interface ---------- */ |
23395
|
|
|
|
|
|
|
|
23396
|
|
|
|
|
|
|
/* LzmaDecode |
23397
|
|
|
|
|
|
|
|
23398
|
|
|
|
|
|
|
finishMode: |
23399
|
|
|
|
|
|
|
It has meaning only if the decoding reaches output limit (*destLen). |
23400
|
|
|
|
|
|
|
LZMA_FINISH_ANY - Decode just destLen bytes. |
23401
|
|
|
|
|
|
|
LZMA_FINISH_END - Stream must be finished after (*destLen). |
23402
|
|
|
|
|
|
|
|
23403
|
|
|
|
|
|
|
Returns: |
23404
|
|
|
|
|
|
|
SZ_OK |
23405
|
|
|
|
|
|
|
status: |
23406
|
|
|
|
|
|
|
LZMA_STATUS_FINISHED_WITH_MARK |
23407
|
|
|
|
|
|
|
LZMA_STATUS_NOT_FINISHED |
23408
|
|
|
|
|
|
|
LZMA_STATUS_MAYBE_FINISHED_WITHOUT_MARK |
23409
|
|
|
|
|
|
|
SZ_ERROR_DATA - Data error |
23410
|
|
|
|
|
|
|
SZ_ERROR_MEM - Memory allocation error |
23411
|
|
|
|
|
|
|
SZ_ERROR_UNSUPPORTED - Unsupported properties |
23412
|
|
|
|
|
|
|
SZ_ERROR_INPUT_EOF - It needs more bytes in input buffer (src). |
23413
|
|
|
|
|
|
|
*/ |
23414
|
|
|
|
|
|
|
|
23415
|
|
|
|
|
|
|
SRes LzmaDecode(uint8_t *dest, size_t *destLen, const uint8_t *src, size_t *srcLen, |
23416
|
|
|
|
|
|
|
const uint8_t *propData, unsigned propSize, ELzmaFinishMode finishMode, |
23417
|
|
|
|
|
|
|
ELzmaStatus *status, ISzAlloc *alloc); |
23418
|
|
|
|
|
|
|
|
23419
|
|
|
|
|
|
|
// LzmaDec.c -- LZMA Decoder |
23420
|
|
|
|
|
|
|
// 2009-09-20 : Igor Pavlov : Public domain |
23421
|
|
|
|
|
|
|
|
23422
|
|
|
|
|
|
|
#define kNumTopBits 24 |
23423
|
|
|
|
|
|
|
#define kTopValue ((uint32_t)1 << kNumTopBits) |
23424
|
|
|
|
|
|
|
|
23425
|
|
|
|
|
|
|
#define kNumBitModelTotalBits 11 |
23426
|
|
|
|
|
|
|
#define kBitModelTotal (1 << kNumBitModelTotalBits) |
23427
|
|
|
|
|
|
|
#define kNumMoveBits 5 |
23428
|
|
|
|
|
|
|
|
23429
|
|
|
|
|
|
|
#define RC_INIT_SIZE 5 |
23430
|
|
|
|
|
|
|
|
23431
|
|
|
|
|
|
|
#define NORMALIZE if (range < kTopValue) { range <<= 8; code = (code << 8) | (*buf++); } |
23432
|
|
|
|
|
|
|
|
23433
|
|
|
|
|
|
|
#define IF_BIT_0(p) ttt = *(p); NORMALIZE; bound = (range >> kNumBitModelTotalBits) * ttt; if (code < bound) |
23434
|
|
|
|
|
|
|
#define UPDATE_0(p) range = bound; *(p) = (CLzmaProb)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits)); |
23435
|
|
|
|
|
|
|
#define UPDATE_1(p) range -= bound; code -= bound; *(p) = (CLzmaProb)(ttt - (ttt >> kNumMoveBits)); |
23436
|
|
|
|
|
|
|
#define GET_BIT2(p, i, A0, A1) IF_BIT_0(p) \ |
23437
|
|
|
|
|
|
|
{ UPDATE_0(p); i = (i + i); A0; } else \ |
23438
|
|
|
|
|
|
|
{ UPDATE_1(p); i = (i + i) + 1; A1; } |
23439
|
|
|
|
|
|
|
#define GET_BIT(p, i) GET_BIT2(p, i, ; , ;) |
23440
|
|
|
|
|
|
|
|
23441
|
|
|
|
|
|
|
#define TREE_GET_BIT(probs, i) { GET_BIT((probs + i), i); } |
23442
|
|
|
|
|
|
|
#define TREE_DECODE(probs, limit, i) \ |
23443
|
|
|
|
|
|
|
{ i = 1; do { TREE_GET_BIT(probs, i); } while (i < limit); i -= limit; } |
23444
|
|
|
|
|
|
|
|
23445
|
|
|
|
|
|
|
/* #define _LZMA_SIZE_OPT */ |
23446
|
|
|
|
|
|
|
|
23447
|
|
|
|
|
|
|
#ifdef _LZMA_SIZE_OPT |
23448
|
|
|
|
|
|
|
#define TREE_6_DECODE(probs, i) TREE_DECODE(probs, (1 << 6), i) |
23449
|
|
|
|
|
|
|
#else |
23450
|
|
|
|
|
|
|
#define TREE_6_DECODE(probs, i) \ |
23451
|
|
|
|
|
|
|
{ i = 1; \ |
23452
|
|
|
|
|
|
|
TREE_GET_BIT(probs, i); \ |
23453
|
|
|
|
|
|
|
TREE_GET_BIT(probs, i); \ |
23454
|
|
|
|
|
|
|
TREE_GET_BIT(probs, i); \ |
23455
|
|
|
|
|
|
|
TREE_GET_BIT(probs, i); \ |
23456
|
|
|
|
|
|
|
TREE_GET_BIT(probs, i); \ |
23457
|
|
|
|
|
|
|
TREE_GET_BIT(probs, i); \ |
23458
|
|
|
|
|
|
|
i -= 0x40; } |
23459
|
|
|
|
|
|
|
#endif |
23460
|
|
|
|
|
|
|
|
23461
|
|
|
|
|
|
|
#define NORMALIZE_CHECK if (range < kTopValue) { if (buf >= bufLimit) return DUMMY_ERROR; range <<= 8; code = (code << 8) | (*buf++); } |
23462
|
|
|
|
|
|
|
|
23463
|
|
|
|
|
|
|
#define IF_BIT_0_CHECK(p) ttt = *(p); NORMALIZE_CHECK; bound = (range >> kNumBitModelTotalBits) * ttt; if (code < bound) |
23464
|
|
|
|
|
|
|
#define UPDATE_0_CHECK range = bound; |
23465
|
|
|
|
|
|
|
#define UPDATE_1_CHECK range -= bound; code -= bound; |
23466
|
|
|
|
|
|
|
#define GET_BIT2_CHECK(p, i, A0, A1) IF_BIT_0_CHECK(p) \ |
23467
|
|
|
|
|
|
|
{ UPDATE_0_CHECK; i = (i + i); A0; } else \ |
23468
|
|
|
|
|
|
|
{ UPDATE_1_CHECK; i = (i + i) + 1; A1; } |
23469
|
|
|
|
|
|
|
#define GET_BIT_CHECK(p, i) GET_BIT2_CHECK(p, i, ; , ;) |
23470
|
|
|
|
|
|
|
#define TREE_DECODE_CHECK(probs, limit, i) \ |
23471
|
|
|
|
|
|
|
{ i = 1; do { GET_BIT_CHECK(probs + i, i) } while (i < limit); i -= limit; } |
23472
|
|
|
|
|
|
|
|
23473
|
|
|
|
|
|
|
#define kNumPosBitsMax 4 |
23474
|
|
|
|
|
|
|
#define kNumPosStatesMax (1 << kNumPosBitsMax) |
23475
|
|
|
|
|
|
|
|
23476
|
|
|
|
|
|
|
#define kLenNumLowBits 3 |
23477
|
|
|
|
|
|
|
#define kLenNumLowSymbols (1 << kLenNumLowBits) |
23478
|
|
|
|
|
|
|
#define kLenNumMidBits 3 |
23479
|
|
|
|
|
|
|
#define kLenNumMidSymbols (1 << kLenNumMidBits) |
23480
|
|
|
|
|
|
|
#define kLenNumHighBits 8 |
23481
|
|
|
|
|
|
|
#define kLenNumHighSymbols (1 << kLenNumHighBits) |
23482
|
|
|
|
|
|
|
|
23483
|
|
|
|
|
|
|
#define LenChoice 0 |
23484
|
|
|
|
|
|
|
#define LenChoice2 (LenChoice + 1) |
23485
|
|
|
|
|
|
|
#define LenLow (LenChoice2 + 1) |
23486
|
|
|
|
|
|
|
#define LenMid (LenLow + (kNumPosStatesMax << kLenNumLowBits)) |
23487
|
|
|
|
|
|
|
#define LenHigh (LenMid + (kNumPosStatesMax << kLenNumMidBits)) |
23488
|
|
|
|
|
|
|
#define kNumLenProbs (LenHigh + kLenNumHighSymbols) |
23489
|
|
|
|
|
|
|
|
23490
|
|
|
|
|
|
|
#define kNumStates 12 |
23491
|
|
|
|
|
|
|
#define kNumLitStates 7 |
23492
|
|
|
|
|
|
|
|
23493
|
|
|
|
|
|
|
#define kStartPosModelIndex 4 |
23494
|
|
|
|
|
|
|
#define kEndPosModelIndex 14 |
23495
|
|
|
|
|
|
|
#define kNumFullDistances (1 << (kEndPosModelIndex >> 1)) |
23496
|
|
|
|
|
|
|
|
23497
|
|
|
|
|
|
|
#define kNumPosSlotBits 6 |
23498
|
|
|
|
|
|
|
#define kNumLenToPosStates 4 |
23499
|
|
|
|
|
|
|
|
23500
|
|
|
|
|
|
|
#define kNumAlignBits 4 |
23501
|
|
|
|
|
|
|
#define kAlignTableSize (1 << kNumAlignBits) |
23502
|
|
|
|
|
|
|
|
23503
|
|
|
|
|
|
|
#define kMatchMinLen 2 |
23504
|
|
|
|
|
|
|
#define kMatchSpecLenStart (kMatchMinLen + kLenNumLowSymbols + kLenNumMidSymbols + kLenNumHighSymbols) |
23505
|
|
|
|
|
|
|
|
23506
|
|
|
|
|
|
|
#define IsMatch 0 |
23507
|
|
|
|
|
|
|
#define IsRep (IsMatch + (kNumStates << kNumPosBitsMax)) |
23508
|
|
|
|
|
|
|
#define IsRepG0 (IsRep + kNumStates) |
23509
|
|
|
|
|
|
|
#define IsRepG1 (IsRepG0 + kNumStates) |
23510
|
|
|
|
|
|
|
#define IsRepG2 (IsRepG1 + kNumStates) |
23511
|
|
|
|
|
|
|
#define IsRep0Long (IsRepG2 + kNumStates) |
23512
|
|
|
|
|
|
|
#define PosSlot (IsRep0Long + (kNumStates << kNumPosBitsMax)) |
23513
|
|
|
|
|
|
|
#define SpecPos (PosSlot + (kNumLenToPosStates << kNumPosSlotBits)) |
23514
|
|
|
|
|
|
|
#define Align (SpecPos + kNumFullDistances - kEndPosModelIndex) |
23515
|
|
|
|
|
|
|
#define LenCoder (Align + kAlignTableSize) |
23516
|
|
|
|
|
|
|
#define RepLenCoder (LenCoder + kNumLenProbs) |
23517
|
|
|
|
|
|
|
#define Literal (RepLenCoder + kNumLenProbs) |
23518
|
|
|
|
|
|
|
|
23519
|
|
|
|
|
|
|
#define LZMA_BASE_SIZE 1846 |
23520
|
|
|
|
|
|
|
#define LZMA_LIT_SIZE 768 |
23521
|
|
|
|
|
|
|
|
23522
|
|
|
|
|
|
|
#define LzmaProps_GetNumProbs(p) ((uint32_t)LZMA_BASE_SIZE + (LZMA_LIT_SIZE << ((p)->lc + (p)->lp))) |
23523
|
|
|
|
|
|
|
|
23524
|
|
|
|
|
|
|
#if Literal != LZMA_BASE_SIZE |
23525
|
|
|
|
|
|
|
StopCompilingDueBUG |
23526
|
|
|
|
|
|
|
#endif |
23527
|
|
|
|
|
|
|
|
23528
|
|
|
|
|
|
|
#define LZMA_DIC_MIN (1 << 12) |
23529
|
|
|
|
|
|
|
|
23530
|
|
|
|
|
|
|
/* First LZMA-symbol is always decoded. |
23531
|
|
|
|
|
|
|
And it decodes new LZMA-symbols while (buf < bufLimit), but "buf" is without last normalization |
23532
|
|
|
|
|
|
|
Out: |
23533
|
|
|
|
|
|
|
Result: |
23534
|
|
|
|
|
|
|
SZ_OK - OK |
23535
|
|
|
|
|
|
|
SZ_ERROR_DATA - Error |
23536
|
|
|
|
|
|
|
p->remainLen: |
23537
|
|
|
|
|
|
|
< kMatchSpecLenStart : normal remain |
23538
|
|
|
|
|
|
|
= kMatchSpecLenStart : finished |
23539
|
|
|
|
|
|
|
= kMatchSpecLenStart + 1 : Flush marker |
23540
|
|
|
|
|
|
|
= kMatchSpecLenStart + 2 : State Init Marker |
23541
|
|
|
|
|
|
|
*/ |
23542
|
|
|
|
|
|
|
|
23543
|
504
|
|
|
|
|
|
static int LzmaDec_DecodeReal(CLzmaDec *p, size_t limit, const uint8_t *bufLimit) |
23544
|
|
|
|
|
|
|
{ |
23545
|
504
|
|
|
|
|
|
CLzmaProb *probs = p->probs; |
23546
|
|
|
|
|
|
|
|
23547
|
504
|
|
|
|
|
|
unsigned state = p->state; |
23548
|
504
|
|
|
|
|
|
uint32_t rep0 = p->reps[0], rep1 = p->reps[1], rep2 = p->reps[2], rep3 = p->reps[3]; |
23549
|
504
|
|
|
|
|
|
unsigned pbMask = ((unsigned)1 << (p->prop.pb)) - 1; |
23550
|
504
|
|
|
|
|
|
unsigned lpMask = ((unsigned)1 << (p->prop.lp)) - 1; |
23551
|
504
|
|
|
|
|
|
unsigned lc = p->prop.lc; |
23552
|
|
|
|
|
|
|
|
23553
|
504
|
|
|
|
|
|
uint8_t *dic = p->dic; |
23554
|
504
|
|
|
|
|
|
size_t dicBufSize = p->dicBufSize; |
23555
|
504
|
|
|
|
|
|
size_t dicPos = p->dicPos; |
23556
|
|
|
|
|
|
|
|
23557
|
504
|
|
|
|
|
|
uint32_t processedPos = p->processedPos; |
23558
|
504
|
|
|
|
|
|
uint32_t checkDicSize = p->checkDicSize; |
23559
|
|
|
|
|
|
|
unsigned len = 0; |
23560
|
|
|
|
|
|
|
|
23561
|
504
|
|
|
|
|
|
const uint8_t *buf = p->buf; |
23562
|
504
|
|
|
|
|
|
uint32_t range = p->range; |
23563
|
504
|
|
|
|
|
|
uint32_t code = p->code; |
23564
|
|
|
|
|
|
|
|
23565
|
|
|
|
|
|
|
do |
23566
|
|
|
|
|
|
|
{ |
23567
|
|
|
|
|
|
|
CLzmaProb *prob; |
23568
|
|
|
|
|
|
|
uint32_t bound; |
23569
|
|
|
|
|
|
|
unsigned ttt; |
23570
|
107442
|
|
|
|
|
|
unsigned posState = processedPos & pbMask; |
23571
|
|
|
|
|
|
|
|
23572
|
107442
|
|
|
|
|
|
prob = probs + IsMatch + (state << kNumPosBitsMax) + posState; |
23573
|
107442
|
100
|
|
|
|
|
IF_BIT_0(prob) |
|
|
100
|
|
|
|
|
|
23574
|
|
|
|
|
|
|
{ |
23575
|
|
|
|
|
|
|
unsigned symbol; |
23576
|
23097
|
|
|
|
|
|
UPDATE_0(prob); |
23577
|
23097
|
|
|
|
|
|
prob = probs + Literal; |
23578
|
23097
|
100
|
|
|
|
|
if (checkDicSize != 0 || processedPos != 0) |
23579
|
46182
|
|
|
|
|
|
prob += (LZMA_LIT_SIZE * (((processedPos & lpMask) << lc) + |
23580
|
23091
|
50
|
|
|
|
|
(dic[(dicPos == 0 ? dicBufSize : dicPos) - 1] >> (8 - lc)))); |
23581
|
|
|
|
|
|
|
|
23582
|
23097
|
100
|
|
|
|
|
if (state < kNumLitStates) |
23583
|
|
|
|
|
|
|
{ |
23584
|
21934
|
|
|
|
|
|
state -= (state < 4) ? state : 3; |
23585
|
|
|
|
|
|
|
symbol = 1; |
23586
|
175472
|
100
|
|
|
|
|
do { GET_BIT(prob + symbol, symbol) } while (symbol < 0x100); |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
23587
|
|
|
|
|
|
|
} |
23588
|
|
|
|
|
|
|
else |
23589
|
|
|
|
|
|
|
{ |
23590
|
1163
|
50
|
|
|
|
|
unsigned matchByte = p->dic[(dicPos - rep0) + ((dicPos < rep0) ? dicBufSize : 0)]; |
23591
|
|
|
|
|
|
|
unsigned offs = 0x100; |
23592
|
1163
|
100
|
|
|
|
|
state -= (state < 10) ? 3 : 6; |
23593
|
|
|
|
|
|
|
symbol = 1; |
23594
|
|
|
|
|
|
|
do |
23595
|
|
|
|
|
|
|
{ |
23596
|
|
|
|
|
|
|
unsigned bit; |
23597
|
|
|
|
|
|
|
CLzmaProb *probLit; |
23598
|
9304
|
|
|
|
|
|
matchByte <<= 1; |
23599
|
9304
|
|
|
|
|
|
bit = (matchByte & offs); |
23600
|
9304
|
|
|
|
|
|
probLit = prob + offs + bit + symbol; |
23601
|
9304
|
100
|
|
|
|
|
GET_BIT2(probLit, symbol, offs &= ~bit, offs &= bit) |
|
|
100
|
|
|
|
|
|
23602
|
|
|
|
|
|
|
} |
23603
|
9304
|
100
|
|
|
|
|
while (symbol < 0x100); |
23604
|
|
|
|
|
|
|
} |
23605
|
23097
|
|
|
|
|
|
dic[dicPos++] = (uint8_t)symbol; |
23606
|
23097
|
|
|
|
|
|
processedPos++; |
23607
|
23097
|
|
|
|
|
|
continue; |
23608
|
|
|
|
|
|
|
} |
23609
|
|
|
|
|
|
|
else |
23610
|
|
|
|
|
|
|
{ |
23611
|
84345
|
|
|
|
|
|
UPDATE_1(prob); |
23612
|
84345
|
|
|
|
|
|
prob = probs + IsRep + state; |
23613
|
84345
|
100
|
|
|
|
|
IF_BIT_0(prob) |
|
|
100
|
|
|
|
|
|
23614
|
|
|
|
|
|
|
{ |
23615
|
487
|
|
|
|
|
|
UPDATE_0(prob); |
23616
|
487
|
|
|
|
|
|
state += kNumStates; |
23617
|
487
|
|
|
|
|
|
prob = probs + LenCoder; |
23618
|
|
|
|
|
|
|
} |
23619
|
|
|
|
|
|
|
else |
23620
|
|
|
|
|
|
|
{ |
23621
|
83858
|
|
|
|
|
|
UPDATE_1(prob); |
23622
|
83858
|
50
|
|
|
|
|
if (checkDicSize == 0 && processedPos == 0) |
23623
|
|
|
|
|
|
|
return SZ_ERROR_DATA; |
23624
|
83858
|
|
|
|
|
|
prob = probs + IsRepG0 + state; |
23625
|
83858
|
100
|
|
|
|
|
IF_BIT_0(prob) |
|
|
100
|
|
|
|
|
|
23626
|
|
|
|
|
|
|
{ |
23627
|
83695
|
|
|
|
|
|
UPDATE_0(prob); |
23628
|
83695
|
|
|
|
|
|
prob = probs + IsRep0Long + (state << kNumPosBitsMax) + posState; |
23629
|
83695
|
100
|
|
|
|
|
IF_BIT_0(prob) |
|
|
100
|
|
|
|
|
|
23630
|
|
|
|
|
|
|
{ |
23631
|
645
|
|
|
|
|
|
UPDATE_0(prob); |
23632
|
645
|
50
|
|
|
|
|
dic[dicPos] = dic[(dicPos - rep0) + ((dicPos < rep0) ? dicBufSize : 0)]; |
23633
|
645
|
|
|
|
|
|
dicPos++; |
23634
|
645
|
|
|
|
|
|
processedPos++; |
23635
|
645
|
100
|
|
|
|
|
state = state < kNumLitStates ? 9 : 11; |
23636
|
|
|
|
|
|
|
continue; |
23637
|
|
|
|
|
|
|
} |
23638
|
83050
|
|
|
|
|
|
UPDATE_1(prob); |
23639
|
|
|
|
|
|
|
} |
23640
|
|
|
|
|
|
|
else |
23641
|
|
|
|
|
|
|
{ |
23642
|
|
|
|
|
|
|
uint32_t distance; |
23643
|
163
|
|
|
|
|
|
UPDATE_1(prob); |
23644
|
163
|
|
|
|
|
|
prob = probs + IsRepG1 + state; |
23645
|
163
|
100
|
|
|
|
|
IF_BIT_0(prob) |
|
|
100
|
|
|
|
|
|
23646
|
|
|
|
|
|
|
{ |
23647
|
97
|
|
|
|
|
|
UPDATE_0(prob); |
23648
|
|
|
|
|
|
|
distance = rep1; |
23649
|
|
|
|
|
|
|
} |
23650
|
|
|
|
|
|
|
else |
23651
|
|
|
|
|
|
|
{ |
23652
|
66
|
|
|
|
|
|
UPDATE_1(prob); |
23653
|
66
|
|
|
|
|
|
prob = probs + IsRepG2 + state; |
23654
|
66
|
100
|
|
|
|
|
IF_BIT_0(prob) |
|
|
100
|
|
|
|
|
|
23655
|
|
|
|
|
|
|
{ |
23656
|
37
|
|
|
|
|
|
UPDATE_0(prob); |
23657
|
|
|
|
|
|
|
distance = rep2; |
23658
|
|
|
|
|
|
|
} |
23659
|
|
|
|
|
|
|
else |
23660
|
|
|
|
|
|
|
{ |
23661
|
29
|
|
|
|
|
|
UPDATE_1(prob); |
23662
|
|
|
|
|
|
|
distance = rep3; |
23663
|
|
|
|
|
|
|
rep3 = rep2; |
23664
|
|
|
|
|
|
|
} |
23665
|
|
|
|
|
|
|
rep2 = rep1; |
23666
|
|
|
|
|
|
|
} |
23667
|
|
|
|
|
|
|
rep1 = rep0; |
23668
|
|
|
|
|
|
|
rep0 = distance; |
23669
|
|
|
|
|
|
|
} |
23670
|
83213
|
100
|
|
|
|
|
state = state < kNumLitStates ? 8 : 11; |
23671
|
83213
|
|
|
|
|
|
prob = probs + RepLenCoder; |
23672
|
|
|
|
|
|
|
} |
23673
|
|
|
|
|
|
|
{ |
23674
|
|
|
|
|
|
|
unsigned limit, offset; |
23675
|
|
|
|
|
|
|
CLzmaProb *probLen = prob + LenChoice; |
23676
|
83700
|
100
|
|
|
|
|
IF_BIT_0(probLen) |
|
|
100
|
|
|
|
|
|
23677
|
|
|
|
|
|
|
{ |
23678
|
445
|
|
|
|
|
|
UPDATE_0(probLen); |
23679
|
445
|
|
|
|
|
|
probLen = prob + LenLow + (posState << kLenNumLowBits); |
23680
|
|
|
|
|
|
|
offset = 0; |
23681
|
|
|
|
|
|
|
limit = (1 << kLenNumLowBits); |
23682
|
|
|
|
|
|
|
} |
23683
|
|
|
|
|
|
|
else |
23684
|
|
|
|
|
|
|
{ |
23685
|
83255
|
|
|
|
|
|
UPDATE_1(probLen); |
23686
|
|
|
|
|
|
|
probLen = prob + LenChoice2; |
23687
|
83255
|
100
|
|
|
|
|
IF_BIT_0(probLen) |
|
|
100
|
|
|
|
|
|
23688
|
|
|
|
|
|
|
{ |
23689
|
113
|
|
|
|
|
|
UPDATE_0(probLen); |
23690
|
113
|
|
|
|
|
|
probLen = prob + LenMid + (posState << kLenNumMidBits); |
23691
|
|
|
|
|
|
|
offset = kLenNumLowSymbols; |
23692
|
|
|
|
|
|
|
limit = (1 << kLenNumMidBits); |
23693
|
|
|
|
|
|
|
} |
23694
|
|
|
|
|
|
|
else |
23695
|
|
|
|
|
|
|
{ |
23696
|
83142
|
|
|
|
|
|
UPDATE_1(probLen); |
23697
|
83700
|
|
|
|
|
|
probLen = prob + LenHigh; |
23698
|
|
|
|
|
|
|
offset = kLenNumLowSymbols + kLenNumMidSymbols; |
23699
|
|
|
|
|
|
|
limit = (1 << kLenNumHighBits); |
23700
|
|
|
|
|
|
|
} |
23701
|
|
|
|
|
|
|
} |
23702
|
666810
|
100
|
|
|
|
|
TREE_DECODE(probLen, limit, len); |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
23703
|
83700
|
|
|
|
|
|
len += offset; |
23704
|
|
|
|
|
|
|
} |
23705
|
|
|
|
|
|
|
|
23706
|
83700
|
100
|
|
|
|
|
if (state >= kNumStates) |
23707
|
|
|
|
|
|
|
{ |
23708
|
|
|
|
|
|
|
uint32_t distance; |
23709
|
487
|
|
|
|
|
|
prob = probs + PosSlot + |
23710
|
974
|
|
|
|
|
|
((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits); |
23711
|
487
|
100
|
|
|
|
|
TREE_6_DECODE(prob, distance); |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
23712
|
487
|
100
|
|
|
|
|
if (distance >= kStartPosModelIndex) |
23713
|
|
|
|
|
|
|
{ |
23714
|
|
|
|
|
|
|
unsigned posSlot = (unsigned)distance; |
23715
|
405
|
|
|
|
|
|
int numDirectBits = (int)(((distance >> 1) - 1)); |
23716
|
405
|
|
|
|
|
|
distance = (2 | (distance & 1)); |
23717
|
405
|
100
|
|
|
|
|
if (posSlot < kEndPosModelIndex) |
23718
|
|
|
|
|
|
|
{ |
23719
|
166
|
|
|
|
|
|
distance <<= numDirectBits; |
23720
|
166
|
|
|
|
|
|
prob = probs + SpecPos + distance - posSlot - 1; |
23721
|
|
|
|
|
|
|
{ |
23722
|
|
|
|
|
|
|
uint32_t mask = 1; |
23723
|
|
|
|
|
|
|
unsigned i = 1; |
23724
|
491
|
100
|
|
|
|
|
do |
23725
|
|
|
|
|
|
|
{ |
23726
|
491
|
100
|
|
|
|
|
GET_BIT2(prob + i, i, ; , distance |= mask); |
|
|
100
|
|
|
|
|
|
23727
|
491
|
|
|
|
|
|
mask <<= 1; |
23728
|
|
|
|
|
|
|
} |
23729
|
|
|
|
|
|
|
while (--numDirectBits != 0); |
23730
|
|
|
|
|
|
|
} |
23731
|
|
|
|
|
|
|
} |
23732
|
|
|
|
|
|
|
else |
23733
|
|
|
|
|
|
|
{ |
23734
|
239
|
|
|
|
|
|
numDirectBits -= kNumAlignBits; |
23735
|
2572
|
100
|
|
|
|
|
do |
23736
|
|
|
|
|
|
|
{ |
23737
|
2572
|
100
|
|
|
|
|
NORMALIZE |
23738
|
2572
|
|
|
|
|
|
range >>= 1; |
23739
|
|
|
|
|
|
|
|
23740
|
|
|
|
|
|
|
{ |
23741
|
|
|
|
|
|
|
uint32_t t; |
23742
|
2572
|
|
|
|
|
|
code -= range; |
23743
|
2572
|
|
|
|
|
|
t = (0 - ((uint32_t)code >> 31)); /* (uint32_t)((int32_t)code >> 31) */ |
23744
|
2572
|
|
|
|
|
|
distance = (distance << 1) + (t + 1); |
23745
|
2572
|
|
|
|
|
|
code += range & t; |
23746
|
|
|
|
|
|
|
} |
23747
|
|
|
|
|
|
|
/* |
23748
|
|
|
|
|
|
|
distance <<= 1; |
23749
|
|
|
|
|
|
|
if (code >= range) |
23750
|
|
|
|
|
|
|
{ |
23751
|
|
|
|
|
|
|
code -= range; |
23752
|
|
|
|
|
|
|
distance |= 1; |
23753
|
|
|
|
|
|
|
} |
23754
|
|
|
|
|
|
|
*/ |
23755
|
|
|
|
|
|
|
} |
23756
|
|
|
|
|
|
|
while (--numDirectBits != 0); |
23757
|
239
|
|
|
|
|
|
prob = probs + Align; |
23758
|
239
|
|
|
|
|
|
distance <<= kNumAlignBits; |
23759
|
|
|
|
|
|
|
{ |
23760
|
|
|
|
|
|
|
unsigned i = 1; |
23761
|
239
|
100
|
|
|
|
|
GET_BIT2(prob + i, i, ; , distance |= 1); |
|
|
100
|
|
|
|
|
|
23762
|
239
|
100
|
|
|
|
|
GET_BIT2(prob + i, i, ; , distance |= 2); |
|
|
100
|
|
|
|
|
|
23763
|
239
|
100
|
|
|
|
|
GET_BIT2(prob + i, i, ; , distance |= 4); |
|
|
100
|
|
|
|
|
|
23764
|
239
|
100
|
|
|
|
|
GET_BIT2(prob + i, i, ; , distance |= 8); |
|
|
100
|
|
|
|
|
|
23765
|
|
|
|
|
|
|
} |
23766
|
239
|
50
|
|
|
|
|
if (distance == (uint32_t)0xFFFFFFFF) |
23767
|
|
|
|
|
|
|
{ |
23768
|
0
|
|
|
|
|
|
len += kMatchSpecLenStart; |
23769
|
0
|
|
|
|
|
|
state -= kNumStates; |
23770
|
0
|
|
|
|
|
|
break; |
23771
|
|
|
|
|
|
|
} |
23772
|
|
|
|
|
|
|
} |
23773
|
|
|
|
|
|
|
} |
23774
|
|
|
|
|
|
|
rep3 = rep2; |
23775
|
|
|
|
|
|
|
rep2 = rep1; |
23776
|
|
|
|
|
|
|
rep1 = rep0; |
23777
|
487
|
|
|
|
|
|
rep0 = distance + 1; |
23778
|
487
|
50
|
|
|
|
|
if (checkDicSize == 0) |
23779
|
|
|
|
|
|
|
{ |
23780
|
487
|
50
|
|
|
|
|
if (distance >= processedPos) |
23781
|
|
|
|
|
|
|
return SZ_ERROR_DATA; |
23782
|
|
|
|
|
|
|
} |
23783
|
0
|
0
|
|
|
|
|
else if (distance >= checkDicSize) |
23784
|
|
|
|
|
|
|
return SZ_ERROR_DATA; |
23785
|
487
|
100
|
|
|
|
|
state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3; |
23786
|
|
|
|
|
|
|
} |
23787
|
|
|
|
|
|
|
|
23788
|
83700
|
|
|
|
|
|
len += kMatchMinLen; |
23789
|
|
|
|
|
|
|
|
23790
|
83700
|
50
|
|
|
|
|
if (limit == dicPos) |
23791
|
|
|
|
|
|
|
return SZ_ERROR_DATA; |
23792
|
|
|
|
|
|
|
{ |
23793
|
83700
|
|
|
|
|
|
size_t rem = limit - dicPos; |
23794
|
83700
|
50
|
|
|
|
|
unsigned curLen = ((rem < len) ? (unsigned)rem : len); |
23795
|
83700
|
50
|
|
|
|
|
size_t pos = (dicPos - rep0) + ((dicPos < rep0) ? dicBufSize : 0); |
23796
|
|
|
|
|
|
|
|
23797
|
83700
|
|
|
|
|
|
processedPos += curLen; |
23798
|
|
|
|
|
|
|
|
23799
|
83700
|
|
|
|
|
|
len -= curLen; |
23800
|
83700
|
50
|
|
|
|
|
if (pos + curLen <= dicBufSize) |
23801
|
|
|
|
|
|
|
{ |
23802
|
83700
|
|
|
|
|
|
uint8_t *dest = dic + dicPos; |
23803
|
83700
|
|
|
|
|
|
ptrdiff_t src = (ptrdiff_t)pos - (ptrdiff_t)dicPos; |
23804
|
83700
|
|
|
|
|
|
const uint8_t *lim = dest + curLen; |
23805
|
83700
|
|
|
|
|
|
dicPos += curLen; |
23806
|
22650228
|
100
|
|
|
|
|
do |
23807
|
22650228
|
|
|
|
|
|
*(dest) = (uint8_t)*(dest + src); |
23808
|
|
|
|
|
|
|
while (++dest != lim); |
23809
|
|
|
|
|
|
|
} |
23810
|
|
|
|
|
|
|
else |
23811
|
|
|
|
|
|
|
{ |
23812
|
0
|
0
|
|
|
|
|
do |
23813
|
|
|
|
|
|
|
{ |
23814
|
0
|
|
|
|
|
|
dic[dicPos++] = dic[pos]; |
23815
|
0
|
0
|
|
|
|
|
if (++pos == dicBufSize) |
23816
|
|
|
|
|
|
|
pos = 0; |
23817
|
|
|
|
|
|
|
} |
23818
|
|
|
|
|
|
|
while (--curLen != 0); |
23819
|
|
|
|
|
|
|
} |
23820
|
|
|
|
|
|
|
} |
23821
|
|
|
|
|
|
|
} |
23822
|
|
|
|
|
|
|
} |
23823
|
107442
|
100
|
|
|
|
|
while (dicPos < limit && buf < bufLimit); |
23824
|
504
|
100
|
|
|
|
|
NORMALIZE; |
23825
|
504
|
|
|
|
|
|
p->buf = buf; |
23826
|
504
|
|
|
|
|
|
p->range = range; |
23827
|
504
|
|
|
|
|
|
p->code = code; |
23828
|
504
|
|
|
|
|
|
p->remainLen = len; |
23829
|
504
|
|
|
|
|
|
p->dicPos = dicPos; |
23830
|
504
|
|
|
|
|
|
p->processedPos = processedPos; |
23831
|
504
|
|
|
|
|
|
p->reps[0] = rep0; |
23832
|
504
|
|
|
|
|
|
p->reps[1] = rep1; |
23833
|
504
|
|
|
|
|
|
p->reps[2] = rep2; |
23834
|
504
|
|
|
|
|
|
p->reps[3] = rep3; |
23835
|
504
|
|
|
|
|
|
p->state = state; |
23836
|
|
|
|
|
|
|
|
23837
|
504
|
|
|
|
|
|
return SZ_OK; |
23838
|
|
|
|
|
|
|
} |
23839
|
|
|
|
|
|
|
|
23840
|
510
|
|
|
|
|
|
static void LzmaDec_WriteRem(CLzmaDec *p, size_t limit) |
23841
|
|
|
|
|
|
|
{ |
23842
|
510
|
50
|
|
|
|
|
if (p->remainLen != 0 && p->remainLen < kMatchSpecLenStart) |
23843
|
|
|
|
|
|
|
{ |
23844
|
0
|
|
|
|
|
|
uint8_t *dic = p->dic; |
23845
|
0
|
|
|
|
|
|
size_t dicPos = p->dicPos; |
23846
|
0
|
|
|
|
|
|
size_t dicBufSize = p->dicBufSize; |
23847
|
|
|
|
|
|
|
unsigned len = p->remainLen; |
23848
|
0
|
|
|
|
|
|
uint32_t rep0 = p->reps[0]; |
23849
|
0
|
0
|
|
|
|
|
if (limit - dicPos < len) |
23850
|
0
|
|
|
|
|
|
len = (unsigned)(limit - dicPos); |
23851
|
|
|
|
|
|
|
|
23852
|
0
|
0
|
|
|
|
|
if (p->checkDicSize == 0 && p->prop.dicSize - p->processedPos <= len) |
|
|
0
|
|
|
|
|
|
23853
|
0
|
|
|
|
|
|
p->checkDicSize = p->prop.dicSize; |
23854
|
|
|
|
|
|
|
|
23855
|
0
|
|
|
|
|
|
p->processedPos += len; |
23856
|
0
|
|
|
|
|
|
p->remainLen -= len; |
23857
|
0
|
0
|
|
|
|
|
while (len-- != 0) |
23858
|
|
|
|
|
|
|
{ |
23859
|
0
|
0
|
|
|
|
|
dic[dicPos] = dic[(dicPos - rep0) + ((dicPos < rep0) ? dicBufSize : 0)]; |
23860
|
0
|
|
|
|
|
|
dicPos++; |
23861
|
|
|
|
|
|
|
} |
23862
|
0
|
|
|
|
|
|
p->dicPos = dicPos; |
23863
|
|
|
|
|
|
|
} |
23864
|
510
|
|
|
|
|
|
} |
23865
|
|
|
|
|
|
|
|
23866
|
1008
|
|
|
|
|
|
static int LzmaDec_DecodeReal2(CLzmaDec *p, size_t limit, const uint8_t *bufLimit) |
23867
|
|
|
|
|
|
|
{ |
23868
|
|
|
|
|
|
|
do |
23869
|
|
|
|
|
|
|
{ |
23870
|
|
|
|
|
|
|
size_t limit2 = limit; |
23871
|
504
|
50
|
|
|
|
|
if (p->checkDicSize == 0) |
23872
|
|
|
|
|
|
|
{ |
23873
|
504
|
|
|
|
|
|
uint32_t rem = p->prop.dicSize - p->processedPos; |
23874
|
504
|
50
|
|
|
|
|
if (limit - p->dicPos > rem) |
23875
|
0
|
|
|
|
|
|
limit2 = p->dicPos + rem; |
23876
|
|
|
|
|
|
|
} |
23877
|
504
|
50
|
|
|
|
|
RINOK(LzmaDec_DecodeReal(p, limit2, bufLimit)); |
23878
|
504
|
50
|
|
|
|
|
if (p->processedPos >= p->prop.dicSize) |
23879
|
0
|
|
|
|
|
|
p->checkDicSize = p->prop.dicSize; |
23880
|
504
|
|
|
|
|
|
LzmaDec_WriteRem(p, limit); |
23881
|
|
|
|
|
|
|
} |
23882
|
504
|
100
|
|
|
|
|
while (p->dicPos < limit && p->buf < bufLimit && p->remainLen < kMatchSpecLenStart); |
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
23883
|
|
|
|
|
|
|
|
23884
|
504
|
50
|
|
|
|
|
if (p->remainLen > kMatchSpecLenStart) |
23885
|
|
|
|
|
|
|
{ |
23886
|
0
|
|
|
|
|
|
p->remainLen = kMatchSpecLenStart; |
23887
|
|
|
|
|
|
|
} |
23888
|
|
|
|
|
|
|
return 0; |
23889
|
|
|
|
|
|
|
} |
23890
|
|
|
|
|
|
|
|
23891
|
|
|
|
|
|
|
enum ELzmaDummy |
23892
|
|
|
|
|
|
|
{ |
23893
|
|
|
|
|
|
|
DUMMY_ERROR, /* unexpected end of input stream */ |
23894
|
|
|
|
|
|
|
DUMMY_LIT, |
23895
|
|
|
|
|
|
|
DUMMY_MATCH, |
23896
|
|
|
|
|
|
|
DUMMY_REP |
23897
|
|
|
|
|
|
|
}; |
23898
|
|
|
|
|
|
|
|
23899
|
470
|
|
|
|
|
|
static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const uint8_t *buf, size_t inSize) |
23900
|
|
|
|
|
|
|
{ |
23901
|
470
|
|
|
|
|
|
uint32_t range = p->range; |
23902
|
470
|
|
|
|
|
|
uint32_t code = p->code; |
23903
|
470
|
|
|
|
|
|
const uint8_t *bufLimit = buf + inSize; |
23904
|
470
|
|
|
|
|
|
CLzmaProb *probs = p->probs; |
23905
|
470
|
|
|
|
|
|
unsigned state = p->state; |
23906
|
|
|
|
|
|
|
ELzmaDummy res; |
23907
|
|
|
|
|
|
|
|
23908
|
|
|
|
|
|
|
{ |
23909
|
|
|
|
|
|
|
CLzmaProb *prob; |
23910
|
|
|
|
|
|
|
uint32_t bound; |
23911
|
|
|
|
|
|
|
unsigned ttt; |
23912
|
470
|
|
|
|
|
|
unsigned posState = (p->processedPos) & ((1 << p->prop.pb) - 1); |
23913
|
|
|
|
|
|
|
|
23914
|
470
|
|
|
|
|
|
prob = probs + IsMatch + (state << kNumPosBitsMax) + posState; |
23915
|
470
|
50
|
|
|
|
|
IF_BIT_0_CHECK(prob) |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
23916
|
|
|
|
|
|
|
{ |
23917
|
|
|
|
|
|
|
UPDATE_0_CHECK |
23918
|
|
|
|
|
|
|
|
23919
|
|
|
|
|
|
|
/* if (bufLimit - buf >= 7) return DUMMY_LIT; */ |
23920
|
|
|
|
|
|
|
|
23921
|
37
|
|
|
|
|
|
prob = probs + Literal; |
23922
|
37
|
100
|
|
|
|
|
if (p->checkDicSize != 0 || p->processedPos != 0) |
23923
|
36
|
|
|
|
|
|
prob += (LZMA_LIT_SIZE * |
23924
|
72
|
|
|
|
|
|
((((p->processedPos) & ((1 << (p->prop.lp)) - 1)) << p->prop.lc) + |
23925
|
36
|
50
|
|
|
|
|
(p->dic[(p->dicPos == 0 ? p->dicBufSize : p->dicPos) - 1] >> (8 - p->prop.lc)))); |
23926
|
|
|
|
|
|
|
|
23927
|
37
|
100
|
|
|
|
|
if (state < kNumLitStates) |
23928
|
|
|
|
|
|
|
{ |
23929
|
|
|
|
|
|
|
unsigned symbol = 1; |
23930
|
216
|
100
|
|
|
|
|
do { GET_BIT_CHECK(prob + symbol, symbol) } while (symbol < 0x100); |
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
23931
|
|
|
|
|
|
|
} |
23932
|
|
|
|
|
|
|
else |
23933
|
|
|
|
|
|
|
{ |
23934
|
20
|
|
|
|
|
|
unsigned matchByte = p->dic[p->dicPos - p->reps[0] + |
23935
|
10
|
50
|
|
|
|
|
((p->dicPos < p->reps[0]) ? p->dicBufSize : 0)]; |
23936
|
|
|
|
|
|
|
unsigned offs = 0x100; |
23937
|
|
|
|
|
|
|
unsigned symbol = 1; |
23938
|
|
|
|
|
|
|
do |
23939
|
|
|
|
|
|
|
{ |
23940
|
|
|
|
|
|
|
unsigned bit; |
23941
|
|
|
|
|
|
|
CLzmaProb *probLit; |
23942
|
80
|
|
|
|
|
|
matchByte <<= 1; |
23943
|
80
|
|
|
|
|
|
bit = (matchByte & offs); |
23944
|
80
|
|
|
|
|
|
probLit = prob + offs + bit + symbol; |
23945
|
80
|
100
|
|
|
|
|
GET_BIT2_CHECK(probLit, symbol, offs &= ~bit, offs &= bit) |
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
23946
|
|
|
|
|
|
|
} |
23947
|
80
|
100
|
|
|
|
|
while (symbol < 0x100); |
23948
|
|
|
|
|
|
|
} |
23949
|
|
|
|
|
|
|
res = DUMMY_LIT; |
23950
|
|
|
|
|
|
|
} |
23951
|
|
|
|
|
|
|
else |
23952
|
|
|
|
|
|
|
{ |
23953
|
|
|
|
|
|
|
unsigned len; |
23954
|
433
|
|
|
|
|
|
UPDATE_1_CHECK; |
23955
|
|
|
|
|
|
|
|
23956
|
433
|
|
|
|
|
|
prob = probs + IsRep + state; |
23957
|
433
|
100
|
|
|
|
|
IF_BIT_0_CHECK(prob) |
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
23958
|
|
|
|
|
|
|
{ |
23959
|
|
|
|
|
|
|
UPDATE_0_CHECK; |
23960
|
|
|
|
|
|
|
state = 0; |
23961
|
17
|
|
|
|
|
|
prob = probs + LenCoder; |
23962
|
|
|
|
|
|
|
res = DUMMY_MATCH; |
23963
|
|
|
|
|
|
|
} |
23964
|
|
|
|
|
|
|
else |
23965
|
|
|
|
|
|
|
{ |
23966
|
416
|
|
|
|
|
|
UPDATE_1_CHECK; |
23967
|
|
|
|
|
|
|
res = DUMMY_REP; |
23968
|
416
|
|
|
|
|
|
prob = probs + IsRepG0 + state; |
23969
|
416
|
50
|
|
|
|
|
IF_BIT_0_CHECK(prob) |
|
|
0
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
23970
|
|
|
|
|
|
|
{ |
23971
|
|
|
|
|
|
|
UPDATE_0_CHECK; |
23972
|
415
|
|
|
|
|
|
prob = probs + IsRep0Long + (state << kNumPosBitsMax) + posState; |
23973
|
415
|
100
|
|
|
|
|
IF_BIT_0_CHECK(prob) |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
23974
|
|
|
|
|
|
|
{ |
23975
|
|
|
|
|
|
|
UPDATE_0_CHECK; |
23976
|
0
|
0
|
|
|
|
|
NORMALIZE_CHECK; |
|
|
0
|
|
|
|
|
|
23977
|
|
|
|
|
|
|
return DUMMY_REP; |
23978
|
|
|
|
|
|
|
} |
23979
|
|
|
|
|
|
|
else |
23980
|
|
|
|
|
|
|
{ |
23981
|
415
|
|
|
|
|
|
UPDATE_1_CHECK; |
23982
|
|
|
|
|
|
|
} |
23983
|
|
|
|
|
|
|
} |
23984
|
|
|
|
|
|
|
else |
23985
|
|
|
|
|
|
|
{ |
23986
|
1
|
|
|
|
|
|
UPDATE_1_CHECK; |
23987
|
1
|
|
|
|
|
|
prob = probs + IsRepG1 + state; |
23988
|
1
|
50
|
|
|
|
|
IF_BIT_0_CHECK(prob) |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
23989
|
|
|
|
|
|
|
{ |
23990
|
|
|
|
|
|
|
UPDATE_0_CHECK; |
23991
|
|
|
|
|
|
|
} |
23992
|
|
|
|
|
|
|
else |
23993
|
|
|
|
|
|
|
{ |
23994
|
1
|
|
|
|
|
|
UPDATE_1_CHECK; |
23995
|
1
|
|
|
|
|
|
prob = probs + IsRepG2 + state; |
23996
|
1
|
50
|
|
|
|
|
IF_BIT_0_CHECK(prob) |
|
|
0
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
23997
|
|
|
|
|
|
|
{ |
23998
|
|
|
|
|
|
|
UPDATE_0_CHECK; |
23999
|
|
|
|
|
|
|
} |
24000
|
|
|
|
|
|
|
else |
24001
|
|
|
|
|
|
|
{ |
24002
|
0
|
|
|
|
|
|
UPDATE_1_CHECK; |
24003
|
|
|
|
|
|
|
} |
24004
|
|
|
|
|
|
|
} |
24005
|
|
|
|
|
|
|
} |
24006
|
|
|
|
|
|
|
state = kNumStates; |
24007
|
416
|
|
|
|
|
|
prob = probs + RepLenCoder; |
24008
|
|
|
|
|
|
|
} |
24009
|
|
|
|
|
|
|
{ |
24010
|
|
|
|
|
|
|
unsigned limit, offset; |
24011
|
|
|
|
|
|
|
CLzmaProb *probLen = prob + LenChoice; |
24012
|
433
|
100
|
|
|
|
|
IF_BIT_0_CHECK(probLen) |
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
24013
|
|
|
|
|
|
|
{ |
24014
|
|
|
|
|
|
|
UPDATE_0_CHECK; |
24015
|
16
|
|
|
|
|
|
probLen = prob + LenLow + (posState << kLenNumLowBits); |
24016
|
|
|
|
|
|
|
offset = 0; |
24017
|
|
|
|
|
|
|
limit = 1 << kLenNumLowBits; |
24018
|
|
|
|
|
|
|
} |
24019
|
|
|
|
|
|
|
else |
24020
|
|
|
|
|
|
|
{ |
24021
|
417
|
|
|
|
|
|
UPDATE_1_CHECK; |
24022
|
|
|
|
|
|
|
probLen = prob + LenChoice2; |
24023
|
417
|
100
|
|
|
|
|
IF_BIT_0_CHECK(probLen) |
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
24024
|
|
|
|
|
|
|
{ |
24025
|
|
|
|
|
|
|
UPDATE_0_CHECK; |
24026
|
2
|
|
|
|
|
|
probLen = prob + LenMid + (posState << kLenNumMidBits); |
24027
|
|
|
|
|
|
|
offset = kLenNumLowSymbols; |
24028
|
|
|
|
|
|
|
limit = 1 << kLenNumMidBits; |
24029
|
|
|
|
|
|
|
} |
24030
|
|
|
|
|
|
|
else |
24031
|
|
|
|
|
|
|
{ |
24032
|
415
|
|
|
|
|
|
UPDATE_1_CHECK; |
24033
|
433
|
|
|
|
|
|
probLen = prob + LenHigh; |
24034
|
|
|
|
|
|
|
offset = kLenNumLowSymbols + kLenNumMidSymbols; |
24035
|
|
|
|
|
|
|
limit = 1 << kLenNumHighBits; |
24036
|
|
|
|
|
|
|
} |
24037
|
|
|
|
|
|
|
} |
24038
|
3374
|
100
|
|
|
|
|
TREE_DECODE_CHECK(probLen, limit, len); |
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
24039
|
433
|
|
|
|
|
|
len += offset; |
24040
|
|
|
|
|
|
|
} |
24041
|
|
|
|
|
|
|
|
24042
|
433
|
100
|
|
|
|
|
if (state < 4) |
24043
|
|
|
|
|
|
|
{ |
24044
|
|
|
|
|
|
|
unsigned posSlot; |
24045
|
17
|
|
|
|
|
|
prob = probs + PosSlot + |
24046
|
17
|
|
|
|
|
|
((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << |
24047
|
17
|
|
|
|
|
|
kNumPosSlotBits); |
24048
|
102
|
100
|
|
|
|
|
TREE_DECODE_CHECK(prob, 1 << kNumPosSlotBits, posSlot); |
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
24049
|
17
|
100
|
|
|
|
|
if (posSlot >= kStartPosModelIndex) |
24050
|
|
|
|
|
|
|
{ |
24051
|
14
|
|
|
|
|
|
int numDirectBits = ((posSlot >> 1) - 1); |
24052
|
|
|
|
|
|
|
|
24053
|
|
|
|
|
|
|
/* if (bufLimit - buf >= 8) return DUMMY_MATCH; */ |
24054
|
|
|
|
|
|
|
|
24055
|
14
|
100
|
|
|
|
|
if (posSlot < kEndPosModelIndex) |
24056
|
|
|
|
|
|
|
{ |
24057
|
9
|
|
|
|
|
|
prob = probs + SpecPos + ((2 | (posSlot & 1)) << numDirectBits) - posSlot - 1; |
24058
|
|
|
|
|
|
|
} |
24059
|
|
|
|
|
|
|
else |
24060
|
|
|
|
|
|
|
{ |
24061
|
5
|
|
|
|
|
|
numDirectBits -= kNumAlignBits; |
24062
|
42
|
100
|
|
|
|
|
do |
24063
|
|
|
|
|
|
|
{ |
24064
|
42
|
100
|
|
|
|
|
NORMALIZE_CHECK |
|
|
50
|
|
|
|
|
|
24065
|
42
|
|
|
|
|
|
range >>= 1; |
24066
|
42
|
|
|
|
|
|
code -= range & (((code - range) >> 31) - 1); |
24067
|
|
|
|
|
|
|
/* if (code >= range) code -= range; */ |
24068
|
|
|
|
|
|
|
} |
24069
|
|
|
|
|
|
|
while (--numDirectBits != 0); |
24070
|
14
|
|
|
|
|
|
prob = probs + Align; |
24071
|
|
|
|
|
|
|
numDirectBits = kNumAlignBits; |
24072
|
|
|
|
|
|
|
} |
24073
|
|
|
|
|
|
|
{ |
24074
|
|
|
|
|
|
|
unsigned i = 1; |
24075
|
47
|
100
|
|
|
|
|
do |
24076
|
|
|
|
|
|
|
{ |
24077
|
47
|
100
|
|
|
|
|
GET_BIT_CHECK(prob + i, i); |
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
24078
|
|
|
|
|
|
|
} |
24079
|
|
|
|
|
|
|
while (--numDirectBits != 0); |
24080
|
|
|
|
|
|
|
} |
24081
|
|
|
|
|
|
|
} |
24082
|
|
|
|
|
|
|
} |
24083
|
|
|
|
|
|
|
} |
24084
|
|
|
|
|
|
|
} |
24085
|
470
|
100
|
|
|
|
|
NORMALIZE_CHECK; |
|
|
50
|
|
|
|
|
|
24086
|
|
|
|
|
|
|
return res; |
24087
|
|
|
|
|
|
|
} |
24088
|
|
|
|
|
|
|
|
24089
|
|
|
|
|
|
|
static void LzmaDec_InitRc(CLzmaDec *p, const uint8_t *data) |
24090
|
|
|
|
|
|
|
{ |
24091
|
6
|
|
|
|
|
|
p->code = ((uint32_t)data[1] << 24) | ((uint32_t)data[2] << 16) | ((uint32_t)data[3] << 8) | ((uint32_t)data[4]); |
24092
|
6
|
|
|
|
|
|
p->range = 0xFFFFFFFF; |
24093
|
6
|
|
|
|
|
|
p->needFlush = 0; |
24094
|
|
|
|
|
|
|
} |
24095
|
|
|
|
|
|
|
|
24096
|
0
|
|
|
|
|
|
void LzmaDec_InitDicAndState(CLzmaDec *p, bool initDic, bool initState) |
24097
|
|
|
|
|
|
|
{ |
24098
|
6
|
|
|
|
|
|
p->needFlush = 1; |
24099
|
6
|
|
|
|
|
|
p->remainLen = 0; |
24100
|
6
|
|
|
|
|
|
p->tempBufSize = 0; |
24101
|
|
|
|
|
|
|
|
24102
|
0
|
0
|
|
|
|
|
if (initDic) |
24103
|
|
|
|
|
|
|
{ |
24104
|
6
|
|
|
|
|
|
p->processedPos = 0; |
24105
|
6
|
|
|
|
|
|
p->checkDicSize = 0; |
24106
|
0
|
|
|
|
|
|
p->needInitState = 1; |
24107
|
|
|
|
|
|
|
} |
24108
|
0
|
0
|
|
|
|
|
if (initState) |
24109
|
0
|
|
|
|
|
|
p->needInitState = 1; |
24110
|
0
|
|
|
|
|
|
} |
24111
|
|
|
|
|
|
|
|
24112
|
0
|
|
|
|
|
|
void LzmaDec_Init(CLzmaDec *p) |
24113
|
|
|
|
|
|
|
{ |
24114
|
6
|
|
|
|
|
|
p->dicPos = 0; |
24115
|
|
|
|
|
|
|
LzmaDec_InitDicAndState(p, true, true); |
24116
|
0
|
|
|
|
|
|
} |
24117
|
|
|
|
|
|
|
|
24118
|
|
|
|
|
|
|
static void LzmaDec_InitStateReal(CLzmaDec *p) |
24119
|
|
|
|
|
|
|
{ |
24120
|
6
|
|
|
|
|
|
uint32_t numProbs = Literal + ((uint32_t)LZMA_LIT_SIZE << (p->prop.lc + p->prop.lp)); |
24121
|
|
|
|
|
|
|
uint32_t i; |
24122
|
6
|
|
|
|
|
|
CLzmaProb *probs = p->probs; |
24123
|
47946
|
100
|
|
|
|
|
for (i = 0; i < numProbs; i++) |
24124
|
47940
|
|
|
|
|
|
probs[i] = kBitModelTotal >> 1; |
24125
|
6
|
|
|
|
|
|
p->reps[0] = p->reps[1] = p->reps[2] = p->reps[3] = 1; |
24126
|
6
|
|
|
|
|
|
p->state = 0; |
24127
|
6
|
|
|
|
|
|
p->needInitState = 0; |
24128
|
|
|
|
|
|
|
} |
24129
|
|
|
|
|
|
|
|
24130
|
6
|
|
|
|
|
|
SRes LzmaDec_DecodeToDic(CLzmaDec *p, size_t dicLimit, const uint8_t *src, size_t *srcLen, |
24131
|
|
|
|
|
|
|
ELzmaFinishMode finishMode, ELzmaStatus *status) |
24132
|
|
|
|
|
|
|
{ |
24133
|
6
|
|
|
|
|
|
size_t inSize = *srcLen; |
24134
|
6
|
|
|
|
|
|
(*srcLen) = 0; |
24135
|
6
|
|
|
|
|
|
LzmaDec_WriteRem(p, dicLimit); |
24136
|
|
|
|
|
|
|
|
24137
|
510
|
|
|
|
|
|
*status = LZMA_STATUS_NOT_SPECIFIED; |
24138
|
|
|
|
|
|
|
|
24139
|
510
|
50
|
|
|
|
|
while (p->remainLen != kMatchSpecLenStart) |
24140
|
|
|
|
|
|
|
{ |
24141
|
|
|
|
|
|
|
int checkEndMarkNow; |
24142
|
|
|
|
|
|
|
|
24143
|
510
|
100
|
|
|
|
|
if (p->needFlush != 0) |
24144
|
|
|
|
|
|
|
{ |
24145
|
36
|
50
|
|
|
|
|
for (; inSize > 0 && p->tempBufSize < RC_INIT_SIZE; (*srcLen)++, inSize--) |
|
|
100
|
|
|
|
|
|
24146
|
30
|
|
|
|
|
|
p->tempBuf[p->tempBufSize++] = *src++; |
24147
|
6
|
50
|
|
|
|
|
if (p->tempBufSize < RC_INIT_SIZE) |
24148
|
|
|
|
|
|
|
{ |
24149
|
0
|
|
|
|
|
|
*status = LZMA_STATUS_NEEDS_MORE_INPUT; |
24150
|
0
|
|
|
|
|
|
return SZ_OK; |
24151
|
|
|
|
|
|
|
} |
24152
|
6
|
50
|
|
|
|
|
if (p->tempBuf[0] != 0) |
24153
|
|
|
|
|
|
|
return SZ_ERROR_DATA; |
24154
|
|
|
|
|
|
|
|
24155
|
|
|
|
|
|
|
LzmaDec_InitRc(p, p->tempBuf); |
24156
|
6
|
|
|
|
|
|
p->tempBufSize = 0; |
24157
|
|
|
|
|
|
|
} |
24158
|
|
|
|
|
|
|
|
24159
|
|
|
|
|
|
|
checkEndMarkNow = 0; |
24160
|
510
|
100
|
|
|
|
|
if (p->dicPos >= dicLimit) |
24161
|
|
|
|
|
|
|
{ |
24162
|
6
|
50
|
|
|
|
|
if (p->remainLen == 0 && p->code == 0) |
|
|
50
|
|
|
|
|
|
24163
|
|
|
|
|
|
|
{ |
24164
|
6
|
|
|
|
|
|
*status = LZMA_STATUS_MAYBE_FINISHED_WITHOUT_MARK; |
24165
|
6
|
|
|
|
|
|
return SZ_OK; |
24166
|
|
|
|
|
|
|
} |
24167
|
0
|
0
|
|
|
|
|
if (finishMode == LZMA_FINISH_ANY) |
24168
|
|
|
|
|
|
|
{ |
24169
|
0
|
|
|
|
|
|
*status = LZMA_STATUS_NOT_FINISHED; |
24170
|
0
|
|
|
|
|
|
return SZ_OK; |
24171
|
|
|
|
|
|
|
} |
24172
|
0
|
0
|
|
|
|
|
if (p->remainLen != 0) |
24173
|
|
|
|
|
|
|
{ |
24174
|
0
|
|
|
|
|
|
*status = LZMA_STATUS_NOT_FINISHED; |
24175
|
0
|
|
|
|
|
|
return SZ_ERROR_DATA; |
24176
|
|
|
|
|
|
|
} |
24177
|
|
|
|
|
|
|
checkEndMarkNow = 1; |
24178
|
|
|
|
|
|
|
} |
24179
|
|
|
|
|
|
|
|
24180
|
504
|
100
|
|
|
|
|
if (p->needInitState) |
24181
|
|
|
|
|
|
|
LzmaDec_InitStateReal(p); |
24182
|
|
|
|
|
|
|
|
24183
|
504
|
50
|
|
|
|
|
if (p->tempBufSize == 0) |
24184
|
|
|
|
|
|
|
{ |
24185
|
|
|
|
|
|
|
size_t processed; |
24186
|
|
|
|
|
|
|
const uint8_t *bufLimit; |
24187
|
504
|
100
|
|
|
|
|
if (inSize < LZMA_REQUIRED_INPUT_MAX || checkEndMarkNow) |
24188
|
|
|
|
|
|
|
{ |
24189
|
470
|
|
|
|
|
|
int dummyRes = LzmaDec_TryDummy(p, src, inSize); |
24190
|
470
|
50
|
|
|
|
|
if (dummyRes == DUMMY_ERROR) |
24191
|
|
|
|
|
|
|
{ |
24192
|
0
|
|
|
|
|
|
memcpy(p->tempBuf, src, inSize); |
24193
|
0
|
|
|
|
|
|
p->tempBufSize = (unsigned)inSize; |
24194
|
0
|
|
|
|
|
|
(*srcLen) += inSize; |
24195
|
0
|
|
|
|
|
|
*status = LZMA_STATUS_NEEDS_MORE_INPUT; |
24196
|
0
|
|
|
|
|
|
return SZ_OK; |
24197
|
|
|
|
|
|
|
} |
24198
|
470
|
50
|
|
|
|
|
if (checkEndMarkNow && dummyRes != DUMMY_MATCH) |
24199
|
|
|
|
|
|
|
{ |
24200
|
0
|
|
|
|
|
|
*status = LZMA_STATUS_NOT_FINISHED; |
24201
|
0
|
|
|
|
|
|
return SZ_ERROR_DATA; |
24202
|
|
|
|
|
|
|
} |
24203
|
|
|
|
|
|
|
bufLimit = src; |
24204
|
|
|
|
|
|
|
} |
24205
|
|
|
|
|
|
|
else |
24206
|
34
|
|
|
|
|
|
bufLimit = src + inSize - LZMA_REQUIRED_INPUT_MAX; |
24207
|
504
|
|
|
|
|
|
p->buf = src; |
24208
|
504
|
50
|
|
|
|
|
if (LzmaDec_DecodeReal2(p, dicLimit, bufLimit) != 0) |
24209
|
|
|
|
|
|
|
return SZ_ERROR_DATA; |
24210
|
504
|
|
|
|
|
|
processed = (size_t)(p->buf - src); |
24211
|
504
|
|
|
|
|
|
(*srcLen) += processed; |
24212
|
|
|
|
|
|
|
src += processed; |
24213
|
504
|
|
|
|
|
|
inSize -= processed; |
24214
|
|
|
|
|
|
|
} |
24215
|
|
|
|
|
|
|
else |
24216
|
|
|
|
|
|
|
{ |
24217
|
|
|
|
|
|
|
unsigned rem = p->tempBufSize, lookAhead = 0; |
24218
|
0
|
0
|
|
|
|
|
while (rem < LZMA_REQUIRED_INPUT_MAX && lookAhead < inSize) |
|
|
0
|
|
|
|
|
|
24219
|
0
|
|
|
|
|
|
p->tempBuf[rem++] = src[lookAhead++]; |
24220
|
0
|
|
|
|
|
|
p->tempBufSize = rem; |
24221
|
0
|
0
|
|
|
|
|
if (rem < LZMA_REQUIRED_INPUT_MAX || checkEndMarkNow) |
24222
|
|
|
|
|
|
|
{ |
24223
|
0
|
|
|
|
|
|
int dummyRes = LzmaDec_TryDummy(p, p->tempBuf, rem); |
24224
|
0
|
0
|
|
|
|
|
if (dummyRes == DUMMY_ERROR) |
24225
|
|
|
|
|
|
|
{ |
24226
|
0
|
|
|
|
|
|
(*srcLen) += lookAhead; |
24227
|
0
|
|
|
|
|
|
*status = LZMA_STATUS_NEEDS_MORE_INPUT; |
24228
|
0
|
|
|
|
|
|
return SZ_OK; |
24229
|
|
|
|
|
|
|
} |
24230
|
0
|
0
|
|
|
|
|
if (checkEndMarkNow && dummyRes != DUMMY_MATCH) |
24231
|
|
|
|
|
|
|
{ |
24232
|
0
|
|
|
|
|
|
*status = LZMA_STATUS_NOT_FINISHED; |
24233
|
0
|
|
|
|
|
|
return SZ_ERROR_DATA; |
24234
|
|
|
|
|
|
|
} |
24235
|
|
|
|
|
|
|
} |
24236
|
0
|
|
|
|
|
|
p->buf = p->tempBuf; |
24237
|
0
|
0
|
|
|
|
|
if (LzmaDec_DecodeReal2(p, dicLimit, p->buf) != 0) |
24238
|
|
|
|
|
|
|
return SZ_ERROR_DATA; |
24239
|
0
|
|
|
|
|
|
lookAhead -= (rem - (unsigned)(p->buf - p->tempBuf)); |
24240
|
0
|
|
|
|
|
|
(*srcLen) += lookAhead; |
24241
|
0
|
|
|
|
|
|
src += lookAhead; |
24242
|
0
|
|
|
|
|
|
inSize -= lookAhead; |
24243
|
0
|
|
|
|
|
|
p->tempBufSize = 0; |
24244
|
|
|
|
|
|
|
} |
24245
|
|
|
|
|
|
|
} |
24246
|
0
|
0
|
|
|
|
|
if (p->code == 0) |
24247
|
0
|
|
|
|
|
|
*status = LZMA_STATUS_FINISHED_WITH_MARK; |
24248
|
0
|
|
|
|
|
|
return (p->code == 0) ? SZ_OK : SZ_ERROR_DATA; |
24249
|
|
|
|
|
|
|
} |
24250
|
|
|
|
|
|
|
|
24251
|
0
|
|
|
|
|
|
SRes LzmaDec_DecodeToBuf(CLzmaDec *p, uint8_t *dest, size_t *destLen, const uint8_t *src, size_t *srcLen, ELzmaFinishMode finishMode, ELzmaStatus *status) |
24252
|
|
|
|
|
|
|
{ |
24253
|
0
|
|
|
|
|
|
size_t outSize = *destLen; |
24254
|
0
|
|
|
|
|
|
size_t inSize = *srcLen; |
24255
|
0
|
|
|
|
|
|
*srcLen = *destLen = 0; |
24256
|
0
|
|
|
|
|
|
for (;;) |
24257
|
|
|
|
|
|
|
{ |
24258
|
0
|
|
|
|
|
|
size_t inSizeCur = inSize, outSizeCur, dicPos; |
24259
|
|
|
|
|
|
|
ELzmaFinishMode curFinishMode; |
24260
|
|
|
|
|
|
|
SRes res; |
24261
|
0
|
0
|
|
|
|
|
if (p->dicPos == p->dicBufSize) |
24262
|
0
|
|
|
|
|
|
p->dicPos = 0; |
24263
|
0
|
|
|
|
|
|
dicPos = p->dicPos; |
24264
|
0
|
0
|
|
|
|
|
if (outSize > p->dicBufSize - dicPos) |
24265
|
|
|
|
|
|
|
{ |
24266
|
|
|
|
|
|
|
outSizeCur = p->dicBufSize; |
24267
|
|
|
|
|
|
|
curFinishMode = LZMA_FINISH_ANY; |
24268
|
|
|
|
|
|
|
} |
24269
|
|
|
|
|
|
|
else |
24270
|
|
|
|
|
|
|
{ |
24271
|
0
|
|
|
|
|
|
outSizeCur = dicPos + outSize; |
24272
|
|
|
|
|
|
|
curFinishMode = finishMode; |
24273
|
|
|
|
|
|
|
} |
24274
|
|
|
|
|
|
|
|
24275
|
0
|
|
|
|
|
|
res = LzmaDec_DecodeToDic(p, outSizeCur, src, &inSizeCur, curFinishMode, status); |
24276
|
0
|
|
|
|
|
|
src += inSizeCur; |
24277
|
0
|
|
|
|
|
|
inSize -= inSizeCur; |
24278
|
0
|
|
|
|
|
|
*srcLen += inSizeCur; |
24279
|
0
|
|
|
|
|
|
outSizeCur = p->dicPos - dicPos; |
24280
|
0
|
|
|
|
|
|
memcpy(dest, p->dic + dicPos, outSizeCur); |
24281
|
0
|
|
|
|
|
|
dest += outSizeCur; |
24282
|
0
|
|
|
|
|
|
outSize -= outSizeCur; |
24283
|
0
|
|
|
|
|
|
*destLen += outSizeCur; |
24284
|
0
|
0
|
|
|
|
|
if (res != 0) |
24285
|
0
|
|
|
|
|
|
return res; |
24286
|
0
|
0
|
|
|
|
|
if (outSizeCur == 0 || outSize == 0) |
24287
|
|
|
|
|
|
|
return SZ_OK; |
24288
|
|
|
|
|
|
|
} |
24289
|
|
|
|
|
|
|
} |
24290
|
|
|
|
|
|
|
|
24291
|
0
|
|
|
|
|
|
void LzmaDec_FreeProbs(CLzmaDec *p, ISzAlloc *alloc) |
24292
|
|
|
|
|
|
|
{ |
24293
|
12
|
|
|
|
|
|
alloc->Free(alloc, p->probs); |
24294
|
6
|
|
|
|
|
|
p->probs = 0; |
24295
|
0
|
|
|
|
|
|
} |
24296
|
|
|
|
|
|
|
|
24297
|
|
|
|
|
|
|
static void LzmaDec_FreeDict(CLzmaDec *p, ISzAlloc *alloc) |
24298
|
|
|
|
|
|
|
{ |
24299
|
0
|
|
|
|
|
|
alloc->Free(alloc, p->dic); |
24300
|
0
|
|
|
|
|
|
p->dic = 0; |
24301
|
|
|
|
|
|
|
} |
24302
|
|
|
|
|
|
|
|
24303
|
0
|
|
|
|
|
|
void LzmaDec_Free(CLzmaDec *p, ISzAlloc *alloc) |
24304
|
|
|
|
|
|
|
{ |
24305
|
|
|
|
|
|
|
LzmaDec_FreeProbs(p, alloc); |
24306
|
|
|
|
|
|
|
LzmaDec_FreeDict(p, alloc); |
24307
|
0
|
|
|
|
|
|
} |
24308
|
|
|
|
|
|
|
|
24309
|
6
|
|
|
|
|
|
SRes LzmaProps_Decode(CLzmaProps *p, const uint8_t *data, unsigned size) |
24310
|
|
|
|
|
|
|
{ |
24311
|
|
|
|
|
|
|
uint32_t dicSize; |
24312
|
|
|
|
|
|
|
uint8_t d; |
24313
|
|
|
|
|
|
|
|
24314
|
6
|
50
|
|
|
|
|
if (size < LZMA_PROPS_SIZE) |
24315
|
|
|
|
|
|
|
return SZ_ERROR_UNSUPPORTED; |
24316
|
|
|
|
|
|
|
else |
24317
|
6
|
|
|
|
|
|
dicSize = data[1] | ((uint32_t)data[2] << 8) | ((uint32_t)data[3] << 16) | ((uint32_t)data[4] << 24); |
24318
|
|
|
|
|
|
|
|
24319
|
6
|
50
|
|
|
|
|
if (dicSize < LZMA_DIC_MIN) |
24320
|
|
|
|
|
|
|
dicSize = LZMA_DIC_MIN; |
24321
|
6
|
|
|
|
|
|
p->dicSize = dicSize; |
24322
|
|
|
|
|
|
|
|
24323
|
6
|
|
|
|
|
|
d = data[0]; |
24324
|
6
|
50
|
|
|
|
|
if (d >= (9 * 5 * 5)) |
24325
|
|
|
|
|
|
|
return SZ_ERROR_UNSUPPORTED; |
24326
|
|
|
|
|
|
|
|
24327
|
6
|
|
|
|
|
|
p->lc = d % 9; |
24328
|
6
|
|
|
|
|
|
d /= 9; |
24329
|
6
|
|
|
|
|
|
p->pb = d / 5; |
24330
|
6
|
|
|
|
|
|
p->lp = d % 5; |
24331
|
|
|
|
|
|
|
|
24332
|
6
|
|
|
|
|
|
return SZ_OK; |
24333
|
|
|
|
|
|
|
} |
24334
|
|
|
|
|
|
|
|
24335
|
12
|
|
|
|
|
|
static SRes LzmaDec_AllocateProbs2(CLzmaDec *p, const CLzmaProps *propNew, ISzAlloc *alloc) |
24336
|
|
|
|
|
|
|
{ |
24337
|
6
|
|
|
|
|
|
uint32_t numProbs = LzmaProps_GetNumProbs(propNew); |
24338
|
6
|
50
|
|
|
|
|
if (p->probs == 0 || numProbs != p->numProbs) |
|
|
0
|
|
|
|
|
|
24339
|
|
|
|
|
|
|
{ |
24340
|
|
|
|
|
|
|
LzmaDec_FreeProbs(p, alloc); |
24341
|
6
|
|
|
|
|
|
p->probs = (CLzmaProb *)alloc->Alloc(alloc, numProbs * sizeof(CLzmaProb)); |
24342
|
6
|
|
|
|
|
|
p->numProbs = numProbs; |
24343
|
6
|
50
|
|
|
|
|
if (p->probs == 0) |
24344
|
|
|
|
|
|
|
return SZ_ERROR_MEM; |
24345
|
|
|
|
|
|
|
} |
24346
|
|
|
|
|
|
|
return SZ_OK; |
24347
|
|
|
|
|
|
|
} |
24348
|
|
|
|
|
|
|
|
24349
|
6
|
|
|
|
|
|
SRes LzmaDec_AllocateProbs(CLzmaDec *p, const uint8_t *props, unsigned propsSize, ISzAlloc *alloc) |
24350
|
|
|
|
|
|
|
{ |
24351
|
|
|
|
|
|
|
CLzmaProps propNew; |
24352
|
6
|
50
|
|
|
|
|
RINOK(LzmaProps_Decode(&propNew, props, propsSize)); |
24353
|
6
|
50
|
|
|
|
|
RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc)); |
24354
|
6
|
|
|
|
|
|
p->prop = propNew; |
24355
|
6
|
|
|
|
|
|
return SZ_OK; |
24356
|
|
|
|
|
|
|
} |
24357
|
|
|
|
|
|
|
|
24358
|
0
|
|
|
|
|
|
SRes LzmaDec_Allocate(CLzmaDec *p, const uint8_t *props, unsigned propsSize, ISzAlloc *alloc) |
24359
|
|
|
|
|
|
|
{ |
24360
|
|
|
|
|
|
|
CLzmaProps propNew; |
24361
|
|
|
|
|
|
|
size_t dicBufSize; |
24362
|
0
|
0
|
|
|
|
|
RINOK(LzmaProps_Decode(&propNew, props, propsSize)); |
24363
|
0
|
0
|
|
|
|
|
RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc)); |
24364
|
0
|
|
|
|
|
|
dicBufSize = propNew.dicSize; |
24365
|
0
|
0
|
|
|
|
|
if (p->dic == 0 || dicBufSize != p->dicBufSize) |
|
|
0
|
|
|
|
|
|
24366
|
|
|
|
|
|
|
{ |
24367
|
|
|
|
|
|
|
LzmaDec_FreeDict(p, alloc); |
24368
|
0
|
|
|
|
|
|
p->dic = (uint8_t *)alloc->Alloc(alloc, dicBufSize); |
24369
|
0
|
0
|
|
|
|
|
if (p->dic == 0) |
24370
|
|
|
|
|
|
|
{ |
24371
|
|
|
|
|
|
|
LzmaDec_FreeProbs(p, alloc); |
24372
|
0
|
|
|
|
|
|
return SZ_ERROR_MEM; |
24373
|
|
|
|
|
|
|
} |
24374
|
|
|
|
|
|
|
} |
24375
|
0
|
|
|
|
|
|
p->dicBufSize = dicBufSize; |
24376
|
0
|
|
|
|
|
|
p->prop = propNew; |
24377
|
0
|
|
|
|
|
|
return SZ_OK; |
24378
|
|
|
|
|
|
|
} |
24379
|
|
|
|
|
|
|
|
24380
|
6
|
|
|
|
|
|
SRes LzmaDecode(uint8_t *dest, size_t *destLen, const uint8_t *src, size_t *srcLen, |
24381
|
|
|
|
|
|
|
const uint8_t *propData, unsigned propSize, ELzmaFinishMode finishMode, |
24382
|
|
|
|
|
|
|
ELzmaStatus *status, ISzAlloc *alloc) |
24383
|
|
|
|
|
|
|
{ |
24384
|
|
|
|
|
|
|
CLzmaDec p; |
24385
|
|
|
|
|
|
|
SRes res; |
24386
|
6
|
|
|
|
|
|
size_t inSize = *srcLen; |
24387
|
6
|
|
|
|
|
|
size_t outSize = *destLen; |
24388
|
6
|
|
|
|
|
|
*srcLen = *destLen = 0; |
24389
|
6
|
50
|
|
|
|
|
if (inSize < RC_INIT_SIZE) |
24390
|
|
|
|
|
|
|
return SZ_ERROR_INPUT_EOF; |
24391
|
|
|
|
|
|
|
|
24392
|
6
|
|
|
|
|
|
LzmaDec_Construct(&p); |
24393
|
6
|
|
|
|
|
|
res = LzmaDec_AllocateProbs(&p, propData, propSize, alloc); |
24394
|
6
|
50
|
|
|
|
|
if (res != 0) |
24395
|
|
|
|
|
|
|
return res; |
24396
|
6
|
|
|
|
|
|
p.dic = dest; |
24397
|
6
|
|
|
|
|
|
p.dicBufSize = outSize; |
24398
|
|
|
|
|
|
|
|
24399
|
|
|
|
|
|
|
LzmaDec_Init(&p); |
24400
|
|
|
|
|
|
|
|
24401
|
6
|
|
|
|
|
|
*srcLen = inSize; |
24402
|
6
|
|
|
|
|
|
res = LzmaDec_DecodeToDic(&p, outSize, src, srcLen, finishMode, status); |
24403
|
|
|
|
|
|
|
|
24404
|
6
|
50
|
|
|
|
|
if (res == SZ_OK && *status == LZMA_STATUS_NEEDS_MORE_INPUT) |
|
|
50
|
|
|
|
|
|
24405
|
|
|
|
|
|
|
res = SZ_ERROR_INPUT_EOF; |
24406
|
|
|
|
|
|
|
|
24407
|
6
|
|
|
|
|
|
(*destLen) = p.dicPos; |
24408
|
|
|
|
|
|
|
LzmaDec_FreeProbs(&p, alloc); |
24409
|
|
|
|
|
|
|
return res; |
24410
|
|
|
|
|
|
|
} |
24411
|
|
|
|
|
|
|
|
24412
|
|
|
|
|
|
|
} // namespace lzma |
24413
|
|
|
|
|
|
|
// End of LZMA compression library by Igor Pavlov |
24414
|
|
|
|
|
|
|
|
24415
|
|
|
|
|
|
|
#ifndef UFAL_CPPUTILS_COMPRESSOR_LZMA_ALLOCATOR_H |
24416
|
|
|
|
|
|
|
#define UFAL_CPPUTILS_COMPRESSOR_LZMA_ALLOCATOR_H |
24417
|
12
|
|
|
|
|
|
static void *LzmaAlloc(void* /*p*/, size_t size) { return new char[size]; } |
24418
|
24
|
100
|
|
|
|
|
static void LzmaFree(void* /*p*/, void *address) { delete[] (char*) address; } |
24419
|
|
|
|
|
|
|
static lzma::ISzAlloc lzmaAllocator = { LzmaAlloc, LzmaFree }; |
24420
|
|
|
|
|
|
|
#endif // UFAL_CPPUTILS_COMPRESSOR_LZMA_ALLOCATOR_H |
24421
|
|
|
|
|
|
|
|
24422
|
6
|
|
|
|
|
|
bool compressor::load(istream& is, binary_decoder& data) { |
24423
|
|
|
|
|
|
|
uint32_t uncompressed_len, compressed_len, poor_crc; |
24424
|
|
|
|
|
|
|
unsigned char props_encoded[LZMA_PROPS_SIZE]; |
24425
|
|
|
|
|
|
|
|
24426
|
6
|
50
|
|
|
|
|
if (!is.read((char *) &uncompressed_len, sizeof(uncompressed_len))) return false; |
24427
|
6
|
50
|
|
|
|
|
if (!is.read((char *) &compressed_len, sizeof(compressed_len))) return false; |
24428
|
6
|
50
|
|
|
|
|
if (!is.read((char *) &poor_crc, sizeof(poor_crc))) return false; |
24429
|
6
|
50
|
|
|
|
|
if (poor_crc != uncompressed_len * 19991 + compressed_len * 199999991 + 1234567890) return false; |
24430
|
6
|
50
|
|
|
|
|
if (!is.read((char *) props_encoded, sizeof(props_encoded))) return false; |
24431
|
|
|
|
|
|
|
|
24432
|
6
|
|
|
|
|
|
vector compressed(compressed_len); |
24433
|
6
|
50
|
|
|
|
|
if (!is.read((char *) compressed.data(), compressed_len)) return false; |
|
|
50
|
|
|
|
|
|
24434
|
|
|
|
|
|
|
|
24435
|
|
|
|
|
|
|
lzma::ELzmaStatus status; |
24436
|
6
|
|
|
|
|
|
size_t uncompressed_size = uncompressed_len, compressed_size = compressed_len; |
24437
|
6
|
50
|
|
|
|
|
auto res = lzma::LzmaDecode(data.fill(uncompressed_len), &uncompressed_size, compressed.data(), &compressed_size, props_encoded, LZMA_PROPS_SIZE, lzma::LZMA_FINISH_ANY, &status, &lzmaAllocator); |
24438
|
6
|
50
|
|
|
|
|
if (res != SZ_OK || uncompressed_size != uncompressed_len || compressed_size != compressed_len) return false; |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
24439
|
|
|
|
|
|
|
|
24440
|
6
|
|
|
|
|
|
return true; |
24441
|
|
|
|
|
|
|
} |
24442
|
|
|
|
|
|
|
|
24443
|
|
|
|
|
|
|
} // namespace utils |
24444
|
|
|
|
|
|
|
|
24445
|
|
|
|
|
|
|
///////// |
24446
|
|
|
|
|
|
|
// File: utils/compressor_save.cpp |
24447
|
|
|
|
|
|
|
///////// |
24448
|
|
|
|
|
|
|
|
24449
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
24450
|
|
|
|
|
|
|
// |
24451
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
24452
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
24453
|
|
|
|
|
|
|
// |
24454
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
24455
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
24456
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
24457
|
|
|
|
|
|
|
|
24458
|
|
|
|
|
|
|
namespace utils { |
24459
|
|
|
|
|
|
|
|
24460
|
|
|
|
|
|
|
// Start of LZMA compression library by Igor Pavlov |
24461
|
|
|
|
|
|
|
namespace lzma { |
24462
|
|
|
|
|
|
|
|
24463
|
|
|
|
|
|
|
// Types.h -- Basic types |
24464
|
|
|
|
|
|
|
// 2010-10-09 : Igor Pavlov : Public domain |
24465
|
|
|
|
|
|
|
#ifndef UFAL_CPPUTILS_COMPRESSOR_LZMA_TYPES_H |
24466
|
|
|
|
|
|
|
#define UFAL_CPPUTILS_COMPRESSOR_LZMA_TYPES_H |
24467
|
|
|
|
|
|
|
|
24468
|
|
|
|
|
|
|
#define SZ_OK 0 |
24469
|
|
|
|
|
|
|
|
24470
|
|
|
|
|
|
|
#define SZ_ERROR_DATA 1 |
24471
|
|
|
|
|
|
|
#define SZ_ERROR_MEM 2 |
24472
|
|
|
|
|
|
|
#define SZ_ERROR_CRC 3 |
24473
|
|
|
|
|
|
|
#define SZ_ERROR_UNSUPPORTED 4 |
24474
|
|
|
|
|
|
|
#define SZ_ERROR_PARAM 5 |
24475
|
|
|
|
|
|
|
#define SZ_ERROR_INPUT_EOF 6 |
24476
|
|
|
|
|
|
|
#define SZ_ERROR_OUTPUT_EOF 7 |
24477
|
|
|
|
|
|
|
#define SZ_ERROR_READ 8 |
24478
|
|
|
|
|
|
|
#define SZ_ERROR_WRITE 9 |
24479
|
|
|
|
|
|
|
#define SZ_ERROR_PROGRESS 10 |
24480
|
|
|
|
|
|
|
#define SZ_ERROR_FAIL 11 |
24481
|
|
|
|
|
|
|
#define SZ_ERROR_THREAD 12 |
24482
|
|
|
|
|
|
|
|
24483
|
|
|
|
|
|
|
#define SZ_ERROR_ARCHIVE 16 |
24484
|
|
|
|
|
|
|
#define SZ_ERROR_NO_ARCHIVE 17 |
24485
|
|
|
|
|
|
|
|
24486
|
|
|
|
|
|
|
typedef int SRes; |
24487
|
|
|
|
|
|
|
|
24488
|
|
|
|
|
|
|
#ifndef RINOK |
24489
|
|
|
|
|
|
|
#define RINOK(x) { int __result__ = (x); if (__result__ != 0) return __result__; } |
24490
|
|
|
|
|
|
|
#endif |
24491
|
|
|
|
|
|
|
|
24492
|
|
|
|
|
|
|
/* The following interfaces use first parameter as pointer to structure */ |
24493
|
|
|
|
|
|
|
|
24494
|
|
|
|
|
|
|
struct IByteIn |
24495
|
|
|
|
|
|
|
{ |
24496
|
|
|
|
|
|
|
uint8_t (*Read)(void *p); /* reads one byte, returns 0 in case of EOF or error */ |
24497
|
|
|
|
|
|
|
}; |
24498
|
|
|
|
|
|
|
|
24499
|
|
|
|
|
|
|
struct IByteOut |
24500
|
|
|
|
|
|
|
{ |
24501
|
|
|
|
|
|
|
void (*Write)(void *p, uint8_t b); |
24502
|
|
|
|
|
|
|
}; |
24503
|
|
|
|
|
|
|
|
24504
|
|
|
|
|
|
|
struct ISeqInStream |
24505
|
|
|
|
|
|
|
{ |
24506
|
|
|
|
|
|
|
SRes (*Read)(void *p, void *buf, size_t *size); |
24507
|
|
|
|
|
|
|
/* if (input(*size) != 0 && output(*size) == 0) means end_of_stream. |
24508
|
|
|
|
|
|
|
(output(*size) < input(*size)) is allowed */ |
24509
|
|
|
|
|
|
|
}; |
24510
|
|
|
|
|
|
|
|
24511
|
|
|
|
|
|
|
/* it can return SZ_ERROR_INPUT_EOF */ |
24512
|
|
|
|
|
|
|
SRes SeqInStream_Read(ISeqInStream *stream, void *buf, size_t size); |
24513
|
|
|
|
|
|
|
SRes SeqInStream_Read2(ISeqInStream *stream, void *buf, size_t size, SRes errorType); |
24514
|
|
|
|
|
|
|
SRes SeqInStream_ReadByte(ISeqInStream *stream, uint8_t *buf); |
24515
|
|
|
|
|
|
|
|
24516
|
|
|
|
|
|
|
struct ISeqOutStream |
24517
|
|
|
|
|
|
|
{ |
24518
|
|
|
|
|
|
|
size_t (*Write)(void *p, const void *buf, size_t size); |
24519
|
|
|
|
|
|
|
/* Returns: result - the number of actually written bytes. |
24520
|
|
|
|
|
|
|
(result < size) means error */ |
24521
|
|
|
|
|
|
|
}; |
24522
|
|
|
|
|
|
|
|
24523
|
|
|
|
|
|
|
enum ESzSeek |
24524
|
|
|
|
|
|
|
{ |
24525
|
|
|
|
|
|
|
SZ_SEEK_SET = 0, |
24526
|
|
|
|
|
|
|
SZ_SEEK_CUR = 1, |
24527
|
|
|
|
|
|
|
SZ_SEEK_END = 2 |
24528
|
|
|
|
|
|
|
}; |
24529
|
|
|
|
|
|
|
|
24530
|
|
|
|
|
|
|
struct ISeekInStream |
24531
|
|
|
|
|
|
|
{ |
24532
|
|
|
|
|
|
|
SRes (*Read)(void *p, void *buf, size_t *size); /* same as ISeqInStream::Read */ |
24533
|
|
|
|
|
|
|
SRes (*Seek)(void *p, int64_t *pos, ESzSeek origin); |
24534
|
|
|
|
|
|
|
}; |
24535
|
|
|
|
|
|
|
|
24536
|
|
|
|
|
|
|
struct ILookInStream |
24537
|
|
|
|
|
|
|
{ |
24538
|
|
|
|
|
|
|
SRes (*Look)(void *p, const void **buf, size_t *size); |
24539
|
|
|
|
|
|
|
/* if (input(*size) != 0 && output(*size) == 0) means end_of_stream. |
24540
|
|
|
|
|
|
|
(output(*size) > input(*size)) is not allowed |
24541
|
|
|
|
|
|
|
(output(*size) < input(*size)) is allowed */ |
24542
|
|
|
|
|
|
|
SRes (*Skip)(void *p, size_t offset); |
24543
|
|
|
|
|
|
|
/* offset must be <= output(*size) of Look */ |
24544
|
|
|
|
|
|
|
|
24545
|
|
|
|
|
|
|
SRes (*Read)(void *p, void *buf, size_t *size); |
24546
|
|
|
|
|
|
|
/* reads directly (without buffer). It's same as ISeqInStream::Read */ |
24547
|
|
|
|
|
|
|
SRes (*Seek)(void *p, int64_t *pos, ESzSeek origin); |
24548
|
|
|
|
|
|
|
}; |
24549
|
|
|
|
|
|
|
|
24550
|
|
|
|
|
|
|
SRes LookInStream_LookRead(ILookInStream *stream, void *buf, size_t *size); |
24551
|
|
|
|
|
|
|
SRes LookInStream_SeekTo(ILookInStream *stream, uint64_t offset); |
24552
|
|
|
|
|
|
|
|
24553
|
|
|
|
|
|
|
/* reads via ILookInStream::Read */ |
24554
|
|
|
|
|
|
|
SRes LookInStream_Read2(ILookInStream *stream, void *buf, size_t size, SRes errorType); |
24555
|
|
|
|
|
|
|
SRes LookInStream_Read(ILookInStream *stream, void *buf, size_t size); |
24556
|
|
|
|
|
|
|
|
24557
|
|
|
|
|
|
|
#define LookToRead_BUF_SIZE (1 << 14) |
24558
|
|
|
|
|
|
|
|
24559
|
|
|
|
|
|
|
struct CLookToRead |
24560
|
|
|
|
|
|
|
{ |
24561
|
|
|
|
|
|
|
ILookInStream s; |
24562
|
|
|
|
|
|
|
ISeekInStream *realStream; |
24563
|
|
|
|
|
|
|
size_t pos; |
24564
|
|
|
|
|
|
|
size_t size; |
24565
|
|
|
|
|
|
|
uint8_t buf[LookToRead_BUF_SIZE]; |
24566
|
|
|
|
|
|
|
}; |
24567
|
|
|
|
|
|
|
|
24568
|
|
|
|
|
|
|
void LookToRead_CreateVTable(CLookToRead *p, int lookahead); |
24569
|
|
|
|
|
|
|
void LookToRead_Init(CLookToRead *p); |
24570
|
|
|
|
|
|
|
|
24571
|
|
|
|
|
|
|
struct CSecToLook |
24572
|
|
|
|
|
|
|
{ |
24573
|
|
|
|
|
|
|
ISeqInStream s; |
24574
|
|
|
|
|
|
|
ILookInStream *realStream; |
24575
|
|
|
|
|
|
|
}; |
24576
|
|
|
|
|
|
|
|
24577
|
|
|
|
|
|
|
void SecToLook_CreateVTable(CSecToLook *p); |
24578
|
|
|
|
|
|
|
|
24579
|
|
|
|
|
|
|
struct CSecToRead |
24580
|
|
|
|
|
|
|
{ |
24581
|
|
|
|
|
|
|
ISeqInStream s; |
24582
|
|
|
|
|
|
|
ILookInStream *realStream; |
24583
|
|
|
|
|
|
|
}; |
24584
|
|
|
|
|
|
|
|
24585
|
|
|
|
|
|
|
void SecToRead_CreateVTable(CSecToRead *p); |
24586
|
|
|
|
|
|
|
|
24587
|
|
|
|
|
|
|
struct ICompressProgress |
24588
|
|
|
|
|
|
|
{ |
24589
|
|
|
|
|
|
|
SRes (*Progress)(void *p, uint64_t inSize, uint64_t outSize); |
24590
|
|
|
|
|
|
|
/* Returns: result. (result != SZ_OK) means break. |
24591
|
|
|
|
|
|
|
Value (uint64_t)(int64_t)-1 for size means unknown value. */ |
24592
|
|
|
|
|
|
|
}; |
24593
|
|
|
|
|
|
|
|
24594
|
|
|
|
|
|
|
struct ISzAlloc |
24595
|
|
|
|
|
|
|
{ |
24596
|
|
|
|
|
|
|
void *(*Alloc)(void *p, size_t size); |
24597
|
|
|
|
|
|
|
void (*Free)(void *p, void *address); /* address can be 0 */ |
24598
|
|
|
|
|
|
|
}; |
24599
|
|
|
|
|
|
|
|
24600
|
|
|
|
|
|
|
#define IAlloc_Alloc(p, size) (p)->Alloc((p), size) |
24601
|
|
|
|
|
|
|
#define IAlloc_Free(p, a) (p)->Free((p), a) |
24602
|
|
|
|
|
|
|
|
24603
|
|
|
|
|
|
|
#endif // UFAL_CPPUTILS_COMPRESSOR_LZMA_TYPES_H |
24604
|
|
|
|
|
|
|
|
24605
|
|
|
|
|
|
|
// LzHash.h -- HASH functions for LZ algorithms |
24606
|
|
|
|
|
|
|
// 2009-02-07 : Igor Pavlov : Public domain |
24607
|
|
|
|
|
|
|
|
24608
|
|
|
|
|
|
|
#define kHash2Size (1 << 10) |
24609
|
|
|
|
|
|
|
#define kHash3Size (1 << 16) |
24610
|
|
|
|
|
|
|
#define kHash4Size (1 << 20) |
24611
|
|
|
|
|
|
|
|
24612
|
|
|
|
|
|
|
#define kFix3HashSize (kHash2Size) |
24613
|
|
|
|
|
|
|
#define kFix4HashSize (kHash2Size + kHash3Size) |
24614
|
|
|
|
|
|
|
#define kFix5HashSize (kHash2Size + kHash3Size + kHash4Size) |
24615
|
|
|
|
|
|
|
|
24616
|
|
|
|
|
|
|
#define HASH2_CALC hashValue = cur[0] | ((uint32_t)cur[1] << 8); |
24617
|
|
|
|
|
|
|
|
24618
|
|
|
|
|
|
|
#define HASH3_CALC { \ |
24619
|
|
|
|
|
|
|
uint32_t temp = p->crc[cur[0]] ^ cur[1]; \ |
24620
|
|
|
|
|
|
|
hash2Value = temp & (kHash2Size - 1); \ |
24621
|
|
|
|
|
|
|
hashValue = (temp ^ ((uint32_t)cur[2] << 8)) & p->hashMask; } |
24622
|
|
|
|
|
|
|
|
24623
|
|
|
|
|
|
|
#define HASH4_CALC { \ |
24624
|
|
|
|
|
|
|
uint32_t temp = p->crc[cur[0]] ^ cur[1]; \ |
24625
|
|
|
|
|
|
|
hash2Value = temp & (kHash2Size - 1); \ |
24626
|
|
|
|
|
|
|
hash3Value = (temp ^ ((uint32_t)cur[2] << 8)) & (kHash3Size - 1); \ |
24627
|
|
|
|
|
|
|
hashValue = (temp ^ ((uint32_t)cur[2] << 8) ^ (p->crc[cur[3]] << 5)) & p->hashMask; } |
24628
|
|
|
|
|
|
|
|
24629
|
|
|
|
|
|
|
#define HASH5_CALC { \ |
24630
|
|
|
|
|
|
|
uint32_t temp = p->crc[cur[0]] ^ cur[1]; \ |
24631
|
|
|
|
|
|
|
hash2Value = temp & (kHash2Size - 1); \ |
24632
|
|
|
|
|
|
|
hash3Value = (temp ^ ((uint32_t)cur[2] << 8)) & (kHash3Size - 1); \ |
24633
|
|
|
|
|
|
|
hash4Value = (temp ^ ((uint32_t)cur[2] << 8) ^ (p->crc[cur[3]] << 5)); \ |
24634
|
|
|
|
|
|
|
hashValue = (hash4Value ^ (p->crc[cur[4]] << 3)) & p->hashMask; \ |
24635
|
|
|
|
|
|
|
hash4Value &= (kHash4Size - 1); } |
24636
|
|
|
|
|
|
|
|
24637
|
|
|
|
|
|
|
/* #define HASH_ZIP_CALC hashValue = ((cur[0] | ((uint32_t)cur[1] << 8)) ^ p->crc[cur[2]]) & 0xFFFF; */ |
24638
|
|
|
|
|
|
|
#define HASH_ZIP_CALC hashValue = ((cur[2] | ((uint32_t)cur[0] << 8)) ^ p->crc[cur[1]]) & 0xFFFF; |
24639
|
|
|
|
|
|
|
|
24640
|
|
|
|
|
|
|
#define MT_HASH2_CALC \ |
24641
|
|
|
|
|
|
|
hash2Value = (p->crc[cur[0]] ^ cur[1]) & (kHash2Size - 1); |
24642
|
|
|
|
|
|
|
|
24643
|
|
|
|
|
|
|
#define MT_HASH3_CALC { \ |
24644
|
|
|
|
|
|
|
uint32_t temp = p->crc[cur[0]] ^ cur[1]; \ |
24645
|
|
|
|
|
|
|
hash2Value = temp & (kHash2Size - 1); \ |
24646
|
|
|
|
|
|
|
hash3Value = (temp ^ ((uint32_t)cur[2] << 8)) & (kHash3Size - 1); } |
24647
|
|
|
|
|
|
|
|
24648
|
|
|
|
|
|
|
#define MT_HASH4_CALC { \ |
24649
|
|
|
|
|
|
|
uint32_t temp = p->crc[cur[0]] ^ cur[1]; \ |
24650
|
|
|
|
|
|
|
hash2Value = temp & (kHash2Size - 1); \ |
24651
|
|
|
|
|
|
|
hash3Value = (temp ^ ((uint32_t)cur[2] << 8)) & (kHash3Size - 1); \ |
24652
|
|
|
|
|
|
|
hash4Value = (temp ^ ((uint32_t)cur[2] << 8) ^ (p->crc[cur[3]] << 5)) & (kHash4Size - 1); } |
24653
|
|
|
|
|
|
|
|
24654
|
|
|
|
|
|
|
// LzFind.h -- Match finder for LZ algorithms |
24655
|
|
|
|
|
|
|
// 2009-04-22 : Igor Pavlov : Public domain |
24656
|
|
|
|
|
|
|
|
24657
|
|
|
|
|
|
|
typedef uint32_t CLzRef; |
24658
|
|
|
|
|
|
|
|
24659
|
|
|
|
|
|
|
struct CMatchFinder |
24660
|
|
|
|
|
|
|
{ |
24661
|
|
|
|
|
|
|
uint8_t *buffer; |
24662
|
|
|
|
|
|
|
uint32_t pos; |
24663
|
|
|
|
|
|
|
uint32_t posLimit; |
24664
|
|
|
|
|
|
|
uint32_t streamPos; |
24665
|
|
|
|
|
|
|
uint32_t lenLimit; |
24666
|
|
|
|
|
|
|
|
24667
|
|
|
|
|
|
|
uint32_t cyclicBufferPos; |
24668
|
|
|
|
|
|
|
uint32_t cyclicBufferSize; /* it must be = (historySize + 1) */ |
24669
|
|
|
|
|
|
|
|
24670
|
|
|
|
|
|
|
uint32_t matchMaxLen; |
24671
|
|
|
|
|
|
|
CLzRef *hash; |
24672
|
|
|
|
|
|
|
CLzRef *son; |
24673
|
|
|
|
|
|
|
uint32_t hashMask; |
24674
|
|
|
|
|
|
|
uint32_t cutValue; |
24675
|
|
|
|
|
|
|
|
24676
|
|
|
|
|
|
|
uint8_t *bufferBase; |
24677
|
|
|
|
|
|
|
ISeqInStream *stream; |
24678
|
|
|
|
|
|
|
int streamEndWasReached; |
24679
|
|
|
|
|
|
|
|
24680
|
|
|
|
|
|
|
uint32_t blockSize; |
24681
|
|
|
|
|
|
|
uint32_t keepSizeBefore; |
24682
|
|
|
|
|
|
|
uint32_t keepSizeAfter; |
24683
|
|
|
|
|
|
|
|
24684
|
|
|
|
|
|
|
uint32_t numHashBytes; |
24685
|
|
|
|
|
|
|
int directInput; |
24686
|
|
|
|
|
|
|
size_t directInputRem; |
24687
|
|
|
|
|
|
|
int btMode; |
24688
|
|
|
|
|
|
|
int bigHash; |
24689
|
|
|
|
|
|
|
uint32_t historySize; |
24690
|
|
|
|
|
|
|
uint32_t fixedHashSize; |
24691
|
|
|
|
|
|
|
uint32_t hashSizeSum; |
24692
|
|
|
|
|
|
|
uint32_t numSons; |
24693
|
|
|
|
|
|
|
SRes result; |
24694
|
|
|
|
|
|
|
uint32_t crc[256]; |
24695
|
|
|
|
|
|
|
}; |
24696
|
|
|
|
|
|
|
|
24697
|
|
|
|
|
|
|
#define Inline_MatchFinder_GetPointerToCurrentPos(p) ((p)->buffer) |
24698
|
|
|
|
|
|
|
#define Inline_MatchFinder_GetIndexByte(p, index) ((p)->buffer[(int32_t)(index)]) |
24699
|
|
|
|
|
|
|
|
24700
|
|
|
|
|
|
|
#define Inline_MatchFinder_GetNumAvailableBytes(p) ((p)->streamPos - (p)->pos) |
24701
|
|
|
|
|
|
|
|
24702
|
|
|
|
|
|
|
int MatchFinder_NeedMove(CMatchFinder *p); |
24703
|
|
|
|
|
|
|
uint8_t *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p); |
24704
|
|
|
|
|
|
|
void MatchFinder_MoveBlock(CMatchFinder *p); |
24705
|
|
|
|
|
|
|
void MatchFinder_ReadIfRequired(CMatchFinder *p); |
24706
|
|
|
|
|
|
|
|
24707
|
|
|
|
|
|
|
void MatchFinder_Construct(CMatchFinder *p); |
24708
|
|
|
|
|
|
|
|
24709
|
|
|
|
|
|
|
/* Conditions: |
24710
|
|
|
|
|
|
|
historySize <= 3 GB |
24711
|
|
|
|
|
|
|
keepAddBufferBefore + matchMaxLen + keepAddBufferAfter < 511MB |
24712
|
|
|
|
|
|
|
*/ |
24713
|
|
|
|
|
|
|
int MatchFinder_Create(CMatchFinder *p, uint32_t historySize, |
24714
|
|
|
|
|
|
|
uint32_t keepAddBufferBefore, uint32_t matchMaxLen, uint32_t keepAddBufferAfter, |
24715
|
|
|
|
|
|
|
ISzAlloc *alloc); |
24716
|
|
|
|
|
|
|
void MatchFinder_Free(CMatchFinder *p, ISzAlloc *alloc); |
24717
|
|
|
|
|
|
|
void MatchFinder_Normalize3(uint32_t subValue, CLzRef *items, uint32_t numItems); |
24718
|
|
|
|
|
|
|
void MatchFinder_ReduceOffsets(CMatchFinder *p, uint32_t subValue); |
24719
|
|
|
|
|
|
|
|
24720
|
|
|
|
|
|
|
uint32_t * GetMatchesSpec1(uint32_t lenLimit, uint32_t curMatch, uint32_t pos, const uint8_t *buffer, CLzRef *son, |
24721
|
|
|
|
|
|
|
uint32_t _cyclicBufferPos, uint32_t _cyclicBufferSize, uint32_t _cutValue, |
24722
|
|
|
|
|
|
|
uint32_t *distances, uint32_t maxLen); |
24723
|
|
|
|
|
|
|
|
24724
|
|
|
|
|
|
|
/* |
24725
|
|
|
|
|
|
|
Conditions: |
24726
|
|
|
|
|
|
|
Mf_GetNumAvailableBytes_Func must be called before each Mf_GetMatchLen_Func. |
24727
|
|
|
|
|
|
|
Mf_GetPointerToCurrentPos_Func's result must be used only before any other function |
24728
|
|
|
|
|
|
|
*/ |
24729
|
|
|
|
|
|
|
|
24730
|
|
|
|
|
|
|
typedef void (*Mf_Init_Func)(CMatchFinder *object); |
24731
|
|
|
|
|
|
|
typedef uint8_t (*Mf_GetIndexByte_Func)(CMatchFinder *object, int32_t index); |
24732
|
|
|
|
|
|
|
typedef uint32_t (*Mf_GetNumAvailableBytes_Func)(CMatchFinder *object); |
24733
|
|
|
|
|
|
|
typedef uint8_t * (*Mf_GetPointerToCurrentPos_Func)(CMatchFinder *object); |
24734
|
|
|
|
|
|
|
typedef uint32_t (*Mf_GetMatches_Func)(CMatchFinder *object, uint32_t *distances); |
24735
|
|
|
|
|
|
|
typedef void (*Mf_Skip_Func)(CMatchFinder *object, uint32_t); |
24736
|
|
|
|
|
|
|
|
24737
|
|
|
|
|
|
|
struct IMatchFinder |
24738
|
|
|
|
|
|
|
{ |
24739
|
|
|
|
|
|
|
Mf_Init_Func Init; |
24740
|
|
|
|
|
|
|
Mf_GetIndexByte_Func GetIndexByte; |
24741
|
|
|
|
|
|
|
Mf_GetNumAvailableBytes_Func GetNumAvailableBytes; |
24742
|
|
|
|
|
|
|
Mf_GetPointerToCurrentPos_Func GetPointerToCurrentPos; |
24743
|
|
|
|
|
|
|
Mf_GetMatches_Func GetMatches; |
24744
|
|
|
|
|
|
|
Mf_Skip_Func Skip; |
24745
|
|
|
|
|
|
|
}; |
24746
|
|
|
|
|
|
|
|
24747
|
|
|
|
|
|
|
void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder *vTable); |
24748
|
|
|
|
|
|
|
|
24749
|
|
|
|
|
|
|
void MatchFinder_Init(CMatchFinder *p); |
24750
|
|
|
|
|
|
|
uint32_t Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, uint32_t *distances); |
24751
|
|
|
|
|
|
|
uint32_t Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, uint32_t *distances); |
24752
|
|
|
|
|
|
|
void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, uint32_t num); |
24753
|
|
|
|
|
|
|
void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, uint32_t num); |
24754
|
|
|
|
|
|
|
|
24755
|
|
|
|
|
|
|
// LzFind.c -- Match finder for LZ algorithms |
24756
|
|
|
|
|
|
|
// 2009-04-22 : Igor Pavlov : Public domain |
24757
|
|
|
|
|
|
|
|
24758
|
|
|
|
|
|
|
#define kEmptyHashValue 0 |
24759
|
|
|
|
|
|
|
#define kMaxValForNormalize ((uint32_t)0xFFFFFFFF) |
24760
|
|
|
|
|
|
|
#define kNormalizeStepMin (1 << 10) /* it must be power of 2 */ |
24761
|
|
|
|
|
|
|
#define kNormalizeMask (~(kNormalizeStepMin - 1)) |
24762
|
|
|
|
|
|
|
#define kMaxHistorySize ((uint32_t)3 << 30) |
24763
|
|
|
|
|
|
|
|
24764
|
|
|
|
|
|
|
#define kStartMaxLen 3 |
24765
|
|
|
|
|
|
|
|
24766
|
|
|
|
|
|
|
static void LzInWindow_Free(CMatchFinder *p, ISzAlloc *alloc) |
24767
|
|
|
|
|
|
|
{ |
24768
|
0
|
0
|
|
|
|
|
if (!p->directInput) |
24769
|
|
|
|
|
|
|
{ |
24770
|
0
|
|
|
|
|
|
alloc->Free(alloc, p->bufferBase); |
24771
|
0
|
|
|
|
|
|
p->bufferBase = 0; |
24772
|
|
|
|
|
|
|
} |
24773
|
|
|
|
|
|
|
} |
24774
|
|
|
|
|
|
|
|
24775
|
|
|
|
|
|
|
/* keepSizeBefore + keepSizeAfter + keepSizeReserv must be < 4G) */ |
24776
|
|
|
|
|
|
|
|
24777
|
0
|
|
|
|
|
|
static int LzInWindow_Create(CMatchFinder *p, uint32_t keepSizeReserv, ISzAlloc *alloc) |
24778
|
|
|
|
|
|
|
{ |
24779
|
0
|
|
|
|
|
|
uint32_t blockSize = p->keepSizeBefore + p->keepSizeAfter + keepSizeReserv; |
24780
|
0
|
0
|
|
|
|
|
if (p->directInput) |
24781
|
|
|
|
|
|
|
{ |
24782
|
0
|
|
|
|
|
|
p->blockSize = blockSize; |
24783
|
0
|
|
|
|
|
|
return 1; |
24784
|
|
|
|
|
|
|
} |
24785
|
0
|
0
|
|
|
|
|
if (p->bufferBase == 0 || p->blockSize != blockSize) |
|
|
0
|
|
|
|
|
|
24786
|
|
|
|
|
|
|
{ |
24787
|
|
|
|
|
|
|
LzInWindow_Free(p, alloc); |
24788
|
0
|
|
|
|
|
|
p->blockSize = blockSize; |
24789
|
0
|
|
|
|
|
|
p->bufferBase = (uint8_t *)alloc->Alloc(alloc, (size_t)blockSize); |
24790
|
|
|
|
|
|
|
} |
24791
|
0
|
|
|
|
|
|
return (p->bufferBase != 0); |
24792
|
|
|
|
|
|
|
} |
24793
|
|
|
|
|
|
|
|
24794
|
0
|
|
|
|
|
|
uint8_t *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p) { return p->buffer; } |
24795
|
0
|
|
|
|
|
|
uint8_t MatchFinder_GetIndexByte(CMatchFinder *p, int32_t index) { return p->buffer[index]; } |
24796
|
|
|
|
|
|
|
|
24797
|
0
|
|
|
|
|
|
uint32_t MatchFinder_GetNumAvailableBytes(CMatchFinder *p) { return p->streamPos - p->pos; } |
24798
|
|
|
|
|
|
|
|
24799
|
0
|
|
|
|
|
|
void MatchFinder_ReduceOffsets(CMatchFinder *p, uint32_t subValue) |
24800
|
|
|
|
|
|
|
{ |
24801
|
0
|
|
|
|
|
|
p->posLimit -= subValue; |
24802
|
0
|
|
|
|
|
|
p->pos -= subValue; |
24803
|
0
|
|
|
|
|
|
p->streamPos -= subValue; |
24804
|
0
|
|
|
|
|
|
} |
24805
|
|
|
|
|
|
|
|
24806
|
0
|
|
|
|
|
|
static void MatchFinder_ReadBlock(CMatchFinder *p) |
24807
|
|
|
|
|
|
|
{ |
24808
|
0
|
0
|
|
|
|
|
if (p->streamEndWasReached || p->result != SZ_OK) |
|
|
0
|
|
|
|
|
|
24809
|
|
|
|
|
|
|
return; |
24810
|
0
|
0
|
|
|
|
|
if (p->directInput) |
24811
|
|
|
|
|
|
|
{ |
24812
|
0
|
|
|
|
|
|
uint32_t curSize = 0xFFFFFFFF - p->streamPos; |
24813
|
0
|
0
|
|
|
|
|
if (curSize > p->directInputRem) |
24814
|
0
|
|
|
|
|
|
curSize = (uint32_t)p->directInputRem; |
24815
|
0
|
|
|
|
|
|
p->directInputRem -= curSize; |
24816
|
0
|
|
|
|
|
|
p->streamPos += curSize; |
24817
|
0
|
0
|
|
|
|
|
if (p->directInputRem == 0) |
24818
|
0
|
|
|
|
|
|
p->streamEndWasReached = 1; |
24819
|
|
|
|
|
|
|
return; |
24820
|
|
|
|
|
|
|
} |
24821
|
0
|
|
|
|
|
|
for (;;) |
24822
|
|
|
|
|
|
|
{ |
24823
|
0
|
|
|
|
|
|
uint8_t *dest = p->buffer + (p->streamPos - p->pos); |
24824
|
0
|
|
|
|
|
|
size_t size = (p->bufferBase + p->blockSize - dest); |
24825
|
0
|
0
|
|
|
|
|
if (size == 0) |
24826
|
0
|
|
|
|
|
|
return; |
24827
|
0
|
|
|
|
|
|
p->result = p->stream->Read(p->stream, dest, &size); |
24828
|
0
|
0
|
|
|
|
|
if (p->result != SZ_OK) |
24829
|
|
|
|
|
|
|
return; |
24830
|
0
|
0
|
|
|
|
|
if (size == 0) |
24831
|
|
|
|
|
|
|
{ |
24832
|
0
|
|
|
|
|
|
p->streamEndWasReached = 1; |
24833
|
0
|
|
|
|
|
|
return; |
24834
|
|
|
|
|
|
|
} |
24835
|
0
|
|
|
|
|
|
p->streamPos += (uint32_t)size; |
24836
|
0
|
0
|
|
|
|
|
if (p->streamPos - p->pos > p->keepSizeAfter) |
24837
|
|
|
|
|
|
|
return; |
24838
|
|
|
|
|
|
|
} |
24839
|
|
|
|
|
|
|
} |
24840
|
|
|
|
|
|
|
|
24841
|
0
|
|
|
|
|
|
void MatchFinder_MoveBlock(CMatchFinder *p) |
24842
|
|
|
|
|
|
|
{ |
24843
|
0
|
|
|
|
|
|
memmove(p->bufferBase, |
24844
|
0
|
|
|
|
|
|
p->buffer - p->keepSizeBefore, |
24845
|
0
|
|
|
|
|
|
(size_t)(p->streamPos - p->pos + p->keepSizeBefore)); |
24846
|
0
|
|
|
|
|
|
p->buffer = p->bufferBase + p->keepSizeBefore; |
24847
|
0
|
|
|
|
|
|
} |
24848
|
|
|
|
|
|
|
|
24849
|
0
|
|
|
|
|
|
int MatchFinder_NeedMove(CMatchFinder *p) |
24850
|
|
|
|
|
|
|
{ |
24851
|
0
|
0
|
|
|
|
|
if (p->directInput) |
|
|
0
|
|
|
|
|
|
24852
|
|
|
|
|
|
|
return 0; |
24853
|
|
|
|
|
|
|
/* if (p->streamEndWasReached) return 0; */ |
24854
|
0
|
|
|
|
|
|
return ((size_t)(p->bufferBase + p->blockSize - p->buffer) <= p->keepSizeAfter); |
24855
|
|
|
|
|
|
|
} |
24856
|
|
|
|
|
|
|
|
24857
|
0
|
|
|
|
|
|
void MatchFinder_ReadIfRequired(CMatchFinder *p) |
24858
|
|
|
|
|
|
|
{ |
24859
|
0
|
0
|
|
|
|
|
if (p->streamEndWasReached) |
24860
|
|
|
|
|
|
|
return; |
24861
|
0
|
0
|
|
|
|
|
if (p->keepSizeAfter >= p->streamPos - p->pos) |
24862
|
0
|
|
|
|
|
|
MatchFinder_ReadBlock(p); |
24863
|
|
|
|
|
|
|
} |
24864
|
|
|
|
|
|
|
|
24865
|
0
|
|
|
|
|
|
static void MatchFinder_CheckAndMoveAndRead(CMatchFinder *p) |
24866
|
|
|
|
|
|
|
{ |
24867
|
0
|
0
|
|
|
|
|
if (MatchFinder_NeedMove(p)) |
24868
|
0
|
|
|
|
|
|
MatchFinder_MoveBlock(p); |
24869
|
0
|
|
|
|
|
|
MatchFinder_ReadBlock(p); |
24870
|
0
|
|
|
|
|
|
} |
24871
|
|
|
|
|
|
|
|
24872
|
|
|
|
|
|
|
static void MatchFinder_SetDefaultSettings(CMatchFinder *p) |
24873
|
|
|
|
|
|
|
{ |
24874
|
0
|
|
|
|
|
|
p->cutValue = 32; |
24875
|
0
|
|
|
|
|
|
p->btMode = 1; |
24876
|
0
|
|
|
|
|
|
p->numHashBytes = 4; |
24877
|
0
|
|
|
|
|
|
p->bigHash = 0; |
24878
|
|
|
|
|
|
|
} |
24879
|
|
|
|
|
|
|
|
24880
|
|
|
|
|
|
|
#define kCrcPoly 0xEDB88320 |
24881
|
|
|
|
|
|
|
|
24882
|
0
|
|
|
|
|
|
void MatchFinder_Construct(CMatchFinder *p) |
24883
|
|
|
|
|
|
|
{ |
24884
|
|
|
|
|
|
|
uint32_t i; |
24885
|
0
|
|
|
|
|
|
p->bufferBase = 0; |
24886
|
0
|
|
|
|
|
|
p->directInput = 0; |
24887
|
0
|
|
|
|
|
|
p->hash = 0; |
24888
|
|
|
|
|
|
|
MatchFinder_SetDefaultSettings(p); |
24889
|
|
|
|
|
|
|
|
24890
|
0
|
0
|
|
|
|
|
for (i = 0; i < 256; i++) |
|
|
0
|
|
|
|
|
|
24891
|
|
|
|
|
|
|
{ |
24892
|
|
|
|
|
|
|
uint32_t r = i; |
24893
|
|
|
|
|
|
|
int j; |
24894
|
0
|
0
|
|
|
|
|
for (j = 0; j < 8; j++) |
|
|
0
|
|
|
|
|
|
24895
|
0
|
|
|
|
|
|
r = (r >> 1) ^ (kCrcPoly & ~((r & 1) - 1)); |
24896
|
0
|
|
|
|
|
|
p->crc[i] = r; |
24897
|
|
|
|
|
|
|
} |
24898
|
0
|
|
|
|
|
|
} |
24899
|
|
|
|
|
|
|
|
24900
|
|
|
|
|
|
|
static void MatchFinder_FreeThisClassMemory(CMatchFinder *p, ISzAlloc *alloc) |
24901
|
|
|
|
|
|
|
{ |
24902
|
0
|
|
|
|
|
|
alloc->Free(alloc, p->hash); |
24903
|
0
|
|
|
|
|
|
p->hash = 0; |
24904
|
|
|
|
|
|
|
} |
24905
|
|
|
|
|
|
|
|
24906
|
0
|
|
|
|
|
|
void MatchFinder_Free(CMatchFinder *p, ISzAlloc *alloc) |
24907
|
|
|
|
|
|
|
{ |
24908
|
|
|
|
|
|
|
MatchFinder_FreeThisClassMemory(p, alloc); |
24909
|
|
|
|
|
|
|
LzInWindow_Free(p, alloc); |
24910
|
0
|
|
|
|
|
|
} |
24911
|
|
|
|
|
|
|
|
24912
|
|
|
|
|
|
|
static CLzRef* AllocRefs(uint32_t num, ISzAlloc *alloc) |
24913
|
|
|
|
|
|
|
{ |
24914
|
0
|
|
|
|
|
|
size_t sizeInBytes = (size_t)num * sizeof(CLzRef); |
24915
|
0
|
0
|
|
|
|
|
if (sizeInBytes / sizeof(CLzRef) != num) |
24916
|
|
|
|
|
|
|
return 0; |
24917
|
0
|
|
|
|
|
|
return (CLzRef *)alloc->Alloc(alloc, sizeInBytes); |
24918
|
|
|
|
|
|
|
} |
24919
|
|
|
|
|
|
|
|
24920
|
0
|
|
|
|
|
|
int MatchFinder_Create(CMatchFinder *p, uint32_t historySize, |
24921
|
|
|
|
|
|
|
uint32_t keepAddBufferBefore, uint32_t matchMaxLen, uint32_t keepAddBufferAfter, |
24922
|
|
|
|
|
|
|
ISzAlloc *alloc) |
24923
|
|
|
|
|
|
|
{ |
24924
|
|
|
|
|
|
|
uint32_t sizeReserv; |
24925
|
0
|
0
|
|
|
|
|
if (historySize > kMaxHistorySize) |
24926
|
|
|
|
|
|
|
{ |
24927
|
|
|
|
|
|
|
MatchFinder_Free(p, alloc); |
24928
|
|
|
|
|
|
|
return 0; |
24929
|
|
|
|
|
|
|
} |
24930
|
0
|
|
|
|
|
|
sizeReserv = historySize >> 1; |
24931
|
0
|
0
|
|
|
|
|
if (historySize > ((uint32_t)2 << 30)) |
24932
|
0
|
|
|
|
|
|
sizeReserv = historySize >> 2; |
24933
|
0
|
|
|
|
|
|
sizeReserv += (keepAddBufferBefore + matchMaxLen + keepAddBufferAfter) / 2 + (1 << 19); |
24934
|
|
|
|
|
|
|
|
24935
|
0
|
|
|
|
|
|
p->keepSizeBefore = historySize + keepAddBufferBefore + 1; |
24936
|
0
|
|
|
|
|
|
p->keepSizeAfter = matchMaxLen + keepAddBufferAfter; |
24937
|
|
|
|
|
|
|
/* we need one additional byte, since we use MoveBlock after pos++ and before dictionary using */ |
24938
|
0
|
0
|
|
|
|
|
if (LzInWindow_Create(p, sizeReserv, alloc)) |
24939
|
|
|
|
|
|
|
{ |
24940
|
0
|
|
|
|
|
|
uint32_t newCyclicBufferSize = historySize + 1; |
24941
|
|
|
|
|
|
|
uint32_t hs; |
24942
|
0
|
|
|
|
|
|
p->matchMaxLen = matchMaxLen; |
24943
|
|
|
|
|
|
|
{ |
24944
|
0
|
|
|
|
|
|
p->fixedHashSize = 0; |
24945
|
0
|
0
|
|
|
|
|
if (p->numHashBytes == 2) |
24946
|
|
|
|
|
|
|
hs = (1 << 16) - 1; |
24947
|
|
|
|
|
|
|
else |
24948
|
|
|
|
|
|
|
{ |
24949
|
0
|
|
|
|
|
|
hs = historySize - 1; |
24950
|
0
|
|
|
|
|
|
hs |= (hs >> 1); |
24951
|
0
|
|
|
|
|
|
hs |= (hs >> 2); |
24952
|
0
|
|
|
|
|
|
hs |= (hs >> 4); |
24953
|
0
|
|
|
|
|
|
hs |= (hs >> 8); |
24954
|
0
|
|
|
|
|
|
hs >>= 1; |
24955
|
0
|
|
|
|
|
|
hs |= 0xFFFF; /* don't change it! It's required for Deflate */ |
24956
|
0
|
0
|
|
|
|
|
if (hs > (1 << 24)) |
24957
|
|
|
|
|
|
|
{ |
24958
|
0
|
0
|
|
|
|
|
if (p->numHashBytes == 3) |
24959
|
|
|
|
|
|
|
hs = (1 << 24) - 1; |
24960
|
|
|
|
|
|
|
else |
24961
|
0
|
|
|
|
|
|
hs >>= 1; |
24962
|
|
|
|
|
|
|
} |
24963
|
|
|
|
|
|
|
} |
24964
|
0
|
|
|
|
|
|
p->hashMask = hs; |
24965
|
0
|
|
|
|
|
|
hs++; |
24966
|
0
|
0
|
|
|
|
|
if (p->numHashBytes > 2) p->fixedHashSize += kHash2Size; |
24967
|
0
|
0
|
|
|
|
|
if (p->numHashBytes > 3) p->fixedHashSize += kHash3Size; |
24968
|
0
|
0
|
|
|
|
|
if (p->numHashBytes > 4) p->fixedHashSize += kHash4Size; |
24969
|
0
|
|
|
|
|
|
hs += p->fixedHashSize; |
24970
|
|
|
|
|
|
|
} |
24971
|
|
|
|
|
|
|
|
24972
|
|
|
|
|
|
|
{ |
24973
|
0
|
|
|
|
|
|
uint32_t prevSize = p->hashSizeSum + p->numSons; |
24974
|
|
|
|
|
|
|
uint32_t newSize; |
24975
|
0
|
|
|
|
|
|
p->historySize = historySize; |
24976
|
0
|
|
|
|
|
|
p->hashSizeSum = hs; |
24977
|
0
|
|
|
|
|
|
p->cyclicBufferSize = newCyclicBufferSize; |
24978
|
0
|
0
|
|
|
|
|
p->numSons = (p->btMode ? newCyclicBufferSize * 2 : newCyclicBufferSize); |
24979
|
0
|
|
|
|
|
|
newSize = p->hashSizeSum + p->numSons; |
24980
|
0
|
0
|
|
|
|
|
if (p->hash != 0 && prevSize == newSize) |
|
|
0
|
|
|
|
|
|
24981
|
|
|
|
|
|
|
return 1; |
24982
|
|
|
|
|
|
|
MatchFinder_FreeThisClassMemory(p, alloc); |
24983
|
0
|
|
|
|
|
|
p->hash = AllocRefs(newSize, alloc); |
24984
|
0
|
0
|
|
|
|
|
if (p->hash != 0) |
24985
|
|
|
|
|
|
|
{ |
24986
|
0
|
|
|
|
|
|
p->son = p->hash + p->hashSizeSum; |
24987
|
0
|
|
|
|
|
|
return 1; |
24988
|
|
|
|
|
|
|
} |
24989
|
|
|
|
|
|
|
} |
24990
|
|
|
|
|
|
|
} |
24991
|
|
|
|
|
|
|
MatchFinder_Free(p, alloc); |
24992
|
|
|
|
|
|
|
return 0; |
24993
|
|
|
|
|
|
|
} |
24994
|
|
|
|
|
|
|
|
24995
|
0
|
|
|
|
|
|
static void MatchFinder_SetLimits(CMatchFinder *p) |
24996
|
|
|
|
|
|
|
{ |
24997
|
0
|
|
|
|
|
|
uint32_t limit = kMaxValForNormalize - p->pos; |
24998
|
0
|
|
|
|
|
|
uint32_t limit2 = p->cyclicBufferSize - p->cyclicBufferPos; |
24999
|
0
|
0
|
|
|
|
|
if (limit2 < limit) |
25000
|
|
|
|
|
|
|
limit = limit2; |
25001
|
0
|
|
|
|
|
|
limit2 = p->streamPos - p->pos; |
25002
|
0
|
0
|
|
|
|
|
if (limit2 <= p->keepSizeAfter) |
25003
|
|
|
|
|
|
|
{ |
25004
|
0
|
0
|
|
|
|
|
if (limit2 > 0) |
25005
|
|
|
|
|
|
|
limit2 = 1; |
25006
|
|
|
|
|
|
|
} |
25007
|
|
|
|
|
|
|
else |
25008
|
0
|
|
|
|
|
|
limit2 -= p->keepSizeAfter; |
25009
|
0
|
0
|
|
|
|
|
if (limit2 < limit) |
25010
|
|
|
|
|
|
|
limit = limit2; |
25011
|
|
|
|
|
|
|
{ |
25012
|
|
|
|
|
|
|
uint32_t lenLimit = p->streamPos - p->pos; |
25013
|
0
|
0
|
|
|
|
|
if (lenLimit > p->matchMaxLen) |
25014
|
|
|
|
|
|
|
lenLimit = p->matchMaxLen; |
25015
|
0
|
|
|
|
|
|
p->lenLimit = lenLimit; |
25016
|
|
|
|
|
|
|
} |
25017
|
0
|
|
|
|
|
|
p->posLimit = p->pos + limit; |
25018
|
0
|
|
|
|
|
|
} |
25019
|
|
|
|
|
|
|
|
25020
|
0
|
|
|
|
|
|
void MatchFinder_Init(CMatchFinder *p) |
25021
|
|
|
|
|
|
|
{ |
25022
|
|
|
|
|
|
|
uint32_t i; |
25023
|
0
|
0
|
|
|
|
|
for (i = 0; i < p->hashSizeSum; i++) |
25024
|
0
|
|
|
|
|
|
p->hash[i] = kEmptyHashValue; |
25025
|
0
|
|
|
|
|
|
p->cyclicBufferPos = 0; |
25026
|
0
|
|
|
|
|
|
p->buffer = p->bufferBase; |
25027
|
0
|
|
|
|
|
|
p->pos = p->streamPos = p->cyclicBufferSize; |
25028
|
0
|
|
|
|
|
|
p->result = SZ_OK; |
25029
|
0
|
|
|
|
|
|
p->streamEndWasReached = 0; |
25030
|
0
|
|
|
|
|
|
MatchFinder_ReadBlock(p); |
25031
|
0
|
|
|
|
|
|
MatchFinder_SetLimits(p); |
25032
|
0
|
|
|
|
|
|
} |
25033
|
|
|
|
|
|
|
|
25034
|
|
|
|
|
|
|
static uint32_t MatchFinder_GetSubValue(CMatchFinder *p) |
25035
|
|
|
|
|
|
|
{ |
25036
|
0
|
|
|
|
|
|
return (p->pos - p->historySize - 1) & kNormalizeMask; |
25037
|
|
|
|
|
|
|
} |
25038
|
|
|
|
|
|
|
|
25039
|
0
|
|
|
|
|
|
void MatchFinder_Normalize3(uint32_t subValue, CLzRef *items, uint32_t numItems) |
25040
|
|
|
|
|
|
|
{ |
25041
|
|
|
|
|
|
|
uint32_t i; |
25042
|
0
|
0
|
|
|
|
|
for (i = 0; i < numItems; i++) |
|
|
0
|
|
|
|
|
|
25043
|
|
|
|
|
|
|
{ |
25044
|
0
|
|
|
|
|
|
uint32_t value = items[i]; |
25045
|
0
|
0
|
|
|
|
|
if (value <= subValue) |
|
|
0
|
|
|
|
|
|
25046
|
|
|
|
|
|
|
value = kEmptyHashValue; |
25047
|
|
|
|
|
|
|
else |
25048
|
0
|
|
|
|
|
|
value -= subValue; |
25049
|
0
|
|
|
|
|
|
items[i] = value; |
25050
|
|
|
|
|
|
|
} |
25051
|
0
|
|
|
|
|
|
} |
25052
|
|
|
|
|
|
|
|
25053
|
0
|
|
|
|
|
|
static void MatchFinder_Normalize(CMatchFinder *p) |
25054
|
|
|
|
|
|
|
{ |
25055
|
|
|
|
|
|
|
uint32_t subValue = MatchFinder_GetSubValue(p); |
25056
|
0
|
|
|
|
|
|
MatchFinder_Normalize3(subValue, p->hash, p->hashSizeSum + p->numSons); |
25057
|
|
|
|
|
|
|
MatchFinder_ReduceOffsets(p, subValue); |
25058
|
0
|
|
|
|
|
|
} |
25059
|
|
|
|
|
|
|
|
25060
|
0
|
|
|
|
|
|
static void MatchFinder_CheckLimits(CMatchFinder *p) |
25061
|
|
|
|
|
|
|
{ |
25062
|
0
|
0
|
|
|
|
|
if (p->pos == kMaxValForNormalize) |
25063
|
0
|
|
|
|
|
|
MatchFinder_Normalize(p); |
25064
|
0
|
0
|
|
|
|
|
if (!p->streamEndWasReached && p->keepSizeAfter == p->streamPos - p->pos) |
|
|
0
|
|
|
|
|
|
25065
|
0
|
|
|
|
|
|
MatchFinder_CheckAndMoveAndRead(p); |
25066
|
0
|
0
|
|
|
|
|
if (p->cyclicBufferPos == p->cyclicBufferSize) |
25067
|
0
|
|
|
|
|
|
p->cyclicBufferPos = 0; |
25068
|
0
|
|
|
|
|
|
MatchFinder_SetLimits(p); |
25069
|
0
|
|
|
|
|
|
} |
25070
|
|
|
|
|
|
|
|
25071
|
0
|
|
|
|
|
|
static uint32_t * Hc_GetMatchesSpec(uint32_t lenLimit, uint32_t curMatch, uint32_t pos, const uint8_t *cur, CLzRef *son, |
25072
|
|
|
|
|
|
|
uint32_t _cyclicBufferPos, uint32_t _cyclicBufferSize, uint32_t cutValue, |
25073
|
|
|
|
|
|
|
uint32_t *distances, uint32_t maxLen) |
25074
|
|
|
|
|
|
|
{ |
25075
|
0
|
|
|
|
|
|
son[_cyclicBufferPos] = curMatch; |
25076
|
|
|
|
|
|
|
for (;;) |
25077
|
|
|
|
|
|
|
{ |
25078
|
0
|
|
|
|
|
|
uint32_t delta = pos - curMatch; |
25079
|
0
|
0
|
|
|
|
|
if (cutValue-- == 0 || delta >= _cyclicBufferSize) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
25080
|
|
|
|
|
|
|
return distances; |
25081
|
|
|
|
|
|
|
{ |
25082
|
0
|
|
|
|
|
|
const uint8_t *pb = cur - delta; |
25083
|
0
|
0
|
|
|
|
|
curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)]; |
25084
|
0
|
0
|
|
|
|
|
if (pb[maxLen] == cur[maxLen] && *pb == *cur) |
|
|
0
|
|
|
|
|
|
25085
|
|
|
|
|
|
|
{ |
25086
|
|
|
|
|
|
|
uint32_t len = 0; |
25087
|
0
|
0
|
|
|
|
|
while (++len != lenLimit) |
25088
|
0
|
0
|
|
|
|
|
if (pb[len] != cur[len]) |
25089
|
|
|
|
|
|
|
break; |
25090
|
0
|
0
|
|
|
|
|
if (maxLen < len) |
25091
|
|
|
|
|
|
|
{ |
25092
|
0
|
|
|
|
|
|
*distances++ = maxLen = len; |
25093
|
0
|
|
|
|
|
|
*distances++ = delta - 1; |
25094
|
0
|
0
|
|
|
|
|
if (len == lenLimit) |
25095
|
|
|
|
|
|
|
return distances; |
25096
|
|
|
|
|
|
|
} |
25097
|
|
|
|
|
|
|
} |
25098
|
|
|
|
|
|
|
} |
25099
|
|
|
|
|
|
|
} |
25100
|
|
|
|
|
|
|
} |
25101
|
|
|
|
|
|
|
|
25102
|
0
|
|
|
|
|
|
uint32_t * GetMatchesSpec1(uint32_t lenLimit, uint32_t curMatch, uint32_t pos, const uint8_t *cur, CLzRef *son, |
25103
|
|
|
|
|
|
|
uint32_t _cyclicBufferPos, uint32_t _cyclicBufferSize, uint32_t cutValue, |
25104
|
|
|
|
|
|
|
uint32_t *distances, uint32_t maxLen) |
25105
|
|
|
|
|
|
|
{ |
25106
|
0
|
|
|
|
|
|
CLzRef *ptr0 = son + (_cyclicBufferPos << 1) + 1; |
25107
|
0
|
|
|
|
|
|
CLzRef *ptr1 = son + (_cyclicBufferPos << 1); |
25108
|
|
|
|
|
|
|
uint32_t len0 = 0, len1 = 0; |
25109
|
|
|
|
|
|
|
for (;;) |
25110
|
|
|
|
|
|
|
{ |
25111
|
0
|
|
|
|
|
|
uint32_t delta = pos - curMatch; |
25112
|
0
|
0
|
|
|
|
|
if (cutValue-- == 0 || delta >= _cyclicBufferSize) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
25113
|
|
|
|
|
|
|
{ |
25114
|
0
|
|
|
|
|
|
*ptr0 = *ptr1 = kEmptyHashValue; |
25115
|
0
|
|
|
|
|
|
return distances; |
25116
|
|
|
|
|
|
|
} |
25117
|
|
|
|
|
|
|
{ |
25118
|
0
|
0
|
|
|
|
|
CLzRef *pair = son + ((_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1); |
25119
|
0
|
|
|
|
|
|
const uint8_t *pb = cur - delta; |
25120
|
0
|
0
|
|
|
|
|
uint32_t len = (len0 < len1 ? len0 : len1); |
25121
|
0
|
0
|
|
|
|
|
if (pb[len] == cur[len]) |
25122
|
|
|
|
|
|
|
{ |
25123
|
0
|
0
|
|
|
|
|
if (++len != lenLimit && pb[len] == cur[len]) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
25124
|
0
|
0
|
|
|
|
|
while (++len != lenLimit) |
25125
|
0
|
0
|
|
|
|
|
if (pb[len] != cur[len]) |
25126
|
|
|
|
|
|
|
break; |
25127
|
0
|
0
|
|
|
|
|
if (maxLen < len) |
25128
|
|
|
|
|
|
|
{ |
25129
|
0
|
|
|
|
|
|
*distances++ = maxLen = len; |
25130
|
0
|
|
|
|
|
|
*distances++ = delta - 1; |
25131
|
0
|
0
|
|
|
|
|
if (len == lenLimit) |
25132
|
|
|
|
|
|
|
{ |
25133
|
0
|
|
|
|
|
|
*ptr1 = pair[0]; |
25134
|
0
|
|
|
|
|
|
*ptr0 = pair[1]; |
25135
|
0
|
|
|
|
|
|
return distances; |
25136
|
|
|
|
|
|
|
} |
25137
|
|
|
|
|
|
|
} |
25138
|
|
|
|
|
|
|
} |
25139
|
0
|
0
|
|
|
|
|
if (pb[len] < cur[len]) |
25140
|
|
|
|
|
|
|
{ |
25141
|
0
|
|
|
|
|
|
*ptr1 = curMatch; |
25142
|
0
|
|
|
|
|
|
ptr1 = pair + 1; |
25143
|
0
|
|
|
|
|
|
curMatch = *ptr1; |
25144
|
|
|
|
|
|
|
len1 = len; |
25145
|
|
|
|
|
|
|
} |
25146
|
|
|
|
|
|
|
else |
25147
|
|
|
|
|
|
|
{ |
25148
|
0
|
|
|
|
|
|
*ptr0 = curMatch; |
25149
|
|
|
|
|
|
|
ptr0 = pair; |
25150
|
0
|
|
|
|
|
|
curMatch = *ptr0; |
25151
|
|
|
|
|
|
|
len0 = len; |
25152
|
|
|
|
|
|
|
} |
25153
|
|
|
|
|
|
|
} |
25154
|
|
|
|
|
|
|
} |
25155
|
|
|
|
|
|
|
} |
25156
|
|
|
|
|
|
|
|
25157
|
0
|
|
|
|
|
|
static void SkipMatchesSpec(uint32_t lenLimit, uint32_t curMatch, uint32_t pos, const uint8_t *cur, CLzRef *son, |
25158
|
|
|
|
|
|
|
uint32_t _cyclicBufferPos, uint32_t _cyclicBufferSize, uint32_t cutValue) |
25159
|
|
|
|
|
|
|
{ |
25160
|
0
|
|
|
|
|
|
CLzRef *ptr0 = son + (_cyclicBufferPos << 1) + 1; |
25161
|
0
|
|
|
|
|
|
CLzRef *ptr1 = son + (_cyclicBufferPos << 1); |
25162
|
|
|
|
|
|
|
uint32_t len0 = 0, len1 = 0; |
25163
|
|
|
|
|
|
|
for (;;) |
25164
|
|
|
|
|
|
|
{ |
25165
|
0
|
|
|
|
|
|
uint32_t delta = pos - curMatch; |
25166
|
0
|
0
|
|
|
|
|
if (cutValue-- == 0 || delta >= _cyclicBufferSize) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
25167
|
|
|
|
|
|
|
{ |
25168
|
0
|
|
|
|
|
|
*ptr0 = *ptr1 = kEmptyHashValue; |
25169
|
0
|
|
|
|
|
|
return; |
25170
|
|
|
|
|
|
|
} |
25171
|
|
|
|
|
|
|
{ |
25172
|
0
|
0
|
|
|
|
|
CLzRef *pair = son + ((_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1); |
25173
|
0
|
|
|
|
|
|
const uint8_t *pb = cur - delta; |
25174
|
0
|
0
|
|
|
|
|
uint32_t len = (len0 < len1 ? len0 : len1); |
25175
|
0
|
0
|
|
|
|
|
if (pb[len] == cur[len]) |
25176
|
|
|
|
|
|
|
{ |
25177
|
0
|
0
|
|
|
|
|
while (++len != lenLimit) |
25178
|
0
|
0
|
|
|
|
|
if (pb[len] != cur[len]) |
25179
|
|
|
|
|
|
|
break; |
25180
|
|
|
|
|
|
|
{ |
25181
|
0
|
0
|
|
|
|
|
if (len == lenLimit) |
25182
|
|
|
|
|
|
|
{ |
25183
|
0
|
|
|
|
|
|
*ptr1 = pair[0]; |
25184
|
0
|
|
|
|
|
|
*ptr0 = pair[1]; |
25185
|
0
|
|
|
|
|
|
return; |
25186
|
|
|
|
|
|
|
} |
25187
|
|
|
|
|
|
|
} |
25188
|
|
|
|
|
|
|
} |
25189
|
0
|
0
|
|
|
|
|
if (pb[len] < cur[len]) |
25190
|
|
|
|
|
|
|
{ |
25191
|
0
|
|
|
|
|
|
*ptr1 = curMatch; |
25192
|
0
|
|
|
|
|
|
ptr1 = pair + 1; |
25193
|
0
|
|
|
|
|
|
curMatch = *ptr1; |
25194
|
|
|
|
|
|
|
len1 = len; |
25195
|
|
|
|
|
|
|
} |
25196
|
|
|
|
|
|
|
else |
25197
|
|
|
|
|
|
|
{ |
25198
|
0
|
|
|
|
|
|
*ptr0 = curMatch; |
25199
|
|
|
|
|
|
|
ptr0 = pair; |
25200
|
0
|
|
|
|
|
|
curMatch = *ptr0; |
25201
|
|
|
|
|
|
|
len0 = len; |
25202
|
|
|
|
|
|
|
} |
25203
|
|
|
|
|
|
|
} |
25204
|
|
|
|
|
|
|
} |
25205
|
|
|
|
|
|
|
} |
25206
|
|
|
|
|
|
|
|
25207
|
|
|
|
|
|
|
#define MOVE_POS \ |
25208
|
|
|
|
|
|
|
++p->cyclicBufferPos; \ |
25209
|
|
|
|
|
|
|
p->buffer++; \ |
25210
|
|
|
|
|
|
|
if (++p->pos == p->posLimit) MatchFinder_CheckLimits(p); |
25211
|
|
|
|
|
|
|
|
25212
|
|
|
|
|
|
|
#define MOVE_POS_RET MOVE_POS return offset; |
25213
|
|
|
|
|
|
|
|
25214
|
0
|
0
|
|
|
|
|
static void MatchFinder_MovePos(CMatchFinder *p) { MOVE_POS; } |
25215
|
|
|
|
|
|
|
|
25216
|
|
|
|
|
|
|
#define GET_MATCHES_HEADER2(minLen, ret_op) \ |
25217
|
|
|
|
|
|
|
uint32_t lenLimit; uint32_t hashValue; const uint8_t *cur; uint32_t curMatch; \ |
25218
|
|
|
|
|
|
|
lenLimit = p->lenLimit; { if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; }} \ |
25219
|
|
|
|
|
|
|
cur = p->buffer; |
25220
|
|
|
|
|
|
|
|
25221
|
|
|
|
|
|
|
#define GET_MATCHES_HEADER(minLen) GET_MATCHES_HEADER2(minLen, return 0) |
25222
|
|
|
|
|
|
|
#define SKIP_HEADER(minLen) GET_MATCHES_HEADER2(minLen, continue) |
25223
|
|
|
|
|
|
|
|
25224
|
|
|
|
|
|
|
#define MF_PARAMS(p) p->pos, p->buffer, p->son, p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue |
25225
|
|
|
|
|
|
|
|
25226
|
|
|
|
|
|
|
#define GET_MATCHES_FOOTER(offset, maxLen) \ |
25227
|
|
|
|
|
|
|
offset = (uint32_t)(GetMatchesSpec1(lenLimit, curMatch, MF_PARAMS(p), \ |
25228
|
|
|
|
|
|
|
distances + offset, maxLen) - distances); MOVE_POS_RET; |
25229
|
|
|
|
|
|
|
|
25230
|
|
|
|
|
|
|
#define SKIP_FOOTER \ |
25231
|
|
|
|
|
|
|
SkipMatchesSpec(lenLimit, curMatch, MF_PARAMS(p)); MOVE_POS; |
25232
|
|
|
|
|
|
|
|
25233
|
0
|
|
|
|
|
|
static uint32_t Bt2_MatchFinder_GetMatches(CMatchFinder *p, uint32_t *distances) |
25234
|
|
|
|
|
|
|
{ |
25235
|
|
|
|
|
|
|
uint32_t offset; |
25236
|
0
|
0
|
|
|
|
|
GET_MATCHES_HEADER(2) |
25237
|
0
|
|
|
|
|
|
HASH2_CALC; |
25238
|
0
|
|
|
|
|
|
curMatch = p->hash[hashValue]; |
25239
|
0
|
|
|
|
|
|
p->hash[hashValue] = p->pos; |
25240
|
|
|
|
|
|
|
offset = 0; |
25241
|
0
|
0
|
|
|
|
|
GET_MATCHES_FOOTER(offset, 1) |
25242
|
|
|
|
|
|
|
} |
25243
|
|
|
|
|
|
|
|
25244
|
0
|
|
|
|
|
|
uint32_t Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, uint32_t *distances) |
25245
|
|
|
|
|
|
|
{ |
25246
|
|
|
|
|
|
|
uint32_t offset; |
25247
|
0
|
0
|
|
|
|
|
GET_MATCHES_HEADER(3) |
25248
|
0
|
|
|
|
|
|
HASH_ZIP_CALC; |
25249
|
0
|
|
|
|
|
|
curMatch = p->hash[hashValue]; |
25250
|
0
|
|
|
|
|
|
p->hash[hashValue] = p->pos; |
25251
|
|
|
|
|
|
|
offset = 0; |
25252
|
0
|
0
|
|
|
|
|
GET_MATCHES_FOOTER(offset, 2) |
25253
|
|
|
|
|
|
|
} |
25254
|
|
|
|
|
|
|
|
25255
|
0
|
|
|
|
|
|
static uint32_t Bt3_MatchFinder_GetMatches(CMatchFinder *p, uint32_t *distances) |
25256
|
|
|
|
|
|
|
{ |
25257
|
|
|
|
|
|
|
uint32_t hash2Value, delta2, maxLen, offset; |
25258
|
0
|
0
|
|
|
|
|
GET_MATCHES_HEADER(3) |
25259
|
|
|
|
|
|
|
|
25260
|
0
|
|
|
|
|
|
HASH3_CALC; |
25261
|
|
|
|
|
|
|
|
25262
|
0
|
|
|
|
|
|
delta2 = p->pos - p->hash[hash2Value]; |
25263
|
0
|
|
|
|
|
|
curMatch = p->hash[kFix3HashSize + hashValue]; |
25264
|
|
|
|
|
|
|
|
25265
|
|
|
|
|
|
|
p->hash[hash2Value] = |
25266
|
0
|
|
|
|
|
|
p->hash[kFix3HashSize + hashValue] = p->pos; |
25267
|
|
|
|
|
|
|
|
25268
|
|
|
|
|
|
|
maxLen = 2; |
25269
|
|
|
|
|
|
|
offset = 0; |
25270
|
0
|
0
|
|
|
|
|
if (delta2 < p->cyclicBufferSize && *(cur - delta2) == *cur) |
|
|
0
|
|
|
|
|
|
25271
|
|
|
|
|
|
|
{ |
25272
|
0
|
0
|
|
|
|
|
for (; maxLen != lenLimit; maxLen++) |
25273
|
0
|
0
|
|
|
|
|
if (cur[(ptrdiff_t)maxLen - delta2] != cur[maxLen]) |
25274
|
|
|
|
|
|
|
break; |
25275
|
0
|
|
|
|
|
|
distances[0] = maxLen; |
25276
|
0
|
|
|
|
|
|
distances[1] = delta2 - 1; |
25277
|
|
|
|
|
|
|
offset = 2; |
25278
|
0
|
0
|
|
|
|
|
if (maxLen == lenLimit) |
25279
|
|
|
|
|
|
|
{ |
25280
|
0
|
|
|
|
|
|
SkipMatchesSpec(lenLimit, curMatch, MF_PARAMS(p)); |
25281
|
0
|
0
|
|
|
|
|
MOVE_POS_RET; |
25282
|
|
|
|
|
|
|
} |
25283
|
|
|
|
|
|
|
} |
25284
|
0
|
0
|
|
|
|
|
GET_MATCHES_FOOTER(offset, maxLen) |
25285
|
|
|
|
|
|
|
} |
25286
|
|
|
|
|
|
|
|
25287
|
0
|
|
|
|
|
|
static uint32_t Bt4_MatchFinder_GetMatches(CMatchFinder *p, uint32_t *distances) |
25288
|
|
|
|
|
|
|
{ |
25289
|
|
|
|
|
|
|
uint32_t hash2Value, hash3Value, delta2, delta3, maxLen, offset; |
25290
|
0
|
0
|
|
|
|
|
GET_MATCHES_HEADER(4) |
25291
|
|
|
|
|
|
|
|
25292
|
0
|
|
|
|
|
|
HASH4_CALC; |
25293
|
|
|
|
|
|
|
|
25294
|
0
|
|
|
|
|
|
delta2 = p->pos - p->hash[ hash2Value]; |
25295
|
0
|
|
|
|
|
|
delta3 = p->pos - p->hash[kFix3HashSize + hash3Value]; |
25296
|
0
|
|
|
|
|
|
curMatch = p->hash[kFix4HashSize + hashValue]; |
25297
|
|
|
|
|
|
|
|
25298
|
|
|
|
|
|
|
p->hash[ hash2Value] = |
25299
|
|
|
|
|
|
|
p->hash[kFix3HashSize + hash3Value] = |
25300
|
0
|
|
|
|
|
|
p->hash[kFix4HashSize + hashValue] = p->pos; |
25301
|
|
|
|
|
|
|
|
25302
|
|
|
|
|
|
|
maxLen = 1; |
25303
|
|
|
|
|
|
|
offset = 0; |
25304
|
0
|
0
|
|
|
|
|
if (delta2 < p->cyclicBufferSize && *(cur - delta2) == *cur) |
|
|
0
|
|
|
|
|
|
25305
|
|
|
|
|
|
|
{ |
25306
|
0
|
|
|
|
|
|
distances[0] = maxLen = 2; |
25307
|
0
|
|
|
|
|
|
distances[1] = delta2 - 1; |
25308
|
|
|
|
|
|
|
offset = 2; |
25309
|
|
|
|
|
|
|
} |
25310
|
0
|
0
|
|
|
|
|
if (delta2 != delta3 && delta3 < p->cyclicBufferSize && *(cur - delta3) == *cur) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
25311
|
|
|
|
|
|
|
{ |
25312
|
|
|
|
|
|
|
maxLen = 3; |
25313
|
0
|
|
|
|
|
|
distances[offset + 1] = delta3 - 1; |
25314
|
0
|
|
|
|
|
|
offset += 2; |
25315
|
|
|
|
|
|
|
delta2 = delta3; |
25316
|
|
|
|
|
|
|
} |
25317
|
0
|
0
|
|
|
|
|
if (offset != 0) |
25318
|
|
|
|
|
|
|
{ |
25319
|
0
|
0
|
|
|
|
|
for (; maxLen != lenLimit; maxLen++) |
25320
|
0
|
0
|
|
|
|
|
if (cur[(ptrdiff_t)maxLen - delta2] != cur[maxLen]) |
25321
|
|
|
|
|
|
|
break; |
25322
|
0
|
|
|
|
|
|
distances[offset - 2] = maxLen; |
25323
|
0
|
0
|
|
|
|
|
if (maxLen == lenLimit) |
25324
|
|
|
|
|
|
|
{ |
25325
|
0
|
|
|
|
|
|
SkipMatchesSpec(lenLimit, curMatch, MF_PARAMS(p)); |
25326
|
0
|
0
|
|
|
|
|
MOVE_POS_RET; |
25327
|
|
|
|
|
|
|
} |
25328
|
|
|
|
|
|
|
} |
25329
|
0
|
0
|
|
|
|
|
if (maxLen < 3) |
25330
|
|
|
|
|
|
|
maxLen = 3; |
25331
|
0
|
0
|
|
|
|
|
GET_MATCHES_FOOTER(offset, maxLen) |
25332
|
|
|
|
|
|
|
} |
25333
|
|
|
|
|
|
|
|
25334
|
0
|
|
|
|
|
|
static uint32_t Hc4_MatchFinder_GetMatches(CMatchFinder *p, uint32_t *distances) |
25335
|
|
|
|
|
|
|
{ |
25336
|
|
|
|
|
|
|
uint32_t hash2Value, hash3Value, delta2, delta3, maxLen, offset; |
25337
|
0
|
0
|
|
|
|
|
GET_MATCHES_HEADER(4) |
25338
|
|
|
|
|
|
|
|
25339
|
0
|
|
|
|
|
|
HASH4_CALC; |
25340
|
|
|
|
|
|
|
|
25341
|
0
|
|
|
|
|
|
delta2 = p->pos - p->hash[ hash2Value]; |
25342
|
0
|
|
|
|
|
|
delta3 = p->pos - p->hash[kFix3HashSize + hash3Value]; |
25343
|
0
|
|
|
|
|
|
curMatch = p->hash[kFix4HashSize + hashValue]; |
25344
|
|
|
|
|
|
|
|
25345
|
|
|
|
|
|
|
p->hash[ hash2Value] = |
25346
|
|
|
|
|
|
|
p->hash[kFix3HashSize + hash3Value] = |
25347
|
0
|
|
|
|
|
|
p->hash[kFix4HashSize + hashValue] = p->pos; |
25348
|
|
|
|
|
|
|
|
25349
|
|
|
|
|
|
|
maxLen = 1; |
25350
|
|
|
|
|
|
|
offset = 0; |
25351
|
0
|
0
|
|
|
|
|
if (delta2 < p->cyclicBufferSize && *(cur - delta2) == *cur) |
|
|
0
|
|
|
|
|
|
25352
|
|
|
|
|
|
|
{ |
25353
|
0
|
|
|
|
|
|
distances[0] = maxLen = 2; |
25354
|
0
|
|
|
|
|
|
distances[1] = delta2 - 1; |
25355
|
|
|
|
|
|
|
offset = 2; |
25356
|
|
|
|
|
|
|
} |
25357
|
0
|
0
|
|
|
|
|
if (delta2 != delta3 && delta3 < p->cyclicBufferSize && *(cur - delta3) == *cur) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
25358
|
|
|
|
|
|
|
{ |
25359
|
|
|
|
|
|
|
maxLen = 3; |
25360
|
0
|
|
|
|
|
|
distances[offset + 1] = delta3 - 1; |
25361
|
0
|
|
|
|
|
|
offset += 2; |
25362
|
|
|
|
|
|
|
delta2 = delta3; |
25363
|
|
|
|
|
|
|
} |
25364
|
0
|
0
|
|
|
|
|
if (offset != 0) |
25365
|
|
|
|
|
|
|
{ |
25366
|
0
|
0
|
|
|
|
|
for (; maxLen != lenLimit; maxLen++) |
25367
|
0
|
0
|
|
|
|
|
if (cur[(ptrdiff_t)maxLen - delta2] != cur[maxLen]) |
25368
|
|
|
|
|
|
|
break; |
25369
|
0
|
|
|
|
|
|
distances[offset - 2] = maxLen; |
25370
|
0
|
0
|
|
|
|
|
if (maxLen == lenLimit) |
25371
|
|
|
|
|
|
|
{ |
25372
|
0
|
|
|
|
|
|
p->son[p->cyclicBufferPos] = curMatch; |
25373
|
0
|
0
|
|
|
|
|
MOVE_POS_RET; |
25374
|
|
|
|
|
|
|
} |
25375
|
|
|
|
|
|
|
} |
25376
|
0
|
0
|
|
|
|
|
if (maxLen < 3) |
25377
|
|
|
|
|
|
|
maxLen = 3; |
25378
|
0
|
|
|
|
|
|
offset = (uint32_t)(Hc_GetMatchesSpec(lenLimit, curMatch, MF_PARAMS(p), |
25379
|
0
|
|
|
|
|
|
distances + offset, maxLen) - (distances)); |
25380
|
0
|
0
|
|
|
|
|
MOVE_POS_RET |
25381
|
|
|
|
|
|
|
} |
25382
|
|
|
|
|
|
|
|
25383
|
0
|
|
|
|
|
|
uint32_t Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, uint32_t *distances) |
25384
|
|
|
|
|
|
|
{ |
25385
|
|
|
|
|
|
|
uint32_t offset; |
25386
|
0
|
0
|
|
|
|
|
GET_MATCHES_HEADER(3) |
25387
|
0
|
|
|
|
|
|
HASH_ZIP_CALC; |
25388
|
0
|
|
|
|
|
|
curMatch = p->hash[hashValue]; |
25389
|
0
|
|
|
|
|
|
p->hash[hashValue] = p->pos; |
25390
|
0
|
|
|
|
|
|
offset = (uint32_t)(Hc_GetMatchesSpec(lenLimit, curMatch, MF_PARAMS(p), |
25391
|
0
|
|
|
|
|
|
distances, 2) - (distances)); |
25392
|
0
|
0
|
|
|
|
|
MOVE_POS_RET |
25393
|
|
|
|
|
|
|
} |
25394
|
|
|
|
|
|
|
|
25395
|
0
|
|
|
|
|
|
static void Bt2_MatchFinder_Skip(CMatchFinder *p, uint32_t num) |
25396
|
|
|
|
|
|
|
{ |
25397
|
0
|
0
|
|
|
|
|
do |
25398
|
|
|
|
|
|
|
{ |
25399
|
0
|
0
|
|
|
|
|
SKIP_HEADER(2) |
25400
|
0
|
|
|
|
|
|
HASH2_CALC; |
25401
|
0
|
|
|
|
|
|
curMatch = p->hash[hashValue]; |
25402
|
0
|
|
|
|
|
|
p->hash[hashValue] = p->pos; |
25403
|
0
|
0
|
|
|
|
|
SKIP_FOOTER |
25404
|
|
|
|
|
|
|
} |
25405
|
|
|
|
|
|
|
while (--num != 0); |
25406
|
0
|
|
|
|
|
|
} |
25407
|
|
|
|
|
|
|
|
25408
|
0
|
|
|
|
|
|
void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, uint32_t num) |
25409
|
|
|
|
|
|
|
{ |
25410
|
0
|
0
|
|
|
|
|
do |
25411
|
|
|
|
|
|
|
{ |
25412
|
0
|
0
|
|
|
|
|
SKIP_HEADER(3) |
25413
|
0
|
|
|
|
|
|
HASH_ZIP_CALC; |
25414
|
0
|
|
|
|
|
|
curMatch = p->hash[hashValue]; |
25415
|
0
|
|
|
|
|
|
p->hash[hashValue] = p->pos; |
25416
|
0
|
0
|
|
|
|
|
SKIP_FOOTER |
25417
|
|
|
|
|
|
|
} |
25418
|
|
|
|
|
|
|
while (--num != 0); |
25419
|
0
|
|
|
|
|
|
} |
25420
|
|
|
|
|
|
|
|
25421
|
0
|
|
|
|
|
|
static void Bt3_MatchFinder_Skip(CMatchFinder *p, uint32_t num) |
25422
|
|
|
|
|
|
|
{ |
25423
|
0
|
0
|
|
|
|
|
do |
25424
|
|
|
|
|
|
|
{ |
25425
|
|
|
|
|
|
|
uint32_t hash2Value; |
25426
|
0
|
0
|
|
|
|
|
SKIP_HEADER(3) |
25427
|
0
|
|
|
|
|
|
HASH3_CALC; |
25428
|
0
|
|
|
|
|
|
curMatch = p->hash[kFix3HashSize + hashValue]; |
25429
|
0
|
|
|
|
|
|
p->hash[hash2Value] = |
25430
|
0
|
|
|
|
|
|
p->hash[kFix3HashSize + hashValue] = p->pos; |
25431
|
0
|
0
|
|
|
|
|
SKIP_FOOTER |
25432
|
|
|
|
|
|
|
} |
25433
|
|
|
|
|
|
|
while (--num != 0); |
25434
|
0
|
|
|
|
|
|
} |
25435
|
|
|
|
|
|
|
|
25436
|
0
|
|
|
|
|
|
static void Bt4_MatchFinder_Skip(CMatchFinder *p, uint32_t num) |
25437
|
|
|
|
|
|
|
{ |
25438
|
0
|
0
|
|
|
|
|
do |
25439
|
|
|
|
|
|
|
{ |
25440
|
|
|
|
|
|
|
uint32_t hash2Value, hash3Value; |
25441
|
0
|
0
|
|
|
|
|
SKIP_HEADER(4) |
25442
|
0
|
|
|
|
|
|
HASH4_CALC; |
25443
|
0
|
|
|
|
|
|
curMatch = p->hash[kFix4HashSize + hashValue]; |
25444
|
0
|
|
|
|
|
|
p->hash[ hash2Value] = |
25445
|
0
|
|
|
|
|
|
p->hash[kFix3HashSize + hash3Value] = p->pos; |
25446
|
0
|
|
|
|
|
|
p->hash[kFix4HashSize + hashValue] = p->pos; |
25447
|
0
|
0
|
|
|
|
|
SKIP_FOOTER |
25448
|
|
|
|
|
|
|
} |
25449
|
|
|
|
|
|
|
while (--num != 0); |
25450
|
0
|
|
|
|
|
|
} |
25451
|
|
|
|
|
|
|
|
25452
|
0
|
|
|
|
|
|
static void Hc4_MatchFinder_Skip(CMatchFinder *p, uint32_t num) |
25453
|
|
|
|
|
|
|
{ |
25454
|
0
|
0
|
|
|
|
|
do |
25455
|
|
|
|
|
|
|
{ |
25456
|
|
|
|
|
|
|
uint32_t hash2Value, hash3Value; |
25457
|
0
|
0
|
|
|
|
|
SKIP_HEADER(4) |
25458
|
0
|
|
|
|
|
|
HASH4_CALC; |
25459
|
0
|
|
|
|
|
|
curMatch = p->hash[kFix4HashSize + hashValue]; |
25460
|
0
|
|
|
|
|
|
p->hash[ hash2Value] = |
25461
|
0
|
|
|
|
|
|
p->hash[kFix3HashSize + hash3Value] = |
25462
|
0
|
|
|
|
|
|
p->hash[kFix4HashSize + hashValue] = p->pos; |
25463
|
0
|
|
|
|
|
|
p->son[p->cyclicBufferPos] = curMatch; |
25464
|
0
|
0
|
|
|
|
|
MOVE_POS |
25465
|
|
|
|
|
|
|
} |
25466
|
|
|
|
|
|
|
while (--num != 0); |
25467
|
0
|
|
|
|
|
|
} |
25468
|
|
|
|
|
|
|
|
25469
|
0
|
|
|
|
|
|
void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, uint32_t num) |
25470
|
|
|
|
|
|
|
{ |
25471
|
0
|
0
|
|
|
|
|
do |
25472
|
|
|
|
|
|
|
{ |
25473
|
0
|
0
|
|
|
|
|
SKIP_HEADER(3) |
25474
|
0
|
|
|
|
|
|
HASH_ZIP_CALC; |
25475
|
0
|
|
|
|
|
|
curMatch = p->hash[hashValue]; |
25476
|
0
|
|
|
|
|
|
p->hash[hashValue] = p->pos; |
25477
|
0
|
|
|
|
|
|
p->son[p->cyclicBufferPos] = curMatch; |
25478
|
0
|
0
|
|
|
|
|
MOVE_POS |
25479
|
|
|
|
|
|
|
} |
25480
|
|
|
|
|
|
|
while (--num != 0); |
25481
|
0
|
|
|
|
|
|
} |
25482
|
|
|
|
|
|
|
|
25483
|
0
|
|
|
|
|
|
void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder *vTable) |
25484
|
|
|
|
|
|
|
{ |
25485
|
0
|
|
|
|
|
|
vTable->Init = (Mf_Init_Func)MatchFinder_Init; |
25486
|
0
|
|
|
|
|
|
vTable->GetIndexByte = (Mf_GetIndexByte_Func)MatchFinder_GetIndexByte; |
25487
|
0
|
|
|
|
|
|
vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinder_GetNumAvailableBytes; |
25488
|
0
|
|
|
|
|
|
vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinder_GetPointerToCurrentPos; |
25489
|
0
|
0
|
|
|
|
|
if (!p->btMode) |
|
|
0
|
|
|
|
|
|
25490
|
|
|
|
|
|
|
{ |
25491
|
0
|
|
|
|
|
|
vTable->GetMatches = (Mf_GetMatches_Func)Hc4_MatchFinder_GetMatches; |
25492
|
0
|
|
|
|
|
|
vTable->Skip = (Mf_Skip_Func)Hc4_MatchFinder_Skip; |
25493
|
|
|
|
|
|
|
} |
25494
|
0
|
0
|
|
|
|
|
else if (p->numHashBytes == 2) |
|
|
0
|
|
|
|
|
|
25495
|
|
|
|
|
|
|
{ |
25496
|
0
|
|
|
|
|
|
vTable->GetMatches = (Mf_GetMatches_Func)Bt2_MatchFinder_GetMatches; |
25497
|
0
|
|
|
|
|
|
vTable->Skip = (Mf_Skip_Func)Bt2_MatchFinder_Skip; |
25498
|
|
|
|
|
|
|
} |
25499
|
0
|
0
|
|
|
|
|
else if (p->numHashBytes == 3) |
|
|
0
|
|
|
|
|
|
25500
|
|
|
|
|
|
|
{ |
25501
|
0
|
|
|
|
|
|
vTable->GetMatches = (Mf_GetMatches_Func)Bt3_MatchFinder_GetMatches; |
25502
|
0
|
|
|
|
|
|
vTable->Skip = (Mf_Skip_Func)Bt3_MatchFinder_Skip; |
25503
|
|
|
|
|
|
|
} |
25504
|
|
|
|
|
|
|
else |
25505
|
|
|
|
|
|
|
{ |
25506
|
0
|
|
|
|
|
|
vTable->GetMatches = (Mf_GetMatches_Func)Bt4_MatchFinder_GetMatches; |
25507
|
0
|
|
|
|
|
|
vTable->Skip = (Mf_Skip_Func)Bt4_MatchFinder_Skip; |
25508
|
|
|
|
|
|
|
} |
25509
|
0
|
|
|
|
|
|
} |
25510
|
|
|
|
|
|
|
|
25511
|
|
|
|
|
|
|
// LzmaEnc.h -- LZMA Encoder |
25512
|
|
|
|
|
|
|
// 2009-02-07 : Igor Pavlov : Public domain |
25513
|
|
|
|
|
|
|
|
25514
|
|
|
|
|
|
|
#define LZMA_PROPS_SIZE 5 |
25515
|
|
|
|
|
|
|
|
25516
|
|
|
|
|
|
|
struct CLzmaEncProps |
25517
|
|
|
|
|
|
|
{ |
25518
|
|
|
|
|
|
|
int level; /* 0 <= level <= 9 */ |
25519
|
|
|
|
|
|
|
uint32_t dictSize; /* (1 << 12) <= dictSize <= (1 << 27) for 32-bit version |
25520
|
|
|
|
|
|
|
(1 << 12) <= dictSize <= (1 << 30) for 64-bit version |
25521
|
|
|
|
|
|
|
default = (1 << 24) */ |
25522
|
|
|
|
|
|
|
int lc; /* 0 <= lc <= 8, default = 3 */ |
25523
|
|
|
|
|
|
|
int lp; /* 0 <= lp <= 4, default = 0 */ |
25524
|
|
|
|
|
|
|
int pb; /* 0 <= pb <= 4, default = 2 */ |
25525
|
|
|
|
|
|
|
int algo; /* 0 - fast, 1 - normal, default = 1 */ |
25526
|
|
|
|
|
|
|
int fb; /* 5 <= fb <= 273, default = 32 */ |
25527
|
|
|
|
|
|
|
int btMode; /* 0 - hashChain Mode, 1 - binTree mode - normal, default = 1 */ |
25528
|
|
|
|
|
|
|
int numHashBytes; /* 2, 3 or 4, default = 4 */ |
25529
|
|
|
|
|
|
|
uint32_t mc; /* 1 <= mc <= (1 << 30), default = 32 */ |
25530
|
|
|
|
|
|
|
unsigned writeEndMark; /* 0 - do not write EOPM, 1 - write EOPM, default = 0 */ |
25531
|
|
|
|
|
|
|
int numThreads; /* 1 or 2, default = 2 */ |
25532
|
|
|
|
|
|
|
}; |
25533
|
|
|
|
|
|
|
|
25534
|
|
|
|
|
|
|
void LzmaEncProps_Init(CLzmaEncProps *p); |
25535
|
|
|
|
|
|
|
void LzmaEncProps_Normalize(CLzmaEncProps *p); |
25536
|
|
|
|
|
|
|
uint32_t LzmaEncProps_GetDictSize(const CLzmaEncProps *props2); |
25537
|
|
|
|
|
|
|
|
25538
|
|
|
|
|
|
|
/* ---------- CLzmaEncHandle Interface ---------- */ |
25539
|
|
|
|
|
|
|
|
25540
|
|
|
|
|
|
|
/* LzmaEnc_* functions can return the following exit codes: |
25541
|
|
|
|
|
|
|
Returns: |
25542
|
|
|
|
|
|
|
SZ_OK - OK |
25543
|
|
|
|
|
|
|
SZ_ERROR_MEM - Memory allocation error |
25544
|
|
|
|
|
|
|
SZ_ERROR_PARAM - Incorrect paramater in props |
25545
|
|
|
|
|
|
|
SZ_ERROR_WRITE - Write callback error. |
25546
|
|
|
|
|
|
|
SZ_ERROR_PROGRESS - some break from progress callback |
25547
|
|
|
|
|
|
|
SZ_ERROR_THREAD - errors in multithreading functions (only for Mt version) |
25548
|
|
|
|
|
|
|
*/ |
25549
|
|
|
|
|
|
|
|
25550
|
|
|
|
|
|
|
typedef void * CLzmaEncHandle; |
25551
|
|
|
|
|
|
|
|
25552
|
|
|
|
|
|
|
CLzmaEncHandle LzmaEnc_Create(ISzAlloc *alloc); |
25553
|
|
|
|
|
|
|
void LzmaEnc_Destroy(CLzmaEncHandle p, ISzAlloc *alloc, ISzAlloc *allocBig); |
25554
|
|
|
|
|
|
|
SRes LzmaEnc_SetProps(CLzmaEncHandle p, const CLzmaEncProps *props); |
25555
|
|
|
|
|
|
|
SRes LzmaEnc_WriteProperties(CLzmaEncHandle p, uint8_t *properties, size_t *size); |
25556
|
|
|
|
|
|
|
SRes LzmaEnc_Encode(CLzmaEncHandle p, ISeqOutStream *outStream, ISeqInStream *inStream, |
25557
|
|
|
|
|
|
|
ICompressProgress *progress, ISzAlloc *alloc, ISzAlloc *allocBig); |
25558
|
|
|
|
|
|
|
SRes LzmaEnc_MemEncode(CLzmaEncHandle p, uint8_t *dest, size_t *destLen, const uint8_t *src, size_t srcLen, |
25559
|
|
|
|
|
|
|
int writeEndMark, ICompressProgress *progress, ISzAlloc *alloc, ISzAlloc *allocBig); |
25560
|
|
|
|
|
|
|
|
25561
|
|
|
|
|
|
|
/* ---------- One Call Interface ---------- */ |
25562
|
|
|
|
|
|
|
|
25563
|
|
|
|
|
|
|
/* LzmaEncode |
25564
|
|
|
|
|
|
|
Return code: |
25565
|
|
|
|
|
|
|
SZ_OK - OK |
25566
|
|
|
|
|
|
|
SZ_ERROR_MEM - Memory allocation error |
25567
|
|
|
|
|
|
|
SZ_ERROR_PARAM - Incorrect paramater |
25568
|
|
|
|
|
|
|
SZ_ERROR_OUTPUT_EOF - output buffer overflow |
25569
|
|
|
|
|
|
|
SZ_ERROR_THREAD - errors in multithreading functions (only for Mt version) |
25570
|
|
|
|
|
|
|
*/ |
25571
|
|
|
|
|
|
|
|
25572
|
|
|
|
|
|
|
SRes LzmaEncode(uint8_t *dest, size_t *destLen, const uint8_t *src, size_t srcLen, |
25573
|
|
|
|
|
|
|
const CLzmaEncProps *props, uint8_t *propsEncoded, size_t *propsSize, int writeEndMark, |
25574
|
|
|
|
|
|
|
ICompressProgress *progress, ISzAlloc *alloc, ISzAlloc *allocBig); |
25575
|
|
|
|
|
|
|
|
25576
|
|
|
|
|
|
|
// LzmaEnc.c -- LZMA Encoder |
25577
|
|
|
|
|
|
|
// 2010-04-16 : Igor Pavlov : Public domain |
25578
|
|
|
|
|
|
|
|
25579
|
|
|
|
|
|
|
#define kBlockSizeMax ((1 << LZMA_NUM_BLOCK_SIZE_BITS) - 1) |
25580
|
|
|
|
|
|
|
|
25581
|
|
|
|
|
|
|
#define kBlockSize (9 << 10) |
25582
|
|
|
|
|
|
|
#define kUnpackBlockSize (1 << 18) |
25583
|
|
|
|
|
|
|
#define kMatchArraySize (1 << 21) |
25584
|
|
|
|
|
|
|
#define kMatchRecordMaxSize ((LZMA_MATCH_LEN_MAX * 2 + 3) * LZMA_MATCH_LEN_MAX) |
25585
|
|
|
|
|
|
|
|
25586
|
|
|
|
|
|
|
#define kNumMaxDirectBits (31) |
25587
|
|
|
|
|
|
|
|
25588
|
|
|
|
|
|
|
#define kNumTopBits 24 |
25589
|
|
|
|
|
|
|
#define kTopValue ((uint32_t)1 << kNumTopBits) |
25590
|
|
|
|
|
|
|
|
25591
|
|
|
|
|
|
|
#define kNumBitModelTotalBits 11 |
25592
|
|
|
|
|
|
|
#define kBitModelTotal (1 << kNumBitModelTotalBits) |
25593
|
|
|
|
|
|
|
#define kNumMoveBits 5 |
25594
|
|
|
|
|
|
|
#define kProbInitValue (kBitModelTotal >> 1) |
25595
|
|
|
|
|
|
|
|
25596
|
|
|
|
|
|
|
#define kNumMoveReducingBits 4 |
25597
|
|
|
|
|
|
|
#define kNumBitPriceShiftBits 4 |
25598
|
|
|
|
|
|
|
#define kBitPrice (1 << kNumBitPriceShiftBits) |
25599
|
|
|
|
|
|
|
|
25600
|
0
|
|
|
|
|
|
void LzmaEncProps_Init(CLzmaEncProps *p) |
25601
|
|
|
|
|
|
|
{ |
25602
|
0
|
|
|
|
|
|
p->level = 5; |
25603
|
0
|
|
|
|
|
|
p->dictSize = p->mc = 0; |
25604
|
0
|
|
|
|
|
|
p->lc = p->lp = p->pb = p->algo = p->fb = p->btMode = p->numHashBytes = p->numThreads = -1; |
25605
|
0
|
|
|
|
|
|
p->writeEndMark = 0; |
25606
|
0
|
|
|
|
|
|
} |
25607
|
|
|
|
|
|
|
|
25608
|
0
|
|
|
|
|
|
void LzmaEncProps_Normalize(CLzmaEncProps *p) |
25609
|
|
|
|
|
|
|
{ |
25610
|
0
|
|
|
|
|
|
int level = p->level; |
25611
|
0
|
0
|
|
|
|
|
if (level < 0) level = 5; |
25612
|
0
|
|
|
|
|
|
p->level = level; |
25613
|
0
|
0
|
|
|
|
|
if (p->dictSize == 0) p->dictSize = (level <= 5 ? (1 << (level * 2 + 14)) : (level == 6 ? (1 << 25) : (1 << 26))); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
25614
|
0
|
0
|
|
|
|
|
if (p->lc < 0) p->lc = 3; |
25615
|
0
|
0
|
|
|
|
|
if (p->lp < 0) p->lp = 0; |
25616
|
0
|
0
|
|
|
|
|
if (p->pb < 0) p->pb = 2; |
25617
|
0
|
0
|
|
|
|
|
if (p->algo < 0) p->algo = (level < 5 ? 0 : 1); |
25618
|
0
|
0
|
|
|
|
|
if (p->fb < 0) p->fb = (level < 7 ? 32 : 64); |
|
|
0
|
|
|
|
|
|
25619
|
0
|
0
|
|
|
|
|
if (p->btMode < 0) p->btMode = (p->algo == 0 ? 0 : 1); |
25620
|
0
|
0
|
|
|
|
|
if (p->numHashBytes < 0) p->numHashBytes = 4; |
25621
|
0
|
0
|
|
|
|
|
if (p->mc == 0) p->mc = (16 + (p->fb >> 1)) >> (p->btMode ? 0 : 1); |
25622
|
0
|
0
|
|
|
|
|
if (p->numThreads < 0) |
25623
|
0
|
|
|
|
|
|
p->numThreads = 1; |
25624
|
0
|
|
|
|
|
|
} |
25625
|
|
|
|
|
|
|
|
25626
|
0
|
|
|
|
|
|
uint32_t LzmaEncProps_GetDictSize(const CLzmaEncProps *props2) |
25627
|
|
|
|
|
|
|
{ |
25628
|
0
|
|
|
|
|
|
CLzmaEncProps props = *props2; |
25629
|
0
|
|
|
|
|
|
LzmaEncProps_Normalize(&props); |
25630
|
0
|
|
|
|
|
|
return props.dictSize; |
25631
|
|
|
|
|
|
|
} |
25632
|
|
|
|
|
|
|
|
25633
|
|
|
|
|
|
|
/* #define LZMA_LOG_BSR */ |
25634
|
|
|
|
|
|
|
/* Define it for Intel's CPU */ |
25635
|
|
|
|
|
|
|
|
25636
|
|
|
|
|
|
|
#ifdef LZMA_LOG_BSR |
25637
|
|
|
|
|
|
|
|
25638
|
|
|
|
|
|
|
#define kDicLogSizeMaxCompress 30 |
25639
|
|
|
|
|
|
|
|
25640
|
|
|
|
|
|
|
#define BSR2_RET(pos, res) { unsigned long i; _BitScanReverse(&i, (pos)); res = (i + i) + ((pos >> (i - 1)) & 1); } |
25641
|
|
|
|
|
|
|
|
25642
|
|
|
|
|
|
|
uint32_t GetPosSlot1(uint32_t pos) |
25643
|
|
|
|
|
|
|
{ |
25644
|
|
|
|
|
|
|
uint32_t res; |
25645
|
|
|
|
|
|
|
BSR2_RET(pos, res); |
25646
|
|
|
|
|
|
|
return res; |
25647
|
|
|
|
|
|
|
} |
25648
|
|
|
|
|
|
|
#define GetPosSlot2(pos, res) { BSR2_RET(pos, res); } |
25649
|
|
|
|
|
|
|
#define GetPosSlot(pos, res) { if (pos < 2) res = pos; else BSR2_RET(pos, res); } |
25650
|
|
|
|
|
|
|
|
25651
|
|
|
|
|
|
|
#else |
25652
|
|
|
|
|
|
|
|
25653
|
|
|
|
|
|
|
//#define kNumLogBits (9 + (int)sizeof(size_t) / 2) |
25654
|
|
|
|
|
|
|
#define kNumLogBits (9 + (int)sizeof(uint32_t) / 2) |
25655
|
|
|
|
|
|
|
#define kDicLogSizeMaxCompress ((kNumLogBits - 1) * 2 + 7) |
25656
|
|
|
|
|
|
|
|
25657
|
0
|
|
|
|
|
|
void LzmaEnc_FastPosInit(uint8_t *g_FastPos) |
25658
|
|
|
|
|
|
|
{ |
25659
|
|
|
|
|
|
|
int c = 2, slotFast; |
25660
|
0
|
|
|
|
|
|
g_FastPos[0] = 0; |
25661
|
0
|
|
|
|
|
|
g_FastPos[1] = 1; |
25662
|
|
|
|
|
|
|
|
25663
|
0
|
0
|
|
|
|
|
for (slotFast = 2; slotFast < kNumLogBits * 2; slotFast++) |
|
|
0
|
|
|
|
|
|
25664
|
|
|
|
|
|
|
{ |
25665
|
0
|
|
|
|
|
|
uint32_t k = (1 << ((slotFast >> 1) - 1)); |
25666
|
|
|
|
|
|
|
uint32_t j; |
25667
|
0
|
0
|
|
|
|
|
for (j = 0; j < k; j++, c++) |
|
|
0
|
|
|
|
|
|
25668
|
0
|
|
|
|
|
|
g_FastPos[c] = (uint8_t)slotFast; |
25669
|
|
|
|
|
|
|
} |
25670
|
0
|
|
|
|
|
|
} |
25671
|
|
|
|
|
|
|
|
25672
|
|
|
|
|
|
|
#define BSR2_RET(pos, res) { uint32_t i = 6 + ((kNumLogBits - 1) & \ |
25673
|
|
|
|
|
|
|
(0 - (((((uint32_t)1 << (kNumLogBits + 6)) - 1) - pos) >> 31))); \ |
25674
|
|
|
|
|
|
|
res = p->g_FastPos[pos >> i] + (i * 2); } |
25675
|
|
|
|
|
|
|
/* |
25676
|
|
|
|
|
|
|
#define BSR2_RET(pos, res) { res = (pos < (1 << (kNumLogBits + 6))) ? \ |
25677
|
|
|
|
|
|
|
p->g_FastPos[pos >> 6] + 12 : \ |
25678
|
|
|
|
|
|
|
p->g_FastPos[pos >> (6 + kNumLogBits - 1)] + (6 + (kNumLogBits - 1)) * 2; } |
25679
|
|
|
|
|
|
|
*/ |
25680
|
|
|
|
|
|
|
|
25681
|
|
|
|
|
|
|
#define GetPosSlot1(pos) p->g_FastPos[pos] |
25682
|
|
|
|
|
|
|
#define GetPosSlot2(pos, res) { BSR2_RET(pos, res); } |
25683
|
|
|
|
|
|
|
#define GetPosSlot(pos, res) { if (pos < kNumFullDistances) res = p->g_FastPos[pos]; else BSR2_RET(pos, res); } |
25684
|
|
|
|
|
|
|
|
25685
|
|
|
|
|
|
|
#endif |
25686
|
|
|
|
|
|
|
|
25687
|
|
|
|
|
|
|
#define LZMA_NUM_REPS 4 |
25688
|
|
|
|
|
|
|
|
25689
|
|
|
|
|
|
|
typedef unsigned CState; |
25690
|
|
|
|
|
|
|
|
25691
|
|
|
|
|
|
|
struct COptimal |
25692
|
|
|
|
|
|
|
{ |
25693
|
|
|
|
|
|
|
uint32_t price; |
25694
|
|
|
|
|
|
|
|
25695
|
|
|
|
|
|
|
CState state; |
25696
|
|
|
|
|
|
|
int prev1IsChar; |
25697
|
|
|
|
|
|
|
int prev2; |
25698
|
|
|
|
|
|
|
|
25699
|
|
|
|
|
|
|
uint32_t posPrev2; |
25700
|
|
|
|
|
|
|
uint32_t backPrev2; |
25701
|
|
|
|
|
|
|
|
25702
|
|
|
|
|
|
|
uint32_t posPrev; |
25703
|
|
|
|
|
|
|
uint32_t backPrev; |
25704
|
|
|
|
|
|
|
uint32_t backs[LZMA_NUM_REPS]; |
25705
|
|
|
|
|
|
|
}; |
25706
|
|
|
|
|
|
|
|
25707
|
|
|
|
|
|
|
#define kNumOpts (1 << 12) |
25708
|
|
|
|
|
|
|
|
25709
|
|
|
|
|
|
|
#define kNumLenToPosStates 4 |
25710
|
|
|
|
|
|
|
#define kNumPosSlotBits 6 |
25711
|
|
|
|
|
|
|
#define kDicLogSizeMin 0 |
25712
|
|
|
|
|
|
|
#define kDicLogSizeMax 32 |
25713
|
|
|
|
|
|
|
#define kDistTableSizeMax (kDicLogSizeMax * 2) |
25714
|
|
|
|
|
|
|
|
25715
|
|
|
|
|
|
|
#define kNumAlignBits 4 |
25716
|
|
|
|
|
|
|
#define kAlignTableSize (1 << kNumAlignBits) |
25717
|
|
|
|
|
|
|
#define kAlignMask (kAlignTableSize - 1) |
25718
|
|
|
|
|
|
|
|
25719
|
|
|
|
|
|
|
#define kStartPosModelIndex 4 |
25720
|
|
|
|
|
|
|
#define kEndPosModelIndex 14 |
25721
|
|
|
|
|
|
|
#define kNumPosModels (kEndPosModelIndex - kStartPosModelIndex) |
25722
|
|
|
|
|
|
|
|
25723
|
|
|
|
|
|
|
#define kNumFullDistances (1 << (kEndPosModelIndex >> 1)) |
25724
|
|
|
|
|
|
|
|
25725
|
|
|
|
|
|
|
#ifdef _LZMA_PROB32 |
25726
|
|
|
|
|
|
|
#define CLzmaProb uint32_t |
25727
|
|
|
|
|
|
|
#else |
25728
|
|
|
|
|
|
|
#define CLzmaProb uint16_t |
25729
|
|
|
|
|
|
|
#endif |
25730
|
|
|
|
|
|
|
|
25731
|
|
|
|
|
|
|
#define LZMA_PB_MAX 4 |
25732
|
|
|
|
|
|
|
#define LZMA_LC_MAX 8 |
25733
|
|
|
|
|
|
|
#define LZMA_LP_MAX 4 |
25734
|
|
|
|
|
|
|
|
25735
|
|
|
|
|
|
|
#define LZMA_NUM_PB_STATES_MAX (1 << LZMA_PB_MAX) |
25736
|
|
|
|
|
|
|
|
25737
|
|
|
|
|
|
|
#define kLenNumLowBits 3 |
25738
|
|
|
|
|
|
|
#define kLenNumLowSymbols (1 << kLenNumLowBits) |
25739
|
|
|
|
|
|
|
#define kLenNumMidBits 3 |
25740
|
|
|
|
|
|
|
#define kLenNumMidSymbols (1 << kLenNumMidBits) |
25741
|
|
|
|
|
|
|
#define kLenNumHighBits 8 |
25742
|
|
|
|
|
|
|
#define kLenNumHighSymbols (1 << kLenNumHighBits) |
25743
|
|
|
|
|
|
|
|
25744
|
|
|
|
|
|
|
#define kLenNumSymbolsTotal (kLenNumLowSymbols + kLenNumMidSymbols + kLenNumHighSymbols) |
25745
|
|
|
|
|
|
|
|
25746
|
|
|
|
|
|
|
#define LZMA_MATCH_LEN_MIN 2 |
25747
|
|
|
|
|
|
|
#define LZMA_MATCH_LEN_MAX (LZMA_MATCH_LEN_MIN + kLenNumSymbolsTotal - 1) |
25748
|
|
|
|
|
|
|
|
25749
|
|
|
|
|
|
|
#define kNumStates 12 |
25750
|
|
|
|
|
|
|
|
25751
|
|
|
|
|
|
|
struct CLenEnc |
25752
|
|
|
|
|
|
|
{ |
25753
|
|
|
|
|
|
|
CLzmaProb choice; |
25754
|
|
|
|
|
|
|
CLzmaProb choice2; |
25755
|
|
|
|
|
|
|
CLzmaProb low[LZMA_NUM_PB_STATES_MAX << kLenNumLowBits]; |
25756
|
|
|
|
|
|
|
CLzmaProb mid[LZMA_NUM_PB_STATES_MAX << kLenNumMidBits]; |
25757
|
|
|
|
|
|
|
CLzmaProb high[kLenNumHighSymbols]; |
25758
|
|
|
|
|
|
|
}; |
25759
|
|
|
|
|
|
|
|
25760
|
|
|
|
|
|
|
struct CLenPriceEnc |
25761
|
|
|
|
|
|
|
{ |
25762
|
|
|
|
|
|
|
CLenEnc p; |
25763
|
|
|
|
|
|
|
uint32_t prices[LZMA_NUM_PB_STATES_MAX][kLenNumSymbolsTotal]; |
25764
|
|
|
|
|
|
|
uint32_t tableSize; |
25765
|
|
|
|
|
|
|
uint32_t counters[LZMA_NUM_PB_STATES_MAX]; |
25766
|
|
|
|
|
|
|
}; |
25767
|
|
|
|
|
|
|
|
25768
|
|
|
|
|
|
|
struct CRangeEnc |
25769
|
|
|
|
|
|
|
{ |
25770
|
|
|
|
|
|
|
uint32_t range; |
25771
|
|
|
|
|
|
|
uint8_t cache; |
25772
|
|
|
|
|
|
|
uint64_t low; |
25773
|
|
|
|
|
|
|
uint64_t cacheSize; |
25774
|
|
|
|
|
|
|
uint8_t *buf; |
25775
|
|
|
|
|
|
|
uint8_t *bufLim; |
25776
|
|
|
|
|
|
|
uint8_t *bufBase; |
25777
|
|
|
|
|
|
|
ISeqOutStream *outStream; |
25778
|
|
|
|
|
|
|
uint64_t processed; |
25779
|
|
|
|
|
|
|
SRes res; |
25780
|
|
|
|
|
|
|
}; |
25781
|
|
|
|
|
|
|
|
25782
|
|
|
|
|
|
|
struct CSaveState |
25783
|
|
|
|
|
|
|
{ |
25784
|
|
|
|
|
|
|
CLzmaProb *litProbs; |
25785
|
|
|
|
|
|
|
|
25786
|
|
|
|
|
|
|
CLzmaProb isMatch[kNumStates][LZMA_NUM_PB_STATES_MAX]; |
25787
|
|
|
|
|
|
|
CLzmaProb isRep[kNumStates]; |
25788
|
|
|
|
|
|
|
CLzmaProb isRepG0[kNumStates]; |
25789
|
|
|
|
|
|
|
CLzmaProb isRepG1[kNumStates]; |
25790
|
|
|
|
|
|
|
CLzmaProb isRepG2[kNumStates]; |
25791
|
|
|
|
|
|
|
CLzmaProb isRep0Long[kNumStates][LZMA_NUM_PB_STATES_MAX]; |
25792
|
|
|
|
|
|
|
|
25793
|
|
|
|
|
|
|
CLzmaProb posSlotEncoder[kNumLenToPosStates][1 << kNumPosSlotBits]; |
25794
|
|
|
|
|
|
|
CLzmaProb posEncoders[kNumFullDistances - kEndPosModelIndex]; |
25795
|
|
|
|
|
|
|
CLzmaProb posAlignEncoder[1 << kNumAlignBits]; |
25796
|
|
|
|
|
|
|
|
25797
|
|
|
|
|
|
|
CLenPriceEnc lenEnc; |
25798
|
|
|
|
|
|
|
CLenPriceEnc repLenEnc; |
25799
|
|
|
|
|
|
|
|
25800
|
|
|
|
|
|
|
uint32_t reps[LZMA_NUM_REPS]; |
25801
|
|
|
|
|
|
|
uint32_t state; |
25802
|
|
|
|
|
|
|
}; |
25803
|
|
|
|
|
|
|
|
25804
|
|
|
|
|
|
|
struct CLzmaEnc |
25805
|
|
|
|
|
|
|
{ |
25806
|
|
|
|
|
|
|
IMatchFinder matchFinder; |
25807
|
|
|
|
|
|
|
CMatchFinder *matchFinderObj; |
25808
|
|
|
|
|
|
|
|
25809
|
|
|
|
|
|
|
CMatchFinder matchFinderBase; |
25810
|
|
|
|
|
|
|
|
25811
|
|
|
|
|
|
|
uint32_t optimumEndIndex; |
25812
|
|
|
|
|
|
|
uint32_t optimumCurrentIndex; |
25813
|
|
|
|
|
|
|
|
25814
|
|
|
|
|
|
|
uint32_t longestMatchLength; |
25815
|
|
|
|
|
|
|
uint32_t numPairs; |
25816
|
|
|
|
|
|
|
uint32_t numAvail; |
25817
|
|
|
|
|
|
|
COptimal opt[kNumOpts]; |
25818
|
|
|
|
|
|
|
|
25819
|
|
|
|
|
|
|
#ifndef LZMA_LOG_BSR |
25820
|
|
|
|
|
|
|
uint8_t g_FastPos[1 << kNumLogBits]; |
25821
|
|
|
|
|
|
|
#endif |
25822
|
|
|
|
|
|
|
|
25823
|
|
|
|
|
|
|
uint32_t ProbPrices[kBitModelTotal >> kNumMoveReducingBits]; |
25824
|
|
|
|
|
|
|
uint32_t matches[LZMA_MATCH_LEN_MAX * 2 + 2 + 1]; |
25825
|
|
|
|
|
|
|
uint32_t numFastBytes; |
25826
|
|
|
|
|
|
|
uint32_t additionalOffset; |
25827
|
|
|
|
|
|
|
uint32_t reps[LZMA_NUM_REPS]; |
25828
|
|
|
|
|
|
|
uint32_t state; |
25829
|
|
|
|
|
|
|
|
25830
|
|
|
|
|
|
|
uint32_t posSlotPrices[kNumLenToPosStates][kDistTableSizeMax]; |
25831
|
|
|
|
|
|
|
uint32_t distancesPrices[kNumLenToPosStates][kNumFullDistances]; |
25832
|
|
|
|
|
|
|
uint32_t alignPrices[kAlignTableSize]; |
25833
|
|
|
|
|
|
|
uint32_t alignPriceCount; |
25834
|
|
|
|
|
|
|
|
25835
|
|
|
|
|
|
|
uint32_t distTableSize; |
25836
|
|
|
|
|
|
|
|
25837
|
|
|
|
|
|
|
unsigned lc, lp, pb; |
25838
|
|
|
|
|
|
|
unsigned lpMask, pbMask; |
25839
|
|
|
|
|
|
|
|
25840
|
|
|
|
|
|
|
CLzmaProb *litProbs; |
25841
|
|
|
|
|
|
|
|
25842
|
|
|
|
|
|
|
CLzmaProb isMatch[kNumStates][LZMA_NUM_PB_STATES_MAX]; |
25843
|
|
|
|
|
|
|
CLzmaProb isRep[kNumStates]; |
25844
|
|
|
|
|
|
|
CLzmaProb isRepG0[kNumStates]; |
25845
|
|
|
|
|
|
|
CLzmaProb isRepG1[kNumStates]; |
25846
|
|
|
|
|
|
|
CLzmaProb isRepG2[kNumStates]; |
25847
|
|
|
|
|
|
|
CLzmaProb isRep0Long[kNumStates][LZMA_NUM_PB_STATES_MAX]; |
25848
|
|
|
|
|
|
|
|
25849
|
|
|
|
|
|
|
CLzmaProb posSlotEncoder[kNumLenToPosStates][1 << kNumPosSlotBits]; |
25850
|
|
|
|
|
|
|
CLzmaProb posEncoders[kNumFullDistances - kEndPosModelIndex]; |
25851
|
|
|
|
|
|
|
CLzmaProb posAlignEncoder[1 << kNumAlignBits]; |
25852
|
|
|
|
|
|
|
|
25853
|
|
|
|
|
|
|
CLenPriceEnc lenEnc; |
25854
|
|
|
|
|
|
|
CLenPriceEnc repLenEnc; |
25855
|
|
|
|
|
|
|
|
25856
|
|
|
|
|
|
|
unsigned lclp; |
25857
|
|
|
|
|
|
|
|
25858
|
|
|
|
|
|
|
bool fastMode; |
25859
|
|
|
|
|
|
|
|
25860
|
|
|
|
|
|
|
CRangeEnc rc; |
25861
|
|
|
|
|
|
|
|
25862
|
|
|
|
|
|
|
bool writeEndMark; |
25863
|
|
|
|
|
|
|
uint64_t nowPos64; |
25864
|
|
|
|
|
|
|
uint32_t matchPriceCount; |
25865
|
|
|
|
|
|
|
bool finished; |
25866
|
|
|
|
|
|
|
bool multiThread; |
25867
|
|
|
|
|
|
|
|
25868
|
|
|
|
|
|
|
SRes result; |
25869
|
|
|
|
|
|
|
uint32_t dictSize; |
25870
|
|
|
|
|
|
|
uint32_t matchFinderCycles; |
25871
|
|
|
|
|
|
|
|
25872
|
|
|
|
|
|
|
int needInit; |
25873
|
|
|
|
|
|
|
|
25874
|
|
|
|
|
|
|
CSaveState saveState; |
25875
|
|
|
|
|
|
|
}; |
25876
|
|
|
|
|
|
|
|
25877
|
0
|
|
|
|
|
|
void LzmaEnc_SaveState(CLzmaEncHandle pp) |
25878
|
|
|
|
|
|
|
{ |
25879
|
|
|
|
|
|
|
CLzmaEnc *p = (CLzmaEnc *)pp; |
25880
|
|
|
|
|
|
|
CSaveState *dest = &p->saveState; |
25881
|
|
|
|
|
|
|
int i; |
25882
|
0
|
|
|
|
|
|
dest->lenEnc = p->lenEnc; |
25883
|
0
|
|
|
|
|
|
dest->repLenEnc = p->repLenEnc; |
25884
|
0
|
|
|
|
|
|
dest->state = p->state; |
25885
|
|
|
|
|
|
|
|
25886
|
0
|
0
|
|
|
|
|
for (i = 0; i < kNumStates; i++) |
25887
|
|
|
|
|
|
|
{ |
25888
|
0
|
|
|
|
|
|
memcpy(dest->isMatch[i], p->isMatch[i], sizeof(p->isMatch[i])); |
25889
|
0
|
|
|
|
|
|
memcpy(dest->isRep0Long[i], p->isRep0Long[i], sizeof(p->isRep0Long[i])); |
25890
|
|
|
|
|
|
|
} |
25891
|
0
|
0
|
|
|
|
|
for (i = 0; i < kNumLenToPosStates; i++) |
25892
|
0
|
|
|
|
|
|
memcpy(dest->posSlotEncoder[i], p->posSlotEncoder[i], sizeof(p->posSlotEncoder[i])); |
25893
|
0
|
|
|
|
|
|
memcpy(dest->isRep, p->isRep, sizeof(p->isRep)); |
25894
|
0
|
|
|
|
|
|
memcpy(dest->isRepG0, p->isRepG0, sizeof(p->isRepG0)); |
25895
|
0
|
|
|
|
|
|
memcpy(dest->isRepG1, p->isRepG1, sizeof(p->isRepG1)); |
25896
|
0
|
|
|
|
|
|
memcpy(dest->isRepG2, p->isRepG2, sizeof(p->isRepG2)); |
25897
|
0
|
|
|
|
|
|
memcpy(dest->posEncoders, p->posEncoders, sizeof(p->posEncoders)); |
25898
|
0
|
|
|
|
|
|
memcpy(dest->posAlignEncoder, p->posAlignEncoder, sizeof(p->posAlignEncoder)); |
25899
|
0
|
|
|
|
|
|
memcpy(dest->reps, p->reps, sizeof(p->reps)); |
25900
|
0
|
|
|
|
|
|
memcpy(dest->litProbs, p->litProbs, (0x300 << p->lclp) * sizeof(CLzmaProb)); |
25901
|
0
|
|
|
|
|
|
} |
25902
|
|
|
|
|
|
|
|
25903
|
0
|
|
|
|
|
|
void LzmaEnc_RestoreState(CLzmaEncHandle pp) |
25904
|
|
|
|
|
|
|
{ |
25905
|
|
|
|
|
|
|
CLzmaEnc *dest = (CLzmaEnc *)pp; |
25906
|
|
|
|
|
|
|
const CSaveState *p = &dest->saveState; |
25907
|
|
|
|
|
|
|
int i; |
25908
|
0
|
|
|
|
|
|
dest->lenEnc = p->lenEnc; |
25909
|
0
|
|
|
|
|
|
dest->repLenEnc = p->repLenEnc; |
25910
|
0
|
|
|
|
|
|
dest->state = p->state; |
25911
|
|
|
|
|
|
|
|
25912
|
0
|
0
|
|
|
|
|
for (i = 0; i < kNumStates; i++) |
25913
|
|
|
|
|
|
|
{ |
25914
|
0
|
|
|
|
|
|
memcpy(dest->isMatch[i], p->isMatch[i], sizeof(p->isMatch[i])); |
25915
|
0
|
|
|
|
|
|
memcpy(dest->isRep0Long[i], p->isRep0Long[i], sizeof(p->isRep0Long[i])); |
25916
|
|
|
|
|
|
|
} |
25917
|
0
|
0
|
|
|
|
|
for (i = 0; i < kNumLenToPosStates; i++) |
25918
|
0
|
|
|
|
|
|
memcpy(dest->posSlotEncoder[i], p->posSlotEncoder[i], sizeof(p->posSlotEncoder[i])); |
25919
|
0
|
|
|
|
|
|
memcpy(dest->isRep, p->isRep, sizeof(p->isRep)); |
25920
|
0
|
|
|
|
|
|
memcpy(dest->isRepG0, p->isRepG0, sizeof(p->isRepG0)); |
25921
|
0
|
|
|
|
|
|
memcpy(dest->isRepG1, p->isRepG1, sizeof(p->isRepG1)); |
25922
|
0
|
|
|
|
|
|
memcpy(dest->isRepG2, p->isRepG2, sizeof(p->isRepG2)); |
25923
|
0
|
|
|
|
|
|
memcpy(dest->posEncoders, p->posEncoders, sizeof(p->posEncoders)); |
25924
|
0
|
|
|
|
|
|
memcpy(dest->posAlignEncoder, p->posAlignEncoder, sizeof(p->posAlignEncoder)); |
25925
|
0
|
|
|
|
|
|
memcpy(dest->reps, p->reps, sizeof(p->reps)); |
25926
|
0
|
|
|
|
|
|
memcpy(dest->litProbs, p->litProbs, (0x300 << dest->lclp) * sizeof(CLzmaProb)); |
25927
|
0
|
|
|
|
|
|
} |
25928
|
|
|
|
|
|
|
|
25929
|
0
|
|
|
|
|
|
SRes LzmaEnc_SetProps(CLzmaEncHandle pp, const CLzmaEncProps *props2) |
25930
|
|
|
|
|
|
|
{ |
25931
|
|
|
|
|
|
|
CLzmaEnc *p = (CLzmaEnc *)pp; |
25932
|
0
|
|
|
|
|
|
CLzmaEncProps props = *props2; |
25933
|
0
|
|
|
|
|
|
LzmaEncProps_Normalize(&props); |
25934
|
|
|
|
|
|
|
|
25935
|
0
|
0
|
|
|
|
|
if (props.lc > LZMA_LC_MAX || props.lp > LZMA_LP_MAX || props.pb > LZMA_PB_MAX || |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
25936
|
0
|
0
|
|
|
|
|
props.dictSize > ((uint32_t)1 << kDicLogSizeMaxCompress) || props.dictSize > ((uint32_t)1 << 30)) |
25937
|
|
|
|
|
|
|
return SZ_ERROR_PARAM; |
25938
|
0
|
|
|
|
|
|
p->dictSize = props.dictSize; |
25939
|
0
|
|
|
|
|
|
p->matchFinderCycles = props.mc; |
25940
|
|
|
|
|
|
|
{ |
25941
|
0
|
|
|
|
|
|
unsigned fb = props.fb; |
25942
|
0
|
0
|
|
|
|
|
if (fb < 5) |
25943
|
|
|
|
|
|
|
fb = 5; |
25944
|
0
|
0
|
|
|
|
|
if (fb > LZMA_MATCH_LEN_MAX) |
25945
|
|
|
|
|
|
|
fb = LZMA_MATCH_LEN_MAX; |
25946
|
0
|
|
|
|
|
|
p->numFastBytes = fb; |
25947
|
|
|
|
|
|
|
} |
25948
|
0
|
|
|
|
|
|
p->lc = props.lc; |
25949
|
0
|
|
|
|
|
|
p->lp = props.lp; |
25950
|
0
|
|
|
|
|
|
p->pb = props.pb; |
25951
|
0
|
|
|
|
|
|
p->fastMode = (props.algo == 0); |
25952
|
0
|
|
|
|
|
|
p->matchFinderBase.btMode = props.btMode; |
25953
|
|
|
|
|
|
|
{ |
25954
|
|
|
|
|
|
|
uint32_t numHashBytes = 4; |
25955
|
0
|
0
|
|
|
|
|
if (props.btMode) |
25956
|
|
|
|
|
|
|
{ |
25957
|
0
|
0
|
|
|
|
|
if (props.numHashBytes < 2) |
25958
|
|
|
|
|
|
|
numHashBytes = 2; |
25959
|
0
|
0
|
|
|
|
|
else if (props.numHashBytes < 4) |
25960
|
0
|
|
|
|
|
|
numHashBytes = props.numHashBytes; |
25961
|
|
|
|
|
|
|
} |
25962
|
0
|
|
|
|
|
|
p->matchFinderBase.numHashBytes = numHashBytes; |
25963
|
|
|
|
|
|
|
} |
25964
|
|
|
|
|
|
|
|
25965
|
0
|
|
|
|
|
|
p->matchFinderBase.cutValue = props.mc; |
25966
|
|
|
|
|
|
|
|
25967
|
0
|
|
|
|
|
|
p->writeEndMark = props.writeEndMark; |
25968
|
|
|
|
|
|
|
|
25969
|
0
|
|
|
|
|
|
return SZ_OK; |
25970
|
|
|
|
|
|
|
} |
25971
|
|
|
|
|
|
|
|
25972
|
|
|
|
|
|
|
static const int kLiteralNextStates[kNumStates] = {0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5}; |
25973
|
|
|
|
|
|
|
static const int kMatchNextStates[kNumStates] = {7, 7, 7, 7, 7, 7, 7, 10, 10, 10, 10, 10}; |
25974
|
|
|
|
|
|
|
static const int kRepNextStates[kNumStates] = {8, 8, 8, 8, 8, 8, 8, 11, 11, 11, 11, 11}; |
25975
|
|
|
|
|
|
|
static const int kShortRepNextStates[kNumStates]= {9, 9, 9, 9, 9, 9, 9, 11, 11, 11, 11, 11}; |
25976
|
|
|
|
|
|
|
|
25977
|
|
|
|
|
|
|
#define IsCharState(s) ((s) < 7) |
25978
|
|
|
|
|
|
|
|
25979
|
|
|
|
|
|
|
#define GetLenToPosState(len) (((len) < kNumLenToPosStates + 1) ? (len) - 2 : kNumLenToPosStates - 1) |
25980
|
|
|
|
|
|
|
|
25981
|
|
|
|
|
|
|
#define kInfinityPrice (1 << 30) |
25982
|
|
|
|
|
|
|
|
25983
|
|
|
|
|
|
|
static void RangeEnc_Construct(CRangeEnc *p) |
25984
|
|
|
|
|
|
|
{ |
25985
|
0
|
|
|
|
|
|
p->outStream = 0; |
25986
|
0
|
|
|
|
|
|
p->bufBase = 0; |
25987
|
|
|
|
|
|
|
} |
25988
|
|
|
|
|
|
|
|
25989
|
|
|
|
|
|
|
#define RangeEnc_GetProcessed(p) ((p)->processed + ((p)->buf - (p)->bufBase) + (p)->cacheSize) |
25990
|
|
|
|
|
|
|
|
25991
|
|
|
|
|
|
|
#define RC_BUF_SIZE (1 << 16) |
25992
|
|
|
|
|
|
|
static int RangeEnc_Alloc(CRangeEnc *p, ISzAlloc *alloc) |
25993
|
|
|
|
|
|
|
{ |
25994
|
0
|
0
|
|
|
|
|
if (p->bufBase == 0) |
25995
|
|
|
|
|
|
|
{ |
25996
|
0
|
|
|
|
|
|
p->bufBase = (uint8_t *)alloc->Alloc(alloc, RC_BUF_SIZE); |
25997
|
0
|
0
|
|
|
|
|
if (p->bufBase == 0) |
25998
|
|
|
|
|
|
|
return 0; |
25999
|
0
|
|
|
|
|
|
p->bufLim = p->bufBase + RC_BUF_SIZE; |
26000
|
|
|
|
|
|
|
} |
26001
|
|
|
|
|
|
|
return 1; |
26002
|
|
|
|
|
|
|
} |
26003
|
|
|
|
|
|
|
|
26004
|
|
|
|
|
|
|
static void RangeEnc_Free(CRangeEnc *p, ISzAlloc *alloc) |
26005
|
|
|
|
|
|
|
{ |
26006
|
0
|
|
|
|
|
|
alloc->Free(alloc, p->bufBase); |
26007
|
0
|
|
|
|
|
|
p->bufBase = 0; |
26008
|
|
|
|
|
|
|
} |
26009
|
|
|
|
|
|
|
|
26010
|
|
|
|
|
|
|
static void RangeEnc_Init(CRangeEnc *p) |
26011
|
|
|
|
|
|
|
{ |
26012
|
|
|
|
|
|
|
/* Stream.Init(); */ |
26013
|
0
|
|
|
|
|
|
p->low = 0; |
26014
|
0
|
|
|
|
|
|
p->range = 0xFFFFFFFF; |
26015
|
0
|
|
|
|
|
|
p->cacheSize = 1; |
26016
|
0
|
|
|
|
|
|
p->cache = 0; |
26017
|
|
|
|
|
|
|
|
26018
|
0
|
|
|
|
|
|
p->buf = p->bufBase; |
26019
|
|
|
|
|
|
|
|
26020
|
0
|
|
|
|
|
|
p->processed = 0; |
26021
|
0
|
|
|
|
|
|
p->res = SZ_OK; |
26022
|
|
|
|
|
|
|
} |
26023
|
|
|
|
|
|
|
|
26024
|
0
|
|
|
|
|
|
static void RangeEnc_FlushStream(CRangeEnc *p) |
26025
|
|
|
|
|
|
|
{ |
26026
|
|
|
|
|
|
|
size_t num; |
26027
|
0
|
0
|
|
|
|
|
if (p->res != SZ_OK) |
26028
|
|
|
|
|
|
|
return; |
26029
|
0
|
|
|
|
|
|
num = p->buf - p->bufBase; |
26030
|
0
|
0
|
|
|
|
|
if (num != p->outStream->Write(p->outStream, p->bufBase, num)) |
26031
|
0
|
|
|
|
|
|
p->res = SZ_ERROR_WRITE; |
26032
|
0
|
|
|
|
|
|
p->processed += num; |
26033
|
0
|
|
|
|
|
|
p->buf = p->bufBase; |
26034
|
|
|
|
|
|
|
} |
26035
|
|
|
|
|
|
|
|
26036
|
0
|
|
|
|
|
|
static void RangeEnc_ShiftLow(CRangeEnc *p) |
26037
|
|
|
|
|
|
|
{ |
26038
|
0
|
0
|
|
|
|
|
if ((uint32_t)p->low < (uint32_t)0xFF000000 || (int)(p->low >> 32) != 0) |
|
|
0
|
|
|
|
|
|
26039
|
|
|
|
|
|
|
{ |
26040
|
0
|
|
|
|
|
|
uint8_t temp = p->cache; |
26041
|
0
|
0
|
|
|
|
|
do |
26042
|
|
|
|
|
|
|
{ |
26043
|
0
|
|
|
|
|
|
uint8_t *buf = p->buf; |
26044
|
0
|
|
|
|
|
|
*buf++ = (uint8_t)(temp + (uint8_t)(p->low >> 32)); |
26045
|
0
|
|
|
|
|
|
p->buf = buf; |
26046
|
0
|
0
|
|
|
|
|
if (buf == p->bufLim) |
26047
|
0
|
|
|
|
|
|
RangeEnc_FlushStream(p); |
26048
|
|
|
|
|
|
|
temp = 0xFF; |
26049
|
|
|
|
|
|
|
} |
26050
|
0
|
|
|
|
|
|
while (--p->cacheSize != 0); |
26051
|
0
|
|
|
|
|
|
p->cache = (uint8_t)((uint32_t)p->low >> 24); |
26052
|
|
|
|
|
|
|
} |
26053
|
0
|
|
|
|
|
|
p->cacheSize++; |
26054
|
0
|
|
|
|
|
|
p->low = (uint32_t)p->low << 8; |
26055
|
0
|
|
|
|
|
|
} |
26056
|
|
|
|
|
|
|
|
26057
|
|
|
|
|
|
|
static void RangeEnc_FlushData(CRangeEnc *p) |
26058
|
|
|
|
|
|
|
{ |
26059
|
|
|
|
|
|
|
int i; |
26060
|
0
|
0
|
|
|
|
|
for (i = 0; i < 5; i++) |
26061
|
0
|
|
|
|
|
|
RangeEnc_ShiftLow(p); |
26062
|
|
|
|
|
|
|
} |
26063
|
|
|
|
|
|
|
|
26064
|
0
|
|
|
|
|
|
static void RangeEnc_EncodeDirectBits(CRangeEnc *p, uint32_t value, int numBits) |
26065
|
|
|
|
|
|
|
{ |
26066
|
|
|
|
|
|
|
do |
26067
|
|
|
|
|
|
|
{ |
26068
|
0
|
|
|
|
|
|
p->range >>= 1; |
26069
|
0
|
|
|
|
|
|
p->low += p->range & (0 - ((value >> --numBits) & 1)); |
26070
|
0
|
0
|
|
|
|
|
if (p->range < kTopValue) |
26071
|
|
|
|
|
|
|
{ |
26072
|
0
|
|
|
|
|
|
p->range <<= 8; |
26073
|
0
|
|
|
|
|
|
RangeEnc_ShiftLow(p); |
26074
|
|
|
|
|
|
|
} |
26075
|
|
|
|
|
|
|
} |
26076
|
0
|
0
|
|
|
|
|
while (numBits != 0); |
26077
|
0
|
|
|
|
|
|
} |
26078
|
|
|
|
|
|
|
|
26079
|
0
|
|
|
|
|
|
static void RangeEnc_EncodeBit(CRangeEnc *p, CLzmaProb *prob, uint32_t symbol) |
26080
|
|
|
|
|
|
|
{ |
26081
|
0
|
|
|
|
|
|
uint32_t ttt = *prob; |
26082
|
0
|
|
|
|
|
|
uint32_t newBound = (p->range >> kNumBitModelTotalBits) * ttt; |
26083
|
0
|
0
|
|
|
|
|
if (symbol == 0) |
26084
|
|
|
|
|
|
|
{ |
26085
|
0
|
|
|
|
|
|
p->range = newBound; |
26086
|
0
|
|
|
|
|
|
ttt += (kBitModelTotal - ttt) >> kNumMoveBits; |
26087
|
|
|
|
|
|
|
} |
26088
|
|
|
|
|
|
|
else |
26089
|
|
|
|
|
|
|
{ |
26090
|
0
|
|
|
|
|
|
p->low += newBound; |
26091
|
0
|
|
|
|
|
|
p->range -= newBound; |
26092
|
0
|
|
|
|
|
|
ttt -= ttt >> kNumMoveBits; |
26093
|
|
|
|
|
|
|
} |
26094
|
0
|
|
|
|
|
|
*prob = (CLzmaProb)ttt; |
26095
|
0
|
0
|
|
|
|
|
if (p->range < kTopValue) |
26096
|
|
|
|
|
|
|
{ |
26097
|
0
|
|
|
|
|
|
p->range <<= 8; |
26098
|
0
|
|
|
|
|
|
RangeEnc_ShiftLow(p); |
26099
|
|
|
|
|
|
|
} |
26100
|
0
|
|
|
|
|
|
} |
26101
|
|
|
|
|
|
|
|
26102
|
0
|
|
|
|
|
|
static void LitEnc_Encode(CRangeEnc *p, CLzmaProb *probs, uint32_t symbol) |
26103
|
|
|
|
|
|
|
{ |
26104
|
0
|
|
|
|
|
|
symbol |= 0x100; |
26105
|
|
|
|
|
|
|
do |
26106
|
|
|
|
|
|
|
{ |
26107
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(p, probs + (symbol >> 8), (symbol >> 7) & 1); |
26108
|
0
|
|
|
|
|
|
symbol <<= 1; |
26109
|
|
|
|
|
|
|
} |
26110
|
0
|
0
|
|
|
|
|
while (symbol < 0x10000); |
26111
|
0
|
|
|
|
|
|
} |
26112
|
|
|
|
|
|
|
|
26113
|
0
|
|
|
|
|
|
static void LitEnc_EncodeMatched(CRangeEnc *p, CLzmaProb *probs, uint32_t symbol, uint32_t matchByte) |
26114
|
|
|
|
|
|
|
{ |
26115
|
|
|
|
|
|
|
uint32_t offs = 0x100; |
26116
|
0
|
|
|
|
|
|
symbol |= 0x100; |
26117
|
|
|
|
|
|
|
do |
26118
|
|
|
|
|
|
|
{ |
26119
|
0
|
|
|
|
|
|
matchByte <<= 1; |
26120
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(p, probs + (offs + (matchByte & offs) + (symbol >> 8)), (symbol >> 7) & 1); |
26121
|
0
|
|
|
|
|
|
symbol <<= 1; |
26122
|
0
|
|
|
|
|
|
offs &= ~(matchByte ^ symbol); |
26123
|
|
|
|
|
|
|
} |
26124
|
0
|
0
|
|
|
|
|
while (symbol < 0x10000); |
26125
|
0
|
|
|
|
|
|
} |
26126
|
|
|
|
|
|
|
|
26127
|
0
|
|
|
|
|
|
void LzmaEnc_InitPriceTables(uint32_t *ProbPrices) |
26128
|
|
|
|
|
|
|
{ |
26129
|
|
|
|
|
|
|
uint32_t i; |
26130
|
0
|
0
|
|
|
|
|
for (i = (1 << kNumMoveReducingBits) / 2; i < kBitModelTotal; i += (1 << kNumMoveReducingBits)) |
26131
|
|
|
|
|
|
|
{ |
26132
|
|
|
|
|
|
|
const int kCyclesBits = kNumBitPriceShiftBits; |
26133
|
|
|
|
|
|
|
uint32_t w = i; |
26134
|
|
|
|
|
|
|
uint32_t bitCount = 0; |
26135
|
|
|
|
|
|
|
int j; |
26136
|
0
|
0
|
|
|
|
|
for (j = 0; j < kCyclesBits; j++) |
26137
|
|
|
|
|
|
|
{ |
26138
|
0
|
|
|
|
|
|
w = w * w; |
26139
|
0
|
|
|
|
|
|
bitCount <<= 1; |
26140
|
0
|
0
|
|
|
|
|
while (w >= ((uint32_t)1 << 16)) |
26141
|
|
|
|
|
|
|
{ |
26142
|
0
|
|
|
|
|
|
w >>= 1; |
26143
|
0
|
|
|
|
|
|
bitCount++; |
26144
|
|
|
|
|
|
|
} |
26145
|
|
|
|
|
|
|
} |
26146
|
0
|
|
|
|
|
|
ProbPrices[i >> kNumMoveReducingBits] = ((kNumBitModelTotalBits << kCyclesBits) - 15 - bitCount); |
26147
|
|
|
|
|
|
|
} |
26148
|
0
|
|
|
|
|
|
} |
26149
|
|
|
|
|
|
|
|
26150
|
|
|
|
|
|
|
#define GET_PRICE(prob, symbol) \ |
26151
|
|
|
|
|
|
|
p->ProbPrices[((prob) ^ (((-(int)(symbol))) & (kBitModelTotal - 1))) >> kNumMoveReducingBits]; |
26152
|
|
|
|
|
|
|
|
26153
|
|
|
|
|
|
|
#define GET_PRICEa(prob, symbol) \ |
26154
|
|
|
|
|
|
|
ProbPrices[((prob) ^ ((-((int)(symbol))) & (kBitModelTotal - 1))) >> kNumMoveReducingBits]; |
26155
|
|
|
|
|
|
|
|
26156
|
|
|
|
|
|
|
#define GET_PRICE_0(prob) p->ProbPrices[(prob) >> kNumMoveReducingBits] |
26157
|
|
|
|
|
|
|
#define GET_PRICE_1(prob) p->ProbPrices[((prob) ^ (kBitModelTotal - 1)) >> kNumMoveReducingBits] |
26158
|
|
|
|
|
|
|
|
26159
|
|
|
|
|
|
|
#define GET_PRICE_0a(prob) ProbPrices[(prob) >> kNumMoveReducingBits] |
26160
|
|
|
|
|
|
|
#define GET_PRICE_1a(prob) ProbPrices[((prob) ^ (kBitModelTotal - 1)) >> kNumMoveReducingBits] |
26161
|
|
|
|
|
|
|
|
26162
|
|
|
|
|
|
|
static uint32_t LitEnc_GetPrice(const CLzmaProb *probs, uint32_t symbol, uint32_t *ProbPrices) |
26163
|
|
|
|
|
|
|
{ |
26164
|
|
|
|
|
|
|
uint32_t price = 0; |
26165
|
0
|
|
|
|
|
|
symbol |= 0x100; |
26166
|
|
|
|
|
|
|
do |
26167
|
|
|
|
|
|
|
{ |
26168
|
0
|
|
|
|
|
|
price += GET_PRICEa(probs[symbol >> 8], (symbol >> 7) & 1); |
26169
|
0
|
|
|
|
|
|
symbol <<= 1; |
26170
|
|
|
|
|
|
|
} |
26171
|
0
|
0
|
|
|
|
|
while (symbol < 0x10000); |
|
|
0
|
|
|
|
|
|
26172
|
|
|
|
|
|
|
return price; |
26173
|
|
|
|
|
|
|
} |
26174
|
|
|
|
|
|
|
|
26175
|
0
|
|
|
|
|
|
static uint32_t LitEnc_GetPriceMatched(const CLzmaProb *probs, uint32_t symbol, uint32_t matchByte, uint32_t *ProbPrices) |
26176
|
|
|
|
|
|
|
{ |
26177
|
|
|
|
|
|
|
uint32_t price = 0; |
26178
|
|
|
|
|
|
|
uint32_t offs = 0x100; |
26179
|
0
|
|
|
|
|
|
symbol |= 0x100; |
26180
|
|
|
|
|
|
|
do |
26181
|
|
|
|
|
|
|
{ |
26182
|
0
|
|
|
|
|
|
matchByte <<= 1; |
26183
|
0
|
|
|
|
|
|
price += GET_PRICEa(probs[offs + (matchByte & offs) + (symbol >> 8)], (symbol >> 7) & 1); |
26184
|
0
|
|
|
|
|
|
symbol <<= 1; |
26185
|
0
|
|
|
|
|
|
offs &= ~(matchByte ^ symbol); |
26186
|
|
|
|
|
|
|
} |
26187
|
0
|
0
|
|
|
|
|
while (symbol < 0x10000); |
26188
|
0
|
|
|
|
|
|
return price; |
26189
|
|
|
|
|
|
|
} |
26190
|
|
|
|
|
|
|
|
26191
|
0
|
|
|
|
|
|
static void RcTree_Encode(CRangeEnc *rc, CLzmaProb *probs, int numBitLevels, uint32_t symbol) |
26192
|
|
|
|
|
|
|
{ |
26193
|
|
|
|
|
|
|
uint32_t m = 1; |
26194
|
|
|
|
|
|
|
int i; |
26195
|
0
|
0
|
|
|
|
|
for (i = numBitLevels; i != 0;) |
26196
|
|
|
|
|
|
|
{ |
26197
|
|
|
|
|
|
|
uint32_t bit; |
26198
|
0
|
|
|
|
|
|
i--; |
26199
|
0
|
|
|
|
|
|
bit = (symbol >> i) & 1; |
26200
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(rc, probs + m, bit); |
26201
|
0
|
|
|
|
|
|
m = (m << 1) | bit; |
26202
|
|
|
|
|
|
|
} |
26203
|
0
|
|
|
|
|
|
} |
26204
|
|
|
|
|
|
|
|
26205
|
0
|
|
|
|
|
|
static void RcTree_ReverseEncode(CRangeEnc *rc, CLzmaProb *probs, int numBitLevels, uint32_t symbol) |
26206
|
|
|
|
|
|
|
{ |
26207
|
|
|
|
|
|
|
uint32_t m = 1; |
26208
|
|
|
|
|
|
|
int i; |
26209
|
0
|
0
|
|
|
|
|
for (i = 0; i < numBitLevels; i++) |
26210
|
|
|
|
|
|
|
{ |
26211
|
0
|
|
|
|
|
|
uint32_t bit = symbol & 1; |
26212
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(rc, probs + m, bit); |
26213
|
0
|
|
|
|
|
|
m = (m << 1) | bit; |
26214
|
0
|
|
|
|
|
|
symbol >>= 1; |
26215
|
|
|
|
|
|
|
} |
26216
|
0
|
|
|
|
|
|
} |
26217
|
|
|
|
|
|
|
|
26218
|
|
|
|
|
|
|
static uint32_t RcTree_GetPrice(const CLzmaProb *probs, int numBitLevels, uint32_t symbol, uint32_t *ProbPrices) |
26219
|
|
|
|
|
|
|
{ |
26220
|
|
|
|
|
|
|
uint32_t price = 0; |
26221
|
0
|
|
|
|
|
|
symbol |= (1 << numBitLevels); |
26222
|
0
|
0
|
|
|
|
|
while (symbol != 1) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
26223
|
|
|
|
|
|
|
{ |
26224
|
0
|
|
|
|
|
|
price += GET_PRICEa(probs[symbol >> 1], symbol & 1); |
26225
|
|
|
|
|
|
|
symbol >>= 1; |
26226
|
|
|
|
|
|
|
} |
26227
|
|
|
|
|
|
|
return price; |
26228
|
|
|
|
|
|
|
} |
26229
|
|
|
|
|
|
|
|
26230
|
|
|
|
|
|
|
static uint32_t RcTree_ReverseGetPrice(const CLzmaProb *probs, int numBitLevels, uint32_t symbol, uint32_t *ProbPrices) |
26231
|
|
|
|
|
|
|
{ |
26232
|
|
|
|
|
|
|
uint32_t price = 0; |
26233
|
|
|
|
|
|
|
uint32_t m = 1; |
26234
|
|
|
|
|
|
|
int i; |
26235
|
0
|
0
|
|
|
|
|
for (i = numBitLevels; i != 0; i--) |
|
|
0
|
|
|
|
|
|
26236
|
|
|
|
|
|
|
{ |
26237
|
0
|
|
|
|
|
|
uint32_t bit = symbol & 1; |
26238
|
0
|
|
|
|
|
|
symbol >>= 1; |
26239
|
0
|
|
|
|
|
|
price += GET_PRICEa(probs[m], bit); |
26240
|
0
|
|
|
|
|
|
m = (m << 1) | bit; |
26241
|
|
|
|
|
|
|
} |
26242
|
|
|
|
|
|
|
return price; |
26243
|
|
|
|
|
|
|
} |
26244
|
|
|
|
|
|
|
|
26245
|
|
|
|
|
|
|
static void LenEnc_Init(CLenEnc *p) |
26246
|
|
|
|
|
|
|
{ |
26247
|
|
|
|
|
|
|
unsigned i; |
26248
|
0
|
|
|
|
|
|
p->choice = p->choice2 = kProbInitValue; |
26249
|
0
|
0
|
|
|
|
|
for (i = 0; i < (LZMA_NUM_PB_STATES_MAX << kLenNumLowBits); i++) |
|
|
0
|
|
|
|
|
|
26250
|
0
|
|
|
|
|
|
p->low[i] = kProbInitValue; |
26251
|
0
|
0
|
|
|
|
|
for (i = 0; i < (LZMA_NUM_PB_STATES_MAX << kLenNumMidBits); i++) |
|
|
0
|
|
|
|
|
|
26252
|
0
|
|
|
|
|
|
p->mid[i] = kProbInitValue; |
26253
|
0
|
0
|
|
|
|
|
for (i = 0; i < kLenNumHighSymbols; i++) |
|
|
0
|
|
|
|
|
|
26254
|
0
|
|
|
|
|
|
p->high[i] = kProbInitValue; |
26255
|
|
|
|
|
|
|
} |
26256
|
|
|
|
|
|
|
|
26257
|
0
|
|
|
|
|
|
static void LenEnc_Encode(CLenEnc *p, CRangeEnc *rc, uint32_t symbol, uint32_t posState) |
26258
|
|
|
|
|
|
|
{ |
26259
|
0
|
0
|
|
|
|
|
if (symbol < kLenNumLowSymbols) |
26260
|
|
|
|
|
|
|
{ |
26261
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(rc, &p->choice, 0); |
26262
|
0
|
|
|
|
|
|
RcTree_Encode(rc, p->low + (posState << kLenNumLowBits), kLenNumLowBits, symbol); |
26263
|
|
|
|
|
|
|
} |
26264
|
|
|
|
|
|
|
else |
26265
|
|
|
|
|
|
|
{ |
26266
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(rc, &p->choice, 1); |
26267
|
0
|
0
|
|
|
|
|
if (symbol < kLenNumLowSymbols + kLenNumMidSymbols) |
26268
|
|
|
|
|
|
|
{ |
26269
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(rc, &p->choice2, 0); |
26270
|
0
|
|
|
|
|
|
RcTree_Encode(rc, p->mid + (posState << kLenNumMidBits), kLenNumMidBits, symbol - kLenNumLowSymbols); |
26271
|
|
|
|
|
|
|
} |
26272
|
|
|
|
|
|
|
else |
26273
|
|
|
|
|
|
|
{ |
26274
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(rc, &p->choice2, 1); |
26275
|
0
|
|
|
|
|
|
RcTree_Encode(rc, p->high, kLenNumHighBits, symbol - kLenNumLowSymbols - kLenNumMidSymbols); |
26276
|
|
|
|
|
|
|
} |
26277
|
|
|
|
|
|
|
} |
26278
|
0
|
|
|
|
|
|
} |
26279
|
|
|
|
|
|
|
|
26280
|
0
|
|
|
|
|
|
static void LenEnc_SetPrices(CLenEnc *p, uint32_t posState, uint32_t numSymbols, uint32_t *prices, uint32_t *ProbPrices) |
26281
|
|
|
|
|
|
|
{ |
26282
|
0
|
|
|
|
|
|
uint32_t a0 = GET_PRICE_0a(p->choice); |
26283
|
0
|
|
|
|
|
|
uint32_t a1 = GET_PRICE_1a(p->choice); |
26284
|
0
|
|
|
|
|
|
uint32_t b0 = a1 + GET_PRICE_0a(p->choice2); |
26285
|
0
|
|
|
|
|
|
uint32_t b1 = a1 + GET_PRICE_1a(p->choice2); |
26286
|
|
|
|
|
|
|
uint32_t i = 0; |
26287
|
0
|
0
|
|
|
|
|
for (i = 0; i < kLenNumLowSymbols; i++) |
26288
|
|
|
|
|
|
|
{ |
26289
|
0
|
0
|
|
|
|
|
if (i >= numSymbols) |
26290
|
|
|
|
|
|
|
return; |
26291
|
0
|
|
|
|
|
|
prices[i] = a0 + RcTree_GetPrice(p->low + (posState << kLenNumLowBits), kLenNumLowBits, i, ProbPrices); |
26292
|
|
|
|
|
|
|
} |
26293
|
0
|
0
|
|
|
|
|
for (; i < kLenNumLowSymbols + kLenNumMidSymbols; i++) |
26294
|
|
|
|
|
|
|
{ |
26295
|
0
|
0
|
|
|
|
|
if (i >= numSymbols) |
26296
|
|
|
|
|
|
|
return; |
26297
|
0
|
|
|
|
|
|
prices[i] = b0 + RcTree_GetPrice(p->mid + (posState << kLenNumMidBits), kLenNumMidBits, i - kLenNumLowSymbols, ProbPrices); |
26298
|
|
|
|
|
|
|
} |
26299
|
0
|
0
|
|
|
|
|
for (; i < numSymbols; i++) |
26300
|
0
|
|
|
|
|
|
prices[i] = b1 + RcTree_GetPrice(p->high, kLenNumHighBits, i - kLenNumLowSymbols - kLenNumMidSymbols, ProbPrices); |
26301
|
|
|
|
|
|
|
} |
26302
|
|
|
|
|
|
|
|
26303
|
|
|
|
|
|
|
static void LenPriceEnc_UpdateTable(CLenPriceEnc *p, uint32_t posState, uint32_t *ProbPrices) |
26304
|
|
|
|
|
|
|
{ |
26305
|
0
|
|
|
|
|
|
LenEnc_SetPrices(&p->p, posState, p->tableSize, p->prices[posState], ProbPrices); |
26306
|
0
|
|
|
|
|
|
p->counters[posState] = p->tableSize; |
26307
|
|
|
|
|
|
|
} |
26308
|
|
|
|
|
|
|
|
26309
|
|
|
|
|
|
|
static void LenPriceEnc_UpdateTables(CLenPriceEnc *p, uint32_t numPosStates, uint32_t *ProbPrices) |
26310
|
|
|
|
|
|
|
{ |
26311
|
|
|
|
|
|
|
uint32_t posState; |
26312
|
0
|
0
|
|
|
|
|
for (posState = 0; posState < numPosStates; posState++) |
|
|
0
|
|
|
|
|
|
26313
|
|
|
|
|
|
|
LenPriceEnc_UpdateTable(p, posState, ProbPrices); |
26314
|
|
|
|
|
|
|
} |
26315
|
|
|
|
|
|
|
|
26316
|
0
|
|
|
|
|
|
static void LenEnc_Encode2(CLenPriceEnc *p, CRangeEnc *rc, uint32_t symbol, uint32_t posState, bool updatePrice, uint32_t *ProbPrices) |
26317
|
|
|
|
|
|
|
{ |
26318
|
0
|
|
|
|
|
|
LenEnc_Encode(&p->p, rc, symbol, posState); |
26319
|
0
|
0
|
|
|
|
|
if (updatePrice) |
26320
|
0
|
0
|
|
|
|
|
if (--p->counters[posState] == 0) |
26321
|
|
|
|
|
|
|
LenPriceEnc_UpdateTable(p, posState, ProbPrices); |
26322
|
0
|
|
|
|
|
|
} |
26323
|
|
|
|
|
|
|
|
26324
|
|
|
|
|
|
|
static void MovePos(CLzmaEnc *p, uint32_t num) |
26325
|
|
|
|
|
|
|
{ |
26326
|
0
|
0
|
|
|
|
|
if (num != 0) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
26327
|
|
|
|
|
|
|
{ |
26328
|
0
|
|
|
|
|
|
p->additionalOffset += num; |
26329
|
0
|
|
|
|
|
|
p->matchFinder.Skip(p->matchFinderObj, num); |
26330
|
|
|
|
|
|
|
} |
26331
|
|
|
|
|
|
|
} |
26332
|
|
|
|
|
|
|
|
26333
|
0
|
|
|
|
|
|
static uint32_t ReadMatchDistances(CLzmaEnc *p, uint32_t *numDistancePairsRes) |
26334
|
|
|
|
|
|
|
{ |
26335
|
|
|
|
|
|
|
uint32_t lenRes = 0, numPairs; |
26336
|
0
|
|
|
|
|
|
p->numAvail = p->matchFinder.GetNumAvailableBytes(p->matchFinderObj); |
26337
|
0
|
|
|
|
|
|
numPairs = p->matchFinder.GetMatches(p->matchFinderObj, p->matches); |
26338
|
0
|
0
|
|
|
|
|
if (numPairs > 0) |
26339
|
|
|
|
|
|
|
{ |
26340
|
0
|
|
|
|
|
|
lenRes = p->matches[numPairs - 2]; |
26341
|
0
|
0
|
|
|
|
|
if (lenRes == p->numFastBytes) |
26342
|
|
|
|
|
|
|
{ |
26343
|
0
|
|
|
|
|
|
const uint8_t *pby = p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - 1; |
26344
|
0
|
|
|
|
|
|
uint32_t distance = p->matches[numPairs - 1] + 1; |
26345
|
0
|
|
|
|
|
|
uint32_t numAvail = p->numAvail; |
26346
|
0
|
0
|
|
|
|
|
if (numAvail > LZMA_MATCH_LEN_MAX) |
26347
|
|
|
|
|
|
|
numAvail = LZMA_MATCH_LEN_MAX; |
26348
|
|
|
|
|
|
|
{ |
26349
|
0
|
|
|
|
|
|
const uint8_t *pby2 = pby - distance; |
26350
|
0
|
0
|
|
|
|
|
for (; lenRes < numAvail && pby[lenRes] == pby2[lenRes]; lenRes++); |
|
|
0
|
|
|
|
|
|
26351
|
|
|
|
|
|
|
} |
26352
|
|
|
|
|
|
|
} |
26353
|
|
|
|
|
|
|
} |
26354
|
0
|
|
|
|
|
|
p->additionalOffset++; |
26355
|
0
|
|
|
|
|
|
*numDistancePairsRes = numPairs; |
26356
|
0
|
|
|
|
|
|
return lenRes; |
26357
|
|
|
|
|
|
|
} |
26358
|
|
|
|
|
|
|
|
26359
|
|
|
|
|
|
|
#define MakeAsChar(p) (p)->backPrev = (uint32_t)(-1); (p)->prev1IsChar = false; |
26360
|
|
|
|
|
|
|
#define MakeAsShortRep(p) (p)->backPrev = 0; (p)->prev1IsChar = false; |
26361
|
|
|
|
|
|
|
#define IsShortRep(p) ((p)->backPrev == 0) |
26362
|
|
|
|
|
|
|
|
26363
|
|
|
|
|
|
|
static uint32_t GetRepLen1Price(CLzmaEnc *p, uint32_t state, uint32_t posState) |
26364
|
|
|
|
|
|
|
{ |
26365
|
|
|
|
|
|
|
return |
26366
|
0
|
|
|
|
|
|
GET_PRICE_0(p->isRepG0[state]) + |
26367
|
0
|
|
|
|
|
|
GET_PRICE_0(p->isRep0Long[state][posState]); |
26368
|
|
|
|
|
|
|
} |
26369
|
|
|
|
|
|
|
|
26370
|
0
|
|
|
|
|
|
static uint32_t GetPureRepPrice(CLzmaEnc *p, uint32_t repIndex, uint32_t state, uint32_t posState) |
26371
|
|
|
|
|
|
|
{ |
26372
|
|
|
|
|
|
|
uint32_t price; |
26373
|
0
|
0
|
|
|
|
|
if (repIndex == 0) |
26374
|
|
|
|
|
|
|
{ |
26375
|
0
|
|
|
|
|
|
price = GET_PRICE_0(p->isRepG0[state]); |
26376
|
0
|
|
|
|
|
|
price += GET_PRICE_1(p->isRep0Long[state][posState]); |
26377
|
|
|
|
|
|
|
} |
26378
|
|
|
|
|
|
|
else |
26379
|
|
|
|
|
|
|
{ |
26380
|
0
|
|
|
|
|
|
price = GET_PRICE_1(p->isRepG0[state]); |
26381
|
0
|
0
|
|
|
|
|
if (repIndex == 1) |
26382
|
0
|
|
|
|
|
|
price += GET_PRICE_0(p->isRepG1[state]); |
26383
|
|
|
|
|
|
|
else |
26384
|
|
|
|
|
|
|
{ |
26385
|
0
|
|
|
|
|
|
price += GET_PRICE_1(p->isRepG1[state]); |
26386
|
0
|
|
|
|
|
|
price += GET_PRICE(p->isRepG2[state], repIndex - 2); |
26387
|
|
|
|
|
|
|
} |
26388
|
|
|
|
|
|
|
} |
26389
|
0
|
|
|
|
|
|
return price; |
26390
|
|
|
|
|
|
|
} |
26391
|
|
|
|
|
|
|
|
26392
|
|
|
|
|
|
|
static uint32_t GetRepPrice(CLzmaEnc *p, uint32_t repIndex, uint32_t len, uint32_t state, uint32_t posState) |
26393
|
|
|
|
|
|
|
{ |
26394
|
0
|
|
|
|
|
|
return p->repLenEnc.prices[posState][len - LZMA_MATCH_LEN_MIN] + |
26395
|
0
|
|
|
|
|
|
GetPureRepPrice(p, repIndex, state, posState); |
26396
|
|
|
|
|
|
|
} |
26397
|
|
|
|
|
|
|
|
26398
|
|
|
|
|
|
|
static uint32_t Backward(CLzmaEnc *p, uint32_t *backRes, uint32_t cur) |
26399
|
|
|
|
|
|
|
{ |
26400
|
0
|
|
|
|
|
|
uint32_t posMem = p->opt[cur].posPrev; |
26401
|
0
|
|
|
|
|
|
uint32_t backMem = p->opt[cur].backPrev; |
26402
|
0
|
|
|
|
|
|
p->optimumEndIndex = cur; |
26403
|
|
|
|
|
|
|
do |
26404
|
|
|
|
|
|
|
{ |
26405
|
0
|
0
|
|
|
|
|
if (p->opt[cur].prev1IsChar) |
|
|
0
|
|
|
|
|
|
26406
|
|
|
|
|
|
|
{ |
26407
|
0
|
|
|
|
|
|
MakeAsChar(&p->opt[posMem]) |
26408
|
0
|
|
|
|
|
|
p->opt[posMem].posPrev = posMem - 1; |
26409
|
0
|
0
|
|
|
|
|
if (p->opt[cur].prev2) |
|
|
0
|
|
|
|
|
|
26410
|
|
|
|
|
|
|
{ |
26411
|
0
|
|
|
|
|
|
p->opt[posMem - 1].prev1IsChar = false; |
26412
|
0
|
|
|
|
|
|
p->opt[posMem - 1].posPrev = p->opt[cur].posPrev2; |
26413
|
0
|
|
|
|
|
|
p->opt[posMem - 1].backPrev = p->opt[cur].backPrev2; |
26414
|
|
|
|
|
|
|
} |
26415
|
|
|
|
|
|
|
} |
26416
|
|
|
|
|
|
|
{ |
26417
|
|
|
|
|
|
|
uint32_t posPrev = posMem; |
26418
|
|
|
|
|
|
|
uint32_t backCur = backMem; |
26419
|
|
|
|
|
|
|
|
26420
|
0
|
|
|
|
|
|
backMem = p->opt[posPrev].backPrev; |
26421
|
0
|
|
|
|
|
|
posMem = p->opt[posPrev].posPrev; |
26422
|
|
|
|
|
|
|
|
26423
|
0
|
|
|
|
|
|
p->opt[posPrev].backPrev = backCur; |
26424
|
0
|
|
|
|
|
|
p->opt[posPrev].posPrev = cur; |
26425
|
|
|
|
|
|
|
cur = posPrev; |
26426
|
|
|
|
|
|
|
} |
26427
|
|
|
|
|
|
|
} |
26428
|
0
|
0
|
|
|
|
|
while (cur != 0); |
|
|
0
|
|
|
|
|
|
26429
|
0
|
|
|
|
|
|
*backRes = p->opt[0].backPrev; |
26430
|
0
|
|
|
|
|
|
p->optimumCurrentIndex = p->opt[0].posPrev; |
26431
|
|
|
|
|
|
|
return p->optimumCurrentIndex; |
26432
|
|
|
|
|
|
|
} |
26433
|
|
|
|
|
|
|
|
26434
|
|
|
|
|
|
|
#define LIT_PROBS(pos, prevByte) (p->litProbs + ((((pos) & p->lpMask) << p->lc) + ((prevByte) >> (8 - p->lc))) * 0x300) |
26435
|
|
|
|
|
|
|
|
26436
|
0
|
|
|
|
|
|
static uint32_t GetOptimum(CLzmaEnc *p, uint32_t position, uint32_t *backRes) |
26437
|
|
|
|
|
|
|
{ |
26438
|
|
|
|
|
|
|
uint32_t numAvail, mainLen, numPairs, repMaxIndex, i, posState, lenEnd, len, cur; |
26439
|
|
|
|
|
|
|
uint32_t matchPrice, repMatchPrice, normalMatchPrice; |
26440
|
|
|
|
|
|
|
uint32_t reps[LZMA_NUM_REPS], repLens[LZMA_NUM_REPS]; |
26441
|
|
|
|
|
|
|
uint32_t *matches; |
26442
|
|
|
|
|
|
|
const uint8_t *data; |
26443
|
|
|
|
|
|
|
uint8_t curByte, matchByte; |
26444
|
0
|
0
|
|
|
|
|
if (p->optimumEndIndex != p->optimumCurrentIndex) |
26445
|
|
|
|
|
|
|
{ |
26446
|
|
|
|
|
|
|
const COptimal *opt = &p->opt[p->optimumCurrentIndex]; |
26447
|
0
|
|
|
|
|
|
uint32_t lenRes = opt->posPrev - p->optimumCurrentIndex; |
26448
|
0
|
|
|
|
|
|
*backRes = opt->backPrev; |
26449
|
0
|
|
|
|
|
|
p->optimumCurrentIndex = opt->posPrev; |
26450
|
0
|
|
|
|
|
|
return lenRes; |
26451
|
|
|
|
|
|
|
} |
26452
|
0
|
|
|
|
|
|
p->optimumCurrentIndex = p->optimumEndIndex = 0; |
26453
|
|
|
|
|
|
|
|
26454
|
0
|
0
|
|
|
|
|
if (p->additionalOffset == 0) |
26455
|
0
|
|
|
|
|
|
mainLen = ReadMatchDistances(p, &numPairs); |
26456
|
|
|
|
|
|
|
else |
26457
|
|
|
|
|
|
|
{ |
26458
|
0
|
|
|
|
|
|
mainLen = p->longestMatchLength; |
26459
|
0
|
|
|
|
|
|
numPairs = p->numPairs; |
26460
|
|
|
|
|
|
|
} |
26461
|
|
|
|
|
|
|
|
26462
|
0
|
|
|
|
|
|
numAvail = p->numAvail; |
26463
|
0
|
0
|
|
|
|
|
if (numAvail < 2) |
26464
|
|
|
|
|
|
|
{ |
26465
|
0
|
|
|
|
|
|
*backRes = (uint32_t)(-1); |
26466
|
0
|
|
|
|
|
|
return 1; |
26467
|
|
|
|
|
|
|
} |
26468
|
0
|
0
|
|
|
|
|
if (numAvail > LZMA_MATCH_LEN_MAX) |
26469
|
|
|
|
|
|
|
numAvail = LZMA_MATCH_LEN_MAX; |
26470
|
|
|
|
|
|
|
|
26471
|
0
|
|
|
|
|
|
data = p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - 1; |
26472
|
|
|
|
|
|
|
repMaxIndex = 0; |
26473
|
0
|
0
|
|
|
|
|
for (i = 0; i < LZMA_NUM_REPS; i++) |
26474
|
|
|
|
|
|
|
{ |
26475
|
|
|
|
|
|
|
uint32_t lenTest; |
26476
|
|
|
|
|
|
|
const uint8_t *data2; |
26477
|
0
|
|
|
|
|
|
reps[i] = p->reps[i]; |
26478
|
0
|
|
|
|
|
|
data2 = data - (reps[i] + 1); |
26479
|
0
|
0
|
|
|
|
|
if (data[0] != data2[0] || data[1] != data2[1]) |
|
|
0
|
|
|
|
|
|
26480
|
|
|
|
|
|
|
{ |
26481
|
0
|
|
|
|
|
|
repLens[i] = 0; |
26482
|
0
|
|
|
|
|
|
continue; |
26483
|
|
|
|
|
|
|
} |
26484
|
0
|
0
|
|
|
|
|
for (lenTest = 2; lenTest < numAvail && data[lenTest] == data2[lenTest]; lenTest++); |
|
|
0
|
|
|
|
|
|
26485
|
0
|
|
|
|
|
|
repLens[i] = lenTest; |
26486
|
0
|
0
|
|
|
|
|
if (lenTest > repLens[repMaxIndex]) |
26487
|
|
|
|
|
|
|
repMaxIndex = i; |
26488
|
|
|
|
|
|
|
} |
26489
|
0
|
0
|
|
|
|
|
if (repLens[repMaxIndex] >= p->numFastBytes) |
26490
|
|
|
|
|
|
|
{ |
26491
|
|
|
|
|
|
|
uint32_t lenRes; |
26492
|
0
|
|
|
|
|
|
*backRes = repMaxIndex; |
26493
|
|
|
|
|
|
|
lenRes = repLens[repMaxIndex]; |
26494
|
0
|
|
|
|
|
|
MovePos(p, lenRes - 1); |
26495
|
|
|
|
|
|
|
return lenRes; |
26496
|
|
|
|
|
|
|
} |
26497
|
|
|
|
|
|
|
|
26498
|
0
|
|
|
|
|
|
matches = p->matches; |
26499
|
0
|
0
|
|
|
|
|
if (mainLen >= p->numFastBytes) |
26500
|
|
|
|
|
|
|
{ |
26501
|
0
|
|
|
|
|
|
*backRes = matches[numPairs - 1] + LZMA_NUM_REPS; |
26502
|
0
|
|
|
|
|
|
MovePos(p, mainLen - 1); |
26503
|
|
|
|
|
|
|
return mainLen; |
26504
|
|
|
|
|
|
|
} |
26505
|
0
|
|
|
|
|
|
curByte = *data; |
26506
|
0
|
|
|
|
|
|
matchByte = *(data - (reps[0] + 1)); |
26507
|
|
|
|
|
|
|
|
26508
|
0
|
0
|
|
|
|
|
if (mainLen < 2 && curByte != matchByte && repLens[repMaxIndex] < 2) |
|
|
0
|
|
|
|
|
|
26509
|
|
|
|
|
|
|
{ |
26510
|
0
|
|
|
|
|
|
*backRes = (uint32_t)-1; |
26511
|
0
|
|
|
|
|
|
return 1; |
26512
|
|
|
|
|
|
|
} |
26513
|
|
|
|
|
|
|
|
26514
|
0
|
|
|
|
|
|
p->opt[0].state = (CState)p->state; |
26515
|
|
|
|
|
|
|
|
26516
|
0
|
|
|
|
|
|
posState = (position & p->pbMask); |
26517
|
|
|
|
|
|
|
|
26518
|
|
|
|
|
|
|
{ |
26519
|
0
|
|
|
|
|
|
const CLzmaProb *probs = LIT_PROBS(position, *(data - 1)); |
26520
|
0
|
0
|
|
|
|
|
p->opt[1].price = GET_PRICE_0(p->isMatch[p->state][posState]) + |
26521
|
|
|
|
|
|
|
(!IsCharState(p->state) ? |
26522
|
0
|
|
|
|
|
|
LitEnc_GetPriceMatched(probs, curByte, matchByte, p->ProbPrices) : |
26523
|
0
|
|
|
|
|
|
LitEnc_GetPrice(probs, curByte, p->ProbPrices)); |
26524
|
|
|
|
|
|
|
} |
26525
|
|
|
|
|
|
|
|
26526
|
0
|
|
|
|
|
|
MakeAsChar(&p->opt[1]); |
26527
|
|
|
|
|
|
|
|
26528
|
0
|
|
|
|
|
|
matchPrice = GET_PRICE_1(p->isMatch[p->state][posState]); |
26529
|
0
|
|
|
|
|
|
repMatchPrice = matchPrice + GET_PRICE_1(p->isRep[p->state]); |
26530
|
|
|
|
|
|
|
|
26531
|
0
|
0
|
|
|
|
|
if (matchByte == curByte) |
26532
|
|
|
|
|
|
|
{ |
26533
|
0
|
|
|
|
|
|
uint32_t shortRepPrice = repMatchPrice + GetRepLen1Price(p, p->state, posState); |
26534
|
0
|
0
|
|
|
|
|
if (shortRepPrice < p->opt[1].price) |
26535
|
|
|
|
|
|
|
{ |
26536
|
0
|
|
|
|
|
|
p->opt[1].price = shortRepPrice; |
26537
|
0
|
|
|
|
|
|
MakeAsShortRep(&p->opt[1]); |
26538
|
|
|
|
|
|
|
} |
26539
|
|
|
|
|
|
|
} |
26540
|
0
|
0
|
|
|
|
|
lenEnd = ((mainLen >= repLens[repMaxIndex]) ? mainLen : repLens[repMaxIndex]); |
26541
|
|
|
|
|
|
|
|
26542
|
0
|
0
|
|
|
|
|
if (lenEnd < 2) |
26543
|
|
|
|
|
|
|
{ |
26544
|
0
|
|
|
|
|
|
*backRes = p->opt[1].backPrev; |
26545
|
0
|
|
|
|
|
|
return 1; |
26546
|
|
|
|
|
|
|
} |
26547
|
|
|
|
|
|
|
|
26548
|
0
|
|
|
|
|
|
p->opt[1].posPrev = 0; |
26549
|
0
|
0
|
|
|
|
|
for (i = 0; i < LZMA_NUM_REPS; i++) |
26550
|
0
|
|
|
|
|
|
p->opt[0].backs[i] = reps[i]; |
26551
|
|
|
|
|
|
|
|
26552
|
|
|
|
|
|
|
len = lenEnd; |
26553
|
|
|
|
|
|
|
do |
26554
|
0
|
|
|
|
|
|
p->opt[len--].price = kInfinityPrice; |
26555
|
0
|
0
|
|
|
|
|
while (len >= 2); |
26556
|
|
|
|
|
|
|
|
26557
|
0
|
0
|
|
|
|
|
for (i = 0; i < LZMA_NUM_REPS; i++) |
26558
|
|
|
|
|
|
|
{ |
26559
|
0
|
|
|
|
|
|
uint32_t repLen = repLens[i]; |
26560
|
|
|
|
|
|
|
uint32_t price; |
26561
|
0
|
0
|
|
|
|
|
if (repLen < 2) |
26562
|
|
|
|
|
|
|
continue; |
26563
|
0
|
|
|
|
|
|
price = repMatchPrice + GetPureRepPrice(p, i, p->state, posState); |
26564
|
0
|
0
|
|
|
|
|
do |
26565
|
|
|
|
|
|
|
{ |
26566
|
0
|
|
|
|
|
|
uint32_t curAndLenPrice = price + p->repLenEnc.prices[posState][repLen - 2]; |
26567
|
|
|
|
|
|
|
COptimal *opt = &p->opt[repLen]; |
26568
|
0
|
0
|
|
|
|
|
if (curAndLenPrice < opt->price) |
26569
|
|
|
|
|
|
|
{ |
26570
|
0
|
|
|
|
|
|
opt->price = curAndLenPrice; |
26571
|
0
|
|
|
|
|
|
opt->posPrev = 0; |
26572
|
0
|
|
|
|
|
|
opt->backPrev = i; |
26573
|
0
|
|
|
|
|
|
opt->prev1IsChar = false; |
26574
|
|
|
|
|
|
|
} |
26575
|
|
|
|
|
|
|
} |
26576
|
|
|
|
|
|
|
while (--repLen >= 2); |
26577
|
|
|
|
|
|
|
} |
26578
|
|
|
|
|
|
|
|
26579
|
0
|
|
|
|
|
|
normalMatchPrice = matchPrice + GET_PRICE_0(p->isRep[p->state]); |
26580
|
|
|
|
|
|
|
|
26581
|
0
|
0
|
|
|
|
|
len = ((repLens[0] >= 2) ? repLens[0] + 1 : 2); |
26582
|
0
|
0
|
|
|
|
|
if (len <= mainLen) |
26583
|
|
|
|
|
|
|
{ |
26584
|
|
|
|
|
|
|
uint32_t offs = 0; |
26585
|
0
|
0
|
|
|
|
|
while (len > matches[offs]) |
26586
|
0
|
|
|
|
|
|
offs += 2; |
26587
|
0
|
|
|
|
|
|
for (; ; len++) |
26588
|
|
|
|
|
|
|
{ |
26589
|
|
|
|
|
|
|
COptimal *opt; |
26590
|
0
|
|
|
|
|
|
uint32_t distance = matches[offs + 1]; |
26591
|
|
|
|
|
|
|
|
26592
|
0
|
|
|
|
|
|
uint32_t curAndLenPrice = normalMatchPrice + p->lenEnc.prices[posState][len - LZMA_MATCH_LEN_MIN]; |
26593
|
0
|
0
|
|
|
|
|
uint32_t lenToPosState = GetLenToPosState(len); |
26594
|
0
|
0
|
|
|
|
|
if (distance < kNumFullDistances) |
26595
|
0
|
|
|
|
|
|
curAndLenPrice += p->distancesPrices[lenToPosState][distance]; |
26596
|
|
|
|
|
|
|
else |
26597
|
|
|
|
|
|
|
{ |
26598
|
|
|
|
|
|
|
uint32_t slot; |
26599
|
0
|
|
|
|
|
|
GetPosSlot2(distance, slot); |
26600
|
0
|
|
|
|
|
|
curAndLenPrice += p->alignPrices[distance & kAlignMask] + p->posSlotPrices[lenToPosState][slot]; |
26601
|
|
|
|
|
|
|
} |
26602
|
|
|
|
|
|
|
opt = &p->opt[len]; |
26603
|
0
|
0
|
|
|
|
|
if (curAndLenPrice < opt->price) |
26604
|
|
|
|
|
|
|
{ |
26605
|
0
|
|
|
|
|
|
opt->price = curAndLenPrice; |
26606
|
0
|
|
|
|
|
|
opt->posPrev = 0; |
26607
|
0
|
|
|
|
|
|
opt->backPrev = distance + LZMA_NUM_REPS; |
26608
|
0
|
|
|
|
|
|
opt->prev1IsChar = false; |
26609
|
|
|
|
|
|
|
} |
26610
|
0
|
0
|
|
|
|
|
if (len == matches[offs]) |
26611
|
|
|
|
|
|
|
{ |
26612
|
0
|
|
|
|
|
|
offs += 2; |
26613
|
0
|
0
|
|
|
|
|
if (offs == numPairs) |
26614
|
|
|
|
|
|
|
break; |
26615
|
|
|
|
|
|
|
} |
26616
|
|
|
|
|
|
|
} |
26617
|
|
|
|
|
|
|
} |
26618
|
|
|
|
|
|
|
|
26619
|
|
|
|
|
|
|
cur = 0; |
26620
|
|
|
|
|
|
|
|
26621
|
|
|
|
|
|
|
for (;;) |
26622
|
|
|
|
|
|
|
{ |
26623
|
|
|
|
|
|
|
uint32_t numAvailFull, newLen, numPairs, posPrev, state, posState, startLen; |
26624
|
|
|
|
|
|
|
uint32_t curPrice, curAnd1Price, matchPrice, repMatchPrice; |
26625
|
|
|
|
|
|
|
bool nextIsChar; |
26626
|
|
|
|
|
|
|
uint8_t curByte, matchByte; |
26627
|
|
|
|
|
|
|
const uint8_t *data; |
26628
|
|
|
|
|
|
|
COptimal *curOpt; |
26629
|
|
|
|
|
|
|
COptimal *nextOpt; |
26630
|
|
|
|
|
|
|
|
26631
|
0
|
|
|
|
|
|
cur++; |
26632
|
0
|
0
|
|
|
|
|
if (cur == lenEnd) |
26633
|
0
|
|
|
|
|
|
return Backward(p, backRes, cur); |
26634
|
|
|
|
|
|
|
|
26635
|
0
|
|
|
|
|
|
newLen = ReadMatchDistances(p, &numPairs); |
26636
|
0
|
0
|
|
|
|
|
if (newLen >= p->numFastBytes) |
26637
|
|
|
|
|
|
|
{ |
26638
|
0
|
|
|
|
|
|
p->numPairs = numPairs; |
26639
|
0
|
|
|
|
|
|
p->longestMatchLength = newLen; |
26640
|
0
|
|
|
|
|
|
return Backward(p, backRes, cur); |
26641
|
|
|
|
|
|
|
} |
26642
|
0
|
|
|
|
|
|
position++; |
26643
|
|
|
|
|
|
|
curOpt = &p->opt[cur]; |
26644
|
0
|
|
|
|
|
|
posPrev = curOpt->posPrev; |
26645
|
0
|
0
|
|
|
|
|
if (curOpt->prev1IsChar) |
26646
|
|
|
|
|
|
|
{ |
26647
|
0
|
|
|
|
|
|
posPrev--; |
26648
|
0
|
0
|
|
|
|
|
if (curOpt->prev2) |
26649
|
|
|
|
|
|
|
{ |
26650
|
0
|
|
|
|
|
|
state = p->opt[curOpt->posPrev2].state; |
26651
|
0
|
0
|
|
|
|
|
if (curOpt->backPrev2 < LZMA_NUM_REPS) |
26652
|
0
|
|
|
|
|
|
state = kRepNextStates[state]; |
26653
|
|
|
|
|
|
|
else |
26654
|
0
|
|
|
|
|
|
state = kMatchNextStates[state]; |
26655
|
|
|
|
|
|
|
} |
26656
|
|
|
|
|
|
|
else |
26657
|
0
|
|
|
|
|
|
state = p->opt[posPrev].state; |
26658
|
0
|
|
|
|
|
|
state = kLiteralNextStates[state]; |
26659
|
|
|
|
|
|
|
} |
26660
|
|
|
|
|
|
|
else |
26661
|
0
|
|
|
|
|
|
state = p->opt[posPrev].state; |
26662
|
0
|
0
|
|
|
|
|
if (posPrev == cur - 1) |
26663
|
|
|
|
|
|
|
{ |
26664
|
0
|
0
|
|
|
|
|
if (IsShortRep(curOpt)) |
26665
|
0
|
|
|
|
|
|
state = kShortRepNextStates[state]; |
26666
|
|
|
|
|
|
|
else |
26667
|
0
|
|
|
|
|
|
state = kLiteralNextStates[state]; |
26668
|
|
|
|
|
|
|
} |
26669
|
|
|
|
|
|
|
else |
26670
|
|
|
|
|
|
|
{ |
26671
|
|
|
|
|
|
|
uint32_t pos; |
26672
|
|
|
|
|
|
|
const COptimal *prevOpt; |
26673
|
0
|
0
|
|
|
|
|
if (curOpt->prev1IsChar && curOpt->prev2) |
|
|
0
|
|
|
|
|
|
26674
|
|
|
|
|
|
|
{ |
26675
|
0
|
|
|
|
|
|
posPrev = curOpt->posPrev2; |
26676
|
0
|
|
|
|
|
|
pos = curOpt->backPrev2; |
26677
|
0
|
|
|
|
|
|
state = kRepNextStates[state]; |
26678
|
|
|
|
|
|
|
} |
26679
|
|
|
|
|
|
|
else |
26680
|
|
|
|
|
|
|
{ |
26681
|
0
|
|
|
|
|
|
pos = curOpt->backPrev; |
26682
|
0
|
0
|
|
|
|
|
if (pos < LZMA_NUM_REPS) |
26683
|
0
|
|
|
|
|
|
state = kRepNextStates[state]; |
26684
|
|
|
|
|
|
|
else |
26685
|
0
|
|
|
|
|
|
state = kMatchNextStates[state]; |
26686
|
|
|
|
|
|
|
} |
26687
|
|
|
|
|
|
|
prevOpt = &p->opt[posPrev]; |
26688
|
0
|
0
|
|
|
|
|
if (pos < LZMA_NUM_REPS) |
26689
|
|
|
|
|
|
|
{ |
26690
|
|
|
|
|
|
|
uint32_t i; |
26691
|
0
|
|
|
|
|
|
reps[0] = prevOpt->backs[pos]; |
26692
|
0
|
0
|
|
|
|
|
for (i = 1; i <= pos; i++) |
26693
|
0
|
|
|
|
|
|
reps[i] = prevOpt->backs[i - 1]; |
26694
|
0
|
0
|
|
|
|
|
for (; i < LZMA_NUM_REPS; i++) |
26695
|
0
|
|
|
|
|
|
reps[i] = prevOpt->backs[i]; |
26696
|
|
|
|
|
|
|
} |
26697
|
|
|
|
|
|
|
else |
26698
|
|
|
|
|
|
|
{ |
26699
|
|
|
|
|
|
|
uint32_t i; |
26700
|
0
|
|
|
|
|
|
reps[0] = (pos - LZMA_NUM_REPS); |
26701
|
0
|
0
|
|
|
|
|
for (i = 1; i < LZMA_NUM_REPS; i++) |
26702
|
0
|
|
|
|
|
|
reps[i] = prevOpt->backs[i - 1]; |
26703
|
|
|
|
|
|
|
} |
26704
|
|
|
|
|
|
|
} |
26705
|
0
|
|
|
|
|
|
curOpt->state = (CState)state; |
26706
|
|
|
|
|
|
|
|
26707
|
0
|
|
|
|
|
|
curOpt->backs[0] = reps[0]; |
26708
|
0
|
|
|
|
|
|
curOpt->backs[1] = reps[1]; |
26709
|
0
|
|
|
|
|
|
curOpt->backs[2] = reps[2]; |
26710
|
0
|
|
|
|
|
|
curOpt->backs[3] = reps[3]; |
26711
|
|
|
|
|
|
|
|
26712
|
0
|
|
|
|
|
|
curPrice = curOpt->price; |
26713
|
|
|
|
|
|
|
nextIsChar = false; |
26714
|
0
|
|
|
|
|
|
data = p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - 1; |
26715
|
0
|
|
|
|
|
|
curByte = *data; |
26716
|
0
|
|
|
|
|
|
matchByte = *(data - (reps[0] + 1)); |
26717
|
|
|
|
|
|
|
|
26718
|
0
|
|
|
|
|
|
posState = (position & p->pbMask); |
26719
|
|
|
|
|
|
|
|
26720
|
0
|
|
|
|
|
|
curAnd1Price = curPrice + GET_PRICE_0(p->isMatch[state][posState]); |
26721
|
|
|
|
|
|
|
{ |
26722
|
0
|
|
|
|
|
|
const CLzmaProb *probs = LIT_PROBS(position, *(data - 1)); |
26723
|
|
|
|
|
|
|
curAnd1Price += |
26724
|
|
|
|
|
|
|
(!IsCharState(state) ? |
26725
|
0
|
|
|
|
|
|
LitEnc_GetPriceMatched(probs, curByte, matchByte, p->ProbPrices) : |
26726
|
0
|
0
|
|
|
|
|
LitEnc_GetPrice(probs, curByte, p->ProbPrices)); |
26727
|
|
|
|
|
|
|
} |
26728
|
|
|
|
|
|
|
|
26729
|
0
|
|
|
|
|
|
nextOpt = &p->opt[cur + 1]; |
26730
|
|
|
|
|
|
|
|
26731
|
0
|
0
|
|
|
|
|
if (curAnd1Price < nextOpt->price) |
26732
|
|
|
|
|
|
|
{ |
26733
|
0
|
|
|
|
|
|
nextOpt->price = curAnd1Price; |
26734
|
0
|
|
|
|
|
|
nextOpt->posPrev = cur; |
26735
|
0
|
|
|
|
|
|
MakeAsChar(nextOpt); |
26736
|
|
|
|
|
|
|
nextIsChar = true; |
26737
|
|
|
|
|
|
|
} |
26738
|
|
|
|
|
|
|
|
26739
|
0
|
|
|
|
|
|
matchPrice = curPrice + GET_PRICE_1(p->isMatch[state][posState]); |
26740
|
0
|
|
|
|
|
|
repMatchPrice = matchPrice + GET_PRICE_1(p->isRep[state]); |
26741
|
|
|
|
|
|
|
|
26742
|
0
|
0
|
|
|
|
|
if (matchByte == curByte && !(nextOpt->posPrev < cur && nextOpt->backPrev == 0)) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
26743
|
|
|
|
|
|
|
{ |
26744
|
0
|
|
|
|
|
|
uint32_t shortRepPrice = repMatchPrice + GetRepLen1Price(p, state, posState); |
26745
|
0
|
0
|
|
|
|
|
if (shortRepPrice <= nextOpt->price) |
26746
|
|
|
|
|
|
|
{ |
26747
|
0
|
|
|
|
|
|
nextOpt->price = shortRepPrice; |
26748
|
0
|
|
|
|
|
|
nextOpt->posPrev = cur; |
26749
|
0
|
|
|
|
|
|
MakeAsShortRep(nextOpt); |
26750
|
|
|
|
|
|
|
nextIsChar = true; |
26751
|
|
|
|
|
|
|
} |
26752
|
|
|
|
|
|
|
} |
26753
|
0
|
|
|
|
|
|
numAvailFull = p->numAvail; |
26754
|
|
|
|
|
|
|
{ |
26755
|
0
|
|
|
|
|
|
uint32_t temp = kNumOpts - 1 - cur; |
26756
|
0
|
0
|
|
|
|
|
if (temp < numAvailFull) |
26757
|
|
|
|
|
|
|
numAvailFull = temp; |
26758
|
|
|
|
|
|
|
} |
26759
|
|
|
|
|
|
|
|
26760
|
0
|
0
|
|
|
|
|
if (numAvailFull < 2) |
26761
|
0
|
|
|
|
|
|
continue; |
26762
|
0
|
0
|
|
|
|
|
numAvail = (numAvailFull <= p->numFastBytes ? numAvailFull : p->numFastBytes); |
26763
|
|
|
|
|
|
|
|
26764
|
0
|
0
|
|
|
|
|
if (!nextIsChar && matchByte != curByte) /* speed optimization */ |
26765
|
|
|
|
|
|
|
{ |
26766
|
|
|
|
|
|
|
/* try Literal + rep0 */ |
26767
|
|
|
|
|
|
|
uint32_t temp; |
26768
|
|
|
|
|
|
|
uint32_t lenTest2; |
26769
|
|
|
|
|
|
|
const uint8_t *data2 = data - (reps[0] + 1); |
26770
|
0
|
|
|
|
|
|
uint32_t limit = p->numFastBytes + 1; |
26771
|
0
|
0
|
|
|
|
|
if (limit > numAvailFull) |
26772
|
|
|
|
|
|
|
limit = numAvailFull; |
26773
|
|
|
|
|
|
|
|
26774
|
0
|
0
|
|
|
|
|
for (temp = 1; temp < limit && data[temp] == data2[temp]; temp++); |
|
|
0
|
|
|
|
|
|
26775
|
0
|
|
|
|
|
|
lenTest2 = temp - 1; |
26776
|
0
|
0
|
|
|
|
|
if (lenTest2 >= 2) |
26777
|
|
|
|
|
|
|
{ |
26778
|
0
|
|
|
|
|
|
uint32_t state2 = kLiteralNextStates[state]; |
26779
|
0
|
|
|
|
|
|
uint32_t posStateNext = (position + 1) & p->pbMask; |
26780
|
0
|
|
|
|
|
|
uint32_t nextRepMatchPrice = curAnd1Price + |
26781
|
0
|
|
|
|
|
|
GET_PRICE_1(p->isMatch[state2][posStateNext]) + |
26782
|
0
|
|
|
|
|
|
GET_PRICE_1(p->isRep[state2]); |
26783
|
|
|
|
|
|
|
/* for (; lenTest2 >= 2; lenTest2--) */ |
26784
|
|
|
|
|
|
|
{ |
26785
|
|
|
|
|
|
|
uint32_t curAndLenPrice; |
26786
|
|
|
|
|
|
|
COptimal *opt; |
26787
|
0
|
|
|
|
|
|
uint32_t offset = cur + 1 + lenTest2; |
26788
|
0
|
0
|
|
|
|
|
while (lenEnd < offset) |
26789
|
0
|
|
|
|
|
|
p->opt[++lenEnd].price = kInfinityPrice; |
26790
|
0
|
|
|
|
|
|
curAndLenPrice = nextRepMatchPrice + GetRepPrice(p, 0, lenTest2, state2, posStateNext); |
26791
|
|
|
|
|
|
|
opt = &p->opt[offset]; |
26792
|
0
|
0
|
|
|
|
|
if (curAndLenPrice < opt->price) |
26793
|
|
|
|
|
|
|
{ |
26794
|
0
|
|
|
|
|
|
opt->price = curAndLenPrice; |
26795
|
0
|
|
|
|
|
|
opt->posPrev = cur + 1; |
26796
|
0
|
|
|
|
|
|
opt->backPrev = 0; |
26797
|
0
|
|
|
|
|
|
opt->prev1IsChar = true; |
26798
|
0
|
|
|
|
|
|
opt->prev2 = false; |
26799
|
|
|
|
|
|
|
} |
26800
|
|
|
|
|
|
|
} |
26801
|
|
|
|
|
|
|
} |
26802
|
|
|
|
|
|
|
} |
26803
|
|
|
|
|
|
|
|
26804
|
|
|
|
|
|
|
startLen = 2; /* speed optimization */ |
26805
|
|
|
|
|
|
|
{ |
26806
|
|
|
|
|
|
|
uint32_t repIndex; |
26807
|
0
|
0
|
|
|
|
|
for (repIndex = 0; repIndex < LZMA_NUM_REPS; repIndex++) |
26808
|
|
|
|
|
|
|
{ |
26809
|
|
|
|
|
|
|
uint32_t lenTest; |
26810
|
|
|
|
|
|
|
uint32_t lenTestTemp; |
26811
|
|
|
|
|
|
|
uint32_t price; |
26812
|
0
|
|
|
|
|
|
const uint8_t *data2 = data - (reps[repIndex] + 1); |
26813
|
0
|
0
|
|
|
|
|
if (data[0] != data2[0] || data[1] != data2[1]) |
|
|
0
|
|
|
|
|
|
26814
|
|
|
|
|
|
|
continue; |
26815
|
0
|
0
|
|
|
|
|
for (lenTest = 2; lenTest < numAvail && data[lenTest] == data2[lenTest]; lenTest++); |
|
|
0
|
|
|
|
|
|
26816
|
0
|
0
|
|
|
|
|
while (lenEnd < cur + lenTest) |
26817
|
0
|
|
|
|
|
|
p->opt[++lenEnd].price = kInfinityPrice; |
26818
|
|
|
|
|
|
|
lenTestTemp = lenTest; |
26819
|
0
|
|
|
|
|
|
price = repMatchPrice + GetPureRepPrice(p, repIndex, state, posState); |
26820
|
0
|
0
|
|
|
|
|
do |
26821
|
|
|
|
|
|
|
{ |
26822
|
0
|
|
|
|
|
|
uint32_t curAndLenPrice = price + p->repLenEnc.prices[posState][lenTest - 2]; |
26823
|
0
|
|
|
|
|
|
COptimal *opt = &p->opt[cur + lenTest]; |
26824
|
0
|
0
|
|
|
|
|
if (curAndLenPrice < opt->price) |
26825
|
|
|
|
|
|
|
{ |
26826
|
0
|
|
|
|
|
|
opt->price = curAndLenPrice; |
26827
|
0
|
|
|
|
|
|
opt->posPrev = cur; |
26828
|
0
|
|
|
|
|
|
opt->backPrev = repIndex; |
26829
|
0
|
|
|
|
|
|
opt->prev1IsChar = false; |
26830
|
|
|
|
|
|
|
} |
26831
|
|
|
|
|
|
|
} |
26832
|
|
|
|
|
|
|
while (--lenTest >= 2); |
26833
|
|
|
|
|
|
|
lenTest = lenTestTemp; |
26834
|
|
|
|
|
|
|
|
26835
|
0
|
0
|
|
|
|
|
if (repIndex == 0) |
26836
|
0
|
|
|
|
|
|
startLen = lenTest + 1; |
26837
|
|
|
|
|
|
|
|
26838
|
|
|
|
|
|
|
/* if (_maxMode) */ |
26839
|
|
|
|
|
|
|
if (1) |
26840
|
|
|
|
|
|
|
{ |
26841
|
0
|
|
|
|
|
|
uint32_t lenTest2 = lenTest + 1; |
26842
|
0
|
|
|
|
|
|
uint32_t limit = lenTest2 + p->numFastBytes; |
26843
|
|
|
|
|
|
|
uint32_t nextRepMatchPrice; |
26844
|
0
|
0
|
|
|
|
|
if (limit > numAvailFull) |
26845
|
|
|
|
|
|
|
limit = numAvailFull; |
26846
|
0
|
0
|
|
|
|
|
for (; lenTest2 < limit && data[lenTest2] == data2[lenTest2]; lenTest2++); |
|
|
0
|
|
|
|
|
|
26847
|
0
|
|
|
|
|
|
lenTest2 -= lenTest + 1; |
26848
|
0
|
0
|
|
|
|
|
if (lenTest2 >= 2) |
26849
|
|
|
|
|
|
|
{ |
26850
|
0
|
|
|
|
|
|
uint32_t state2 = kRepNextStates[state]; |
26851
|
0
|
|
|
|
|
|
uint32_t posStateNext = (position + lenTest) & p->pbMask; |
26852
|
|
|
|
|
|
|
uint32_t curAndLenCharPrice = |
26853
|
0
|
|
|
|
|
|
price + p->repLenEnc.prices[posState][lenTest - 2] + |
26854
|
0
|
|
|
|
|
|
GET_PRICE_0(p->isMatch[state2][posStateNext]) + |
26855
|
0
|
|
|
|
|
|
LitEnc_GetPriceMatched(LIT_PROBS(position + lenTest, data[lenTest - 1]), |
26856
|
0
|
|
|
|
|
|
data[lenTest], data2[lenTest], p->ProbPrices); |
26857
|
0
|
|
|
|
|
|
state2 = kLiteralNextStates[state2]; |
26858
|
0
|
|
|
|
|
|
posStateNext = (position + lenTest + 1) & p->pbMask; |
26859
|
0
|
|
|
|
|
|
nextRepMatchPrice = curAndLenCharPrice + |
26860
|
0
|
|
|
|
|
|
GET_PRICE_1(p->isMatch[state2][posStateNext]) + |
26861
|
0
|
|
|
|
|
|
GET_PRICE_1(p->isRep[state2]); |
26862
|
|
|
|
|
|
|
|
26863
|
|
|
|
|
|
|
/* for (; lenTest2 >= 2; lenTest2--) */ |
26864
|
|
|
|
|
|
|
{ |
26865
|
|
|
|
|
|
|
uint32_t curAndLenPrice; |
26866
|
|
|
|
|
|
|
COptimal *opt; |
26867
|
0
|
|
|
|
|
|
uint32_t offset = cur + lenTest + 1 + lenTest2; |
26868
|
0
|
0
|
|
|
|
|
while (lenEnd < offset) |
26869
|
0
|
|
|
|
|
|
p->opt[++lenEnd].price = kInfinityPrice; |
26870
|
0
|
|
|
|
|
|
curAndLenPrice = nextRepMatchPrice + GetRepPrice(p, 0, lenTest2, state2, posStateNext); |
26871
|
|
|
|
|
|
|
opt = &p->opt[offset]; |
26872
|
0
|
0
|
|
|
|
|
if (curAndLenPrice < opt->price) |
26873
|
|
|
|
|
|
|
{ |
26874
|
0
|
|
|
|
|
|
opt->price = curAndLenPrice; |
26875
|
0
|
|
|
|
|
|
opt->posPrev = cur + lenTest + 1; |
26876
|
0
|
|
|
|
|
|
opt->backPrev = 0; |
26877
|
0
|
|
|
|
|
|
opt->prev1IsChar = true; |
26878
|
0
|
|
|
|
|
|
opt->prev2 = true; |
26879
|
0
|
|
|
|
|
|
opt->posPrev2 = cur; |
26880
|
0
|
|
|
|
|
|
opt->backPrev2 = repIndex; |
26881
|
|
|
|
|
|
|
} |
26882
|
|
|
|
|
|
|
} |
26883
|
|
|
|
|
|
|
} |
26884
|
|
|
|
|
|
|
} |
26885
|
|
|
|
|
|
|
} |
26886
|
|
|
|
|
|
|
} |
26887
|
|
|
|
|
|
|
/* for (uint32_t lenTest = 2; lenTest <= newLen; lenTest++) */ |
26888
|
0
|
0
|
|
|
|
|
if (newLen > numAvail) |
26889
|
|
|
|
|
|
|
{ |
26890
|
|
|
|
|
|
|
newLen = numAvail; |
26891
|
0
|
0
|
|
|
|
|
for (numPairs = 0; newLen > matches[numPairs]; numPairs += 2); |
26892
|
0
|
|
|
|
|
|
matches[numPairs] = newLen; |
26893
|
0
|
|
|
|
|
|
numPairs += 2; |
26894
|
|
|
|
|
|
|
} |
26895
|
0
|
0
|
|
|
|
|
if (newLen >= startLen) |
26896
|
|
|
|
|
|
|
{ |
26897
|
0
|
|
|
|
|
|
uint32_t normalMatchPrice = matchPrice + GET_PRICE_0(p->isRep[state]); |
26898
|
|
|
|
|
|
|
uint32_t offs, curBack, posSlot; |
26899
|
|
|
|
|
|
|
uint32_t lenTest; |
26900
|
0
|
0
|
|
|
|
|
while (lenEnd < cur + newLen) |
26901
|
0
|
|
|
|
|
|
p->opt[++lenEnd].price = kInfinityPrice; |
26902
|
|
|
|
|
|
|
|
26903
|
|
|
|
|
|
|
offs = 0; |
26904
|
0
|
0
|
|
|
|
|
while (startLen > matches[offs]) |
26905
|
0
|
|
|
|
|
|
offs += 2; |
26906
|
0
|
|
|
|
|
|
curBack = matches[offs + 1]; |
26907
|
0
|
|
|
|
|
|
GetPosSlot2(curBack, posSlot); |
26908
|
0
|
|
|
|
|
|
for (lenTest = /*2*/ startLen; ; lenTest++) |
26909
|
|
|
|
|
|
|
{ |
26910
|
0
|
|
|
|
|
|
uint32_t curAndLenPrice = normalMatchPrice + p->lenEnc.prices[posState][lenTest - LZMA_MATCH_LEN_MIN]; |
26911
|
0
|
0
|
|
|
|
|
uint32_t lenToPosState = GetLenToPosState(lenTest); |
26912
|
|
|
|
|
|
|
COptimal *opt; |
26913
|
0
|
0
|
|
|
|
|
if (curBack < kNumFullDistances) |
26914
|
0
|
|
|
|
|
|
curAndLenPrice += p->distancesPrices[lenToPosState][curBack]; |
26915
|
|
|
|
|
|
|
else |
26916
|
0
|
|
|
|
|
|
curAndLenPrice += p->posSlotPrices[lenToPosState][posSlot] + p->alignPrices[curBack & kAlignMask]; |
26917
|
|
|
|
|
|
|
|
26918
|
0
|
|
|
|
|
|
opt = &p->opt[cur + lenTest]; |
26919
|
0
|
0
|
|
|
|
|
if (curAndLenPrice < opt->price) |
26920
|
|
|
|
|
|
|
{ |
26921
|
0
|
|
|
|
|
|
opt->price = curAndLenPrice; |
26922
|
0
|
|
|
|
|
|
opt->posPrev = cur; |
26923
|
0
|
|
|
|
|
|
opt->backPrev = curBack + LZMA_NUM_REPS; |
26924
|
0
|
|
|
|
|
|
opt->prev1IsChar = false; |
26925
|
|
|
|
|
|
|
} |
26926
|
|
|
|
|
|
|
|
26927
|
0
|
0
|
|
|
|
|
if (/*_maxMode && */lenTest == matches[offs]) |
26928
|
|
|
|
|
|
|
{ |
26929
|
|
|
|
|
|
|
/* Try Match + Literal + Rep0 */ |
26930
|
0
|
|
|
|
|
|
const uint8_t *data2 = data - (curBack + 1); |
26931
|
0
|
|
|
|
|
|
uint32_t lenTest2 = lenTest + 1; |
26932
|
0
|
|
|
|
|
|
uint32_t limit = lenTest2 + p->numFastBytes; |
26933
|
|
|
|
|
|
|
uint32_t nextRepMatchPrice; |
26934
|
0
|
0
|
|
|
|
|
if (limit > numAvailFull) |
26935
|
|
|
|
|
|
|
limit = numAvailFull; |
26936
|
0
|
0
|
|
|
|
|
for (; lenTest2 < limit && data[lenTest2] == data2[lenTest2]; lenTest2++); |
|
|
0
|
|
|
|
|
|
26937
|
0
|
|
|
|
|
|
lenTest2 -= lenTest + 1; |
26938
|
0
|
0
|
|
|
|
|
if (lenTest2 >= 2) |
26939
|
|
|
|
|
|
|
{ |
26940
|
0
|
|
|
|
|
|
uint32_t state2 = kMatchNextStates[state]; |
26941
|
0
|
|
|
|
|
|
uint32_t posStateNext = (position + lenTest) & p->pbMask; |
26942
|
0
|
|
|
|
|
|
uint32_t curAndLenCharPrice = curAndLenPrice + |
26943
|
0
|
|
|
|
|
|
GET_PRICE_0(p->isMatch[state2][posStateNext]) + |
26944
|
0
|
|
|
|
|
|
LitEnc_GetPriceMatched(LIT_PROBS(position + lenTest, data[lenTest - 1]), |
26945
|
0
|
|
|
|
|
|
data[lenTest], data2[lenTest], p->ProbPrices); |
26946
|
0
|
|
|
|
|
|
state2 = kLiteralNextStates[state2]; |
26947
|
0
|
|
|
|
|
|
posStateNext = (posStateNext + 1) & p->pbMask; |
26948
|
0
|
|
|
|
|
|
nextRepMatchPrice = curAndLenCharPrice + |
26949
|
0
|
|
|
|
|
|
GET_PRICE_1(p->isMatch[state2][posStateNext]) + |
26950
|
0
|
|
|
|
|
|
GET_PRICE_1(p->isRep[state2]); |
26951
|
|
|
|
|
|
|
|
26952
|
|
|
|
|
|
|
/* for (; lenTest2 >= 2; lenTest2--) */ |
26953
|
|
|
|
|
|
|
{ |
26954
|
0
|
|
|
|
|
|
uint32_t offset = cur + lenTest + 1 + lenTest2; |
26955
|
|
|
|
|
|
|
uint32_t curAndLenPrice; |
26956
|
|
|
|
|
|
|
COptimal *opt; |
26957
|
0
|
0
|
|
|
|
|
while (lenEnd < offset) |
26958
|
0
|
|
|
|
|
|
p->opt[++lenEnd].price = kInfinityPrice; |
26959
|
0
|
|
|
|
|
|
curAndLenPrice = nextRepMatchPrice + GetRepPrice(p, 0, lenTest2, state2, posStateNext); |
26960
|
|
|
|
|
|
|
opt = &p->opt[offset]; |
26961
|
0
|
0
|
|
|
|
|
if (curAndLenPrice < opt->price) |
26962
|
|
|
|
|
|
|
{ |
26963
|
0
|
|
|
|
|
|
opt->price = curAndLenPrice; |
26964
|
0
|
|
|
|
|
|
opt->posPrev = cur + lenTest + 1; |
26965
|
0
|
|
|
|
|
|
opt->backPrev = 0; |
26966
|
0
|
|
|
|
|
|
opt->prev1IsChar = true; |
26967
|
0
|
|
|
|
|
|
opt->prev2 = true; |
26968
|
0
|
|
|
|
|
|
opt->posPrev2 = cur; |
26969
|
0
|
|
|
|
|
|
opt->backPrev2 = curBack + LZMA_NUM_REPS; |
26970
|
|
|
|
|
|
|
} |
26971
|
|
|
|
|
|
|
} |
26972
|
|
|
|
|
|
|
} |
26973
|
0
|
|
|
|
|
|
offs += 2; |
26974
|
0
|
0
|
|
|
|
|
if (offs == numPairs) |
26975
|
|
|
|
|
|
|
break; |
26976
|
0
|
|
|
|
|
|
curBack = matches[offs + 1]; |
26977
|
0
|
0
|
|
|
|
|
if (curBack >= kNumFullDistances) |
26978
|
0
|
|
|
|
|
|
GetPosSlot2(curBack, posSlot); |
26979
|
|
|
|
|
|
|
} |
26980
|
|
|
|
|
|
|
} |
26981
|
|
|
|
|
|
|
} |
26982
|
|
|
|
|
|
|
} |
26983
|
|
|
|
|
|
|
} |
26984
|
|
|
|
|
|
|
|
26985
|
|
|
|
|
|
|
#define ChangePair(smallDist, bigDist) (((bigDist) >> 7) > (smallDist)) |
26986
|
|
|
|
|
|
|
|
26987
|
0
|
|
|
|
|
|
static uint32_t GetOptimumFast(CLzmaEnc *p, uint32_t *backRes) |
26988
|
|
|
|
|
|
|
{ |
26989
|
|
|
|
|
|
|
uint32_t numAvail, mainLen, mainDist, numPairs, repIndex, repLen, i; |
26990
|
|
|
|
|
|
|
const uint8_t *data; |
26991
|
|
|
|
|
|
|
const uint32_t *matches; |
26992
|
|
|
|
|
|
|
|
26993
|
0
|
0
|
|
|
|
|
if (p->additionalOffset == 0) |
26994
|
0
|
|
|
|
|
|
mainLen = ReadMatchDistances(p, &numPairs); |
26995
|
|
|
|
|
|
|
else |
26996
|
|
|
|
|
|
|
{ |
26997
|
0
|
|
|
|
|
|
mainLen = p->longestMatchLength; |
26998
|
0
|
|
|
|
|
|
numPairs = p->numPairs; |
26999
|
|
|
|
|
|
|
} |
27000
|
|
|
|
|
|
|
|
27001
|
0
|
|
|
|
|
|
numAvail = p->numAvail; |
27002
|
0
|
|
|
|
|
|
*backRes = (uint32_t)-1; |
27003
|
0
|
0
|
|
|
|
|
if (numAvail < 2) |
27004
|
|
|
|
|
|
|
return 1; |
27005
|
0
|
0
|
|
|
|
|
if (numAvail > LZMA_MATCH_LEN_MAX) |
27006
|
|
|
|
|
|
|
numAvail = LZMA_MATCH_LEN_MAX; |
27007
|
0
|
|
|
|
|
|
data = p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - 1; |
27008
|
|
|
|
|
|
|
|
27009
|
|
|
|
|
|
|
repLen = repIndex = 0; |
27010
|
0
|
0
|
|
|
|
|
for (i = 0; i < LZMA_NUM_REPS; i++) |
27011
|
|
|
|
|
|
|
{ |
27012
|
|
|
|
|
|
|
uint32_t len; |
27013
|
0
|
|
|
|
|
|
const uint8_t *data2 = data - (p->reps[i] + 1); |
27014
|
0
|
0
|
|
|
|
|
if (data[0] != data2[0] || data[1] != data2[1]) |
|
|
0
|
|
|
|
|
|
27015
|
|
|
|
|
|
|
continue; |
27016
|
0
|
0
|
|
|
|
|
for (len = 2; len < numAvail && data[len] == data2[len]; len++); |
|
|
0
|
|
|
|
|
|
27017
|
0
|
0
|
|
|
|
|
if (len >= p->numFastBytes) |
27018
|
|
|
|
|
|
|
{ |
27019
|
0
|
|
|
|
|
|
*backRes = i; |
27020
|
0
|
|
|
|
|
|
MovePos(p, len - 1); |
27021
|
|
|
|
|
|
|
return len; |
27022
|
|
|
|
|
|
|
} |
27023
|
0
|
0
|
|
|
|
|
if (len > repLen) |
27024
|
|
|
|
|
|
|
{ |
27025
|
|
|
|
|
|
|
repIndex = i; |
27026
|
|
|
|
|
|
|
repLen = len; |
27027
|
|
|
|
|
|
|
} |
27028
|
|
|
|
|
|
|
} |
27029
|
|
|
|
|
|
|
|
27030
|
0
|
|
|
|
|
|
matches = p->matches; |
27031
|
0
|
0
|
|
|
|
|
if (mainLen >= p->numFastBytes) |
27032
|
|
|
|
|
|
|
{ |
27033
|
0
|
|
|
|
|
|
*backRes = matches[numPairs - 1] + LZMA_NUM_REPS; |
27034
|
0
|
|
|
|
|
|
MovePos(p, mainLen - 1); |
27035
|
|
|
|
|
|
|
return mainLen; |
27036
|
|
|
|
|
|
|
} |
27037
|
|
|
|
|
|
|
|
27038
|
|
|
|
|
|
|
mainDist = 0; /* for GCC */ |
27039
|
0
|
0
|
|
|
|
|
if (mainLen >= 2) |
27040
|
|
|
|
|
|
|
{ |
27041
|
0
|
|
|
|
|
|
mainDist = matches[numPairs - 1]; |
27042
|
0
|
0
|
|
|
|
|
while (numPairs > 2 && mainLen == matches[numPairs - 4] + 1) |
|
|
0
|
|
|
|
|
|
27043
|
|
|
|
|
|
|
{ |
27044
|
0
|
0
|
|
|
|
|
if (!ChangePair(matches[numPairs - 3], mainDist)) |
27045
|
|
|
|
|
|
|
break; |
27046
|
0
|
|
|
|
|
|
numPairs -= 2; |
27047
|
0
|
|
|
|
|
|
mainLen = matches[numPairs - 2]; |
27048
|
0
|
|
|
|
|
|
mainDist = matches[numPairs - 1]; |
27049
|
|
|
|
|
|
|
} |
27050
|
0
|
0
|
|
|
|
|
if (mainLen == 2 && mainDist >= 0x80) |
27051
|
|
|
|
|
|
|
mainLen = 1; |
27052
|
|
|
|
|
|
|
} |
27053
|
|
|
|
|
|
|
|
27054
|
0
|
0
|
|
|
|
|
if (repLen >= 2 && ( |
|
|
0
|
|
|
|
|
|
27055
|
0
|
0
|
|
|
|
|
(repLen + 1 >= mainLen) || |
27056
|
0
|
0
|
|
|
|
|
(repLen + 2 >= mainLen && mainDist >= (1 << 9)) || |
|
|
0
|
|
|
|
|
|
27057
|
0
|
0
|
|
|
|
|
(repLen + 3 >= mainLen && mainDist >= (1 << 15)))) |
27058
|
|
|
|
|
|
|
{ |
27059
|
0
|
|
|
|
|
|
*backRes = repIndex; |
27060
|
0
|
|
|
|
|
|
MovePos(p, repLen - 1); |
27061
|
|
|
|
|
|
|
return repLen; |
27062
|
|
|
|
|
|
|
} |
27063
|
|
|
|
|
|
|
|
27064
|
0
|
0
|
|
|
|
|
if (mainLen < 2 || numAvail <= 2) |
27065
|
|
|
|
|
|
|
return 1; |
27066
|
|
|
|
|
|
|
|
27067
|
0
|
|
|
|
|
|
p->longestMatchLength = ReadMatchDistances(p, &p->numPairs); |
27068
|
0
|
0
|
|
|
|
|
if (p->longestMatchLength >= 2) |
27069
|
|
|
|
|
|
|
{ |
27070
|
0
|
|
|
|
|
|
uint32_t newDistance = matches[p->numPairs - 1]; |
27071
|
0
|
0
|
|
|
|
|
if ((p->longestMatchLength >= mainLen && newDistance < mainDist) || |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
27072
|
0
|
0
|
|
|
|
|
(p->longestMatchLength == mainLen + 1 && !ChangePair(mainDist, newDistance)) || |
|
|
0
|
|
|
|
|
|
27073
|
0
|
0
|
|
|
|
|
(p->longestMatchLength > mainLen + 1) || |
27074
|
0
|
0
|
|
|
|
|
(p->longestMatchLength + 1 >= mainLen && mainLen >= 3 && ChangePair(newDistance, mainDist))) |
|
|
0
|
|
|
|
|
|
27075
|
|
|
|
|
|
|
return 1; |
27076
|
|
|
|
|
|
|
} |
27077
|
|
|
|
|
|
|
|
27078
|
0
|
|
|
|
|
|
data = p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - 1; |
27079
|
0
|
0
|
|
|
|
|
for (i = 0; i < LZMA_NUM_REPS; i++) |
27080
|
|
|
|
|
|
|
{ |
27081
|
|
|
|
|
|
|
uint32_t len, limit; |
27082
|
0
|
|
|
|
|
|
const uint8_t *data2 = data - (p->reps[i] + 1); |
27083
|
0
|
0
|
|
|
|
|
if (data[0] != data2[0] || data[1] != data2[1]) |
|
|
0
|
|
|
|
|
|
27084
|
|
|
|
|
|
|
continue; |
27085
|
0
|
|
|
|
|
|
limit = mainLen - 1; |
27086
|
0
|
0
|
|
|
|
|
for (len = 2; len < limit && data[len] == data2[len]; len++); |
|
|
0
|
|
|
|
|
|
27087
|
0
|
0
|
|
|
|
|
if (len >= limit) |
27088
|
|
|
|
|
|
|
return 1; |
27089
|
|
|
|
|
|
|
} |
27090
|
0
|
|
|
|
|
|
*backRes = mainDist + LZMA_NUM_REPS; |
27091
|
0
|
|
|
|
|
|
MovePos(p, mainLen - 2); |
27092
|
|
|
|
|
|
|
return mainLen; |
27093
|
|
|
|
|
|
|
} |
27094
|
|
|
|
|
|
|
|
27095
|
0
|
|
|
|
|
|
static void WriteEndMarker(CLzmaEnc *p, uint32_t posState) |
27096
|
|
|
|
|
|
|
{ |
27097
|
|
|
|
|
|
|
uint32_t len; |
27098
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(&p->rc, &p->isMatch[p->state][posState], 1); |
27099
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(&p->rc, &p->isRep[p->state], 0); |
27100
|
0
|
|
|
|
|
|
p->state = kMatchNextStates[p->state]; |
27101
|
|
|
|
|
|
|
len = LZMA_MATCH_LEN_MIN; |
27102
|
0
|
|
|
|
|
|
LenEnc_Encode2(&p->lenEnc, &p->rc, len - LZMA_MATCH_LEN_MIN, posState, !p->fastMode, p->ProbPrices); |
27103
|
0
|
|
|
|
|
|
RcTree_Encode(&p->rc, p->posSlotEncoder[GetLenToPosState(len)], kNumPosSlotBits, (1 << kNumPosSlotBits) - 1); |
27104
|
0
|
|
|
|
|
|
RangeEnc_EncodeDirectBits(&p->rc, (((uint32_t)1 << 30) - 1) >> kNumAlignBits, 30 - kNumAlignBits); |
27105
|
0
|
|
|
|
|
|
RcTree_ReverseEncode(&p->rc, p->posAlignEncoder, kNumAlignBits, kAlignMask); |
27106
|
0
|
|
|
|
|
|
} |
27107
|
|
|
|
|
|
|
|
27108
|
|
|
|
|
|
|
static SRes CheckErrors(CLzmaEnc *p) |
27109
|
|
|
|
|
|
|
{ |
27110
|
0
|
0
|
|
|
|
|
if (p->result != SZ_OK) |
27111
|
|
|
|
|
|
|
return p->result; |
27112
|
0
|
0
|
|
|
|
|
if (p->rc.res != SZ_OK) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
27113
|
0
|
|
|
|
|
|
p->result = SZ_ERROR_WRITE; |
27114
|
0
|
0
|
|
|
|
|
if (p->matchFinderBase.result != SZ_OK) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
27115
|
0
|
|
|
|
|
|
p->result = SZ_ERROR_READ; |
27116
|
0
|
0
|
|
|
|
|
if (p->result != SZ_OK) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
27117
|
0
|
|
|
|
|
|
p->finished = true; |
27118
|
|
|
|
|
|
|
return p->result; |
27119
|
|
|
|
|
|
|
} |
27120
|
|
|
|
|
|
|
|
27121
|
0
|
|
|
|
|
|
static SRes Flush(CLzmaEnc *p, uint32_t nowPos) |
27122
|
|
|
|
|
|
|
{ |
27123
|
|
|
|
|
|
|
/* ReleaseMFStream(); */ |
27124
|
0
|
|
|
|
|
|
p->finished = true; |
27125
|
0
|
0
|
|
|
|
|
if (p->writeEndMark) |
27126
|
0
|
|
|
|
|
|
WriteEndMarker(p, nowPos & p->pbMask); |
27127
|
0
|
|
|
|
|
|
RangeEnc_FlushData(&p->rc); |
27128
|
0
|
|
|
|
|
|
RangeEnc_FlushStream(&p->rc); |
27129
|
0
|
|
|
|
|
|
return CheckErrors(p); |
27130
|
|
|
|
|
|
|
} |
27131
|
|
|
|
|
|
|
|
27132
|
0
|
|
|
|
|
|
static void FillAlignPrices(CLzmaEnc *p) |
27133
|
|
|
|
|
|
|
{ |
27134
|
|
|
|
|
|
|
uint32_t i; |
27135
|
0
|
0
|
|
|
|
|
for (i = 0; i < kAlignTableSize; i++) |
27136
|
0
|
|
|
|
|
|
p->alignPrices[i] = RcTree_ReverseGetPrice(p->posAlignEncoder, kNumAlignBits, i, p->ProbPrices); |
27137
|
0
|
|
|
|
|
|
p->alignPriceCount = 0; |
27138
|
0
|
|
|
|
|
|
} |
27139
|
|
|
|
|
|
|
|
27140
|
0
|
|
|
|
|
|
static void FillDistancesPrices(CLzmaEnc *p) |
27141
|
|
|
|
|
|
|
{ |
27142
|
|
|
|
|
|
|
uint32_t tempPrices[kNumFullDistances]; |
27143
|
|
|
|
|
|
|
uint32_t i, lenToPosState; |
27144
|
0
|
0
|
|
|
|
|
for (i = kStartPosModelIndex; i < kNumFullDistances; i++) |
27145
|
|
|
|
|
|
|
{ |
27146
|
0
|
|
|
|
|
|
uint32_t posSlot = GetPosSlot1(i); |
27147
|
0
|
|
|
|
|
|
uint32_t footerBits = ((posSlot >> 1) - 1); |
27148
|
0
|
|
|
|
|
|
uint32_t base = ((2 | (posSlot & 1)) << footerBits); |
27149
|
0
|
|
|
|
|
|
tempPrices[i] = RcTree_ReverseGetPrice(p->posEncoders + base - posSlot - 1, footerBits, i - base, p->ProbPrices); |
27150
|
|
|
|
|
|
|
} |
27151
|
|
|
|
|
|
|
|
27152
|
0
|
0
|
|
|
|
|
for (lenToPosState = 0; lenToPosState < kNumLenToPosStates; lenToPosState++) |
27153
|
|
|
|
|
|
|
{ |
27154
|
|
|
|
|
|
|
uint32_t posSlot; |
27155
|
0
|
|
|
|
|
|
const CLzmaProb *encoder = p->posSlotEncoder[lenToPosState]; |
27156
|
0
|
|
|
|
|
|
uint32_t *posSlotPrices = p->posSlotPrices[lenToPosState]; |
27157
|
0
|
0
|
|
|
|
|
for (posSlot = 0; posSlot < p->distTableSize; posSlot++) |
27158
|
0
|
|
|
|
|
|
posSlotPrices[posSlot] = RcTree_GetPrice(encoder, kNumPosSlotBits, posSlot, p->ProbPrices); |
27159
|
0
|
0
|
|
|
|
|
for (posSlot = kEndPosModelIndex; posSlot < p->distTableSize; posSlot++) |
27160
|
0
|
|
|
|
|
|
posSlotPrices[posSlot] += ((((posSlot >> 1) - 1) - kNumAlignBits) << kNumBitPriceShiftBits); |
27161
|
|
|
|
|
|
|
|
27162
|
|
|
|
|
|
|
{ |
27163
|
0
|
|
|
|
|
|
uint32_t *distancesPrices = p->distancesPrices[lenToPosState]; |
27164
|
|
|
|
|
|
|
uint32_t i; |
27165
|
0
|
0
|
|
|
|
|
for (i = 0; i < kStartPosModelIndex; i++) |
27166
|
0
|
|
|
|
|
|
distancesPrices[i] = posSlotPrices[i]; |
27167
|
0
|
0
|
|
|
|
|
for (; i < kNumFullDistances; i++) |
27168
|
0
|
|
|
|
|
|
distancesPrices[i] = posSlotPrices[GetPosSlot1(i)] + tempPrices[i]; |
27169
|
|
|
|
|
|
|
} |
27170
|
|
|
|
|
|
|
} |
27171
|
0
|
|
|
|
|
|
p->matchPriceCount = 0; |
27172
|
0
|
|
|
|
|
|
} |
27173
|
|
|
|
|
|
|
|
27174
|
0
|
|
|
|
|
|
void LzmaEnc_Construct(CLzmaEnc *p) |
27175
|
|
|
|
|
|
|
{ |
27176
|
|
|
|
|
|
|
RangeEnc_Construct(&p->rc); |
27177
|
|
|
|
|
|
|
MatchFinder_Construct(&p->matchFinderBase); |
27178
|
|
|
|
|
|
|
|
27179
|
|
|
|
|
|
|
{ |
27180
|
|
|
|
|
|
|
CLzmaEncProps props; |
27181
|
|
|
|
|
|
|
LzmaEncProps_Init(&props); |
27182
|
0
|
|
|
|
|
|
LzmaEnc_SetProps(p, &props); |
27183
|
|
|
|
|
|
|
} |
27184
|
|
|
|
|
|
|
|
27185
|
|
|
|
|
|
|
#ifndef LZMA_LOG_BSR |
27186
|
0
|
|
|
|
|
|
LzmaEnc_FastPosInit(p->g_FastPos); |
27187
|
|
|
|
|
|
|
#endif |
27188
|
|
|
|
|
|
|
|
27189
|
0
|
|
|
|
|
|
LzmaEnc_InitPriceTables(p->ProbPrices); |
27190
|
0
|
|
|
|
|
|
p->litProbs = 0; |
27191
|
0
|
|
|
|
|
|
p->saveState.litProbs = 0; |
27192
|
0
|
|
|
|
|
|
} |
27193
|
|
|
|
|
|
|
|
27194
|
0
|
|
|
|
|
|
CLzmaEncHandle LzmaEnc_Create(ISzAlloc *alloc) |
27195
|
|
|
|
|
|
|
{ |
27196
|
|
|
|
|
|
|
void *p; |
27197
|
0
|
|
|
|
|
|
p = alloc->Alloc(alloc, sizeof(CLzmaEnc)); |
27198
|
0
|
0
|
|
|
|
|
if (p != 0) |
27199
|
0
|
|
|
|
|
|
LzmaEnc_Construct((CLzmaEnc *)p); |
27200
|
0
|
|
|
|
|
|
return p; |
27201
|
|
|
|
|
|
|
} |
27202
|
|
|
|
|
|
|
|
27203
|
0
|
|
|
|
|
|
void LzmaEnc_FreeLits(CLzmaEnc *p, ISzAlloc *alloc) |
27204
|
|
|
|
|
|
|
{ |
27205
|
0
|
|
|
|
|
|
alloc->Free(alloc, p->litProbs); |
27206
|
0
|
|
|
|
|
|
alloc->Free(alloc, p->saveState.litProbs); |
27207
|
0
|
|
|
|
|
|
p->litProbs = 0; |
27208
|
0
|
|
|
|
|
|
p->saveState.litProbs = 0; |
27209
|
0
|
|
|
|
|
|
} |
27210
|
|
|
|
|
|
|
|
27211
|
0
|
|
|
|
|
|
void LzmaEnc_Destruct(CLzmaEnc *p, ISzAlloc *alloc, ISzAlloc *allocBig) |
27212
|
|
|
|
|
|
|
{ |
27213
|
|
|
|
|
|
|
MatchFinder_Free(&p->matchFinderBase, allocBig); |
27214
|
|
|
|
|
|
|
LzmaEnc_FreeLits(p, alloc); |
27215
|
|
|
|
|
|
|
RangeEnc_Free(&p->rc, alloc); |
27216
|
0
|
|
|
|
|
|
} |
27217
|
|
|
|
|
|
|
|
27218
|
0
|
|
|
|
|
|
void LzmaEnc_Destroy(CLzmaEncHandle p, ISzAlloc *alloc, ISzAlloc *allocBig) |
27219
|
|
|
|
|
|
|
{ |
27220
|
0
|
|
|
|
|
|
LzmaEnc_Destruct((CLzmaEnc *)p, alloc, allocBig); |
27221
|
0
|
|
|
|
|
|
alloc->Free(alloc, p); |
27222
|
0
|
|
|
|
|
|
} |
27223
|
|
|
|
|
|
|
|
27224
|
0
|
|
|
|
|
|
static SRes LzmaEnc_CodeOneBlock(CLzmaEnc *p, bool useLimits, uint32_t maxPackSize, uint32_t maxUnpackSize) |
27225
|
|
|
|
|
|
|
{ |
27226
|
|
|
|
|
|
|
uint32_t nowPos32, startPos32; |
27227
|
0
|
0
|
|
|
|
|
if (p->needInit) |
27228
|
|
|
|
|
|
|
{ |
27229
|
0
|
|
|
|
|
|
p->matchFinder.Init(p->matchFinderObj); |
27230
|
0
|
|
|
|
|
|
p->needInit = 0; |
27231
|
|
|
|
|
|
|
} |
27232
|
|
|
|
|
|
|
|
27233
|
0
|
0
|
|
|
|
|
if (p->finished) |
27234
|
0
|
|
|
|
|
|
return p->result; |
27235
|
0
|
0
|
|
|
|
|
RINOK(CheckErrors(p)); |
27236
|
|
|
|
|
|
|
|
27237
|
0
|
|
|
|
|
|
nowPos32 = (uint32_t)p->nowPos64; |
27238
|
|
|
|
|
|
|
startPos32 = nowPos32; |
27239
|
|
|
|
|
|
|
|
27240
|
0
|
0
|
|
|
|
|
if (p->nowPos64 == 0) |
27241
|
|
|
|
|
|
|
{ |
27242
|
|
|
|
|
|
|
uint32_t numPairs; |
27243
|
|
|
|
|
|
|
uint8_t curByte; |
27244
|
0
|
0
|
|
|
|
|
if (p->matchFinder.GetNumAvailableBytes(p->matchFinderObj) == 0) |
27245
|
0
|
|
|
|
|
|
return Flush(p, nowPos32); |
27246
|
0
|
|
|
|
|
|
ReadMatchDistances(p, &numPairs); |
27247
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(&p->rc, &p->isMatch[p->state][0], 0); |
27248
|
0
|
|
|
|
|
|
p->state = kLiteralNextStates[p->state]; |
27249
|
0
|
|
|
|
|
|
curByte = p->matchFinder.GetIndexByte(p->matchFinderObj, 0 - p->additionalOffset); |
27250
|
0
|
|
|
|
|
|
LitEnc_Encode(&p->rc, p->litProbs, curByte); |
27251
|
0
|
|
|
|
|
|
p->additionalOffset--; |
27252
|
0
|
|
|
|
|
|
nowPos32++; |
27253
|
|
|
|
|
|
|
} |
27254
|
|
|
|
|
|
|
|
27255
|
0
|
0
|
|
|
|
|
if (p->matchFinder.GetNumAvailableBytes(p->matchFinderObj) != 0) |
27256
|
0
|
|
|
|
|
|
for (;;) |
27257
|
|
|
|
|
|
|
{ |
27258
|
|
|
|
|
|
|
uint32_t pos, len, posState; |
27259
|
|
|
|
|
|
|
|
27260
|
0
|
0
|
|
|
|
|
if (p->fastMode) |
27261
|
0
|
|
|
|
|
|
len = GetOptimumFast(p, &pos); |
27262
|
|
|
|
|
|
|
else |
27263
|
0
|
|
|
|
|
|
len = GetOptimum(p, nowPos32, &pos); |
27264
|
|
|
|
|
|
|
|
27265
|
0
|
|
|
|
|
|
posState = nowPos32 & p->pbMask; |
27266
|
0
|
0
|
|
|
|
|
if (len == 1 && pos == (uint32_t)-1) |
|
|
0
|
|
|
|
|
|
27267
|
|
|
|
|
|
|
{ |
27268
|
|
|
|
|
|
|
uint8_t curByte; |
27269
|
|
|
|
|
|
|
CLzmaProb *probs; |
27270
|
|
|
|
|
|
|
const uint8_t *data; |
27271
|
|
|
|
|
|
|
|
27272
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(&p->rc, &p->isMatch[p->state][posState], 0); |
27273
|
0
|
|
|
|
|
|
data = p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - p->additionalOffset; |
27274
|
0
|
|
|
|
|
|
curByte = *data; |
27275
|
0
|
|
|
|
|
|
probs = LIT_PROBS(nowPos32, *(data - 1)); |
27276
|
0
|
0
|
|
|
|
|
if (IsCharState(p->state)) |
27277
|
0
|
|
|
|
|
|
LitEnc_Encode(&p->rc, probs, curByte); |
27278
|
|
|
|
|
|
|
else |
27279
|
0
|
|
|
|
|
|
LitEnc_EncodeMatched(&p->rc, probs, curByte, *(data - p->reps[0] - 1)); |
27280
|
0
|
|
|
|
|
|
p->state = kLiteralNextStates[p->state]; |
27281
|
|
|
|
|
|
|
} |
27282
|
|
|
|
|
|
|
else |
27283
|
|
|
|
|
|
|
{ |
27284
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(&p->rc, &p->isMatch[p->state][posState], 1); |
27285
|
0
|
0
|
|
|
|
|
if (pos < LZMA_NUM_REPS) |
27286
|
|
|
|
|
|
|
{ |
27287
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(&p->rc, &p->isRep[p->state], 1); |
27288
|
0
|
0
|
|
|
|
|
if (pos == 0) |
27289
|
|
|
|
|
|
|
{ |
27290
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(&p->rc, &p->isRepG0[p->state], 0); |
27291
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(&p->rc, &p->isRep0Long[p->state][posState], ((len == 1) ? 0 : 1)); |
27292
|
|
|
|
|
|
|
} |
27293
|
|
|
|
|
|
|
else |
27294
|
|
|
|
|
|
|
{ |
27295
|
0
|
|
|
|
|
|
uint32_t distance = p->reps[pos]; |
27296
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(&p->rc, &p->isRepG0[p->state], 1); |
27297
|
0
|
0
|
|
|
|
|
if (pos == 1) |
27298
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(&p->rc, &p->isRepG1[p->state], 0); |
27299
|
|
|
|
|
|
|
else |
27300
|
|
|
|
|
|
|
{ |
27301
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(&p->rc, &p->isRepG1[p->state], 1); |
27302
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(&p->rc, &p->isRepG2[p->state], pos - 2); |
27303
|
0
|
0
|
|
|
|
|
if (pos == 3) |
27304
|
0
|
|
|
|
|
|
p->reps[3] = p->reps[2]; |
27305
|
0
|
|
|
|
|
|
p->reps[2] = p->reps[1]; |
27306
|
|
|
|
|
|
|
} |
27307
|
0
|
|
|
|
|
|
p->reps[1] = p->reps[0]; |
27308
|
0
|
|
|
|
|
|
p->reps[0] = distance; |
27309
|
|
|
|
|
|
|
} |
27310
|
0
|
0
|
|
|
|
|
if (len == 1) |
27311
|
0
|
|
|
|
|
|
p->state = kShortRepNextStates[p->state]; |
27312
|
|
|
|
|
|
|
else |
27313
|
|
|
|
|
|
|
{ |
27314
|
0
|
|
|
|
|
|
LenEnc_Encode2(&p->repLenEnc, &p->rc, len - LZMA_MATCH_LEN_MIN, posState, !p->fastMode, p->ProbPrices); |
27315
|
0
|
|
|
|
|
|
p->state = kRepNextStates[p->state]; |
27316
|
|
|
|
|
|
|
} |
27317
|
|
|
|
|
|
|
} |
27318
|
|
|
|
|
|
|
else |
27319
|
|
|
|
|
|
|
{ |
27320
|
|
|
|
|
|
|
uint32_t posSlot; |
27321
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(&p->rc, &p->isRep[p->state], 0); |
27322
|
0
|
|
|
|
|
|
p->state = kMatchNextStates[p->state]; |
27323
|
0
|
|
|
|
|
|
LenEnc_Encode2(&p->lenEnc, &p->rc, len - LZMA_MATCH_LEN_MIN, posState, !p->fastMode, p->ProbPrices); |
27324
|
0
|
|
|
|
|
|
pos -= LZMA_NUM_REPS; |
27325
|
0
|
0
|
|
|
|
|
GetPosSlot(pos, posSlot); |
27326
|
0
|
0
|
|
|
|
|
RcTree_Encode(&p->rc, p->posSlotEncoder[GetLenToPosState(len)], kNumPosSlotBits, posSlot); |
27327
|
|
|
|
|
|
|
|
27328
|
0
|
0
|
|
|
|
|
if (posSlot >= kStartPosModelIndex) |
27329
|
|
|
|
|
|
|
{ |
27330
|
0
|
|
|
|
|
|
uint32_t footerBits = ((posSlot >> 1) - 1); |
27331
|
0
|
|
|
|
|
|
uint32_t base = ((2 | (posSlot & 1)) << footerBits); |
27332
|
0
|
|
|
|
|
|
uint32_t posReduced = pos - base; |
27333
|
|
|
|
|
|
|
|
27334
|
0
|
0
|
|
|
|
|
if (posSlot < kEndPosModelIndex) |
27335
|
0
|
|
|
|
|
|
RcTree_ReverseEncode(&p->rc, p->posEncoders + base - posSlot - 1, footerBits, posReduced); |
27336
|
|
|
|
|
|
|
else |
27337
|
|
|
|
|
|
|
{ |
27338
|
0
|
|
|
|
|
|
RangeEnc_EncodeDirectBits(&p->rc, posReduced >> kNumAlignBits, footerBits - kNumAlignBits); |
27339
|
0
|
|
|
|
|
|
RcTree_ReverseEncode(&p->rc, p->posAlignEncoder, kNumAlignBits, posReduced & kAlignMask); |
27340
|
0
|
|
|
|
|
|
p->alignPriceCount++; |
27341
|
|
|
|
|
|
|
} |
27342
|
|
|
|
|
|
|
} |
27343
|
0
|
|
|
|
|
|
p->reps[3] = p->reps[2]; |
27344
|
0
|
|
|
|
|
|
p->reps[2] = p->reps[1]; |
27345
|
0
|
|
|
|
|
|
p->reps[1] = p->reps[0]; |
27346
|
0
|
|
|
|
|
|
p->reps[0] = pos; |
27347
|
0
|
|
|
|
|
|
p->matchPriceCount++; |
27348
|
|
|
|
|
|
|
} |
27349
|
|
|
|
|
|
|
} |
27350
|
0
|
|
|
|
|
|
p->additionalOffset -= len; |
27351
|
0
|
|
|
|
|
|
nowPos32 += len; |
27352
|
0
|
0
|
|
|
|
|
if (p->additionalOffset == 0) |
27353
|
|
|
|
|
|
|
{ |
27354
|
|
|
|
|
|
|
uint32_t processed; |
27355
|
0
|
0
|
|
|
|
|
if (!p->fastMode) |
27356
|
|
|
|
|
|
|
{ |
27357
|
0
|
0
|
|
|
|
|
if (p->matchPriceCount >= (1 << 7)) |
27358
|
0
|
|
|
|
|
|
FillDistancesPrices(p); |
27359
|
0
|
0
|
|
|
|
|
if (p->alignPriceCount >= kAlignTableSize) |
27360
|
0
|
|
|
|
|
|
FillAlignPrices(p); |
27361
|
|
|
|
|
|
|
} |
27362
|
0
|
0
|
|
|
|
|
if (p->matchFinder.GetNumAvailableBytes(p->matchFinderObj) == 0) |
27363
|
|
|
|
|
|
|
break; |
27364
|
0
|
|
|
|
|
|
processed = nowPos32 - startPos32; |
27365
|
0
|
0
|
|
|
|
|
if (useLimits) |
27366
|
|
|
|
|
|
|
{ |
27367
|
0
|
0
|
|
|
|
|
if (processed + kNumOpts + 300 >= maxUnpackSize || |
|
|
0
|
|
|
|
|
|
27368
|
0
|
|
|
|
|
|
RangeEnc_GetProcessed(&p->rc) + kNumOpts * 2 >= maxPackSize) |
27369
|
|
|
|
|
|
|
break; |
27370
|
|
|
|
|
|
|
} |
27371
|
0
|
0
|
|
|
|
|
else if (processed >= (1 << 15)) |
27372
|
|
|
|
|
|
|
{ |
27373
|
0
|
|
|
|
|
|
p->nowPos64 += nowPos32 - startPos32; |
27374
|
0
|
|
|
|
|
|
return CheckErrors(p); |
27375
|
|
|
|
|
|
|
} |
27376
|
|
|
|
|
|
|
} |
27377
|
|
|
|
|
|
|
} |
27378
|
0
|
|
|
|
|
|
p->nowPos64 += nowPos32 - startPos32; |
27379
|
0
|
|
|
|
|
|
return Flush(p, nowPos32); |
27380
|
|
|
|
|
|
|
} |
27381
|
|
|
|
|
|
|
|
27382
|
|
|
|
|
|
|
#define kBigHashDicLimit ((uint32_t)1 << 24) |
27383
|
|
|
|
|
|
|
|
27384
|
0
|
|
|
|
|
|
static SRes LzmaEnc_Alloc(CLzmaEnc *p, uint32_t keepWindowSize, ISzAlloc *alloc, ISzAlloc *allocBig) |
27385
|
|
|
|
|
|
|
{ |
27386
|
|
|
|
|
|
|
uint32_t beforeSize = kNumOpts; |
27387
|
0
|
0
|
|
|
|
|
if (!RangeEnc_Alloc(&p->rc, alloc)) |
27388
|
|
|
|
|
|
|
return SZ_ERROR_MEM; |
27389
|
|
|
|
|
|
|
|
27390
|
|
|
|
|
|
|
{ |
27391
|
0
|
|
|
|
|
|
unsigned lclp = p->lc + p->lp; |
27392
|
0
|
0
|
|
|
|
|
if (p->litProbs == 0 || p->saveState.litProbs == 0 || p->lclp != lclp) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
27393
|
|
|
|
|
|
|
{ |
27394
|
|
|
|
|
|
|
LzmaEnc_FreeLits(p, alloc); |
27395
|
0
|
|
|
|
|
|
p->litProbs = (CLzmaProb *)alloc->Alloc(alloc, (0x300 << lclp) * sizeof(CLzmaProb)); |
27396
|
0
|
|
|
|
|
|
p->saveState.litProbs = (CLzmaProb *)alloc->Alloc(alloc, (0x300 << lclp) * sizeof(CLzmaProb)); |
27397
|
0
|
0
|
|
|
|
|
if (p->litProbs == 0 || p->saveState.litProbs == 0) |
|
|
0
|
|
|
|
|
|
27398
|
|
|
|
|
|
|
{ |
27399
|
|
|
|
|
|
|
LzmaEnc_FreeLits(p, alloc); |
27400
|
0
|
|
|
|
|
|
return SZ_ERROR_MEM; |
27401
|
|
|
|
|
|
|
} |
27402
|
0
|
|
|
|
|
|
p->lclp = lclp; |
27403
|
|
|
|
|
|
|
} |
27404
|
|
|
|
|
|
|
} |
27405
|
|
|
|
|
|
|
|
27406
|
0
|
|
|
|
|
|
p->matchFinderBase.bigHash = (p->dictSize > kBigHashDicLimit); |
27407
|
|
|
|
|
|
|
|
27408
|
0
|
0
|
|
|
|
|
if (beforeSize + p->dictSize < keepWindowSize) |
27409
|
0
|
|
|
|
|
|
beforeSize = keepWindowSize - p->dictSize; |
27410
|
|
|
|
|
|
|
|
27411
|
|
|
|
|
|
|
{ |
27412
|
0
|
0
|
|
|
|
|
if (!MatchFinder_Create(&p->matchFinderBase, p->dictSize, beforeSize, p->numFastBytes, LZMA_MATCH_LEN_MAX, allocBig)) |
27413
|
|
|
|
|
|
|
return SZ_ERROR_MEM; |
27414
|
0
|
|
|
|
|
|
p->matchFinderObj = &p->matchFinderBase; |
27415
|
|
|
|
|
|
|
MatchFinder_CreateVTable(&p->matchFinderBase, &p->matchFinder); |
27416
|
|
|
|
|
|
|
} |
27417
|
|
|
|
|
|
|
return SZ_OK; |
27418
|
|
|
|
|
|
|
} |
27419
|
|
|
|
|
|
|
|
27420
|
0
|
|
|
|
|
|
void LzmaEnc_Init(CLzmaEnc *p) |
27421
|
|
|
|
|
|
|
{ |
27422
|
|
|
|
|
|
|
uint32_t i; |
27423
|
0
|
|
|
|
|
|
p->state = 0; |
27424
|
0
|
0
|
|
|
|
|
for (i = 0 ; i < LZMA_NUM_REPS; i++) |
27425
|
0
|
|
|
|
|
|
p->reps[i] = 0; |
27426
|
|
|
|
|
|
|
|
27427
|
|
|
|
|
|
|
RangeEnc_Init(&p->rc); |
27428
|
|
|
|
|
|
|
|
27429
|
0
|
0
|
|
|
|
|
for (i = 0; i < kNumStates; i++) |
27430
|
|
|
|
|
|
|
{ |
27431
|
|
|
|
|
|
|
uint32_t j; |
27432
|
0
|
0
|
|
|
|
|
for (j = 0; j < LZMA_NUM_PB_STATES_MAX; j++) |
27433
|
|
|
|
|
|
|
{ |
27434
|
0
|
|
|
|
|
|
p->isMatch[i][j] = kProbInitValue; |
27435
|
0
|
|
|
|
|
|
p->isRep0Long[i][j] = kProbInitValue; |
27436
|
|
|
|
|
|
|
} |
27437
|
0
|
|
|
|
|
|
p->isRep[i] = kProbInitValue; |
27438
|
0
|
|
|
|
|
|
p->isRepG0[i] = kProbInitValue; |
27439
|
0
|
|
|
|
|
|
p->isRepG1[i] = kProbInitValue; |
27440
|
0
|
|
|
|
|
|
p->isRepG2[i] = kProbInitValue; |
27441
|
|
|
|
|
|
|
} |
27442
|
|
|
|
|
|
|
|
27443
|
|
|
|
|
|
|
{ |
27444
|
0
|
|
|
|
|
|
uint32_t num = 0x300 << (p->lp + p->lc); |
27445
|
0
|
0
|
|
|
|
|
for (i = 0; i < num; i++) |
27446
|
0
|
|
|
|
|
|
p->litProbs[i] = kProbInitValue; |
27447
|
|
|
|
|
|
|
} |
27448
|
|
|
|
|
|
|
|
27449
|
|
|
|
|
|
|
{ |
27450
|
0
|
0
|
|
|
|
|
for (i = 0; i < kNumLenToPosStates; i++) |
27451
|
|
|
|
|
|
|
{ |
27452
|
0
|
|
|
|
|
|
CLzmaProb *probs = p->posSlotEncoder[i]; |
27453
|
|
|
|
|
|
|
uint32_t j; |
27454
|
0
|
0
|
|
|
|
|
for (j = 0; j < (1 << kNumPosSlotBits); j++) |
27455
|
0
|
|
|
|
|
|
probs[j] = kProbInitValue; |
27456
|
|
|
|
|
|
|
} |
27457
|
|
|
|
|
|
|
} |
27458
|
|
|
|
|
|
|
{ |
27459
|
0
|
0
|
|
|
|
|
for (i = 0; i < kNumFullDistances - kEndPosModelIndex; i++) |
27460
|
0
|
|
|
|
|
|
p->posEncoders[i] = kProbInitValue; |
27461
|
|
|
|
|
|
|
} |
27462
|
|
|
|
|
|
|
|
27463
|
|
|
|
|
|
|
LenEnc_Init(&p->lenEnc.p); |
27464
|
|
|
|
|
|
|
LenEnc_Init(&p->repLenEnc.p); |
27465
|
|
|
|
|
|
|
|
27466
|
0
|
0
|
|
|
|
|
for (i = 0; i < (1 << kNumAlignBits); i++) |
27467
|
0
|
|
|
|
|
|
p->posAlignEncoder[i] = kProbInitValue; |
27468
|
|
|
|
|
|
|
|
27469
|
0
|
|
|
|
|
|
p->optimumEndIndex = 0; |
27470
|
0
|
|
|
|
|
|
p->optimumCurrentIndex = 0; |
27471
|
0
|
|
|
|
|
|
p->additionalOffset = 0; |
27472
|
|
|
|
|
|
|
|
27473
|
0
|
|
|
|
|
|
p->pbMask = (1 << p->pb) - 1; |
27474
|
0
|
|
|
|
|
|
p->lpMask = (1 << p->lp) - 1; |
27475
|
0
|
|
|
|
|
|
} |
27476
|
|
|
|
|
|
|
|
27477
|
0
|
|
|
|
|
|
void LzmaEnc_InitPrices(CLzmaEnc *p) |
27478
|
|
|
|
|
|
|
{ |
27479
|
0
|
0
|
|
|
|
|
if (!p->fastMode) |
27480
|
|
|
|
|
|
|
{ |
27481
|
0
|
|
|
|
|
|
FillDistancesPrices(p); |
27482
|
0
|
|
|
|
|
|
FillAlignPrices(p); |
27483
|
|
|
|
|
|
|
} |
27484
|
|
|
|
|
|
|
|
27485
|
|
|
|
|
|
|
p->lenEnc.tableSize = |
27486
|
|
|
|
|
|
|
p->repLenEnc.tableSize = |
27487
|
0
|
|
|
|
|
|
p->numFastBytes + 1 - LZMA_MATCH_LEN_MIN; |
27488
|
0
|
|
|
|
|
|
LenPriceEnc_UpdateTables(&p->lenEnc, 1 << p->pb, p->ProbPrices); |
27489
|
0
|
|
|
|
|
|
LenPriceEnc_UpdateTables(&p->repLenEnc, 1 << p->pb, p->ProbPrices); |
27490
|
0
|
|
|
|
|
|
} |
27491
|
|
|
|
|
|
|
|
27492
|
0
|
|
|
|
|
|
static SRes LzmaEnc_AllocAndInit(CLzmaEnc *p, uint32_t keepWindowSize, ISzAlloc *alloc, ISzAlloc *allocBig) |
27493
|
|
|
|
|
|
|
{ |
27494
|
|
|
|
|
|
|
uint32_t i; |
27495
|
0
|
0
|
|
|
|
|
for (i = 0; i < (uint32_t)kDicLogSizeMaxCompress; i++) |
27496
|
0
|
0
|
|
|
|
|
if (p->dictSize <= ((uint32_t)1 << i)) |
27497
|
|
|
|
|
|
|
break; |
27498
|
0
|
|
|
|
|
|
p->distTableSize = i * 2; |
27499
|
|
|
|
|
|
|
|
27500
|
0
|
|
|
|
|
|
p->finished = false; |
27501
|
0
|
|
|
|
|
|
p->result = SZ_OK; |
27502
|
0
|
0
|
|
|
|
|
RINOK(LzmaEnc_Alloc(p, keepWindowSize, alloc, allocBig)); |
27503
|
0
|
|
|
|
|
|
LzmaEnc_Init(p); |
27504
|
0
|
|
|
|
|
|
LzmaEnc_InitPrices(p); |
27505
|
0
|
|
|
|
|
|
p->nowPos64 = 0; |
27506
|
0
|
|
|
|
|
|
return SZ_OK; |
27507
|
|
|
|
|
|
|
} |
27508
|
|
|
|
|
|
|
|
27509
|
|
|
|
|
|
|
static SRes LzmaEnc_Prepare(CLzmaEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStream, |
27510
|
|
|
|
|
|
|
ISzAlloc *alloc, ISzAlloc *allocBig) |
27511
|
|
|
|
|
|
|
{ |
27512
|
|
|
|
|
|
|
CLzmaEnc *p = (CLzmaEnc *)pp; |
27513
|
0
|
|
|
|
|
|
p->matchFinderBase.stream = inStream; |
27514
|
0
|
|
|
|
|
|
p->needInit = 1; |
27515
|
0
|
|
|
|
|
|
p->rc.outStream = outStream; |
27516
|
0
|
|
|
|
|
|
return LzmaEnc_AllocAndInit(p, 0, alloc, allocBig); |
27517
|
|
|
|
|
|
|
} |
27518
|
|
|
|
|
|
|
|
27519
|
0
|
|
|
|
|
|
SRes LzmaEnc_PrepareForLzma2(CLzmaEncHandle pp, |
27520
|
|
|
|
|
|
|
ISeqInStream *inStream, uint32_t keepWindowSize, |
27521
|
|
|
|
|
|
|
ISzAlloc *alloc, ISzAlloc *allocBig) |
27522
|
|
|
|
|
|
|
{ |
27523
|
|
|
|
|
|
|
CLzmaEnc *p = (CLzmaEnc *)pp; |
27524
|
0
|
|
|
|
|
|
p->matchFinderBase.stream = inStream; |
27525
|
0
|
|
|
|
|
|
p->needInit = 1; |
27526
|
0
|
|
|
|
|
|
return LzmaEnc_AllocAndInit(p, keepWindowSize, alloc, allocBig); |
27527
|
|
|
|
|
|
|
} |
27528
|
|
|
|
|
|
|
|
27529
|
|
|
|
|
|
|
static void LzmaEnc_SetInputBuf(CLzmaEnc *p, const uint8_t *src, size_t srcLen) |
27530
|
|
|
|
|
|
|
{ |
27531
|
0
|
|
|
|
|
|
p->matchFinderBase.directInput = 1; |
27532
|
0
|
|
|
|
|
|
p->matchFinderBase.bufferBase = (uint8_t *)src; |
27533
|
0
|
|
|
|
|
|
p->matchFinderBase.directInputRem = srcLen; |
27534
|
|
|
|
|
|
|
} |
27535
|
|
|
|
|
|
|
|
27536
|
0
|
|
|
|
|
|
SRes LzmaEnc_MemPrepare(CLzmaEncHandle pp, const uint8_t *src, size_t srcLen, |
27537
|
|
|
|
|
|
|
uint32_t keepWindowSize, ISzAlloc *alloc, ISzAlloc *allocBig) |
27538
|
|
|
|
|
|
|
{ |
27539
|
|
|
|
|
|
|
CLzmaEnc *p = (CLzmaEnc *)pp; |
27540
|
|
|
|
|
|
|
LzmaEnc_SetInputBuf(p, src, srcLen); |
27541
|
0
|
|
|
|
|
|
p->needInit = 1; |
27542
|
|
|
|
|
|
|
|
27543
|
0
|
|
|
|
|
|
return LzmaEnc_AllocAndInit(p, keepWindowSize, alloc, allocBig); |
27544
|
|
|
|
|
|
|
} |
27545
|
|
|
|
|
|
|
|
27546
|
0
|
|
|
|
|
|
void LzmaEnc_Finish(CLzmaEncHandle /*pp*/) |
27547
|
|
|
|
|
|
|
{ |
27548
|
0
|
|
|
|
|
|
} |
27549
|
|
|
|
|
|
|
|
27550
|
|
|
|
|
|
|
struct CSeqOutStreamBuf |
27551
|
|
|
|
|
|
|
{ |
27552
|
|
|
|
|
|
|
ISeqOutStream funcTable; |
27553
|
|
|
|
|
|
|
uint8_t *data; |
27554
|
|
|
|
|
|
|
size_t rem; |
27555
|
|
|
|
|
|
|
bool overflow; |
27556
|
|
|
|
|
|
|
}; |
27557
|
|
|
|
|
|
|
|
27558
|
0
|
|
|
|
|
|
static size_t MyWrite(void *pp, const void *data, size_t size) |
27559
|
|
|
|
|
|
|
{ |
27560
|
|
|
|
|
|
|
CSeqOutStreamBuf *p = (CSeqOutStreamBuf *)pp; |
27561
|
0
|
0
|
|
|
|
|
if (p->rem < size) |
27562
|
|
|
|
|
|
|
{ |
27563
|
|
|
|
|
|
|
size = p->rem; |
27564
|
0
|
|
|
|
|
|
p->overflow = true; |
27565
|
|
|
|
|
|
|
} |
27566
|
0
|
|
|
|
|
|
memcpy(p->data, data, size); |
27567
|
0
|
|
|
|
|
|
p->rem -= size; |
27568
|
0
|
|
|
|
|
|
p->data += size; |
27569
|
0
|
|
|
|
|
|
return size; |
27570
|
|
|
|
|
|
|
} |
27571
|
|
|
|
|
|
|
|
27572
|
0
|
|
|
|
|
|
uint32_t LzmaEnc_GetNumAvailableBytes(CLzmaEncHandle pp) |
27573
|
|
|
|
|
|
|
{ |
27574
|
|
|
|
|
|
|
const CLzmaEnc *p = (CLzmaEnc *)pp; |
27575
|
0
|
|
|
|
|
|
return p->matchFinder.GetNumAvailableBytes(p->matchFinderObj); |
27576
|
|
|
|
|
|
|
} |
27577
|
|
|
|
|
|
|
|
27578
|
0
|
|
|
|
|
|
const uint8_t *LzmaEnc_GetCurBuf(CLzmaEncHandle pp) |
27579
|
|
|
|
|
|
|
{ |
27580
|
|
|
|
|
|
|
const CLzmaEnc *p = (CLzmaEnc *)pp; |
27581
|
0
|
|
|
|
|
|
return p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - p->additionalOffset; |
27582
|
|
|
|
|
|
|
} |
27583
|
|
|
|
|
|
|
|
27584
|
0
|
|
|
|
|
|
SRes LzmaEnc_CodeOneMemBlock(CLzmaEncHandle pp, bool reInit, |
27585
|
|
|
|
|
|
|
uint8_t *dest, size_t *destLen, uint32_t desiredPackSize, uint32_t *unpackSize) |
27586
|
|
|
|
|
|
|
{ |
27587
|
|
|
|
|
|
|
CLzmaEnc *p = (CLzmaEnc *)pp; |
27588
|
|
|
|
|
|
|
uint64_t nowPos64; |
27589
|
|
|
|
|
|
|
SRes res; |
27590
|
|
|
|
|
|
|
CSeqOutStreamBuf outStream; |
27591
|
|
|
|
|
|
|
|
27592
|
0
|
|
|
|
|
|
outStream.funcTable.Write = MyWrite; |
27593
|
0
|
|
|
|
|
|
outStream.data = dest; |
27594
|
0
|
|
|
|
|
|
outStream.rem = *destLen; |
27595
|
0
|
|
|
|
|
|
outStream.overflow = false; |
27596
|
|
|
|
|
|
|
|
27597
|
0
|
|
|
|
|
|
p->writeEndMark = false; |
27598
|
0
|
|
|
|
|
|
p->finished = false; |
27599
|
0
|
|
|
|
|
|
p->result = SZ_OK; |
27600
|
|
|
|
|
|
|
|
27601
|
0
|
0
|
|
|
|
|
if (reInit) |
27602
|
0
|
|
|
|
|
|
LzmaEnc_Init(p); |
27603
|
0
|
|
|
|
|
|
LzmaEnc_InitPrices(p); |
27604
|
0
|
|
|
|
|
|
nowPos64 = p->nowPos64; |
27605
|
|
|
|
|
|
|
RangeEnc_Init(&p->rc); |
27606
|
0
|
|
|
|
|
|
p->rc.outStream = &outStream.funcTable; |
27607
|
|
|
|
|
|
|
|
27608
|
0
|
|
|
|
|
|
res = LzmaEnc_CodeOneBlock(p, true, desiredPackSize, *unpackSize); |
27609
|
|
|
|
|
|
|
|
27610
|
0
|
|
|
|
|
|
*unpackSize = (uint32_t)(p->nowPos64 - nowPos64); |
27611
|
0
|
|
|
|
|
|
*destLen -= outStream.rem; |
27612
|
0
|
0
|
|
|
|
|
if (outStream.overflow) |
27613
|
|
|
|
|
|
|
return SZ_ERROR_OUTPUT_EOF; |
27614
|
|
|
|
|
|
|
|
27615
|
0
|
|
|
|
|
|
return res; |
27616
|
|
|
|
|
|
|
} |
27617
|
|
|
|
|
|
|
|
27618
|
0
|
|
|
|
|
|
static SRes LzmaEnc_Encode2(CLzmaEnc *p, ICompressProgress *progress) |
27619
|
|
|
|
|
|
|
{ |
27620
|
|
|
|
|
|
|
SRes res = SZ_OK; |
27621
|
|
|
|
|
|
|
|
27622
|
|
|
|
|
|
|
for (;;) |
27623
|
|
|
|
|
|
|
{ |
27624
|
0
|
|
|
|
|
|
res = LzmaEnc_CodeOneBlock(p, false, 0, 0); |
27625
|
0
|
0
|
|
|
|
|
if (res != SZ_OK || p->finished != 0) |
|
|
0
|
|
|
|
|
|
27626
|
|
|
|
|
|
|
break; |
27627
|
0
|
0
|
|
|
|
|
if (progress != 0) |
27628
|
|
|
|
|
|
|
{ |
27629
|
0
|
|
|
|
|
|
res = progress->Progress(progress, p->nowPos64, RangeEnc_GetProcessed(&p->rc)); |
27630
|
0
|
0
|
|
|
|
|
if (res != SZ_OK) |
27631
|
|
|
|
|
|
|
{ |
27632
|
|
|
|
|
|
|
res = SZ_ERROR_PROGRESS; |
27633
|
|
|
|
|
|
|
break; |
27634
|
|
|
|
|
|
|
} |
27635
|
|
|
|
|
|
|
} |
27636
|
|
|
|
|
|
|
} |
27637
|
|
|
|
|
|
|
LzmaEnc_Finish(p); |
27638
|
0
|
|
|
|
|
|
return res; |
27639
|
|
|
|
|
|
|
} |
27640
|
|
|
|
|
|
|
|
27641
|
0
|
|
|
|
|
|
SRes LzmaEnc_Encode(CLzmaEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStream, ICompressProgress *progress, |
27642
|
|
|
|
|
|
|
ISzAlloc *alloc, ISzAlloc *allocBig) |
27643
|
|
|
|
|
|
|
{ |
27644
|
0
|
0
|
|
|
|
|
RINOK(LzmaEnc_Prepare(pp, outStream, inStream, alloc, allocBig)); |
27645
|
0
|
|
|
|
|
|
return LzmaEnc_Encode2((CLzmaEnc *)pp, progress); |
27646
|
|
|
|
|
|
|
} |
27647
|
|
|
|
|
|
|
|
27648
|
0
|
|
|
|
|
|
SRes LzmaEnc_WriteProperties(CLzmaEncHandle pp, uint8_t *props, size_t *size) |
27649
|
|
|
|
|
|
|
{ |
27650
|
|
|
|
|
|
|
CLzmaEnc *p = (CLzmaEnc *)pp; |
27651
|
|
|
|
|
|
|
int i; |
27652
|
0
|
|
|
|
|
|
uint32_t dictSize = p->dictSize; |
27653
|
0
|
0
|
|
|
|
|
if (*size < LZMA_PROPS_SIZE) |
27654
|
|
|
|
|
|
|
return SZ_ERROR_PARAM; |
27655
|
0
|
|
|
|
|
|
*size = LZMA_PROPS_SIZE; |
27656
|
0
|
|
|
|
|
|
props[0] = (uint8_t)((p->pb * 5 + p->lp) * 9 + p->lc); |
27657
|
|
|
|
|
|
|
|
27658
|
0
|
0
|
|
|
|
|
for (i = 11; i <= 30; i++) |
27659
|
|
|
|
|
|
|
{ |
27660
|
0
|
0
|
|
|
|
|
if (dictSize <= ((uint32_t)2 << i)) |
27661
|
|
|
|
|
|
|
{ |
27662
|
0
|
|
|
|
|
|
dictSize = (2 << i); |
27663
|
0
|
|
|
|
|
|
break; |
27664
|
|
|
|
|
|
|
} |
27665
|
0
|
0
|
|
|
|
|
if (dictSize <= ((uint32_t)3 << i)) |
27666
|
|
|
|
|
|
|
{ |
27667
|
0
|
|
|
|
|
|
dictSize = (3 << i); |
27668
|
0
|
|
|
|
|
|
break; |
27669
|
|
|
|
|
|
|
} |
27670
|
|
|
|
|
|
|
} |
27671
|
|
|
|
|
|
|
|
27672
|
0
|
0
|
|
|
|
|
for (i = 0; i < 4; i++) |
27673
|
0
|
|
|
|
|
|
props[1 + i] = (uint8_t)(dictSize >> (8 * i)); |
27674
|
|
|
|
|
|
|
return SZ_OK; |
27675
|
|
|
|
|
|
|
} |
27676
|
|
|
|
|
|
|
|
27677
|
0
|
|
|
|
|
|
SRes LzmaEnc_MemEncode(CLzmaEncHandle pp, uint8_t *dest, size_t *destLen, const uint8_t *src, size_t srcLen, |
27678
|
|
|
|
|
|
|
int writeEndMark, ICompressProgress *progress, ISzAlloc *alloc, ISzAlloc *allocBig) |
27679
|
|
|
|
|
|
|
{ |
27680
|
|
|
|
|
|
|
SRes res; |
27681
|
|
|
|
|
|
|
CLzmaEnc *p = (CLzmaEnc *)pp; |
27682
|
|
|
|
|
|
|
|
27683
|
|
|
|
|
|
|
CSeqOutStreamBuf outStream; |
27684
|
|
|
|
|
|
|
|
27685
|
|
|
|
|
|
|
LzmaEnc_SetInputBuf(p, src, srcLen); |
27686
|
|
|
|
|
|
|
|
27687
|
0
|
|
|
|
|
|
outStream.funcTable.Write = MyWrite; |
27688
|
0
|
|
|
|
|
|
outStream.data = dest; |
27689
|
0
|
|
|
|
|
|
outStream.rem = *destLen; |
27690
|
0
|
|
|
|
|
|
outStream.overflow = false; |
27691
|
|
|
|
|
|
|
|
27692
|
0
|
|
|
|
|
|
p->writeEndMark = writeEndMark; |
27693
|
|
|
|
|
|
|
|
27694
|
0
|
|
|
|
|
|
p->rc.outStream = &outStream.funcTable; |
27695
|
|
|
|
|
|
|
res = LzmaEnc_MemPrepare(pp, src, srcLen, 0, alloc, allocBig); |
27696
|
0
|
0
|
|
|
|
|
if (res == SZ_OK) |
27697
|
0
|
|
|
|
|
|
res = LzmaEnc_Encode2(p, progress); |
27698
|
|
|
|
|
|
|
|
27699
|
0
|
|
|
|
|
|
*destLen -= outStream.rem; |
27700
|
0
|
0
|
|
|
|
|
if (outStream.overflow) |
27701
|
|
|
|
|
|
|
return SZ_ERROR_OUTPUT_EOF; |
27702
|
0
|
|
|
|
|
|
return res; |
27703
|
|
|
|
|
|
|
} |
27704
|
|
|
|
|
|
|
|
27705
|
0
|
|
|
|
|
|
SRes LzmaEncode(uint8_t *dest, size_t *destLen, const uint8_t *src, size_t srcLen, |
27706
|
|
|
|
|
|
|
const CLzmaEncProps *props, uint8_t *propsEncoded, size_t *propsSize, int writeEndMark, |
27707
|
|
|
|
|
|
|
ICompressProgress *progress, ISzAlloc *alloc, ISzAlloc *allocBig) |
27708
|
|
|
|
|
|
|
{ |
27709
|
0
|
|
|
|
|
|
CLzmaEnc *p = (CLzmaEnc *)LzmaEnc_Create(alloc); |
27710
|
|
|
|
|
|
|
SRes res; |
27711
|
0
|
0
|
|
|
|
|
if (p == 0) |
27712
|
|
|
|
|
|
|
return SZ_ERROR_MEM; |
27713
|
|
|
|
|
|
|
|
27714
|
0
|
|
|
|
|
|
res = LzmaEnc_SetProps(p, props); |
27715
|
0
|
0
|
|
|
|
|
if (res == SZ_OK) |
27716
|
|
|
|
|
|
|
{ |
27717
|
0
|
|
|
|
|
|
res = LzmaEnc_WriteProperties(p, propsEncoded, propsSize); |
27718
|
0
|
0
|
|
|
|
|
if (res == SZ_OK) |
27719
|
|
|
|
|
|
|
res = LzmaEnc_MemEncode(p, dest, destLen, src, srcLen, |
27720
|
0
|
|
|
|
|
|
writeEndMark, progress, alloc, allocBig); |
27721
|
|
|
|
|
|
|
} |
27722
|
|
|
|
|
|
|
|
27723
|
|
|
|
|
|
|
LzmaEnc_Destroy(p, alloc, allocBig); |
27724
|
0
|
|
|
|
|
|
return res; |
27725
|
|
|
|
|
|
|
} |
27726
|
|
|
|
|
|
|
|
27727
|
|
|
|
|
|
|
} // namespace lzma |
27728
|
|
|
|
|
|
|
// End of LZMA compression library by Igor Pavlov |
27729
|
|
|
|
|
|
|
|
27730
|
|
|
|
|
|
|
#ifndef UFAL_CPPUTILS_COMPRESSOR_LZMA_ALLOCATOR_H |
27731
|
|
|
|
|
|
|
#define UFAL_CPPUTILS_COMPRESSOR_LZMA_ALLOCATOR_H |
27732
|
|
|
|
|
|
|
static void *LzmaAlloc(void* /*p*/, size_t size) { return new char[size]; } |
27733
|
|
|
|
|
|
|
static void LzmaFree(void* /*p*/, void *address) { delete[] (char*) address; } |
27734
|
|
|
|
|
|
|
static lzma::ISzAlloc lzmaAllocator = { LzmaAlloc, LzmaFree }; |
27735
|
|
|
|
|
|
|
#endif // UFAL_CPPUTILS_COMPRESSOR_LZMA_ALLOCATOR_H |
27736
|
|
|
|
|
|
|
|
27737
|
0
|
|
|
|
|
|
bool compressor::save(ostream& os, const binary_encoder& enc) { |
27738
|
0
|
|
|
|
|
|
size_t uncompressed_size = enc.data.size(), compressed_size = 2 * enc.data.size() + 100; |
27739
|
0
|
|
|
|
|
|
vector compressed(compressed_size); |
27740
|
|
|
|
|
|
|
|
27741
|
|
|
|
|
|
|
lzma::CLzmaEncProps props; |
27742
|
|
|
|
|
|
|
lzma::LzmaEncProps_Init(&props); |
27743
|
|
|
|
|
|
|
unsigned char props_encoded[LZMA_PROPS_SIZE]; |
27744
|
0
|
|
|
|
|
|
size_t props_encoded_size = LZMA_PROPS_SIZE; |
27745
|
|
|
|
|
|
|
|
27746
|
0
|
0
|
|
|
|
|
auto res = lzma::LzmaEncode(compressed.data(), &compressed_size, enc.data.data(), uncompressed_size, &props, props_encoded, &props_encoded_size, 0, nullptr, &lzmaAllocator, &lzmaAllocator); |
27747
|
0
|
0
|
|
|
|
|
if (res != SZ_OK) return false; |
27748
|
|
|
|
|
|
|
|
27749
|
0
|
|
|
|
|
|
uint32_t poor_crc = uncompressed_size * 19991 + compressed_size * 199999991 + 1234567890; |
27750
|
0
|
0
|
|
|
|
|
if (uint32_t(uncompressed_size) != uncompressed_size || uint32_t(compressed_size) != compressed_size) return false; |
|
|
0
|
|
|
|
|
|
27751
|
0
|
0
|
|
|
|
|
if (!os.write((const char*) &uncompressed_size, sizeof(uint32_t))) return false; |
|
|
0
|
|
|
|
|
|
27752
|
0
|
0
|
|
|
|
|
if (!os.write((const char*) &compressed_size, sizeof(uint32_t))) return false; |
|
|
0
|
|
|
|
|
|
27753
|
0
|
0
|
|
|
|
|
if (!os.write((const char*) &poor_crc, sizeof(uint32_t))) return false; |
|
|
0
|
|
|
|
|
|
27754
|
0
|
0
|
|
|
|
|
if (!os.write((const char*) props_encoded, sizeof(props_encoded))) return false; |
|
|
0
|
|
|
|
|
|
27755
|
0
|
0
|
|
|
|
|
if (!os.write((const char*) compressed.data(), compressed_size)) return false; |
|
|
0
|
|
|
|
|
|
27756
|
|
|
|
|
|
|
|
27757
|
0
|
|
|
|
|
|
return true; |
27758
|
|
|
|
|
|
|
} |
27759
|
|
|
|
|
|
|
|
27760
|
|
|
|
|
|
|
} // namespace utils |
27761
|
|
|
|
|
|
|
|
27762
|
|
|
|
|
|
|
///////// |
27763
|
|
|
|
|
|
|
// File: version/version.cpp |
27764
|
|
|
|
|
|
|
///////// |
27765
|
|
|
|
|
|
|
|
27766
|
|
|
|
|
|
|
// This file is part of UDPipe . |
27767
|
|
|
|
|
|
|
// |
27768
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
27769
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
27770
|
|
|
|
|
|
|
// |
27771
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
27772
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
27773
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
27774
|
|
|
|
|
|
|
|
27775
|
|
|
|
|
|
|
// Returns current version. |
27776
|
0
|
|
|
|
|
|
version version::current() { |
27777
|
0
|
0
|
|
|
|
|
return {1, 3, 0, ""}; |
27778
|
|
|
|
|
|
|
} |
27779
|
|
|
|
|
|
|
|
27780
|
|
|
|
|
|
|
// Returns multi-line formated version and copyright string. |
27781
|
0
|
|
|
|
|
|
string version::version_and_copyright(const string& other_libraries) { |
27782
|
0
|
|
|
|
|
|
ostringstream info; |
27783
|
|
|
|
|
|
|
|
27784
|
|
|
|
|
|
|
auto udpipe = version::current(); |
27785
|
|
|
|
|
|
|
auto unilib = unilib::version::current(); |
27786
|
|
|
|
|
|
|
auto morphodita = morphodita::version::current(); |
27787
|
|
|
|
|
|
|
auto parsito = parsito::version::current(); |
27788
|
|
|
|
|
|
|
|
27789
|
0
|
|
|
|
|
|
info << "UDPipe version " << udpipe.major << '.' << udpipe.minor << '.' << udpipe.patch |
27790
|
0
|
0
|
|
|
|
|
<< (udpipe.prerelease.empty() ? "" : "-") << udpipe.prerelease |
|
|
0
|
|
|
|
|
|
27791
|
0
|
|
|
|
|
|
<< " (using UniLib " << unilib.major << '.' << unilib.minor << '.' << unilib.patch |
27792
|
0
|
0
|
|
|
|
|
<< (unilib.prerelease.empty() ? "" : "-") << unilib.prerelease |
|
|
0
|
|
|
|
|
|
27793
|
0
|
|
|
|
|
|
<< ",\nMorphoDiTa " << morphodita.major << '.' << morphodita.minor << '.' << unilib.patch |
27794
|
0
|
0
|
|
|
|
|
<< (morphodita.prerelease.empty() ? "" : "-") << morphodita.prerelease |
|
|
0
|
|
|
|
|
|
27795
|
0
|
|
|
|
|
|
<< ", Parsito " << parsito.major << '.' << parsito.minor << '.' << unilib.patch |
27796
|
0
|
0
|
|
|
|
|
<< (parsito.prerelease.empty() ? "" : "-") << parsito.prerelease |
|
|
0
|
|
|
|
|
|
27797
|
0
|
0
|
|
|
|
|
<< (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n" |
|
|
0
|
|
|
|
|
|
27798
|
|
|
|
|
|
|
"Copyright 2016 by Institute of Formal and Applied Linguistics, Faculty of\n" |
27799
|
0
|
0
|
|
|
|
|
"Mathematics and Physics, Charles University in Prague, Czech Republic."; |
27800
|
|
|
|
|
|
|
|
27801
|
0
|
|
|
|
|
|
return info.str(); |
27802
|
|
|
|
|
|
|
} |
27803
|
|
|
|
|
|
|
|
27804
|
|
|
|
|
|
|
} // namespace udpipe |
27805
|
8
|
50
|
|
|
|
|
} // namespace ufal |
|
|
50
|
|
|
|
|
|