| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
2
|
|
|
|
|
|
|
// |
|
3
|
|
|
|
|
|
|
// This file is a bundle of all sources and headers of UDPipe library. |
|
4
|
|
|
|
|
|
|
// Comments and copyrights of all individual files are kept. |
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
#include |
|
7
|
|
|
|
|
|
|
#include |
|
8
|
|
|
|
|
|
|
#include |
|
9
|
|
|
|
|
|
|
#include |
|
10
|
|
|
|
|
|
|
#include |
|
11
|
|
|
|
|
|
|
#include |
|
12
|
|
|
|
|
|
|
#include |
|
13
|
|
|
|
|
|
|
#include |
|
14
|
|
|
|
|
|
|
#include |
|
15
|
|
|
|
|
|
|
#include |
|
16
|
|
|
|
|
|
|
#include |
|
17
|
|
|
|
|
|
|
#include |
|
18
|
|
|
|
|
|
|
#include |
|
19
|
|
|
|
|
|
|
#include |
|
20
|
|
|
|
|
|
|
#include |
|
21
|
|
|
|
|
|
|
#include |
|
22
|
|
|
|
|
|
|
#include |
|
23
|
|
|
|
|
|
|
#include |
|
24
|
|
|
|
|
|
|
#include |
|
25
|
|
|
|
|
|
|
#include |
|
26
|
|
|
|
|
|
|
#include |
|
27
|
|
|
|
|
|
|
#include |
|
28
|
|
|
|
|
|
|
#include |
|
29
|
|
|
|
|
|
|
#include |
|
30
|
|
|
|
|
|
|
#include |
|
31
|
|
|
|
|
|
|
#include |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
namespace ufal { |
|
34
|
|
|
|
|
|
|
namespace udpipe { |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
///////// |
|
37
|
|
|
|
|
|
|
// File: utils/common.h |
|
38
|
|
|
|
|
|
|
///////// |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
|
41
|
|
|
|
|
|
|
// |
|
42
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
43
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
44
|
|
|
|
|
|
|
// |
|
45
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
46
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
47
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
48
|
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
// Headers available in all sources |
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
namespace utils { |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
using namespace std; |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
// Assert that int is at least 4B |
|
56
|
|
|
|
|
|
|
static_assert(sizeof(int) >= sizeof(int32_t), "Int must be at least 4B wide!"); |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
// Assert that we are on a little endian system |
|
59
|
|
|
|
|
|
|
#ifdef __BYTE_ORDER__ |
|
60
|
|
|
|
|
|
|
static_assert(__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__, "Only little endian systems are supported!"); |
|
61
|
|
|
|
|
|
|
#endif |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
#define runtime_failure(message) exit((cerr << message << endl, 1)) |
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
} // namespace utils |
|
66
|
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
///////// |
|
68
|
|
|
|
|
|
|
// File: utils/string_piece.h |
|
69
|
|
|
|
|
|
|
///////// |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
|
72
|
|
|
|
|
|
|
// |
|
73
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
74
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
75
|
|
|
|
|
|
|
// |
|
76
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
77
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
78
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
namespace utils { |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
struct string_piece { |
|
83
|
|
|
|
|
|
|
const char* str; |
|
84
|
|
|
|
|
|
|
size_t len; |
|
85
|
|
|
|
|
|
|
|
|
86
|
16
|
|
|
|
|
|
string_piece() : str(nullptr), len(0) {} |
|
87
|
42
|
|
|
|
|
|
string_piece(const char* str) : str(str), len(strlen(str)) {} |
|
88
|
128
|
|
|
|
|
|
string_piece(const char* str, size_t len) : str(str), len(len) {} |
|
89
|
43
|
|
|
|
|
|
string_piece(const string& str) : str(str.c_str()), len(str.size()) {} |
|
90
|
|
|
|
|
|
|
}; |
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
inline ostream& operator<<(ostream& os, const string_piece& str) { |
|
93
|
0
|
0
|
|
|
|
|
return os.write(str.str, str.len); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
} |
|
95
|
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
inline bool operator==(const string_piece& a, const string_piece& b) { |
|
97
|
73
|
100
|
|
|
|
|
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
} |
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
inline bool operator!=(const string_piece& a, const string_piece& b) { |
|
101
|
|
|
|
|
|
|
return a.len != b.len || memcmp(a.str, b.str, a.len) != 0; |
|
102
|
|
|
|
|
|
|
} |
|
103
|
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
} // namespace utils |
|
105
|
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
///////// |
|
107
|
|
|
|
|
|
|
// File: common.h |
|
108
|
|
|
|
|
|
|
///////// |
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
111
|
|
|
|
|
|
|
// |
|
112
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
|
113
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
114
|
|
|
|
|
|
|
// |
|
115
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
116
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
117
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
118
|
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
using namespace utils; |
|
120
|
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
///////// |
|
122
|
|
|
|
|
|
|
// File: sentence/empty_node.h |
|
123
|
|
|
|
|
|
|
///////// |
|
124
|
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
126
|
|
|
|
|
|
|
// |
|
127
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
128
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
129
|
|
|
|
|
|
|
// |
|
130
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
131
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
132
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
133
|
|
|
|
|
|
|
|
|
134
|
0
|
|
|
|
|
|
class empty_node { |
|
135
|
|
|
|
|
|
|
public: |
|
136
|
|
|
|
|
|
|
int id; // 0 is root, >0 is sentence word, <0 is undefined |
|
137
|
|
|
|
|
|
|
int index; // index for the current id, should be numbered from 1, 0=undefined |
|
138
|
|
|
|
|
|
|
string form; // form |
|
139
|
|
|
|
|
|
|
string lemma; // lemma |
|
140
|
|
|
|
|
|
|
string upostag; // universal part-of-speech tag |
|
141
|
|
|
|
|
|
|
string xpostag; // language-specific part-of-speech tag |
|
142
|
|
|
|
|
|
|
string feats; // list of morphological features |
|
143
|
|
|
|
|
|
|
string deps; // secondary dependencies |
|
144
|
|
|
|
|
|
|
string misc; // miscellaneous information |
|
145
|
|
|
|
|
|
|
|
|
146
|
0
|
|
|
|
|
|
empty_node(int id = -1, int index = 0) : id(id), index(index) {} |
|
147
|
|
|
|
|
|
|
}; |
|
148
|
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
///////// |
|
150
|
|
|
|
|
|
|
// File: sentence/token.h |
|
151
|
|
|
|
|
|
|
///////// |
|
152
|
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
154
|
|
|
|
|
|
|
// |
|
155
|
|
|
|
|
|
|
// Copyright 2017 Institute of Formal and Applied Linguistics, Faculty of |
|
156
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
157
|
|
|
|
|
|
|
// |
|
158
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
159
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
160
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
161
|
|
|
|
|
|
|
|
|
162
|
24
|
|
|
|
|
|
class token { |
|
163
|
|
|
|
|
|
|
public: |
|
164
|
|
|
|
|
|
|
string form; |
|
165
|
|
|
|
|
|
|
string misc; |
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
token(string_piece form = string_piece(), string_piece misc = string_piece()); |
|
168
|
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
// CoNLL-U defined SpaceAfter=No feature |
|
170
|
|
|
|
|
|
|
bool get_space_after() const; |
|
171
|
|
|
|
|
|
|
void set_space_after(bool space_after); |
|
172
|
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
// UDPipe-specific all-spaces-preserving SpacesBefore and SpacesAfter features |
|
174
|
|
|
|
|
|
|
void get_spaces_before(string& spaces_before) const; |
|
175
|
|
|
|
|
|
|
void set_spaces_before(string_piece spaces_before); |
|
176
|
|
|
|
|
|
|
void get_spaces_after(string& spaces_after) const; |
|
177
|
|
|
|
|
|
|
void set_spaces_after(string_piece spaces_after); |
|
178
|
|
|
|
|
|
|
void get_spaces_in_token(string& spaces_in_token) const; |
|
179
|
|
|
|
|
|
|
void set_spaces_in_token(string_piece spaces_in_token); |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
// UDPipe-specific TokenRange feature |
|
182
|
|
|
|
|
|
|
bool get_token_range(size_t& start, size_t& end) const; |
|
183
|
|
|
|
|
|
|
void set_token_range(size_t start, size_t end); |
|
184
|
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
private: |
|
186
|
|
|
|
|
|
|
bool get_misc_field(string_piece name, string_piece& value) const; |
|
187
|
|
|
|
|
|
|
void remove_misc_field(string_piece name); |
|
188
|
|
|
|
|
|
|
string& start_misc_field(string_piece name); |
|
189
|
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
void append_escaped_spaces(string_piece spaces, string& escaped_spaces) const; |
|
191
|
|
|
|
|
|
|
void unescape_spaces(string_piece escaped_spaces, string& spaces) const; |
|
192
|
|
|
|
|
|
|
}; |
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
///////// |
|
195
|
|
|
|
|
|
|
// File: sentence/multiword_token.h |
|
196
|
|
|
|
|
|
|
///////// |
|
197
|
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
199
|
|
|
|
|
|
|
// |
|
200
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
201
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
202
|
|
|
|
|
|
|
// |
|
203
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
204
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
205
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
206
|
|
|
|
|
|
|
|
|
207
|
0
|
0
|
|
|
|
|
class multiword_token : public token { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
public: |
|
209
|
|
|
|
|
|
|
// form and misc are inherited from token |
|
210
|
|
|
|
|
|
|
int id_first, id_last; |
|
211
|
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
multiword_token(int id_first = -1, int id_last = -1, string_piece form = string_piece(), string_piece misc = string_piece()) |
|
213
|
0
|
|
|
|
|
|
: token(form, misc), id_first(id_first), id_last(id_last) {} |
|
214
|
|
|
|
|
|
|
}; |
|
215
|
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
///////// |
|
217
|
|
|
|
|
|
|
// File: sentence/word.h |
|
218
|
|
|
|
|
|
|
///////// |
|
219
|
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
221
|
|
|
|
|
|
|
// |
|
222
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
223
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
224
|
|
|
|
|
|
|
// |
|
225
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
226
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
227
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
228
|
|
|
|
|
|
|
|
|
229
|
46
|
0
|
|
|
|
|
class word : public token { |
|
230
|
|
|
|
|
|
|
public: |
|
231
|
|
|
|
|
|
|
// form and misc are inherited from token |
|
232
|
|
|
|
|
|
|
int id; // 0 is root, >0 is sentence word, <0 is undefined |
|
233
|
|
|
|
|
|
|
string lemma; // lemma |
|
234
|
|
|
|
|
|
|
string upostag; // universal part-of-speech tag |
|
235
|
|
|
|
|
|
|
string xpostag; // language-specific part-of-speech tag |
|
236
|
|
|
|
|
|
|
string feats; // list of morphological features |
|
237
|
|
|
|
|
|
|
int head; // head, 0 is root, <0 is undefined |
|
238
|
|
|
|
|
|
|
string deprel; // dependency relation to the head |
|
239
|
|
|
|
|
|
|
string deps; // secondary dependencies |
|
240
|
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
vector children; |
|
242
|
|
|
|
|
|
|
|
|
243
|
20
|
|
|
|
|
|
word(int id = -1, string_piece form = string_piece()) : token(form), id(id), head(-1) {} |
|
244
|
|
|
|
|
|
|
}; |
|
245
|
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
///////// |
|
247
|
|
|
|
|
|
|
// File: sentence/sentence.h |
|
248
|
|
|
|
|
|
|
///////// |
|
249
|
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
251
|
|
|
|
|
|
|
// |
|
252
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
253
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
254
|
|
|
|
|
|
|
// |
|
255
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
256
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
257
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
258
|
|
|
|
|
|
|
|
|
259
|
0
|
0
|
|
|
|
|
class sentence { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
public: |
|
261
|
|
|
|
|
|
|
sentence(); |
|
262
|
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
vector words; |
|
264
|
|
|
|
|
|
|
vector multiword_tokens; |
|
265
|
|
|
|
|
|
|
vector empty_nodes; |
|
266
|
|
|
|
|
|
|
vector comments; |
|
267
|
|
|
|
|
|
|
static const string root_form; |
|
268
|
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
// Basic sentence modifications |
|
270
|
|
|
|
|
|
|
bool empty(); |
|
271
|
|
|
|
|
|
|
void clear(); |
|
272
|
|
|
|
|
|
|
word& add_word(string_piece form = string_piece()); |
|
273
|
|
|
|
|
|
|
void set_head(int id, int head, const string& deprel); |
|
274
|
|
|
|
|
|
|
void unlink_all_words(); |
|
275
|
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
// CoNLL-U defined comments |
|
277
|
|
|
|
|
|
|
bool get_new_doc(string* id = nullptr) const; |
|
278
|
|
|
|
|
|
|
void set_new_doc(bool new_doc, string_piece id = string_piece()); |
|
279
|
|
|
|
|
|
|
bool get_new_par(string* id = nullptr) const; |
|
280
|
|
|
|
|
|
|
void set_new_par(bool new_par, string_piece id = string_piece()); |
|
281
|
|
|
|
|
|
|
bool get_sent_id(string& id) const; |
|
282
|
|
|
|
|
|
|
void set_sent_id(string_piece id); |
|
283
|
|
|
|
|
|
|
bool get_text(string& text) const; |
|
284
|
|
|
|
|
|
|
void set_text(string_piece text); |
|
285
|
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
private: |
|
287
|
|
|
|
|
|
|
bool get_comment(string_piece name, string* value) const; |
|
288
|
|
|
|
|
|
|
void remove_comment(string_piece name); |
|
289
|
|
|
|
|
|
|
void set_comment(string_piece name, string_piece value = string_piece()); |
|
290
|
|
|
|
|
|
|
}; |
|
291
|
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
///////// |
|
293
|
|
|
|
|
|
|
// File: sentence/input_format.h |
|
294
|
|
|
|
|
|
|
///////// |
|
295
|
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
297
|
|
|
|
|
|
|
// |
|
298
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
299
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
300
|
|
|
|
|
|
|
// |
|
301
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
302
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
303
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
304
|
|
|
|
|
|
|
|
|
305
|
1
|
|
|
|
|
|
class input_format { |
|
306
|
|
|
|
|
|
|
public: |
|
307
|
1
|
|
|
|
|
|
virtual ~input_format() {} |
|
308
|
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
virtual bool read_block(istream& is, string& block) const = 0; |
|
310
|
|
|
|
|
|
|
virtual void reset_document(string_piece id = string_piece()) = 0; |
|
311
|
|
|
|
|
|
|
virtual void set_text(string_piece text, bool make_copy = false) = 0; |
|
312
|
|
|
|
|
|
|
virtual bool next_sentence(sentence& s, string& error) = 0; |
|
313
|
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
// Static factory methods |
|
315
|
|
|
|
|
|
|
static input_format* new_input_format(const string& name); |
|
316
|
|
|
|
|
|
|
static input_format* new_conllu_input_format(const string& options = string()); |
|
317
|
|
|
|
|
|
|
static input_format* new_generic_tokenizer_input_format(const string& options = string()); |
|
318
|
|
|
|
|
|
|
static input_format* new_horizontal_input_format(const string& options = string()); |
|
319
|
|
|
|
|
|
|
static input_format* new_vertical_input_format(const string& options = string()); |
|
320
|
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
static input_format* new_presegmented_tokenizer(input_format* tokenizer); |
|
322
|
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
static const string CONLLU_V1; |
|
324
|
|
|
|
|
|
|
static const string CONLLU_V2; |
|
325
|
|
|
|
|
|
|
static const string GENERIC_TOKENIZER_NORMALIZED_SPACES; |
|
326
|
|
|
|
|
|
|
static const string GENERIC_TOKENIZER_PRESEGMENTED; |
|
327
|
|
|
|
|
|
|
static const string GENERIC_TOKENIZER_RANGES; |
|
328
|
|
|
|
|
|
|
}; |
|
329
|
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
///////// |
|
331
|
|
|
|
|
|
|
// File: model/model.h |
|
332
|
|
|
|
|
|
|
///////// |
|
333
|
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
335
|
|
|
|
|
|
|
// |
|
336
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
337
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
338
|
|
|
|
|
|
|
// |
|
339
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
340
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
341
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
342
|
|
|
|
|
|
|
|
|
343
|
1
|
|
|
|
|
|
class model { |
|
344
|
|
|
|
|
|
|
public: |
|
345
|
1
|
|
|
|
|
|
virtual ~model() {} |
|
346
|
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
static model* load(const char* fname); |
|
348
|
|
|
|
|
|
|
static model* load(istream& is); |
|
349
|
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
virtual input_format* new_tokenizer(const string& options) const = 0; |
|
351
|
|
|
|
|
|
|
virtual bool tag(sentence& s, const string& options, string& error) const = 0; |
|
352
|
|
|
|
|
|
|
virtual bool parse(sentence& s, const string& options, string& error) const = 0; |
|
353
|
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
static const string DEFAULT; |
|
355
|
|
|
|
|
|
|
static const string TOKENIZER_NORMALIZED_SPACES; |
|
356
|
|
|
|
|
|
|
static const string TOKENIZER_PRESEGMENTED; |
|
357
|
|
|
|
|
|
|
static const string TOKENIZER_RANGES; |
|
358
|
|
|
|
|
|
|
}; |
|
359
|
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
///////// |
|
361
|
|
|
|
|
|
|
// File: model/evaluator.h |
|
362
|
|
|
|
|
|
|
///////// |
|
363
|
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
365
|
|
|
|
|
|
|
// |
|
366
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
|
367
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
368
|
|
|
|
|
|
|
// |
|
369
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
370
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
371
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
372
|
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
class evaluator { |
|
374
|
|
|
|
|
|
|
public: |
|
375
|
|
|
|
|
|
|
evaluator(const model* m, const string& tokenizer, const string& tagger, const string& parser); |
|
376
|
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
void set_model(const model* m); |
|
378
|
|
|
|
|
|
|
void set_tokenizer(const string& tokenizer); |
|
379
|
|
|
|
|
|
|
void set_tagger(const string& tagger); |
|
380
|
|
|
|
|
|
|
void set_parser(const string& parser); |
|
381
|
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
bool evaluate(istream& is, ostream& os, string& error) const; |
|
383
|
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
static const string DEFAULT; |
|
385
|
|
|
|
|
|
|
static const string NONE; |
|
386
|
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
private: |
|
388
|
|
|
|
|
|
|
const model* m; |
|
389
|
|
|
|
|
|
|
string tokenizer, tagger, parser; |
|
390
|
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
struct f1_info { size_t total_system, total_gold; double precision, recall, f1; }; |
|
392
|
|
|
|
|
|
|
template |
|
393
|
|
|
|
|
|
|
static f1_info evaluate_f1(const vector>& system, const vector>& gold); |
|
394
|
|
|
|
|
|
|
|
|
395
|
0
|
|
|
|
|
|
class evaluation_data { |
|
396
|
|
|
|
|
|
|
public: |
|
397
|
0
|
|
|
|
|
|
struct word_data { |
|
398
|
|
|
|
|
|
|
size_t start, end; |
|
399
|
|
|
|
|
|
|
bool is_multiword; |
|
400
|
|
|
|
|
|
|
word w; |
|
401
|
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
word_data(size_t start, size_t end, int id, bool is_multiword, const word& w); |
|
403
|
|
|
|
|
|
|
}; |
|
404
|
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
void add_sentence(const sentence& s); |
|
406
|
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
u32string chars; |
|
408
|
|
|
|
|
|
|
vector> sentences, tokens; |
|
409
|
|
|
|
|
|
|
vector> multiwords; |
|
410
|
|
|
|
|
|
|
vector words; |
|
411
|
|
|
|
|
|
|
}; |
|
412
|
|
|
|
|
|
|
|
|
413
|
0
|
|
|
|
|
|
class word_alignment { |
|
414
|
|
|
|
|
|
|
public: |
|
415
|
0
|
|
|
|
|
|
struct pair_system_gold { |
|
416
|
|
|
|
|
|
|
word system; const word& gold; |
|
417
|
0
|
0
|
|
|
|
|
pair_system_gold(const word& system, const word& gold) : system(system), gold(gold) {} |
|
418
|
|
|
|
|
|
|
}; |
|
419
|
|
|
|
|
|
|
vector matched; |
|
420
|
|
|
|
|
|
|
size_t total_system, total_gold; |
|
421
|
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
template |
|
423
|
|
|
|
|
|
|
f1_info evaluate_f1(Equals equals); |
|
424
|
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
static bool perfect_alignment(const evaluation_data& system, const evaluation_data& gold, word_alignment& alignment); |
|
426
|
|
|
|
|
|
|
static void best_alignment(const evaluation_data& system, const evaluation_data& gold, word_alignment& alignment); |
|
427
|
|
|
|
|
|
|
}; |
|
428
|
|
|
|
|
|
|
}; |
|
429
|
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
///////// |
|
431
|
|
|
|
|
|
|
// File: unilib/unicode.h |
|
432
|
|
|
|
|
|
|
///////// |
|
433
|
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
// This file is part of UniLib . |
|
435
|
|
|
|
|
|
|
// |
|
436
|
|
|
|
|
|
|
// Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of |
|
437
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
438
|
|
|
|
|
|
|
// |
|
439
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
440
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
441
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
442
|
|
|
|
|
|
|
// |
|
443
|
|
|
|
|
|
|
// UniLib version: 3.3.0 |
|
444
|
|
|
|
|
|
|
// Unicode version: 15.0.0 |
|
445
|
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
namespace unilib { |
|
447
|
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
class unicode { |
|
449
|
|
|
|
|
|
|
enum : uint8_t { |
|
450
|
|
|
|
|
|
|
_Lu = 1, _Ll = 2, _Lt = 3, _Lm = 4, _Lo = 5, |
|
451
|
|
|
|
|
|
|
_Mn = 6, _Mc = 7, _Me = 8, |
|
452
|
|
|
|
|
|
|
_Nd = 9, _Nl = 10, _No = 11, |
|
453
|
|
|
|
|
|
|
_Pc = 12, _Pd = 13, _Ps = 14, _Pe = 15, _Pi = 16, _Pf = 17, _Po = 18, |
|
454
|
|
|
|
|
|
|
_Sm = 19, _Sc = 20, _Sk = 21, _So = 22, |
|
455
|
|
|
|
|
|
|
_Zs = 23, _Zl = 24, _Zp = 25, |
|
456
|
|
|
|
|
|
|
_Cc = 26, _Cf = 27, _Cs = 28, _Co = 29, _Cn = 30 |
|
457
|
|
|
|
|
|
|
}; |
|
458
|
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
public: |
|
460
|
|
|
|
|
|
|
typedef uint32_t category_t; |
|
461
|
|
|
|
|
|
|
enum : category_t { |
|
462
|
|
|
|
|
|
|
Lu = 1 << _Lu, Ll = 1 << _Ll, Lt = 1 << _Lt, Lut = Lu | Lt, LC = Lu | Ll | Lt, |
|
463
|
|
|
|
|
|
|
Lm = 1 << _Lm, Lo = 1 << _Lo, L = Lu | Ll | Lt | Lm | Lo, |
|
464
|
|
|
|
|
|
|
Mn = 1 << _Mn, Mc = 1 << _Mc, Me = 1 << _Me, M = Mn | Mc | Me, |
|
465
|
|
|
|
|
|
|
Nd = 1 << _Nd, Nl = 1 << _Nl, No = 1 << _No, N = Nd | Nl | No, |
|
466
|
|
|
|
|
|
|
Pc = 1 << _Pc, Pd = 1 << _Pd, Ps = 1 << _Ps, Pe = 1 << _Pe, Pi = 1 << _Pi, |
|
467
|
|
|
|
|
|
|
Pf = 1 << _Pf, Po = 1 << _Po, P = Pc | Pd | Ps | Pe | Pi | Pf | Po, |
|
468
|
|
|
|
|
|
|
Sm = 1 << _Sm, Sc = 1 << _Sc, Sk = 1 << _Sk, So = 1 << _So, S = Sm | Sc | Sk | So, |
|
469
|
|
|
|
|
|
|
Zs = 1 << _Zs, Zl = 1 << _Zl, Zp = 1 << _Zp, Z = Zs | Zl | Zp, |
|
470
|
|
|
|
|
|
|
Cc = 1 << _Cc, Cf = 1 << _Cf, Cs = 1 << _Cs, Co = 1 << _Co, Cn = 1 << _Cn, C = Cc | Cf | Cs | Co | Cn |
|
471
|
|
|
|
|
|
|
}; |
|
472
|
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
static inline category_t category(char32_t chr); |
|
474
|
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
static inline char32_t lowercase(char32_t chr); |
|
476
|
|
|
|
|
|
|
static inline char32_t uppercase(char32_t chr); |
|
477
|
|
|
|
|
|
|
static inline char32_t titlecase(char32_t chr); |
|
478
|
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
private: |
|
480
|
|
|
|
|
|
|
static const char32_t CHARS = 0x110000; |
|
481
|
|
|
|
|
|
|
static const int32_t DEFAULT_CAT = Cn; |
|
482
|
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
static const uint8_t category_index[CHARS >> 8]; |
|
484
|
|
|
|
|
|
|
static const uint8_t category_block[][256]; |
|
485
|
|
|
|
|
|
|
static const uint8_t othercase_index[CHARS >> 8]; |
|
486
|
|
|
|
|
|
|
static const char32_t othercase_block[][256]; |
|
487
|
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
enum othercase_type { LOWER_ONLY = 1, UPPERTITLE_ONLY = 2, UPPER_ONLY = 3, LOWER_THEN_UPPER = 4, UPPER_THEN_TITLE = 5, TITLE_THEN_LOWER = 6 }; |
|
489
|
|
|
|
|
|
|
}; |
|
490
|
|
|
|
|
|
|
|
|
491
|
|
|
|
|
|
|
unicode::category_t unicode::category(char32_t chr) { |
|
492
|
101
|
0
|
|
|
|
|
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
} |
|
494
|
|
|
|
|
|
|
|
|
495
|
30
|
|
|
|
|
|
char32_t unicode::lowercase(char32_t chr) { |
|
496
|
30
|
50
|
|
|
|
|
if (chr < CHARS) { |
|
497
|
30
|
|
|
|
|
|
char32_t othercase = othercase_block[othercase_index[chr >> 8]][chr & 0xFF]; |
|
498
|
30
|
100
|
|
|
|
|
if ((othercase & 0xFF) == othercase_type::LOWER_ONLY) return othercase >> 8; |
|
499
|
28
|
50
|
|
|
|
|
if ((othercase & 0xFF) == othercase_type::LOWER_THEN_UPPER) return othercase >> 8; |
|
500
|
28
|
50
|
|
|
|
|
if ((othercase & 0xFF) == othercase_type::TITLE_THEN_LOWER) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8; |
|
501
|
|
|
|
|
|
|
} |
|
502
|
|
|
|
|
|
|
return chr; |
|
503
|
|
|
|
|
|
|
} |
|
504
|
|
|
|
|
|
|
|
|
505
|
0
|
|
|
|
|
|
char32_t unicode::uppercase(char32_t chr) { |
|
506
|
0
|
0
|
|
|
|
|
if (chr < CHARS) { |
|
507
|
0
|
|
|
|
|
|
char32_t othercase = othercase_block[othercase_index[chr >> 8]][chr & 0xFF]; |
|
508
|
0
|
0
|
|
|
|
|
if ((othercase & 0xFF) == othercase_type::UPPERTITLE_ONLY) return othercase >> 8; |
|
509
|
0
|
0
|
|
|
|
|
if ((othercase & 0xFF) == othercase_type::UPPER_ONLY) return othercase >> 8; |
|
510
|
0
|
0
|
|
|
|
|
if ((othercase & 0xFF) == othercase_type::UPPER_THEN_TITLE) return othercase >> 8; |
|
511
|
0
|
0
|
|
|
|
|
if ((othercase & 0xFF) == othercase_type::LOWER_THEN_UPPER) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8; |
|
512
|
|
|
|
|
|
|
} |
|
513
|
|
|
|
|
|
|
return chr; |
|
514
|
|
|
|
|
|
|
} |
|
515
|
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
char32_t unicode::titlecase(char32_t chr) { |
|
517
|
|
|
|
|
|
|
if (chr < CHARS) { |
|
518
|
|
|
|
|
|
|
char32_t othercase = othercase_block[othercase_index[chr >> 8]][chr & 0xFF]; |
|
519
|
|
|
|
|
|
|
if ((othercase & 0xFF) == othercase_type::UPPERTITLE_ONLY) return othercase >> 8; |
|
520
|
|
|
|
|
|
|
if ((othercase & 0xFF) == othercase_type::TITLE_THEN_LOWER) return othercase >> 8; |
|
521
|
|
|
|
|
|
|
if ((othercase & 0xFF) == othercase_type::UPPER_THEN_TITLE) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8; |
|
522
|
|
|
|
|
|
|
} |
|
523
|
|
|
|
|
|
|
return chr; |
|
524
|
|
|
|
|
|
|
} |
|
525
|
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
} // namespace unilib |
|
527
|
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
///////// |
|
529
|
|
|
|
|
|
|
// File: unilib/utf8.h |
|
530
|
|
|
|
|
|
|
///////// |
|
531
|
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
// This file is part of UniLib . |
|
533
|
|
|
|
|
|
|
// |
|
534
|
|
|
|
|
|
|
// Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of |
|
535
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
536
|
|
|
|
|
|
|
// |
|
537
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
538
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
539
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
540
|
|
|
|
|
|
|
// |
|
541
|
|
|
|
|
|
|
// UniLib version: 3.3.0 |
|
542
|
|
|
|
|
|
|
// Unicode version: 15.0.0 |
|
543
|
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
namespace unilib { |
|
545
|
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
class utf8 { |
|
547
|
|
|
|
|
|
|
public: |
|
548
|
|
|
|
|
|
|
static bool valid(const char* str); |
|
549
|
|
|
|
|
|
|
static bool valid(const char* str, size_t len); |
|
550
|
|
|
|
|
|
|
static inline bool valid(const std::string& str); |
|
551
|
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
static inline char32_t decode(const char*& str); |
|
553
|
|
|
|
|
|
|
static inline char32_t decode(const char*& str, size_t& len); |
|
554
|
|
|
|
|
|
|
static inline char32_t first(const char* str); |
|
555
|
|
|
|
|
|
|
static inline char32_t first(const char* str, size_t len); |
|
556
|
|
|
|
|
|
|
static inline char32_t first(const std::string& str); |
|
557
|
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
static void decode(const char* str, std::u32string& decoded); |
|
559
|
|
|
|
|
|
|
static void decode(const char* str, size_t len, std::u32string& decoded); |
|
560
|
|
|
|
|
|
|
static inline void decode(const std::string& str, std::u32string& decoded); |
|
561
|
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
class string_decoder { |
|
563
|
|
|
|
|
|
|
public: |
|
564
|
|
|
|
|
|
|
class iterator; |
|
565
|
|
|
|
|
|
|
inline iterator begin(); |
|
566
|
|
|
|
|
|
|
inline iterator end(); |
|
567
|
|
|
|
|
|
|
private: |
|
568
|
|
|
|
|
|
|
inline string_decoder(const char* str); |
|
569
|
|
|
|
|
|
|
const char* str; |
|
570
|
|
|
|
|
|
|
friend class utf8; |
|
571
|
|
|
|
|
|
|
}; |
|
572
|
|
|
|
|
|
|
static inline string_decoder decoder(const char* str); |
|
573
|
|
|
|
|
|
|
static inline string_decoder decoder(const std::string& str); |
|
574
|
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
class buffer_decoder { |
|
576
|
|
|
|
|
|
|
public: |
|
577
|
|
|
|
|
|
|
class iterator; |
|
578
|
|
|
|
|
|
|
inline iterator begin(); |
|
579
|
|
|
|
|
|
|
inline iterator end(); |
|
580
|
|
|
|
|
|
|
private: |
|
581
|
|
|
|
|
|
|
inline buffer_decoder(const char* str, size_t len); |
|
582
|
|
|
|
|
|
|
const char* str; |
|
583
|
|
|
|
|
|
|
size_t len; |
|
584
|
|
|
|
|
|
|
friend class utf8; |
|
585
|
|
|
|
|
|
|
}; |
|
586
|
|
|
|
|
|
|
static inline buffer_decoder decoder(const char* str, size_t len); |
|
587
|
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
static inline void append(char*& str, char32_t chr); |
|
589
|
|
|
|
|
|
|
static inline void append(std::string& str, char32_t chr); |
|
590
|
|
|
|
|
|
|
static void encode(const std::u32string& str, std::string& encoded); |
|
591
|
|
|
|
|
|
|
|
|
592
|
|
|
|
|
|
|
template static void map(F f, const char* str, std::string& result); |
|
593
|
|
|
|
|
|
|
template static void map(F f, const char* str, size_t len, std::string& result); |
|
594
|
|
|
|
|
|
|
template static void map(F f, const std::string& str, std::string& result); |
|
595
|
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
private: |
|
597
|
|
|
|
|
|
|
static const char REPLACEMENT_CHAR = '?'; |
|
598
|
|
|
|
|
|
|
}; |
|
599
|
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
bool utf8::valid(const std::string& str) { |
|
601
|
|
|
|
|
|
|
return valid(str.c_str()); |
|
602
|
|
|
|
|
|
|
} |
|
603
|
|
|
|
|
|
|
|
|
604
|
54
|
|
|
|
|
|
char32_t utf8::decode(const char*& str) { |
|
605
|
54
|
50
|
|
|
|
|
if (((unsigned char)*str) < 0x80) return (unsigned char)*str++; |
|
606
|
0
|
0
|
|
|
|
|
else if (((unsigned char)*str) < 0xC0) return ++str, REPLACEMENT_CHAR; |
|
607
|
0
|
0
|
|
|
|
|
else if (((unsigned char)*str) < 0xE0) { |
|
608
|
0
|
|
|
|
|
|
char32_t res = (((unsigned char)*str++) & 0x1F) << 6; |
|
609
|
0
|
0
|
|
|
|
|
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
|
0
|
|
|
|
|
|
|
610
|
0
|
|
|
|
|
|
return res + (((unsigned char)*str++) & 0x3F); |
|
611
|
0
|
0
|
|
|
|
|
} else if (((unsigned char)*str) < 0xF0) { |
|
612
|
0
|
|
|
|
|
|
char32_t res = (((unsigned char)*str++) & 0x0F) << 12; |
|
613
|
0
|
0
|
|
|
|
|
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
|
0
|
|
|
|
|
|
|
614
|
0
|
|
|
|
|
|
res += (((unsigned char)*str++) & 0x3F) << 6; |
|
615
|
0
|
0
|
|
|
|
|
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
|
0
|
|
|
|
|
|
|
616
|
0
|
|
|
|
|
|
return res + (((unsigned char)*str++) & 0x3F); |
|
617
|
0
|
0
|
|
|
|
|
} else if (((unsigned char)*str) < 0xF8) { |
|
618
|
0
|
|
|
|
|
|
char32_t res = (((unsigned char)*str++) & 0x07) << 18; |
|
619
|
0
|
0
|
|
|
|
|
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
|
0
|
|
|
|
|
|
|
620
|
0
|
|
|
|
|
|
res += (((unsigned char)*str++) & 0x3F) << 12; |
|
621
|
0
|
0
|
|
|
|
|
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
|
0
|
|
|
|
|
|
|
622
|
0
|
|
|
|
|
|
res += (((unsigned char)*str++) & 0x3F) << 6; |
|
623
|
0
|
0
|
|
|
|
|
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
|
0
|
|
|
|
|
|
|
624
|
0
|
|
|
|
|
|
return res + (((unsigned char)*str++) & 0x3F); |
|
625
|
0
|
|
|
|
|
|
} else return ++str, REPLACEMENT_CHAR; |
|
626
|
|
|
|
|
|
|
} |
|
627
|
|
|
|
|
|
|
|
|
628
|
145
|
|
|
|
|
|
char32_t utf8::decode(const char*& str, size_t& len) { |
|
629
|
145
|
50
|
|
|
|
|
if (!len) return 0; |
|
630
|
145
|
|
|
|
|
|
--len; |
|
631
|
145
|
100
|
|
|
|
|
if (((unsigned char)*str) < 0x80) return (unsigned char)*str++; |
|
632
|
23
|
50
|
|
|
|
|
else if (((unsigned char)*str) < 0xC0) return ++str, REPLACEMENT_CHAR; |
|
633
|
23
|
50
|
|
|
|
|
else if (((unsigned char)*str) < 0xE0) { |
|
634
|
23
|
|
|
|
|
|
char32_t res = (((unsigned char)*str++) & 0x1F) << 6; |
|
635
|
23
|
50
|
|
|
|
|
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
636
|
23
|
|
|
|
|
|
return res + ((--len, ((unsigned char)*str++)) & 0x3F); |
|
637
|
0
|
0
|
|
|
|
|
} else if (((unsigned char)*str) < 0xF0) { |
|
638
|
0
|
|
|
|
|
|
char32_t res = (((unsigned char)*str++) & 0x0F) << 12; |
|
639
|
0
|
0
|
|
|
|
|
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
640
|
0
|
|
|
|
|
|
res += ((--len, ((unsigned char)*str++)) & 0x3F) << 6; |
|
641
|
0
|
0
|
|
|
|
|
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
642
|
0
|
|
|
|
|
|
return res + ((--len, ((unsigned char)*str++)) & 0x3F); |
|
643
|
0
|
0
|
|
|
|
|
} else if (((unsigned char)*str) < 0xF8) { |
|
644
|
0
|
|
|
|
|
|
char32_t res = (((unsigned char)*str++) & 0x07) << 18; |
|
645
|
0
|
0
|
|
|
|
|
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
646
|
0
|
|
|
|
|
|
res += ((--len, ((unsigned char)*str++)) & 0x3F) << 12; |
|
647
|
0
|
0
|
|
|
|
|
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
648
|
0
|
|
|
|
|
|
res += ((--len, ((unsigned char)*str++)) & 0x3F) << 6; |
|
649
|
0
|
0
|
|
|
|
|
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
650
|
0
|
|
|
|
|
|
return res + ((--len, ((unsigned char)*str++)) & 0x3F); |
|
651
|
0
|
|
|
|
|
|
} else return ++str, REPLACEMENT_CHAR; |
|
652
|
|
|
|
|
|
|
} |
|
653
|
|
|
|
|
|
|
|
|
654
|
|
|
|
|
|
|
char32_t utf8::first(const char* str) { |
|
655
|
0
|
|
|
|
|
|
return decode(str); |
|
656
|
|
|
|
|
|
|
} |
|
657
|
|
|
|
|
|
|
|
|
658
|
|
|
|
|
|
|
char32_t utf8::first(const char* str, size_t len) { |
|
659
|
0
|
|
|
|
|
|
return decode(str, len); |
|
660
|
|
|
|
|
|
|
} |
|
661
|
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
char32_t utf8::first(const std::string& str) { |
|
663
|
|
|
|
|
|
|
return first(str.c_str()); |
|
664
|
|
|
|
|
|
|
} |
|
665
|
|
|
|
|
|
|
|
|
666
|
|
|
|
|
|
|
void utf8::decode(const std::string& str, std::u32string& decoded) { |
|
667
|
|
|
|
|
|
|
decode(str.c_str(), decoded); |
|
668
|
|
|
|
|
|
|
} |
|
669
|
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
class utf8::string_decoder::iterator : public std::iterator { |
|
671
|
|
|
|
|
|
|
public: |
|
672
|
36
|
|
|
|
|
|
iterator(const char* str) : codepoint(0), next(str) { operator++(); } |
|
673
|
|
|
|
|
|
|
iterator(const iterator& it) : codepoint(it.codepoint), next(it.next) {} |
|
674
|
54
|
0
|
|
|
|
|
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
675
|
|
|
|
|
|
|
iterator operator++(int) { iterator tmp(*this); operator++(); return tmp; } |
|
676
|
|
|
|
|
|
|
bool operator==(const iterator& other) const { return next == other.next; } |
|
677
|
|
|
|
|
|
|
bool operator!=(const iterator& other) const { return next != other.next; } |
|
678
|
|
|
|
|
|
|
const char32_t& operator*() { return codepoint; } |
|
679
|
|
|
|
|
|
|
private: |
|
680
|
|
|
|
|
|
|
char32_t codepoint; |
|
681
|
|
|
|
|
|
|
const char* next; |
|
682
|
|
|
|
|
|
|
}; |
|
683
|
|
|
|
|
|
|
|
|
684
|
|
|
|
|
|
|
utf8::string_decoder::string_decoder(const char* str) : str(str) {} |
|
685
|
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
utf8::string_decoder::iterator utf8::string_decoder::begin() { |
|
687
|
|
|
|
|
|
|
return iterator(str); |
|
688
|
|
|
|
|
|
|
} |
|
689
|
|
|
|
|
|
|
|
|
690
|
|
|
|
|
|
|
utf8::string_decoder::iterator utf8::string_decoder::end() { |
|
691
|
|
|
|
|
|
|
return iterator(nullptr); |
|
692
|
|
|
|
|
|
|
} |
|
693
|
|
|
|
|
|
|
|
|
694
|
|
|
|
|
|
|
utf8::string_decoder utf8::decoder(const char* str) { |
|
695
|
|
|
|
|
|
|
return string_decoder(str); |
|
696
|
|
|
|
|
|
|
} |
|
697
|
|
|
|
|
|
|
|
|
698
|
|
|
|
|
|
|
utf8::string_decoder utf8::decoder(const std::string& str) { |
|
699
|
|
|
|
|
|
|
return string_decoder(str.c_str()); |
|
700
|
|
|
|
|
|
|
} |
|
701
|
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
class utf8::buffer_decoder::iterator : public std::iterator { |
|
703
|
|
|
|
|
|
|
public: |
|
704
|
0
|
|
|
|
|
|
iterator(const char* str, size_t len) : codepoint(0), next(str), len(len) { operator++(); } |
|
705
|
|
|
|
|
|
|
iterator(const iterator& it) : codepoint(it.codepoint), next(it.next), len(it.len) {} |
|
706
|
0
|
0
|
|
|
|
|
iterator& operator++() { if (!len) next = nullptr; if (next) codepoint = decode(next, len); return *this; } |
|
|
|
0
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
iterator operator++(int) { iterator tmp(*this); operator++(); return tmp; } |
|
708
|
|
|
|
|
|
|
bool operator==(const iterator& other) const { return next == other.next; } |
|
709
|
|
|
|
|
|
|
bool operator!=(const iterator& other) const { return next != other.next; } |
|
710
|
|
|
|
|
|
|
const char32_t& operator*() { return codepoint; } |
|
711
|
|
|
|
|
|
|
private: |
|
712
|
|
|
|
|
|
|
char32_t codepoint; |
|
713
|
|
|
|
|
|
|
const char* next; |
|
714
|
|
|
|
|
|
|
size_t len; |
|
715
|
|
|
|
|
|
|
}; |
|
716
|
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
utf8::buffer_decoder::buffer_decoder(const char* str, size_t len) : str(str), len(len) {} |
|
718
|
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
utf8::buffer_decoder::iterator utf8::buffer_decoder::begin() { |
|
720
|
|
|
|
|
|
|
return iterator(str, len); |
|
721
|
|
|
|
|
|
|
} |
|
722
|
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
utf8::buffer_decoder::iterator utf8::buffer_decoder::end() { |
|
724
|
|
|
|
|
|
|
return iterator(nullptr, 0); |
|
725
|
|
|
|
|
|
|
} |
|
726
|
|
|
|
|
|
|
|
|
727
|
|
|
|
|
|
|
utf8::buffer_decoder utf8::decoder(const char* str, size_t len) { |
|
728
|
|
|
|
|
|
|
return buffer_decoder(str, len); |
|
729
|
|
|
|
|
|
|
} |
|
730
|
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
void utf8::append(char*& str, char32_t chr) { |
|
732
|
|
|
|
|
|
|
if (chr < 0x80) *str++ = chr; |
|
733
|
|
|
|
|
|
|
else if (chr < 0x800) { *str++ = 0xC0 + (chr >> 6); *str++ = 0x80 + (chr & 0x3F); } |
|
734
|
|
|
|
|
|
|
else if (chr < 0x10000) { *str++ = 0xE0 + (chr >> 12); *str++ = 0x80 + ((chr >> 6) & 0x3F); *str++ = 0x80 + (chr & 0x3F); } |
|
735
|
|
|
|
|
|
|
else if (chr < 0x200000) { *str++ = 0xF0 + (chr >> 18); *str++ = 0x80 + ((chr >> 12) & 0x3F); *str++ = 0x80 + ((chr >> 6) & 0x3F); *str++ = 0x80 + (chr & 0x3F); } |
|
736
|
|
|
|
|
|
|
else *str++ = REPLACEMENT_CHAR; |
|
737
|
|
|
|
|
|
|
} |
|
738
|
|
|
|
|
|
|
|
|
739
|
30
|
|
|
|
|
|
void utf8::append(std::string& str, char32_t chr) { |
|
740
|
30
|
100
|
|
|
|
|
if (chr < 0x80) str += chr; |
|
741
|
5
|
50
|
|
|
|
|
else if (chr < 0x800) { str += 0xC0 + (chr >> 6); str += 0x80 + (chr & 0x3F); } |
|
742
|
0
|
0
|
|
|
|
|
else if (chr < 0x10000) { str += 0xE0 + (chr >> 12); str += 0x80 + ((chr >> 6) & 0x3F); str += 0x80 + (chr & 0x3F); } |
|
743
|
0
|
0
|
|
|
|
|
else if (chr < 0x200000) { str += 0xF0 + (chr >> 18); str += 0x80 + ((chr >> 12) & 0x3F); str += 0x80 + ((chr >> 6) & 0x3F); str += 0x80 + (chr & 0x3F); } |
|
744
|
|
|
|
|
|
|
else str += REPLACEMENT_CHAR; |
|
745
|
30
|
|
|
|
|
|
} |
|
746
|
|
|
|
|
|
|
|
|
747
|
0
|
|
|
|
|
|
template void utf8::map(F f, const char* str, std::string& result) { |
|
748
|
|
|
|
|
|
|
result.clear(); |
|
749
|
|
|
|
|
|
|
|
|
750
|
0
|
0
|
|
|
|
|
for (char32_t chr; (chr = decode(str)); ) |
|
751
|
0
|
|
|
|
|
|
append(result, f(chr)); |
|
752
|
0
|
|
|
|
|
|
} |
|
753
|
|
|
|
|
|
|
|
|
754
|
7
|
|
|
|
|
|
template void utf8::map(F f, const char* str, size_t len, std::string& result) { |
|
755
|
|
|
|
|
|
|
result.clear(); |
|
756
|
|
|
|
|
|
|
|
|
757
|
36
|
100
|
|
|
|
|
while (len) |
|
758
|
29
|
|
|
|
|
|
append(result, f(decode(str, len))); |
|
759
|
7
|
|
|
|
|
|
} |
|
760
|
|
|
|
|
|
|
|
|
761
|
|
|
|
|
|
|
template void utf8::map(F f, const std::string& str, std::string& result) { |
|
762
|
0
|
0
|
|
|
|
|
map(f, str.c_str(), result); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
763
|
|
|
|
|
|
|
} |
|
764
|
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
} // namespace unilib |
|
766
|
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
///////// |
|
768
|
|
|
|
|
|
|
// File: model/evaluator.cpp |
|
769
|
|
|
|
|
|
|
///////// |
|
770
|
|
|
|
|
|
|
|
|
771
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
772
|
|
|
|
|
|
|
// |
|
773
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
774
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
775
|
|
|
|
|
|
|
// |
|
776
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
777
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
778
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
779
|
|
|
|
|
|
|
|
|
780
|
2
|
|
|
|
|
|
const string evaluator::DEFAULT; |
|
781
|
2
|
|
|
|
|
|
const string evaluator::NONE = "none"; |
|
782
|
|
|
|
|
|
|
|
|
783
|
0
|
|
|
|
|
|
evaluator::evaluator(const model* m, const string& tokenizer, const string& tagger, const string& parser) { |
|
784
|
|
|
|
|
|
|
set_model(m); |
|
785
|
|
|
|
|
|
|
set_tokenizer(tokenizer); |
|
786
|
|
|
|
|
|
|
set_tagger(tagger); |
|
787
|
|
|
|
|
|
|
set_parser(parser); |
|
788
|
0
|
|
|
|
|
|
} |
|
789
|
|
|
|
|
|
|
|
|
790
|
0
|
|
|
|
|
|
void evaluator::set_model(const model* m) { |
|
791
|
0
|
|
|
|
|
|
this->m = m; |
|
792
|
0
|
|
|
|
|
|
} |
|
793
|
|
|
|
|
|
|
|
|
794
|
0
|
|
|
|
|
|
void evaluator::set_tokenizer(const string& tokenizer) { |
|
795
|
0
|
|
|
|
|
|
this->tokenizer = tokenizer; |
|
796
|
0
|
|
|
|
|
|
} |
|
797
|
|
|
|
|
|
|
|
|
798
|
0
|
|
|
|
|
|
void evaluator::set_tagger(const string& tagger) { |
|
799
|
0
|
|
|
|
|
|
this->tagger = tagger; |
|
800
|
0
|
|
|
|
|
|
} |
|
801
|
|
|
|
|
|
|
|
|
802
|
0
|
|
|
|
|
|
void evaluator::set_parser(const string& parser) { |
|
803
|
0
|
|
|
|
|
|
this->parser = parser; |
|
804
|
0
|
|
|
|
|
|
} |
|
805
|
|
|
|
|
|
|
|
|
806
|
0
|
|
|
|
|
|
bool evaluator::evaluate(istream& is, ostream& os, string& error) const { |
|
807
|
|
|
|
|
|
|
error.clear(); |
|
808
|
|
|
|
|
|
|
|
|
809
|
0
|
0
|
|
|
|
|
unique_ptr conllu_input(input_format::new_conllu_input_format()); |
|
810
|
0
|
0
|
|
|
|
|
if (!conllu_input) return error.assign("Cannot allocate CoNLL-U input format instance!"), false; |
|
|
|
0
|
|
|
|
|
|
|
811
|
|
|
|
|
|
|
|
|
812
|
0
|
0
|
|
|
|
|
vector plain_text_paragraphs(1); unsigned space_after_nos = 0; |
|
813
|
0
|
0
|
|
|
|
|
sentence system, gold; |
|
|
|
0
|
|
|
|
|
|
|
814
|
0
|
|
|
|
|
|
evaluation_data gold_data, system_goldtok_data, system_goldtok_goldtags_data, system_plaintext_data; |
|
815
|
|
|
|
|
|
|
|
|
816
|
|
|
|
|
|
|
string block; |
|
817
|
0
|
0
|
|
|
|
|
while (conllu_input->read_block(is, block)) { |
|
|
|
0
|
|
|
|
|
|
|
818
|
0
|
0
|
|
|
|
|
conllu_input->set_text(block); |
|
819
|
0
|
0
|
|
|
|
|
while (conllu_input->next_sentence(gold, error)) { |
|
|
|
0
|
|
|
|
|
|
|
820
|
0
|
0
|
|
|
|
|
gold_data.add_sentence(gold); |
|
821
|
|
|
|
|
|
|
|
|
822
|
|
|
|
|
|
|
// Detokenize the input when tokenizing |
|
823
|
0
|
0
|
|
|
|
|
if (tokenizer != NONE) { |
|
824
|
0
|
0
|
|
|
|
|
if (gold.get_new_doc() || gold.get_new_par()) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
825
|
0
|
0
|
|
|
|
|
plain_text_paragraphs.back().append("\n\n"); |
|
826
|
0
|
0
|
|
|
|
|
plain_text_paragraphs.emplace_back(); |
|
827
|
|
|
|
|
|
|
} |
|
828
|
|
|
|
|
|
|
|
|
829
|
0
|
0
|
|
|
|
|
for (size_t i = 1, j = 0; i < gold.words.size(); i++) { |
|
830
|
0
|
0
|
|
|
|
|
const token& tok = j < gold.multiword_tokens.size() && gold.multiword_tokens[j].id_first == int(i) ? (const token&)gold.multiword_tokens[j] : (const token&)gold.words[i]; |
|
|
|
0
|
|
|
|
|
|
|
831
|
|
|
|
|
|
|
plain_text_paragraphs.back().append(tok.form); |
|
832
|
0
|
0
|
|
|
|
|
if (tok.get_space_after()) |
|
|
|
0
|
|
|
|
|
|
|
833
|
0
|
0
|
|
|
|
|
plain_text_paragraphs.back().push_back(' '); |
|
834
|
|
|
|
|
|
|
else |
|
835
|
0
|
|
|
|
|
|
space_after_nos += 1; |
|
836
|
0
|
0
|
|
|
|
|
if (j < gold.multiword_tokens.size() && gold.multiword_tokens[j].id_first == int(i)) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
837
|
0
|
|
|
|
|
|
i = gold.multiword_tokens[j++].id_last; |
|
838
|
|
|
|
|
|
|
} |
|
839
|
|
|
|
|
|
|
} |
|
840
|
|
|
|
|
|
|
|
|
841
|
|
|
|
|
|
|
// Goldtok data |
|
842
|
0
|
0
|
|
|
|
|
if (tokenizer == NONE && tagger != NONE) { |
|
843
|
0
|
0
|
|
|
|
|
system.clear(); |
|
844
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < gold.words.size(); i++) |
|
845
|
|
|
|
|
|
|
system.add_word(gold.words[i].form); |
|
846
|
|
|
|
|
|
|
|
|
847
|
0
|
0
|
|
|
|
|
if (tagger != NONE) { |
|
848
|
0
|
0
|
|
|
|
|
if (!m->tag(system, tagger, error)) |
|
|
|
0
|
|
|
|
|
|
|
849
|
|
|
|
|
|
|
return false; |
|
850
|
0
|
0
|
|
|
|
|
if (parser != NONE) |
|
851
|
0
|
0
|
|
|
|
|
if (!m->parse(system, parser, error)) |
|
|
|
0
|
|
|
|
|
|
|
852
|
|
|
|
|
|
|
return false; |
|
853
|
|
|
|
|
|
|
} |
|
854
|
0
|
0
|
|
|
|
|
system_goldtok_data.add_sentence(system); |
|
855
|
|
|
|
|
|
|
} |
|
856
|
|
|
|
|
|
|
|
|
857
|
|
|
|
|
|
|
// Goldtok_goldtags data |
|
858
|
0
|
0
|
|
|
|
|
if (tokenizer == NONE && tagger == NONE && parser != NONE) { |
|
|
|
0
|
|
|
|
|
|
|
859
|
0
|
0
|
|
|
|
|
system.clear(); |
|
860
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < gold.words.size(); i++) { |
|
861
|
|
|
|
|
|
|
system.add_word(gold.words[i].form); |
|
862
|
0
|
|
|
|
|
|
system.words[i].upostag = gold.words[i].upostag; |
|
863
|
0
|
|
|
|
|
|
system.words[i].xpostag = gold.words[i].xpostag; |
|
864
|
0
|
|
|
|
|
|
system.words[i].feats = gold.words[i].feats; |
|
865
|
0
|
|
|
|
|
|
system.words[i].lemma = gold.words[i].lemma; |
|
866
|
|
|
|
|
|
|
} |
|
867
|
0
|
0
|
|
|
|
|
if (parser != NONE) |
|
868
|
0
|
0
|
|
|
|
|
if (!m->parse(system, parser, error)) |
|
|
|
0
|
|
|
|
|
|
|
869
|
|
|
|
|
|
|
return false; |
|
870
|
0
|
0
|
|
|
|
|
system_goldtok_goldtags_data.add_sentence(system); |
|
871
|
|
|
|
|
|
|
} |
|
872
|
|
|
|
|
|
|
} |
|
873
|
0
|
0
|
|
|
|
|
if (!error.empty()) return false; |
|
874
|
|
|
|
|
|
|
} |
|
875
|
|
|
|
|
|
|
|
|
876
|
|
|
|
|
|
|
// Tokenize, tag and parse plaintext input |
|
877
|
0
|
0
|
|
|
|
|
if (tokenizer != NONE) { |
|
878
|
0
|
0
|
|
|
|
|
unique_ptr t(m->new_tokenizer(tokenizer)); |
|
879
|
0
|
0
|
|
|
|
|
if (!t) return error.assign("Cannot allocate new tokenizer!"), false; |
|
|
|
0
|
|
|
|
|
|
|
880
|
|
|
|
|
|
|
|
|
881
|
0
|
0
|
|
|
|
|
for (auto&& plain_text : plain_text_paragraphs) { |
|
882
|
0
|
0
|
|
|
|
|
t->set_text(plain_text); |
|
883
|
0
|
0
|
|
|
|
|
while (t->next_sentence(system, error)) { |
|
|
|
0
|
|
|
|
|
|
|
884
|
0
|
0
|
|
|
|
|
if (tagger != NONE) { |
|
885
|
0
|
0
|
|
|
|
|
if (!m->tag(system, tagger, error)) |
|
|
|
0
|
|
|
|
|
|
|
886
|
|
|
|
|
|
|
return false; |
|
887
|
|
|
|
|
|
|
|
|
888
|
0
|
0
|
|
|
|
|
if (parser != NONE) |
|
889
|
0
|
0
|
|
|
|
|
if (!m->parse(system, parser, error)) |
|
|
|
0
|
|
|
|
|
|
|
890
|
|
|
|
|
|
|
return false; |
|
891
|
|
|
|
|
|
|
} |
|
892
|
0
|
0
|
|
|
|
|
system_plaintext_data.add_sentence(system); |
|
893
|
|
|
|
|
|
|
} |
|
894
|
0
|
0
|
|
|
|
|
if (!error.empty()) return false; |
|
895
|
|
|
|
|
|
|
} |
|
896
|
|
|
|
|
|
|
} |
|
897
|
|
|
|
|
|
|
|
|
898
|
|
|
|
|
|
|
// Evaluate from plain text |
|
899
|
0
|
0
|
|
|
|
|
if (tokenizer != NONE) { |
|
900
|
0
|
0
|
|
|
|
|
if (system_plaintext_data.chars != gold_data.chars) { |
|
901
|
|
|
|
|
|
|
os << "Cannot evaluate tokenizer, it returned different sequence of token characters!" << endl; |
|
902
|
|
|
|
|
|
|
} else { |
|
903
|
|
|
|
|
|
|
word_alignment plaintext_alignment; |
|
904
|
0
|
0
|
|
|
|
|
word_alignment::best_alignment(system_plaintext_data, gold_data, plaintext_alignment); |
|
905
|
|
|
|
|
|
|
|
|
906
|
|
|
|
|
|
|
os << "Number of SpaceAfter=No features in gold data: " << space_after_nos << endl; |
|
907
|
|
|
|
|
|
|
|
|
908
|
0
|
|
|
|
|
|
auto tokens = evaluate_f1(system_plaintext_data.tokens, gold_data.tokens); |
|
909
|
0
|
|
|
|
|
|
auto multiwords = evaluate_f1(system_plaintext_data.multiwords, gold_data.multiwords); |
|
910
|
0
|
|
|
|
|
|
auto sentences = evaluate_f1(system_plaintext_data.sentences, gold_data.sentences); |
|
911
|
0
|
|
|
|
|
|
auto words = plaintext_alignment.evaluate_f1([](const word&, const word&) {return true;}); |
|
912
|
0
|
0
|
|
|
|
|
if (multiwords.total_gold || multiwords.total_system) |
|
|
|
0
|
|
|
|
|
|
|
913
|
0
|
|
|
|
|
|
os << "Tokenizer tokens - system: " << tokens.total_system << ", gold: " << tokens.total_gold |
|
914
|
0
|
|
|
|
|
|
<< ", precision: " << fixed << setprecision(2) << 100. * tokens.precision |
|
915
|
0
|
|
|
|
|
|
<< "%, recall: " << 100. * tokens.recall << "%, f1: " << 100. * tokens.f1 << "%" << endl |
|
916
|
|
|
|
|
|
|
<< "Tokenizer multiword tokens - system: " << multiwords.total_system << ", gold: " << multiwords.total_gold |
|
917
|
0
|
|
|
|
|
|
<< ", precision: " << fixed << setprecision(2) << 100. * multiwords.precision |
|
918
|
0
|
|
|
|
|
|
<< "%, recall: " << 100. * multiwords.recall << "%, f1: " << 100. * multiwords.f1 << "%" << endl; |
|
919
|
0
|
|
|
|
|
|
os << "Tokenizer words - system: " << words.total_system << ", gold: " << words.total_gold |
|
920
|
0
|
|
|
|
|
|
<< ", precision: " << fixed << setprecision(2) << 100. * words.precision |
|
921
|
0
|
|
|
|
|
|
<< "%, recall: " << 100. * words.recall << "%, f1: " << 100. * words.f1 << "%" << endl |
|
922
|
0
|
|
|
|
|
|
<< "Tokenizer sentences - system: " << sentences.total_system << ", gold: " << sentences.total_gold |
|
923
|
0
|
|
|
|
|
|
<< ", precision: " << fixed << setprecision(2) << 100. * sentences.precision |
|
924
|
0
|
|
|
|
|
|
<< "%, recall: " << 100. * sentences.recall << "%, f1: " << 100. * sentences.f1 << "%" << endl; |
|
925
|
|
|
|
|
|
|
|
|
926
|
0
|
0
|
|
|
|
|
if (tagger != NONE) { |
|
927
|
0
|
|
|
|
|
|
auto upostags = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag; }); |
|
928
|
0
|
|
|
|
|
|
auto xpostags = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.xpostag == u.xpostag; }); |
|
929
|
0
|
|
|
|
|
|
auto feats = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.feats == u.feats; }); |
|
930
|
0
|
0
|
|
|
|
|
auto alltags = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag && w.xpostag == u.xpostag && w.feats == u.feats; }); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
931
|
0
|
|
|
|
|
|
auto lemmas = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.lemma == u.lemma; }); |
|
932
|
0
|
|
|
|
|
|
os << "Tagging from plain text (CoNLL17 F1 score) - gold forms: " << upostags.total_gold << ", upostag: " |
|
933
|
0
|
|
|
|
|
|
<< fixed << setprecision(2) << 100. * upostags.f1 << "%, xpostag: " |
|
934
|
0
|
|
|
|
|
|
<< 100. * xpostags.f1 << "%, feats: " << 100. * feats.f1 << "%, alltags: " |
|
935
|
0
|
|
|
|
|
|
<< 100. * alltags.f1 << "%, lemmas: " << 100. * lemmas.f1 << '%' << endl; |
|
936
|
|
|
|
|
|
|
} |
|
937
|
|
|
|
|
|
|
|
|
938
|
0
|
0
|
|
|
|
|
if (tagger != NONE && parser != NONE) { |
|
939
|
0
|
|
|
|
|
|
auto uas = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head; }); |
|
940
|
0
|
0
|
|
|
|
|
auto las = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; }); |
|
|
|
0
|
|
|
|
|
|
|
941
|
0
|
|
|
|
|
|
os << "Parsing from plain text with computed tags (CoNLL17 F1 score) - gold forms: " << uas.total_gold |
|
942
|
0
|
|
|
|
|
|
<< ", UAS: " << fixed << setprecision(2) << 100. * uas.f1 << "%, LAS: " << 100. * las.f1 << '%' << endl; |
|
943
|
|
|
|
|
|
|
} |
|
944
|
|
|
|
|
|
|
} |
|
945
|
|
|
|
|
|
|
} |
|
946
|
|
|
|
|
|
|
|
|
947
|
|
|
|
|
|
|
// Evaluate tagger from gold tokenization |
|
948
|
0
|
0
|
|
|
|
|
if (tokenizer == NONE && tagger != NONE) { |
|
949
|
|
|
|
|
|
|
word_alignment goldtok_alignment; |
|
950
|
0
|
0
|
|
|
|
|
if (!word_alignment::perfect_alignment(system_goldtok_data, gold_data, goldtok_alignment)) |
|
|
|
0
|
|
|
|
|
|
|
951
|
0
|
0
|
|
|
|
|
return error.assign("Internal UDPipe error (the words of the gold data do not match)!"), false; |
|
952
|
|
|
|
|
|
|
|
|
953
|
0
|
|
|
|
|
|
auto upostags = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag; }); |
|
954
|
0
|
|
|
|
|
|
auto xpostags = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.xpostag == u.xpostag; }); |
|
955
|
0
|
|
|
|
|
|
auto feats = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.feats == u.feats; }); |
|
956
|
0
|
0
|
|
|
|
|
auto alltags = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag && w.xpostag == u.xpostag && w.feats == u.feats; }); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
957
|
0
|
|
|
|
|
|
auto lemmas = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.lemma == u.lemma; }); |
|
958
|
0
|
|
|
|
|
|
os << "Tagging from gold tokenization - forms: " << upostags.total_gold << ", upostag: " |
|
959
|
0
|
|
|
|
|
|
<< fixed << setprecision(2) << 100. * upostags.f1 << "%, xpostag: " |
|
960
|
0
|
|
|
|
|
|
<< 100. * xpostags.f1 << "%, feats: " << 100. * feats.f1 << "%, alltags: " |
|
961
|
0
|
|
|
|
|
|
<< 100. * alltags.f1 << "%, lemmas: " << 100. * lemmas.f1 << '%' << endl; |
|
962
|
|
|
|
|
|
|
|
|
963
|
0
|
0
|
|
|
|
|
if (parser != NONE) { |
|
964
|
0
|
|
|
|
|
|
auto uas = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head; }); |
|
965
|
0
|
0
|
|
|
|
|
auto las = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; }); |
|
|
|
0
|
|
|
|
|
|
|
966
|
0
|
|
|
|
|
|
os << "Parsing from gold tokenization with computed tags - forms: " << uas.total_gold |
|
967
|
0
|
|
|
|
|
|
<< ", UAS: " << fixed << setprecision(2) << 100. * uas.f1 << "%, LAS: " << 100. * las.f1 << '%' << endl; |
|
968
|
|
|
|
|
|
|
} |
|
969
|
|
|
|
|
|
|
} |
|
970
|
|
|
|
|
|
|
|
|
971
|
|
|
|
|
|
|
// Evaluate parser from gold tokenization |
|
972
|
0
|
0
|
|
|
|
|
if (tokenizer == NONE && tagger == NONE && parser != NONE) { |
|
|
|
0
|
|
|
|
|
|
|
973
|
|
|
|
|
|
|
word_alignment goldtok_goldtags_alignment; |
|
974
|
0
|
0
|
|
|
|
|
if (!word_alignment::perfect_alignment(system_goldtok_goldtags_data, gold_data, goldtok_goldtags_alignment)) |
|
|
|
0
|
|
|
|
|
|
|
975
|
0
|
0
|
|
|
|
|
return error.assign("Internal UDPipe error (the words of the goldtok data do not match)!"), false; |
|
976
|
|
|
|
|
|
|
|
|
977
|
0
|
|
|
|
|
|
auto uas = goldtok_goldtags_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head; }); |
|
978
|
0
|
0
|
|
|
|
|
auto las = goldtok_goldtags_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; }); |
|
|
|
0
|
|
|
|
|
|
|
979
|
0
|
|
|
|
|
|
os << "Parsing from gold tokenization with gold tags - forms: " << uas.total_gold |
|
980
|
0
|
|
|
|
|
|
<< ", UAS: " << fixed << setprecision(2) << 100. * uas.f1 << "%, LAS: " << 100. * las.f1 << '%' << endl; |
|
981
|
|
|
|
|
|
|
} |
|
982
|
|
|
|
|
|
|
|
|
983
|
|
|
|
|
|
|
return true; |
|
984
|
|
|
|
|
|
|
} |
|
985
|
|
|
|
|
|
|
|
|
986
|
|
|
|
|
|
|
template |
|
987
|
0
|
|
|
|
|
|
evaluator::f1_info evaluator::evaluate_f1(const vector>& system, const vector>& gold) { |
|
988
|
|
|
|
|
|
|
size_t both = 0; |
|
989
|
0
|
0
|
|
|
|
|
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
990
|
0
|
0
|
|
|
|
|
if (si < system.size() && (gi == gold.size() || system[si].first < gold[gi].first)) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
991
|
0
|
|
|
|
|
|
si++; |
|
992
|
0
|
0
|
|
|
|
|
else if (gi < gold.size() && (si == system.size() || gold[gi].first < system[si].first)) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
993
|
0
|
|
|
|
|
|
gi++; |
|
994
|
|
|
|
|
|
|
else |
|
995
|
0
|
|
|
|
|
|
both += system[si++].second == gold[gi++].second; |
|
996
|
|
|
|
|
|
|
|
|
997
|
|
|
|
|
|
|
return {system.size(), gold.size(), system.size() ? both / double(system.size()) : 0., |
|
998
|
0
|
0
|
|
|
|
|
gold.size() ? both / double(gold.size()) : 0., system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0. }; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
999
|
|
|
|
|
|
|
} |
|
1000
|
|
|
|
|
|
|
|
|
1001
|
0
|
|
|
|
|
|
evaluator::evaluation_data::word_data::word_data(size_t start, size_t end, int id, bool is_multiword, const word& w) |
|
1002
|
0
|
|
|
|
|
|
: start(start), end(end), is_multiword(is_multiword), w(w) |
|
1003
|
|
|
|
|
|
|
{ |
|
1004
|
|
|
|
|
|
|
// Use absolute ids for words and heads |
|
1005
|
0
|
|
|
|
|
|
this->w.id = id; |
|
1006
|
0
|
0
|
|
|
|
|
this->w.head = w.head ? id + (w.head - w.id) : 0; |
|
1007
|
|
|
|
|
|
|
|
|
1008
|
|
|
|
|
|
|
// Forms in MWTs are compares case-insensitively in LCS, therefore |
|
1009
|
|
|
|
|
|
|
// we lowercase them here. |
|
1010
|
0
|
|
|
|
|
|
unilib::utf8::map(unilib::unicode::lowercase, w.form, this->w.form); |
|
1011
|
|
|
|
|
|
|
|
|
1012
|
|
|
|
|
|
|
// During evaluation, only universal part of DEPREL (up to a colon) is used. |
|
1013
|
0
|
|
|
|
|
|
auto colon = w.deprel.find(':'); |
|
1014
|
0
|
0
|
|
|
|
|
if (colon != string::npos) |
|
1015
|
0
|
0
|
|
|
|
|
this->w.deprel.erase(colon); |
|
1016
|
0
|
|
|
|
|
|
} |
|
1017
|
|
|
|
|
|
|
|
|
1018
|
0
|
|
|
|
|
|
void evaluator::evaluation_data::add_sentence(const sentence& s) { |
|
1019
|
0
|
|
|
|
|
|
sentences.emplace_back(chars.size(), chars.size()); |
|
1020
|
0
|
0
|
|
|
|
|
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
|
1021
|
0
|
|
|
|
|
|
tokens.emplace_back(chars.size(), chars.size()); |
|
1022
|
0
|
0
|
|
|
|
|
const string& form = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? s.multiword_tokens[j].form : s.words[i].form; |
|
|
|
0
|
|
|
|
|
|
|
1023
|
0
|
0
|
|
|
|
|
for (auto&& chr : unilib::utf8::decoder(form)) |
|
1024
|
0
|
0
|
|
|
|
|
if (chr != ' ') |
|
1025
|
0
|
|
|
|
|
|
chars.push_back(chr); |
|
1026
|
0
|
|
|
|
|
|
tokens.back().second = chars.size(); |
|
1027
|
|
|
|
|
|
|
|
|
1028
|
0
|
0
|
|
|
|
|
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1029
|
0
|
|
|
|
|
|
multiwords.emplace_back(tokens.back().first, form); |
|
1030
|
0
|
0
|
|
|
|
|
for (size_t k = i; int(k) <= s.multiword_tokens[j].id_last; k++) { |
|
1031
|
0
|
|
|
|
|
|
words.emplace_back(tokens.back().first, tokens.back().second, (int)words.size() + 1, true, s.words[k]); |
|
1032
|
0
|
|
|
|
|
|
multiwords.back().second.append(" ").append(words.back().w.form); |
|
1033
|
|
|
|
|
|
|
} |
|
1034
|
0
|
|
|
|
|
|
i = s.multiword_tokens[j++].id_last; |
|
1035
|
|
|
|
|
|
|
} else { |
|
1036
|
0
|
|
|
|
|
|
words.emplace_back(tokens.back().first, tokens.back().second, (int)words.size() + 1, false, s.words[i]); |
|
1037
|
|
|
|
|
|
|
} |
|
1038
|
|
|
|
|
|
|
} |
|
1039
|
0
|
|
|
|
|
|
sentences.back().second = chars.size(); |
|
1040
|
0
|
|
|
|
|
|
} |
|
1041
|
|
|
|
|
|
|
|
|
1042
|
|
|
|
|
|
|
template |
|
1043
|
0
|
|
|
|
|
|
evaluator::f1_info evaluator::word_alignment::evaluate_f1(Equals equals) { |
|
1044
|
|
|
|
|
|
|
size_t both = 0; |
|
1045
|
0
|
0
|
|
|
|
|
for (auto&& match : matched) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1046
|
0
|
0
|
|
|
|
|
if (equals(match.system, match.gold)) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1047
|
0
|
|
|
|
|
|
both++; |
|
1048
|
|
|
|
|
|
|
|
|
1049
|
|
|
|
|
|
|
return {total_system, total_gold, total_system ? both / double(total_system) : 0., |
|
1050
|
0
|
0
|
|
|
|
|
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1051
|
|
|
|
|
|
|
} |
|
1052
|
|
|
|
|
|
|
|
|
1053
|
0
|
|
|
|
|
|
bool evaluator::word_alignment::perfect_alignment(const evaluation_data& system, const evaluation_data& gold, word_alignment& alignment) { |
|
1054
|
0
|
|
|
|
|
|
alignment.total_system = system.words.size(); |
|
1055
|
0
|
|
|
|
|
|
alignment.total_gold = gold.words.size(); |
|
1056
|
0
|
0
|
|
|
|
|
if (alignment.total_system != alignment.total_gold) return false; |
|
1057
|
|
|
|
|
|
|
|
|
1058
|
|
|
|
|
|
|
alignment.matched.clear(); |
|
1059
|
0
|
|
|
|
|
|
alignment.matched.reserve(alignment.total_system); |
|
1060
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < system.words.size(); i++) { |
|
1061
|
0
|
0
|
|
|
|
|
if (system.words[i].w.form != gold.words[i].w.form) |
|
1062
|
|
|
|
|
|
|
return false; |
|
1063
|
0
|
|
|
|
|
|
alignment.matched.emplace_back(system.words[i].w, gold.words[i].w); |
|
1064
|
|
|
|
|
|
|
} |
|
1065
|
|
|
|
|
|
|
|
|
1066
|
|
|
|
|
|
|
return true; |
|
1067
|
|
|
|
|
|
|
} |
|
1068
|
|
|
|
|
|
|
|
|
1069
|
0
|
|
|
|
|
|
void evaluator::word_alignment::best_alignment(const evaluation_data& system, const evaluation_data& gold, word_alignment& alignment) { |
|
1070
|
0
|
|
|
|
|
|
alignment.total_system = system.words.size(); |
|
1071
|
0
|
|
|
|
|
|
alignment.total_gold = gold.words.size(); |
|
1072
|
|
|
|
|
|
|
alignment.matched.clear(); |
|
1073
|
|
|
|
|
|
|
|
|
1074
|
0
|
0
|
|
|
|
|
for (size_t si = 0, gi = 0; si < system.words.size() && gi < gold.words.size(); ) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1075
|
0
|
0
|
|
|
|
|
if ((system.words[si].start > gold.words[gi].start || !system.words[si].is_multiword) && |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1076
|
0
|
0
|
|
|
|
|
(gold.words[gi].start > system.words[si].start || !gold.words[gi].is_multiword)) { |
|
1077
|
|
|
|
|
|
|
// No multiword, align using start+end indices |
|
1078
|
0
|
0
|
|
|
|
|
if (system.words[si].start == gold.words[gi].start && system.words[si].end == gold.words[gi].end) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1079
|
0
|
|
|
|
|
|
alignment.matched.emplace_back(system.words[si++].w, gold.words[gi++].w); |
|
1080
|
0
|
0
|
|
|
|
|
else if (system.words[si].start <= gold.words[gi].start) |
|
1081
|
0
|
|
|
|
|
|
si++; |
|
1082
|
|
|
|
|
|
|
else |
|
1083
|
0
|
|
|
|
|
|
gi++; |
|
1084
|
|
|
|
|
|
|
} else { |
|
1085
|
|
|
|
|
|
|
// We have a multiword |
|
1086
|
0
|
0
|
|
|
|
|
size_t ss = si, gs = gi, multiword_range_end = system.words[si].is_multiword ? system.words[si].end : gold.words[gi].end; |
|
1087
|
|
|
|
|
|
|
|
|
1088
|
|
|
|
|
|
|
// Find all words in the multiword range |
|
1089
|
0
|
0
|
|
|
|
|
while ((si < system.words.size() && (system.words[si].is_multiword ? system.words[si].start < multiword_range_end : |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1090
|
0
|
0
|
|
|
|
|
system.words[si].end <= multiword_range_end)) || |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1091
|
0
|
0
|
|
|
|
|
(gi < gold.words.size() && (gold.words[gi].is_multiword ? gold.words[gi].start < multiword_range_end : |
|
|
|
0
|
|
|
|
|
|
|
1092
|
0
|
|
|
|
|
|
gold.words[gi].end <= multiword_range_end))) { |
|
1093
|
|
|
|
|
|
|
// Extend the multiword range |
|
1094
|
0
|
0
|
|
|
|
|
if (si < system.words.size() && (gi >= gold.words.size() || system.words[si].start <= gold.words[gi].start)) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1095
|
0
|
0
|
|
|
|
|
if (system.words[si].is_multiword) multiword_range_end = max(multiword_range_end, system.words[si].end); |
|
1096
|
0
|
|
|
|
|
|
si++; |
|
1097
|
|
|
|
|
|
|
} else { |
|
1098
|
0
|
0
|
|
|
|
|
if (gold.words[gi].is_multiword) multiword_range_end = max(multiword_range_end, gold.words[gi].end); |
|
1099
|
0
|
|
|
|
|
|
gi++; |
|
1100
|
|
|
|
|
|
|
} |
|
1101
|
|
|
|
|
|
|
} |
|
1102
|
|
|
|
|
|
|
|
|
1103
|
|
|
|
|
|
|
// LCS on the chosen words |
|
1104
|
0
|
|
|
|
|
|
vector> lcs(si - ss); |
|
1105
|
0
|
0
|
|
|
|
|
for (unsigned s = si - ss; s--; ) { |
|
1106
|
0
|
0
|
|
|
|
|
lcs[s].resize(gi - gs); |
|
1107
|
0
|
0
|
|
|
|
|
for (unsigned g = gi - gs; g--; ) { |
|
1108
|
0
|
0
|
|
|
|
|
lcs[s][g] = max(lcs[s][g], s+1 < lcs.size() ? lcs[s+1][g] : 0); |
|
1109
|
0
|
0
|
|
|
|
|
lcs[s][g] = max(lcs[s][g], g+1 < lcs[s].size() ? lcs[s][g+1] : 0); |
|
1110
|
0
|
0
|
|
|
|
|
if (system.words[ss + s].w.form == gold.words[gs + g].w.form) |
|
1111
|
0
|
0
|
|
|
|
|
lcs[s][g] = max(lcs[s][g], 1 + (s+1 < lcs.size() && g+1 < lcs[s].size() ? lcs[s+1][g+1] : 0)); |
|
|
|
0
|
|
|
|
|
|
|
1112
|
|
|
|
|
|
|
} |
|
1113
|
|
|
|
|
|
|
} |
|
1114
|
|
|
|
|
|
|
|
|
1115
|
0
|
0
|
|
|
|
|
for (unsigned s = 0, g = 0; s < si - ss && g < gi - gs; ) { |
|
|
|
0
|
|
|
|
|
|
|
1116
|
0
|
0
|
|
|
|
|
if (system.words[ss + s].w.form == gold.words[gs + g].w.form) |
|
1117
|
0
|
0
|
|
|
|
|
alignment.matched.emplace_back(system.words[ss + s++].w, gold.words[gs + g++].w); |
|
1118
|
0
|
0
|
|
|
|
|
else if (lcs[s][g] == (s+1 < lcs.size() ? lcs[s+1][g] : 0)) |
|
|
|
0
|
|
|
|
|
|
|
1119
|
|
|
|
|
|
|
s++; |
|
1120
|
|
|
|
|
|
|
else /* if (lcs[s][g] == (g+1 < lcs[s].size() ? lcs[s][g+1] : 0)) */ |
|
1121
|
0
|
|
|
|
|
|
g++; |
|
1122
|
|
|
|
|
|
|
} |
|
1123
|
|
|
|
|
|
|
} |
|
1124
|
|
|
|
|
|
|
|
|
1125
|
|
|
|
|
|
|
// Reindex HEAD pointers in system to use gold indices |
|
1126
|
0
|
|
|
|
|
|
vector gold_aligned(system.words.size(), -1); |
|
1127
|
0
|
0
|
|
|
|
|
for (auto&& match : alignment.matched) |
|
1128
|
0
|
|
|
|
|
|
gold_aligned[match.system.id - 1] = match.gold.id; |
|
1129
|
0
|
0
|
|
|
|
|
for (auto&& match : alignment.matched) |
|
1130
|
0
|
0
|
|
|
|
|
if (match.system.head > 0) |
|
1131
|
0
|
|
|
|
|
|
match.system.head = gold_aligned[match.system.head - 1]; |
|
1132
|
0
|
|
|
|
|
|
} |
|
1133
|
|
|
|
|
|
|
|
|
1134
|
|
|
|
|
|
|
///////// |
|
1135
|
|
|
|
|
|
|
// File: morphodita/derivator/derivator.h |
|
1136
|
|
|
|
|
|
|
///////// |
|
1137
|
|
|
|
|
|
|
|
|
1138
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
1139
|
|
|
|
|
|
|
// |
|
1140
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
|
1141
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
1142
|
|
|
|
|
|
|
// |
|
1143
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
1144
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
1145
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
1146
|
|
|
|
|
|
|
|
|
1147
|
|
|
|
|
|
|
namespace morphodita { |
|
1148
|
|
|
|
|
|
|
|
|
1149
|
0
|
|
|
|
|
|
struct derivated_lemma { |
|
1150
|
|
|
|
|
|
|
string lemma; |
|
1151
|
|
|
|
|
|
|
}; |
|
1152
|
|
|
|
|
|
|
|
|
1153
|
0
|
|
|
|
|
|
class derivator { |
|
1154
|
|
|
|
|
|
|
public: |
|
1155
|
0
|
|
|
|
|
|
virtual ~derivator() {} |
|
1156
|
|
|
|
|
|
|
|
|
1157
|
|
|
|
|
|
|
// For given lemma, return the parent in the derivation graph. |
|
1158
|
|
|
|
|
|
|
// The lemma is assumed to be lemma id and any lemma comments are ignored. |
|
1159
|
|
|
|
|
|
|
virtual bool parent(string_piece lemma, derivated_lemma& parent) const = 0; |
|
1160
|
|
|
|
|
|
|
|
|
1161
|
|
|
|
|
|
|
// For given lemma, return the children in the derivation graph. |
|
1162
|
|
|
|
|
|
|
// The lemma is assumed to be lemma id and any lemma comments are ignored. |
|
1163
|
|
|
|
|
|
|
virtual bool children(string_piece lemma, vector& children) const = 0; |
|
1164
|
|
|
|
|
|
|
}; |
|
1165
|
|
|
|
|
|
|
|
|
1166
|
|
|
|
|
|
|
} // namespace morphodita |
|
1167
|
|
|
|
|
|
|
|
|
1168
|
|
|
|
|
|
|
///////// |
|
1169
|
|
|
|
|
|
|
// File: morphodita/tokenizer/tokenizer.h |
|
1170
|
|
|
|
|
|
|
///////// |
|
1171
|
|
|
|
|
|
|
|
|
1172
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
1173
|
|
|
|
|
|
|
// |
|
1174
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
1175
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
1176
|
|
|
|
|
|
|
// |
|
1177
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
1178
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
1179
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
1180
|
|
|
|
|
|
|
|
|
1181
|
|
|
|
|
|
|
namespace morphodita { |
|
1182
|
|
|
|
|
|
|
|
|
1183
|
|
|
|
|
|
|
// Range of a token, measured in Unicode characters, not UTF8 bytes. |
|
1184
|
|
|
|
|
|
|
struct token_range { |
|
1185
|
|
|
|
|
|
|
size_t start; |
|
1186
|
|
|
|
|
|
|
size_t length; |
|
1187
|
|
|
|
|
|
|
|
|
1188
|
|
|
|
|
|
|
token_range() {} |
|
1189
|
7
|
|
|
|
|
|
token_range(size_t start, size_t length) : start(start), length(length) {} |
|
1190
|
|
|
|
|
|
|
}; |
|
1191
|
|
|
|
|
|
|
|
|
1192
|
1
|
|
|
|
|
|
class tokenizer { |
|
1193
|
|
|
|
|
|
|
public: |
|
1194
|
1
|
|
|
|
|
|
virtual ~tokenizer() {} |
|
1195
|
|
|
|
|
|
|
|
|
1196
|
|
|
|
|
|
|
virtual void set_text(string_piece text, bool make_copy = false) = 0; |
|
1197
|
|
|
|
|
|
|
virtual bool next_sentence(vector* forms, vector* tokens) = 0; |
|
1198
|
|
|
|
|
|
|
|
|
1199
|
|
|
|
|
|
|
// Static factory methods |
|
1200
|
|
|
|
|
|
|
static tokenizer* new_vertical_tokenizer(); |
|
1201
|
|
|
|
|
|
|
|
|
1202
|
|
|
|
|
|
|
static tokenizer* new_czech_tokenizer(); |
|
1203
|
|
|
|
|
|
|
static tokenizer* new_english_tokenizer(); |
|
1204
|
|
|
|
|
|
|
static tokenizer* new_generic_tokenizer(); |
|
1205
|
|
|
|
|
|
|
}; |
|
1206
|
|
|
|
|
|
|
|
|
1207
|
|
|
|
|
|
|
} // namespace morphodita |
|
1208
|
|
|
|
|
|
|
|
|
1209
|
|
|
|
|
|
|
///////// |
|
1210
|
|
|
|
|
|
|
// File: morphodita/morpho/morpho.h |
|
1211
|
|
|
|
|
|
|
///////// |
|
1212
|
|
|
|
|
|
|
|
|
1213
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
1214
|
|
|
|
|
|
|
// |
|
1215
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
1216
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
1217
|
|
|
|
|
|
|
// |
|
1218
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
1219
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
1220
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
1221
|
|
|
|
|
|
|
|
|
1222
|
|
|
|
|
|
|
namespace morphodita { |
|
1223
|
|
|
|
|
|
|
|
|
1224
|
0
|
|
|
|
|
|
struct tagged_form { |
|
1225
|
|
|
|
|
|
|
string form; |
|
1226
|
|
|
|
|
|
|
string tag; |
|
1227
|
|
|
|
|
|
|
|
|
1228
|
|
|
|
|
|
|
tagged_form() {} |
|
1229
|
0
|
|
|
|
|
|
tagged_form(const string& form, const string& tag) : form(form), tag(tag) {} |
|
1230
|
|
|
|
|
|
|
}; |
|
1231
|
|
|
|
|
|
|
|
|
1232
|
46
|
|
|
|
|
|
struct tagged_lemma { |
|
1233
|
|
|
|
|
|
|
string lemma; |
|
1234
|
|
|
|
|
|
|
string tag; |
|
1235
|
|
|
|
|
|
|
|
|
1236
|
|
|
|
|
|
|
tagged_lemma() {} |
|
1237
|
10
|
|
|
|
|
|
tagged_lemma(const string& lemma, const string& tag) : lemma(lemma), tag(tag) {} |
|
1238
|
|
|
|
|
|
|
}; |
|
1239
|
|
|
|
|
|
|
|
|
1240
|
0
|
|
|
|
|
|
struct tagged_lemma_forms { |
|
1241
|
|
|
|
|
|
|
string lemma; |
|
1242
|
|
|
|
|
|
|
vector forms; |
|
1243
|
|
|
|
|
|
|
|
|
1244
|
|
|
|
|
|
|
tagged_lemma_forms() {} |
|
1245
|
0
|
|
|
|
|
|
tagged_lemma_forms(const string& lemma) : lemma(lemma) {} |
|
1246
|
|
|
|
|
|
|
}; |
|
1247
|
|
|
|
|
|
|
|
|
1248
|
1
|
|
|
|
|
|
class morpho { |
|
1249
|
|
|
|
|
|
|
public: |
|
1250
|
2
|
|
|
|
|
|
virtual ~morpho() {} |
|
1251
|
|
|
|
|
|
|
|
|
1252
|
|
|
|
|
|
|
static morpho* load(istream& is); |
|
1253
|
|
|
|
|
|
|
static morpho* load(const char* fname); |
|
1254
|
|
|
|
|
|
|
|
|
1255
|
|
|
|
|
|
|
enum guesser_mode { NO_GUESSER = 0, GUESSER = 1, GUESSER_UNSPECIFIED = -1 }; |
|
1256
|
|
|
|
|
|
|
|
|
1257
|
|
|
|
|
|
|
// Perform morphologic analysis of a form. The form is given by a pointer and |
|
1258
|
|
|
|
|
|
|
// length and therefore does not need to be '\0' terminated. The guesser |
|
1259
|
|
|
|
|
|
|
// parameter specifies whether a guesser can be used if the form is not found |
|
1260
|
|
|
|
|
|
|
// in the dictionary. Output is assigned to the lemmas vector. |
|
1261
|
|
|
|
|
|
|
// |
|
1262
|
|
|
|
|
|
|
// If the form is found in the dictionary, analyses are assigned to lemmas |
|
1263
|
|
|
|
|
|
|
// and NO_GUESSER returned. If guesser == GUESSER and the form analyses are |
|
1264
|
|
|
|
|
|
|
// found using a guesser, they are assigned to lemmas and GUESSER is |
|
1265
|
|
|
|
|
|
|
// returned. Otherwise <0 is returned and lemmas are filled with one |
|
1266
|
|
|
|
|
|
|
// analysis containing given form as lemma and a tag for unknown word. |
|
1267
|
|
|
|
|
|
|
virtual int analyze(string_piece form, guesser_mode guesser, vector& lemmas) const = 0; |
|
1268
|
|
|
|
|
|
|
|
|
1269
|
|
|
|
|
|
|
// Perform morphologic generation of a lemma. The lemma is given by a pointer |
|
1270
|
|
|
|
|
|
|
// and length and therefore does not need to be '\0' terminated. Optionally |
|
1271
|
|
|
|
|
|
|
// a tag_wildcard can be specified (or be NULL) and if so, results are |
|
1272
|
|
|
|
|
|
|
// filtered using this wildcard. The guesser parameter speficies whether |
|
1273
|
|
|
|
|
|
|
// a guesser can be used if the lemma is not found in the dictionary. Output |
|
1274
|
|
|
|
|
|
|
// is assigned to the forms vector. |
|
1275
|
|
|
|
|
|
|
// |
|
1276
|
|
|
|
|
|
|
// Tag_wildcard can be either NULL or a wildcard applied to the results. |
|
1277
|
|
|
|
|
|
|
// A ? in the wildcard matches any character, [bytes] matches any of the |
|
1278
|
|
|
|
|
|
|
// bytes and [^bytes] matches any byte different from the specified ones. |
|
1279
|
|
|
|
|
|
|
// A - has no special meaning inside the bytes and if ] is first in bytes, it |
|
1280
|
|
|
|
|
|
|
// does not end the bytes group. |
|
1281
|
|
|
|
|
|
|
// |
|
1282
|
|
|
|
|
|
|
// If the given lemma is only a raw lemma, all lemma ids with this raw lemma |
|
1283
|
|
|
|
|
|
|
// are returned. Otherwise only matching lemma ids are returned, ignoring any |
|
1284
|
|
|
|
|
|
|
// lemma comments. For every found lemma, matching forms are filtered using |
|
1285
|
|
|
|
|
|
|
// the tag_wildcard. If at least one lemma is found in the dictionary, |
|
1286
|
|
|
|
|
|
|
// NO_GUESSER is returned. If guesser == GUESSER and the lemma is found by |
|
1287
|
|
|
|
|
|
|
// the guesser, GUESSER is returned. Otherwise, forms are cleared and <0 is |
|
1288
|
|
|
|
|
|
|
// returned. |
|
1289
|
|
|
|
|
|
|
virtual int generate(string_piece lemma, const char* tag_wildcard, guesser_mode guesser, vector& forms) const = 0; |
|
1290
|
|
|
|
|
|
|
|
|
1291
|
|
|
|
|
|
|
// Rawlemma and lemma id identification |
|
1292
|
|
|
|
|
|
|
virtual int raw_lemma_len(string_piece lemma) const = 0; |
|
1293
|
|
|
|
|
|
|
virtual int lemma_id_len(string_piece lemma) const = 0; |
|
1294
|
|
|
|
|
|
|
|
|
1295
|
|
|
|
|
|
|
// Rawform identification |
|
1296
|
|
|
|
|
|
|
virtual int raw_form_len(string_piece form) const = 0; |
|
1297
|
|
|
|
|
|
|
|
|
1298
|
|
|
|
|
|
|
// Construct a new tokenizer instance appropriate for this morphology. |
|
1299
|
|
|
|
|
|
|
// Can return NULL if no such tokenizer exists. |
|
1300
|
|
|
|
|
|
|
virtual tokenizer* new_tokenizer() const = 0; |
|
1301
|
|
|
|
|
|
|
|
|
1302
|
|
|
|
|
|
|
// Return a derivator for this morphology, or NULL if it does not exist. |
|
1303
|
|
|
|
|
|
|
// The returned instance is owned by the morphology and should not be deleted. |
|
1304
|
|
|
|
|
|
|
virtual const derivator* get_derivator() const; |
|
1305
|
|
|
|
|
|
|
|
|
1306
|
|
|
|
|
|
|
protected: |
|
1307
|
|
|
|
|
|
|
unique_ptr derinet; |
|
1308
|
|
|
|
|
|
|
}; |
|
1309
|
|
|
|
|
|
|
|
|
1310
|
|
|
|
|
|
|
} // namespace morphodita |
|
1311
|
|
|
|
|
|
|
|
|
1312
|
|
|
|
|
|
|
///////// |
|
1313
|
|
|
|
|
|
|
// File: morphodita/tokenizer/tokenizer_factory.h |
|
1314
|
|
|
|
|
|
|
///////// |
|
1315
|
|
|
|
|
|
|
|
|
1316
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
1317
|
|
|
|
|
|
|
// |
|
1318
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
|
1319
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
1320
|
|
|
|
|
|
|
// |
|
1321
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
1322
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
1323
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
1324
|
|
|
|
|
|
|
|
|
1325
|
|
|
|
|
|
|
namespace morphodita { |
|
1326
|
|
|
|
|
|
|
|
|
1327
|
1
|
|
|
|
|
|
class tokenizer_factory { |
|
1328
|
|
|
|
|
|
|
public: |
|
1329
|
1
|
|
|
|
|
|
virtual ~tokenizer_factory() {} |
|
1330
|
|
|
|
|
|
|
|
|
1331
|
|
|
|
|
|
|
static tokenizer_factory* load(istream& is); |
|
1332
|
|
|
|
|
|
|
static tokenizer_factory* load(const char* fname); |
|
1333
|
|
|
|
|
|
|
|
|
1334
|
|
|
|
|
|
|
// Construct a new tokenizer instance. |
|
1335
|
|
|
|
|
|
|
virtual tokenizer* new_tokenizer(const morpho* m) const = 0; |
|
1336
|
|
|
|
|
|
|
}; |
|
1337
|
|
|
|
|
|
|
|
|
1338
|
|
|
|
|
|
|
} // namespace morphodita |
|
1339
|
|
|
|
|
|
|
|
|
1340
|
|
|
|
|
|
|
///////// |
|
1341
|
|
|
|
|
|
|
// File: morphodita/tagger/tagger.h |
|
1342
|
|
|
|
|
|
|
///////// |
|
1343
|
|
|
|
|
|
|
|
|
1344
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
1345
|
|
|
|
|
|
|
// |
|
1346
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
1347
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
1348
|
|
|
|
|
|
|
// |
|
1349
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
1350
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
1351
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
1352
|
|
|
|
|
|
|
|
|
1353
|
|
|
|
|
|
|
namespace morphodita { |
|
1354
|
|
|
|
|
|
|
|
|
1355
|
1
|
|
|
|
|
|
class tagger { |
|
1356
|
|
|
|
|
|
|
public: |
|
1357
|
1
|
|
|
|
|
|
virtual ~tagger() {} |
|
1358
|
|
|
|
|
|
|
|
|
1359
|
|
|
|
|
|
|
static tagger* load(const char* fname); |
|
1360
|
|
|
|
|
|
|
static tagger* load(istream& is); |
|
1361
|
|
|
|
|
|
|
|
|
1362
|
|
|
|
|
|
|
// Return morpho associated with the tagger. Do not delete the pointer, it is |
|
1363
|
|
|
|
|
|
|
// owned by the tagger instance and deleted in the tagger destructor. |
|
1364
|
|
|
|
|
|
|
virtual const morpho* get_morpho() const = 0; |
|
1365
|
|
|
|
|
|
|
|
|
1366
|
|
|
|
|
|
|
// Perform morphologic analysis and subsequent disambiguation. |
|
1367
|
|
|
|
|
|
|
virtual void tag(const vector& forms, vector& tags, morpho::guesser_mode guesser = morpho::GUESSER_UNSPECIFIED) const = 0; |
|
1368
|
|
|
|
|
|
|
|
|
1369
|
|
|
|
|
|
|
// Perform disambiguation only on given analyses. |
|
1370
|
|
|
|
|
|
|
virtual void tag_analyzed(const vector& forms, const vector>& analyses, vector& tags) const = 0; |
|
1371
|
|
|
|
|
|
|
|
|
1372
|
|
|
|
|
|
|
// Construct a new tokenizer instance appropriate for this tagger. |
|
1373
|
|
|
|
|
|
|
// Can return NULL if no such tokenizer exists. |
|
1374
|
|
|
|
|
|
|
// Is equal to get_morpho()->new_tokenizer. |
|
1375
|
|
|
|
|
|
|
tokenizer* new_tokenizer() const; |
|
1376
|
|
|
|
|
|
|
}; |
|
1377
|
|
|
|
|
|
|
|
|
1378
|
|
|
|
|
|
|
} // namespace morphodita |
|
1379
|
|
|
|
|
|
|
|
|
1380
|
|
|
|
|
|
|
///////// |
|
1381
|
|
|
|
|
|
|
// File: parsito/tree/node.h |
|
1382
|
|
|
|
|
|
|
///////// |
|
1383
|
|
|
|
|
|
|
|
|
1384
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
1385
|
|
|
|
|
|
|
// |
|
1386
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
1387
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
1388
|
|
|
|
|
|
|
// |
|
1389
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
1390
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
1391
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
1392
|
|
|
|
|
|
|
|
|
1393
|
|
|
|
|
|
|
namespace parsito { |
|
1394
|
|
|
|
|
|
|
|
|
1395
|
23
|
0
|
|
|
|
|
class node { |
|
1396
|
|
|
|
|
|
|
public: |
|
1397
|
|
|
|
|
|
|
int id; // 0 is root, >0 is sentence node, <0 is undefined |
|
1398
|
|
|
|
|
|
|
string form; // form |
|
1399
|
|
|
|
|
|
|
string lemma; // lemma |
|
1400
|
|
|
|
|
|
|
string upostag; // universal part-of-speech tag |
|
1401
|
|
|
|
|
|
|
string xpostag; // language-specific part-of-speech tag |
|
1402
|
|
|
|
|
|
|
string feats; // list of morphological features |
|
1403
|
|
|
|
|
|
|
int head; // head, 0 is root, <0 is without parent |
|
1404
|
|
|
|
|
|
|
string deprel; // dependency relation to the head |
|
1405
|
|
|
|
|
|
|
string deps; // secondary dependencies |
|
1406
|
|
|
|
|
|
|
string misc; // miscellaneous information |
|
1407
|
|
|
|
|
|
|
|
|
1408
|
|
|
|
|
|
|
vector children; |
|
1409
|
|
|
|
|
|
|
|
|
1410
|
9
|
|
|
|
|
|
node(int id = -1, const string& form = string()) : id(id), form(form), head(-1) {} |
|
1411
|
|
|
|
|
|
|
}; |
|
1412
|
|
|
|
|
|
|
|
|
1413
|
|
|
|
|
|
|
} // namespace parsito |
|
1414
|
|
|
|
|
|
|
|
|
1415
|
|
|
|
|
|
|
///////// |
|
1416
|
|
|
|
|
|
|
// File: parsito/tree/tree.h |
|
1417
|
|
|
|
|
|
|
///////// |
|
1418
|
|
|
|
|
|
|
|
|
1419
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
1420
|
|
|
|
|
|
|
// |
|
1421
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
1422
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
1423
|
|
|
|
|
|
|
// |
|
1424
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
1425
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
1426
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
1427
|
|
|
|
|
|
|
|
|
1428
|
|
|
|
|
|
|
namespace parsito { |
|
1429
|
|
|
|
|
|
|
|
|
1430
|
1
|
0
|
|
|
|
|
class tree { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1431
|
|
|
|
|
|
|
public: |
|
1432
|
|
|
|
|
|
|
tree(); |
|
1433
|
|
|
|
|
|
|
|
|
1434
|
|
|
|
|
|
|
vector nodes; |
|
1435
|
|
|
|
|
|
|
|
|
1436
|
|
|
|
|
|
|
bool empty(); |
|
1437
|
|
|
|
|
|
|
void clear(); |
|
1438
|
|
|
|
|
|
|
node& add_node(const string& form); |
|
1439
|
|
|
|
|
|
|
void set_head(int id, int head, const string& deprel); |
|
1440
|
|
|
|
|
|
|
void unlink_all_nodes(); |
|
1441
|
|
|
|
|
|
|
|
|
1442
|
|
|
|
|
|
|
static const string root_form; |
|
1443
|
|
|
|
|
|
|
}; |
|
1444
|
|
|
|
|
|
|
|
|
1445
|
|
|
|
|
|
|
} // namespace parsito |
|
1446
|
|
|
|
|
|
|
|
|
1447
|
|
|
|
|
|
|
///////// |
|
1448
|
|
|
|
|
|
|
// File: parsito/configuration/configuration.h |
|
1449
|
|
|
|
|
|
|
///////// |
|
1450
|
|
|
|
|
|
|
|
|
1451
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
1452
|
|
|
|
|
|
|
// |
|
1453
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
1454
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
1455
|
|
|
|
|
|
|
// |
|
1456
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
1457
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
1458
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
1459
|
|
|
|
|
|
|
|
|
1460
|
|
|
|
|
|
|
namespace parsito { |
|
1461
|
|
|
|
|
|
|
|
|
1462
|
167
|
|
|
|
|
|
class configuration { |
|
1463
|
|
|
|
|
|
|
public: |
|
1464
|
11
|
|
|
|
|
|
configuration(bool single_root) : single_root(single_root) {} |
|
1465
|
|
|
|
|
|
|
|
|
1466
|
|
|
|
|
|
|
void init(tree* t); |
|
1467
|
|
|
|
|
|
|
bool final(); |
|
1468
|
|
|
|
|
|
|
|
|
1469
|
|
|
|
|
|
|
tree* t; |
|
1470
|
|
|
|
|
|
|
vector stack; |
|
1471
|
|
|
|
|
|
|
vector buffer; |
|
1472
|
|
|
|
|
|
|
|
|
1473
|
|
|
|
|
|
|
bool single_root; |
|
1474
|
|
|
|
|
|
|
}; |
|
1475
|
|
|
|
|
|
|
|
|
1476
|
|
|
|
|
|
|
} // namespace parsito |
|
1477
|
|
|
|
|
|
|
|
|
1478
|
|
|
|
|
|
|
///////// |
|
1479
|
|
|
|
|
|
|
// File: utils/binary_decoder.h |
|
1480
|
|
|
|
|
|
|
///////// |
|
1481
|
|
|
|
|
|
|
|
|
1482
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
|
1483
|
|
|
|
|
|
|
// |
|
1484
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
1485
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
1486
|
|
|
|
|
|
|
// |
|
1487
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
1488
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
1489
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
1490
|
|
|
|
|
|
|
|
|
1491
|
|
|
|
|
|
|
namespace utils { |
|
1492
|
|
|
|
|
|
|
|
|
1493
|
|
|
|
|
|
|
// |
|
1494
|
|
|
|
|
|
|
// Declarations |
|
1495
|
|
|
|
|
|
|
// |
|
1496
|
|
|
|
|
|
|
|
|
1497
|
0
|
|
|
|
|
|
class binary_decoder_error : public runtime_error { |
|
1498
|
|
|
|
|
|
|
public: |
|
1499
|
0
|
0
|
|
|
|
|
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1500
|
|
|
|
|
|
|
}; |
|
1501
|
|
|
|
|
|
|
|
|
1502
|
5
|
0
|
|
|
|
|
class binary_decoder { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1503
|
|
|
|
|
|
|
public: |
|
1504
|
|
|
|
|
|
|
inline unsigned char* fill(unsigned len); |
|
1505
|
|
|
|
|
|
|
|
|
1506
|
|
|
|
|
|
|
inline unsigned next_1B(); |
|
1507
|
|
|
|
|
|
|
inline unsigned next_2B(); |
|
1508
|
|
|
|
|
|
|
inline unsigned next_4B(); |
|
1509
|
|
|
|
|
|
|
inline void next_str(string& str); |
|
1510
|
|
|
|
|
|
|
template inline const T* next(unsigned elements); |
|
1511
|
|
|
|
|
|
|
|
|
1512
|
|
|
|
|
|
|
inline bool is_end(); |
|
1513
|
|
|
|
|
|
|
inline unsigned tell(); |
|
1514
|
|
|
|
|
|
|
inline void seek(unsigned pos); |
|
1515
|
|
|
|
|
|
|
|
|
1516
|
|
|
|
|
|
|
private: |
|
1517
|
|
|
|
|
|
|
vector buffer; |
|
1518
|
|
|
|
|
|
|
const unsigned char* data; |
|
1519
|
|
|
|
|
|
|
const unsigned char* data_end; |
|
1520
|
|
|
|
|
|
|
}; |
|
1521
|
|
|
|
|
|
|
|
|
1522
|
|
|
|
|
|
|
// |
|
1523
|
|
|
|
|
|
|
// Definitions |
|
1524
|
|
|
|
|
|
|
// |
|
1525
|
|
|
|
|
|
|
|
|
1526
|
|
|
|
|
|
|
unsigned char* binary_decoder::fill(unsigned len) { |
|
1527
|
6
|
50
|
|
|
|
|
buffer.resize(len); |
|
1528
|
6
|
|
|
|
|
|
data = buffer.data(); |
|
1529
|
6
|
|
|
|
|
|
data_end = buffer.data() + len; |
|
1530
|
|
|
|
|
|
|
|
|
1531
|
|
|
|
|
|
|
return buffer.data(); |
|
1532
|
|
|
|
|
|
|
} |
|
1533
|
|
|
|
|
|
|
|
|
1534
|
2616
|
|
|
|
|
|
unsigned binary_decoder::next_1B() { |
|
1535
|
1308
|
50
|
|
|
|
|
if (data + 1 > data_end) throw binary_decoder_error("No more data in binary_decoder"); |
|
1536
|
1308
|
|
|
|
|
|
return *data++; |
|
1537
|
|
|
|
|
|
|
} |
|
1538
|
|
|
|
|
|
|
|
|
1539
|
26
|
|
|
|
|
|
unsigned binary_decoder::next_2B() { |
|
1540
|
26
|
50
|
|
|
|
|
if (data + sizeof(uint16_t) > data_end) throw binary_decoder_error("No more data in binary_decoder"); |
|
1541
|
|
|
|
|
|
|
uint16_t result; |
|
1542
|
|
|
|
|
|
|
memcpy(&result, data, sizeof(uint16_t)); |
|
1543
|
26
|
|
|
|
|
|
data += sizeof(uint16_t); |
|
1544
|
26
|
|
|
|
|
|
return result; |
|
1545
|
|
|
|
|
|
|
} |
|
1546
|
|
|
|
|
|
|
|
|
1547
|
1573
|
|
|
|
|
|
unsigned binary_decoder::next_4B() { |
|
1548
|
1573
|
50
|
|
|
|
|
if (data + sizeof(uint32_t) > data_end) throw binary_decoder_error("No more data in binary_decoder"); |
|
1549
|
|
|
|
|
|
|
uint32_t result; |
|
1550
|
|
|
|
|
|
|
memcpy(&result, data, sizeof(uint32_t)); |
|
1551
|
1573
|
|
|
|
|
|
data += sizeof(uint32_t); |
|
1552
|
1573
|
|
|
|
|
|
return result; |
|
1553
|
|
|
|
|
|
|
} |
|
1554
|
|
|
|
|
|
|
|
|
1555
|
36
|
|
|
|
|
|
void binary_decoder::next_str(string& str) { |
|
1556
|
36
|
|
|
|
|
|
unsigned len = next_1B(); |
|
1557
|
36
|
100
|
|
|
|
|
if (len == 255) len = next_4B(); |
|
1558
|
36
|
|
|
|
|
|
str.assign(next(len), len); |
|
1559
|
36
|
|
|
|
|
|
} |
|
1560
|
|
|
|
|
|
|
|
|
1561
|
2544
|
|
|
|
|
|
template const T* binary_decoder::next(unsigned elements) { |
|
1562
|
1272
|
50
|
|
|
|
|
if (data + sizeof(T) * elements > data_end) throw binary_decoder_error("No more data in binary_decoder"); |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1563
|
|
|
|
|
|
|
const T* result = (const T*) data; |
|
1564
|
1272
|
|
|
|
|
|
data += sizeof(T) * elements; |
|
1565
|
1272
|
|
|
|
|
|
return result; |
|
1566
|
|
|
|
|
|
|
} |
|
1567
|
|
|
|
|
|
|
|
|
1568
|
|
|
|
|
|
|
bool binary_decoder::is_end() { |
|
1569
|
4
|
|
|
|
|
|
return data >= data_end; |
|
1570
|
|
|
|
|
|
|
} |
|
1571
|
|
|
|
|
|
|
|
|
1572
|
|
|
|
|
|
|
unsigned binary_decoder::tell() { |
|
1573
|
1
|
|
|
|
|
|
return data - buffer.data(); |
|
1574
|
|
|
|
|
|
|
} |
|
1575
|
|
|
|
|
|
|
|
|
1576
|
1
|
|
|
|
|
|
void binary_decoder::seek(unsigned pos) { |
|
1577
|
1
|
50
|
|
|
|
|
if (pos > buffer.size()) throw binary_decoder_error("Cannot seek past end of binary_decoder"); |
|
1578
|
1
|
|
|
|
|
|
data = buffer.data() + pos; |
|
1579
|
1
|
|
|
|
|
|
} |
|
1580
|
|
|
|
|
|
|
|
|
1581
|
|
|
|
|
|
|
} // namespace utils |
|
1582
|
|
|
|
|
|
|
|
|
1583
|
|
|
|
|
|
|
///////// |
|
1584
|
|
|
|
|
|
|
// File: parsito/parser/parser.h |
|
1585
|
|
|
|
|
|
|
///////// |
|
1586
|
|
|
|
|
|
|
|
|
1587
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
1588
|
|
|
|
|
|
|
// |
|
1589
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
1590
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
1591
|
|
|
|
|
|
|
// |
|
1592
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
1593
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
1594
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
1595
|
|
|
|
|
|
|
|
|
1596
|
|
|
|
|
|
|
namespace parsito { |
|
1597
|
|
|
|
|
|
|
|
|
1598
|
|
|
|
|
|
|
// Parser |
|
1599
|
1
|
|
|
|
|
|
class parser { |
|
1600
|
|
|
|
|
|
|
public: |
|
1601
|
1
|
|
|
|
|
|
virtual ~parser() {}; |
|
1602
|
|
|
|
|
|
|
|
|
1603
|
|
|
|
|
|
|
virtual void parse(tree& t, unsigned beam_size = 0, double* cost = nullptr) const = 0; |
|
1604
|
|
|
|
|
|
|
|
|
1605
|
|
|
|
|
|
|
enum { NO_CACHE = 0, FULL_CACHE = 2147483647}; |
|
1606
|
|
|
|
|
|
|
static parser* load(const char* file, unsigned cache = 1000); |
|
1607
|
|
|
|
|
|
|
static parser* load(istream& in, unsigned cache = 1000); |
|
1608
|
|
|
|
|
|
|
|
|
1609
|
|
|
|
|
|
|
protected: |
|
1610
|
|
|
|
|
|
|
virtual void load(binary_decoder& data, unsigned cache) = 0; |
|
1611
|
|
|
|
|
|
|
static parser* create(const string& name); |
|
1612
|
|
|
|
|
|
|
}; |
|
1613
|
|
|
|
|
|
|
|
|
1614
|
|
|
|
|
|
|
} // namespace parsito |
|
1615
|
|
|
|
|
|
|
|
|
1616
|
|
|
|
|
|
|
///////// |
|
1617
|
|
|
|
|
|
|
// File: tokenizer/multiword_splitter.h |
|
1618
|
|
|
|
|
|
|
///////// |
|
1619
|
|
|
|
|
|
|
|
|
1620
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
1621
|
|
|
|
|
|
|
// |
|
1622
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
|
1623
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
1624
|
|
|
|
|
|
|
// |
|
1625
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
1626
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
1627
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
1628
|
|
|
|
|
|
|
|
|
1629
|
1
|
|
|
|
|
|
class multiword_splitter { |
|
1630
|
|
|
|
|
|
|
public: |
|
1631
|
|
|
|
|
|
|
void append_token(string_piece token, string_piece misc, sentence& s) const; |
|
1632
|
|
|
|
|
|
|
|
|
1633
|
|
|
|
|
|
|
static multiword_splitter* load(istream& is); |
|
1634
|
|
|
|
|
|
|
|
|
1635
|
|
|
|
|
|
|
private: |
|
1636
|
1
|
|
|
|
|
|
multiword_splitter(unsigned version) : version(version) {} |
|
1637
|
|
|
|
|
|
|
unsigned version; |
|
1638
|
|
|
|
|
|
|
enum { VERSION_LATEST = 2 }; |
|
1639
|
|
|
|
|
|
|
friend class multiword_splitter_trainer; |
|
1640
|
|
|
|
|
|
|
|
|
1641
|
0
|
|
|
|
|
|
struct suffix_info { |
|
1642
|
|
|
|
|
|
|
vector words; |
|
1643
|
|
|
|
|
|
|
}; |
|
1644
|
|
|
|
|
|
|
unordered_map full_rules, suffix_rules; |
|
1645
|
|
|
|
|
|
|
}; |
|
1646
|
|
|
|
|
|
|
|
|
1647
|
|
|
|
|
|
|
///////// |
|
1648
|
|
|
|
|
|
|
// File: utils/parse_int.h |
|
1649
|
|
|
|
|
|
|
///////// |
|
1650
|
|
|
|
|
|
|
|
|
1651
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
|
1652
|
|
|
|
|
|
|
// |
|
1653
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
1654
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
1655
|
|
|
|
|
|
|
// |
|
1656
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
1657
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
1658
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
1659
|
|
|
|
|
|
|
|
|
1660
|
|
|
|
|
|
|
namespace utils { |
|
1661
|
|
|
|
|
|
|
|
|
1662
|
|
|
|
|
|
|
// |
|
1663
|
|
|
|
|
|
|
// Declarations |
|
1664
|
|
|
|
|
|
|
// |
|
1665
|
|
|
|
|
|
|
|
|
1666
|
|
|
|
|
|
|
// Try to parse an int from given string. If the int cannot be parsed or does |
|
1667
|
|
|
|
|
|
|
// not fit into int, false is returned and the error string is filled using the |
|
1668
|
|
|
|
|
|
|
// value_name argument. |
|
1669
|
|
|
|
|
|
|
inline bool parse_int(string_piece str, const char* value_name, int& value, string& error); |
|
1670
|
|
|
|
|
|
|
|
|
1671
|
|
|
|
|
|
|
// Try to parse an int from given string. If the int cannot be parsed or does |
|
1672
|
|
|
|
|
|
|
// not fit into int, an error is displayed and program exits. |
|
1673
|
|
|
|
|
|
|
inline int parse_int(string_piece str, const char* value_name); |
|
1674
|
|
|
|
|
|
|
|
|
1675
|
|
|
|
|
|
|
// |
|
1676
|
|
|
|
|
|
|
// Definitions |
|
1677
|
|
|
|
|
|
|
// |
|
1678
|
|
|
|
|
|
|
|
|
1679
|
68
|
|
|
|
|
|
bool parse_int(string_piece str, const char* value_name, int& value, string& error) { |
|
1680
|
|
|
|
|
|
|
string_piece original = str; |
|
1681
|
|
|
|
|
|
|
|
|
1682
|
|
|
|
|
|
|
// Skip spaces |
|
1683
|
34
|
50
|
|
|
|
|
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1684
|
0
|
|
|
|
|
|
str.str++, str.len--; |
|
1685
|
|
|
|
|
|
|
|
|
1686
|
|
|
|
|
|
|
// Allow minus |
|
1687
|
|
|
|
|
|
|
bool positive = true; |
|
1688
|
34
|
50
|
|
|
|
|
if (str.len && (str.str[0] == '+' || str.str[0] == '-')) { |
|
|
|
100
|
|
|
|
|
|
|
1689
|
|
|
|
|
|
|
positive = str.str[0] == '+'; |
|
1690
|
8
|
|
|
|
|
|
str.str++, str.len--; |
|
1691
|
|
|
|
|
|
|
} |
|
1692
|
|
|
|
|
|
|
|
|
1693
|
|
|
|
|
|
|
// Parse value, checking for overflow/underflow |
|
1694
|
34
|
50
|
|
|
|
|
if (!str.len) return error.assign("Cannot parse ").append(value_name).append(" int value '").append(original.str, original.len).append("': empty string."), false; |
|
1695
|
|
|
|
|
|
|
if (!(str.str[0] >= '0' || str.str[0] <= '9')) return error.assign("Cannot parse ").append(value_name).append(" int value '").append(original.str, original.len).append("': non-digit character found."), false; |
|
1696
|
|
|
|
|
|
|
|
|
1697
|
34
|
|
|
|
|
|
value = 0; |
|
1698
|
68
|
100
|
|
|
|
|
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1699
|
34
|
100
|
|
|
|
|
if (positive) { |
|
1700
|
26
|
50
|
|
|
|
|
if (value > (numeric_limits::max() - (str.str[0] - '0')) / 10) |
|
1701
|
0
|
|
|
|
|
|
return error.assign("Cannot parse ").append(value_name).append(" int value '").append(original.str, original.len).append("': overflow occured."), false; |
|
1702
|
26
|
|
|
|
|
|
value = 10 * value + (str.str[0] - '0'); |
|
1703
|
|
|
|
|
|
|
} else { |
|
1704
|
8
|
50
|
|
|
|
|
if (value < (numeric_limits::min() + (str.str[0] - '0')) / 10) |
|
1705
|
0
|
|
|
|
|
|
return error.assign("Cannot parse ").append(value_name).append(" int value '").append(original.str, original.len).append("': underflow occured."), false; |
|
1706
|
8
|
|
|
|
|
|
value = 10 * value - (str.str[0] - '0'); |
|
1707
|
|
|
|
|
|
|
} |
|
1708
|
34
|
|
|
|
|
|
str.str++, str.len--; |
|
1709
|
|
|
|
|
|
|
} |
|
1710
|
|
|
|
|
|
|
|
|
1711
|
|
|
|
|
|
|
// Skip spaces |
|
1712
|
34
|
50
|
|
|
|
|
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1713
|
0
|
|
|
|
|
|
str.str++, str.len--; |
|
1714
|
|
|
|
|
|
|
|
|
1715
|
|
|
|
|
|
|
// Check for remaining characters |
|
1716
|
34
|
50
|
|
|
|
|
if (str.len) return error.assign("Cannot parse ").append(value_name).append(" int value '").append(original.str, original.len).append("': non-digit character found."), false; |
|
1717
|
|
|
|
|
|
|
|
|
1718
|
|
|
|
|
|
|
return true; |
|
1719
|
|
|
|
|
|
|
} |
|
1720
|
|
|
|
|
|
|
|
|
1721
|
0
|
|
|
|
|
|
int parse_int(string_piece str, const char* value_name) { |
|
1722
|
|
|
|
|
|
|
int result; |
|
1723
|
|
|
|
|
|
|
string error; |
|
1724
|
|
|
|
|
|
|
|
|
1725
|
0
|
0
|
|
|
|
|
if (!parse_int(str, value_name, result, error)) |
|
|
|
0
|
|
|
|
|
|
|
1726
|
0
|
|
|
|
|
|
runtime_failure(error); |
|
1727
|
|
|
|
|
|
|
|
|
1728
|
0
|
|
|
|
|
|
return result; |
|
1729
|
|
|
|
|
|
|
} |
|
1730
|
|
|
|
|
|
|
|
|
1731
|
|
|
|
|
|
|
} // namespace utils |
|
1732
|
|
|
|
|
|
|
|
|
1733
|
|
|
|
|
|
|
///////// |
|
1734
|
|
|
|
|
|
|
// File: utils/path_from_utf8.h |
|
1735
|
|
|
|
|
|
|
///////// |
|
1736
|
|
|
|
|
|
|
|
|
1737
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
|
1738
|
|
|
|
|
|
|
// |
|
1739
|
|
|
|
|
|
|
// Copyright 2022 Institute of Formal and Applied Linguistics, Faculty of |
|
1740
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
1741
|
|
|
|
|
|
|
// |
|
1742
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
1743
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
1744
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
1745
|
|
|
|
|
|
|
|
|
1746
|
|
|
|
|
|
|
namespace utils { |
|
1747
|
|
|
|
|
|
|
|
|
1748
|
|
|
|
|
|
|
// |
|
1749
|
|
|
|
|
|
|
// Declarations |
|
1750
|
|
|
|
|
|
|
// |
|
1751
|
|
|
|
|
|
|
|
|
1752
|
|
|
|
|
|
|
#ifdef _WIN32 |
|
1753
|
|
|
|
|
|
|
inline wstring path_from_utf8(const char* str); |
|
1754
|
|
|
|
|
|
|
inline wstring path_from_utf8(const string& str); |
|
1755
|
|
|
|
|
|
|
#else |
|
1756
|
|
|
|
|
|
|
inline string path_from_utf8(const char* str); |
|
1757
|
|
|
|
|
|
|
inline const string& path_from_utf8(const string& str); |
|
1758
|
|
|
|
|
|
|
#endif |
|
1759
|
|
|
|
|
|
|
|
|
1760
|
|
|
|
|
|
|
// |
|
1761
|
|
|
|
|
|
|
// Definitions |
|
1762
|
|
|
|
|
|
|
// |
|
1763
|
|
|
|
|
|
|
|
|
1764
|
|
|
|
|
|
|
#ifdef _WIN32 |
|
1765
|
|
|
|
|
|
|
|
|
1766
|
|
|
|
|
|
|
inline wstring path_from_utf8(const char* str) { |
|
1767
|
|
|
|
|
|
|
// We could implement this using codecvt_utf8_utf16, but it is not available |
|
1768
|
|
|
|
|
|
|
// in GCC 4.9, which we still use. We could also use MultiByteToWideChar, |
|
1769
|
|
|
|
|
|
|
// but using it would require changing our build infrastructure -- hence |
|
1770
|
|
|
|
|
|
|
// we implement the conversion manually. |
|
1771
|
|
|
|
|
|
|
wstring wstr; |
|
1772
|
|
|
|
|
|
|
while (*str) { |
|
1773
|
|
|
|
|
|
|
char32_t chr; |
|
1774
|
|
|
|
|
|
|
if (((unsigned char)*str) < 0x80) chr = (unsigned char)*str++; |
|
1775
|
|
|
|
|
|
|
else if (((unsigned char)*str) < 0xC0) chr = '?', ++str; |
|
1776
|
|
|
|
|
|
|
else if (((unsigned char)*str) < 0xE0) { |
|
1777
|
|
|
|
|
|
|
chr = (((unsigned char)*str++) & 0x1F) << 6; |
|
1778
|
|
|
|
|
|
|
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) chr = '?'; |
|
1779
|
|
|
|
|
|
|
else chr += ((unsigned char)*str++) & 0x3F; |
|
1780
|
|
|
|
|
|
|
} else if (((unsigned char)*str) < 0xF0) { |
|
1781
|
|
|
|
|
|
|
chr = (((unsigned char)*str++) & 0x0F) << 12; |
|
1782
|
|
|
|
|
|
|
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) chr = '?'; |
|
1783
|
|
|
|
|
|
|
else { |
|
1784
|
|
|
|
|
|
|
chr += (((unsigned char)*str++) & 0x3F) << 6; |
|
1785
|
|
|
|
|
|
|
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) chr = '?'; |
|
1786
|
|
|
|
|
|
|
else chr += ((unsigned char)*str++) & 0x3F; |
|
1787
|
|
|
|
|
|
|
} |
|
1788
|
|
|
|
|
|
|
} else if (((unsigned char)*str) < 0xF8) { |
|
1789
|
|
|
|
|
|
|
chr = (((unsigned char)*str++) & 0x07) << 18; |
|
1790
|
|
|
|
|
|
|
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) chr = '?'; |
|
1791
|
|
|
|
|
|
|
else { |
|
1792
|
|
|
|
|
|
|
chr += (((unsigned char)*str++) & 0x3F) << 12; |
|
1793
|
|
|
|
|
|
|
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) chr = '?'; |
|
1794
|
|
|
|
|
|
|
else { |
|
1795
|
|
|
|
|
|
|
chr += (((unsigned char)*str++) & 0x3F) << 6; |
|
1796
|
|
|
|
|
|
|
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) chr = '?'; |
|
1797
|
|
|
|
|
|
|
else chr += ((unsigned char)*str++) & 0x3F; |
|
1798
|
|
|
|
|
|
|
} |
|
1799
|
|
|
|
|
|
|
} |
|
1800
|
|
|
|
|
|
|
} else chr = '?', ++str; |
|
1801
|
|
|
|
|
|
|
|
|
1802
|
|
|
|
|
|
|
if (chr <= 0xFFFF) wstr.push_back(chr); |
|
1803
|
|
|
|
|
|
|
else if (chr <= 0x10FFFF) { |
|
1804
|
|
|
|
|
|
|
wstr.push_back(0xD800 + ((chr - 0x10000) >> 10)); |
|
1805
|
|
|
|
|
|
|
wstr.push_back(0xDC00 + ((chr - 0x10000) & 0x3FF)); |
|
1806
|
|
|
|
|
|
|
} else { |
|
1807
|
|
|
|
|
|
|
wstr.push_back('?'); |
|
1808
|
|
|
|
|
|
|
} |
|
1809
|
|
|
|
|
|
|
} |
|
1810
|
|
|
|
|
|
|
return wstr; |
|
1811
|
|
|
|
|
|
|
} |
|
1812
|
|
|
|
|
|
|
|
|
1813
|
|
|
|
|
|
|
inline wstring path_from_utf8(const string& str) { |
|
1814
|
|
|
|
|
|
|
return path_from_utf8(str.c_str()); |
|
1815
|
|
|
|
|
|
|
} |
|
1816
|
|
|
|
|
|
|
|
|
1817
|
|
|
|
|
|
|
#else |
|
1818
|
|
|
|
|
|
|
|
|
1819
|
|
|
|
|
|
|
inline string path_from_utf8(const char* str) { |
|
1820
|
1
|
|
|
|
|
|
return str; |
|
1821
|
|
|
|
|
|
|
} |
|
1822
|
|
|
|
|
|
|
|
|
1823
|
|
|
|
|
|
|
inline const string& path_from_utf8(const string& str) { |
|
1824
|
|
|
|
|
|
|
return str; |
|
1825
|
|
|
|
|
|
|
} |
|
1826
|
|
|
|
|
|
|
|
|
1827
|
|
|
|
|
|
|
#endif |
|
1828
|
|
|
|
|
|
|
|
|
1829
|
|
|
|
|
|
|
} // namespace utils |
|
1830
|
|
|
|
|
|
|
|
|
1831
|
|
|
|
|
|
|
///////// |
|
1832
|
|
|
|
|
|
|
// File: utils/named_values.h |
|
1833
|
|
|
|
|
|
|
///////// |
|
1834
|
|
|
|
|
|
|
|
|
1835
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
|
1836
|
|
|
|
|
|
|
// |
|
1837
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
1838
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
1839
|
|
|
|
|
|
|
// |
|
1840
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
1841
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
1842
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
1843
|
|
|
|
|
|
|
|
|
1844
|
|
|
|
|
|
|
namespace utils { |
|
1845
|
|
|
|
|
|
|
|
|
1846
|
|
|
|
|
|
|
// |
|
1847
|
|
|
|
|
|
|
// Declarations |
|
1848
|
|
|
|
|
|
|
// |
|
1849
|
|
|
|
|
|
|
|
|
1850
|
|
|
|
|
|
|
class named_values { |
|
1851
|
|
|
|
|
|
|
public: |
|
1852
|
|
|
|
|
|
|
typedef unordered_map map; |
|
1853
|
|
|
|
|
|
|
|
|
1854
|
|
|
|
|
|
|
inline static bool parse(const string& values, map& parsed_values, string& error); |
|
1855
|
|
|
|
|
|
|
}; |
|
1856
|
|
|
|
|
|
|
|
|
1857
|
|
|
|
|
|
|
// |
|
1858
|
|
|
|
|
|
|
// Definitions |
|
1859
|
|
|
|
|
|
|
// |
|
1860
|
|
|
|
|
|
|
|
|
1861
|
3
|
|
|
|
|
|
bool named_values::parse(const string& values, map& parsed_values, string& error) { |
|
1862
|
|
|
|
|
|
|
error.clear(); |
|
1863
|
|
|
|
|
|
|
parsed_values.clear(); |
|
1864
|
|
|
|
|
|
|
|
|
1865
|
|
|
|
|
|
|
string name, file; |
|
1866
|
3
|
50
|
|
|
|
|
for (size_t start = 0; start < values.size(); ) { |
|
1867
|
0
|
0
|
|
|
|
|
while (start < values.size() && values[start] == ';') start++; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1868
|
0
|
0
|
|
|
|
|
if (start >= values.size()) break; |
|
1869
|
|
|
|
|
|
|
|
|
1870
|
|
|
|
|
|
|
size_t name_end = values.find_first_of("=;", start); |
|
1871
|
0
|
0
|
|
|
|
|
name.assign(values, start, name_end - start); |
|
1872
|
|
|
|
|
|
|
string& value = parsed_values[name]; |
|
1873
|
|
|
|
|
|
|
|
|
1874
|
0
|
0
|
|
|
|
|
if (name_end == string::npos) { |
|
1875
|
|
|
|
|
|
|
start = name_end; |
|
1876
|
0
|
0
|
|
|
|
|
} else if (values[name_end] == ';') { |
|
1877
|
0
|
|
|
|
|
|
start = name_end + 1; |
|
1878
|
|
|
|
|
|
|
} else /* if (values[name_end] == '=') */ { |
|
1879
|
|
|
|
|
|
|
size_t equal_sign = name_end; |
|
1880
|
|
|
|
|
|
|
|
|
1881
|
0
|
0
|
|
|
|
|
if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "file:") == 0) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1882
|
|
|
|
|
|
|
// Value of type file: |
|
1883
|
|
|
|
|
|
|
size_t file_name = equal_sign + 1 + 5; |
|
1884
|
0
|
|
|
|
|
|
size_t semicolon = min(values.find(';', file_name), values.size()); |
|
1885
|
|
|
|
|
|
|
|
|
1886
|
0
|
0
|
|
|
|
|
file.assign(values, file_name, semicolon - file_name); |
|
1887
|
0
|
0
|
|
|
|
|
ifstream is(path_from_utf8(file).c_str()); |
|
1888
|
0
|
0
|
|
|
|
|
if (!is.is_open()) return error.assign("Cannot open file '").append(file).append("'!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1889
|
|
|
|
|
|
|
|
|
1890
|
|
|
|
|
|
|
char buffer[1024]; |
|
1891
|
0
|
0
|
|
|
|
|
for (value.clear(); is.read(buffer, sizeof(buffer)); ) |
|
|
|
0
|
|
|
|
|
|
|
1892
|
0
|
0
|
|
|
|
|
value.append(buffer, sizeof(buffer)); |
|
1893
|
0
|
0
|
|
|
|
|
value.append(buffer, is.gcount()); |
|
1894
|
|
|
|
|
|
|
|
|
1895
|
0
|
|
|
|
|
|
start = semicolon + 1; |
|
1896
|
0
|
0
|
|
|
|
|
} else if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "data:") == 0) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1897
|
|
|
|
|
|
|
// Value of type data: |
|
1898
|
|
|
|
|
|
|
size_t data_size_start = equal_sign + 1 + 5; |
|
1899
|
0
|
|
|
|
|
|
size_t data_size_end = values.find(':', data_size_start); |
|
1900
|
0
|
0
|
|
|
|
|
if (data_size_end == string::npos) return error.assign("Cannot parse named values, data size of value '").append(name).append("' not terminated!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1901
|
|
|
|
|
|
|
|
|
1902
|
|
|
|
|
|
|
int data_size; |
|
1903
|
0
|
0
|
|
|
|
|
if (!parse_int(string_piece(values.c_str() + data_size_start, data_size_end - data_size_start), "data_size", data_size, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
1904
|
|
|
|
|
|
|
|
|
1905
|
0
|
|
|
|
|
|
size_t data_start = data_size_end + 1, data_end = data_start + data_size; |
|
1906
|
0
|
0
|
|
|
|
|
if (data_end > values.size()) return error.assign("Cannot parse named values, value '").append(name).append("' shorter than specified length!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1907
|
0
|
0
|
|
|
|
|
if (data_end < values.size() && values[data_end] != ';') return error.assign("Cannot parse named values, value '").append(name).append("' not terminated by semicolon!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1908
|
|
|
|
|
|
|
|
|
1909
|
0
|
0
|
|
|
|
|
value.assign(values, data_start, data_end - data_start); |
|
1910
|
0
|
|
|
|
|
|
start = data_end + 1; |
|
1911
|
|
|
|
|
|
|
} else { |
|
1912
|
|
|
|
|
|
|
// Value of string type |
|
1913
|
0
|
|
|
|
|
|
size_t semicolon = min(values.find(';', equal_sign), values.size()); |
|
1914
|
0
|
0
|
|
|
|
|
value.assign(values, equal_sign + 1, semicolon - equal_sign - 1); |
|
1915
|
0
|
|
|
|
|
|
start = semicolon + 1; |
|
1916
|
|
|
|
|
|
|
} |
|
1917
|
|
|
|
|
|
|
} |
|
1918
|
|
|
|
|
|
|
} |
|
1919
|
|
|
|
|
|
|
|
|
1920
|
|
|
|
|
|
|
return true; |
|
1921
|
|
|
|
|
|
|
} |
|
1922
|
|
|
|
|
|
|
|
|
1923
|
|
|
|
|
|
|
} // namespace utils |
|
1924
|
|
|
|
|
|
|
|
|
1925
|
|
|
|
|
|
|
///////// |
|
1926
|
|
|
|
|
|
|
// File: utils/threadsafe_stack.h |
|
1927
|
|
|
|
|
|
|
///////// |
|
1928
|
|
|
|
|
|
|
|
|
1929
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
|
1930
|
|
|
|
|
|
|
// |
|
1931
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
1932
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
1933
|
|
|
|
|
|
|
// |
|
1934
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
1935
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
1936
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
1937
|
|
|
|
|
|
|
|
|
1938
|
|
|
|
|
|
|
namespace utils { |
|
1939
|
|
|
|
|
|
|
|
|
1940
|
|
|
|
|
|
|
// |
|
1941
|
|
|
|
|
|
|
// Declarations |
|
1942
|
|
|
|
|
|
|
// |
|
1943
|
|
|
|
|
|
|
|
|
1944
|
|
|
|
|
|
|
template |
|
1945
|
3
|
|
|
|
|
|
class threadsafe_stack { |
|
1946
|
|
|
|
|
|
|
public: |
|
1947
|
|
|
|
|
|
|
inline void push(T* t); |
|
1948
|
|
|
|
|
|
|
inline T* pop(); |
|
1949
|
|
|
|
|
|
|
|
|
1950
|
|
|
|
|
|
|
private: |
|
1951
|
|
|
|
|
|
|
vector> stack; |
|
1952
|
|
|
|
|
|
|
atomic_flag lock = ATOMIC_FLAG_INIT; |
|
1953
|
|
|
|
|
|
|
}; |
|
1954
|
|
|
|
|
|
|
|
|
1955
|
|
|
|
|
|
|
// |
|
1956
|
|
|
|
|
|
|
// Definitions |
|
1957
|
|
|
|
|
|
|
// |
|
1958
|
|
|
|
|
|
|
|
|
1959
|
|
|
|
|
|
|
template |
|
1960
|
8
|
|
|
|
|
|
void threadsafe_stack::push(T* t) { |
|
1961
|
4
|
0
|
|
|
|
|
while (lock.test_and_set(memory_order_acquire)) {} |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1962
|
4
|
|
|
|
|
|
stack.emplace_back(t); |
|
1963
|
|
|
|
|
|
|
lock.clear(memory_order_release); |
|
1964
|
4
|
|
|
|
|
|
} |
|
1965
|
|
|
|
|
|
|
|
|
1966
|
|
|
|
|
|
|
template |
|
1967
|
8
|
|
|
|
|
|
T* threadsafe_stack::pop() { |
|
1968
|
|
|
|
|
|
|
T* res = nullptr; |
|
1969
|
|
|
|
|
|
|
|
|
1970
|
4
|
0
|
|
|
|
|
while (lock.test_and_set(memory_order_acquire)) {} |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1971
|
4
|
0
|
|
|
|
|
if (!stack.empty()) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1972
|
|
|
|
|
|
|
res = stack.back().release(); |
|
1973
|
|
|
|
|
|
|
stack.pop_back(); |
|
1974
|
|
|
|
|
|
|
} |
|
1975
|
|
|
|
|
|
|
lock.clear(memory_order_release); |
|
1976
|
|
|
|
|
|
|
|
|
1977
|
4
|
|
|
|
|
|
return res; |
|
1978
|
|
|
|
|
|
|
} |
|
1979
|
|
|
|
|
|
|
|
|
1980
|
|
|
|
|
|
|
} // namespace utils |
|
1981
|
|
|
|
|
|
|
|
|
1982
|
|
|
|
|
|
|
///////// |
|
1983
|
|
|
|
|
|
|
// File: model/model_morphodita_parsito.h |
|
1984
|
|
|
|
|
|
|
///////// |
|
1985
|
|
|
|
|
|
|
|
|
1986
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
1987
|
|
|
|
|
|
|
// |
|
1988
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
1989
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
1990
|
|
|
|
|
|
|
// |
|
1991
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
1992
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
1993
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
1994
|
|
|
|
|
|
|
|
|
1995
|
4
|
|
|
|
|
|
class model_morphodita_parsito : public model { |
|
1996
|
|
|
|
|
|
|
public: |
|
1997
|
|
|
|
|
|
|
virtual input_format* new_tokenizer(const string& options) const override; |
|
1998
|
|
|
|
|
|
|
virtual bool tag(sentence& s, const string& options, string& error) const override; |
|
1999
|
|
|
|
|
|
|
virtual bool parse(sentence& s, const string& options, string& error) const override; |
|
2000
|
|
|
|
|
|
|
|
|
2001
|
|
|
|
|
|
|
static model* load(istream& is); |
|
2002
|
|
|
|
|
|
|
|
|
2003
|
|
|
|
|
|
|
private: |
|
2004
|
|
|
|
|
|
|
model_morphodita_parsito(unsigned version); |
|
2005
|
|
|
|
|
|
|
unsigned version; |
|
2006
|
|
|
|
|
|
|
enum { VERSION_LATEST = 3 }; |
|
2007
|
|
|
|
|
|
|
|
|
2008
|
|
|
|
|
|
|
unique_ptr tokenizer_factory; |
|
2009
|
|
|
|
|
|
|
unique_ptr splitter; |
|
2010
|
1
|
|
|
|
|
|
struct tagger_model { |
|
2011
|
|
|
|
|
|
|
bool raw; bool upostag; int lemma; bool xpostag, feats; |
|
2012
|
|
|
|
|
|
|
unique_ptr tagger; |
|
2013
|
|
|
|
|
|
|
|
|
2014
|
|
|
|
|
|
|
tagger_model(bool raw, bool upostag, int lemma, bool xpostag, bool feats, morphodita::tagger* tagger) |
|
2015
|
1
|
|
|
|
|
|
: raw(raw), upostag(upostag), lemma(lemma), xpostag(xpostag), feats(feats), tagger(tagger) {} |
|
2016
|
|
|
|
|
|
|
}; |
|
2017
|
|
|
|
|
|
|
vector taggers; |
|
2018
|
|
|
|
|
|
|
unique_ptr parser; |
|
2019
|
|
|
|
|
|
|
|
|
2020
|
3
|
|
|
|
|
|
struct tagger_cache { |
|
2021
|
|
|
|
|
|
|
vector forms_normalized; |
|
2022
|
|
|
|
|
|
|
vector forms_string_pieces; |
|
2023
|
|
|
|
|
|
|
vector lemmas; |
|
2024
|
|
|
|
|
|
|
}; |
|
2025
|
|
|
|
|
|
|
mutable threadsafe_stack tagger_caches; |
|
2026
|
|
|
|
|
|
|
|
|
2027
|
1
|
50
|
|
|
|
|
struct parser_cache { |
|
2028
|
|
|
|
|
|
|
parsito::tree tree; |
|
2029
|
|
|
|
|
|
|
named_values::map options; |
|
2030
|
|
|
|
|
|
|
}; |
|
2031
|
|
|
|
|
|
|
mutable threadsafe_stack parser_caches; |
|
2032
|
|
|
|
|
|
|
|
|
2033
|
|
|
|
|
|
|
bool parse(sentence& s, const string& options, string& error, double* cost) const; |
|
2034
|
|
|
|
|
|
|
|
|
2035
|
0
|
|
|
|
|
|
class joint_with_parsing_tokenizer : public input_format { |
|
2036
|
|
|
|
|
|
|
public: |
|
2037
|
|
|
|
|
|
|
joint_with_parsing_tokenizer(input_format* tokenizer, const model_morphodita_parsito& model, |
|
2038
|
|
|
|
|
|
|
int max_sentence_len, double change_boundary_logprob, double sentence_logprob) |
|
2039
|
|
|
|
|
|
|
: tokenizer(tokenizer), model(model), max_sentence_len(max_sentence_len), |
|
2040
|
0
|
|
|
|
|
|
change_boundary_logprob(change_boundary_logprob), sentence_logprob(sentence_logprob) {} |
|
2041
|
|
|
|
|
|
|
|
|
2042
|
|
|
|
|
|
|
virtual bool read_block(istream& is, string& block) const override; |
|
2043
|
|
|
|
|
|
|
virtual void reset_document(string_piece id) override; |
|
2044
|
|
|
|
|
|
|
virtual void set_text(string_piece text, bool make_copy = false) override; |
|
2045
|
|
|
|
|
|
|
virtual bool next_sentence(sentence& s, string& error) override; |
|
2046
|
|
|
|
|
|
|
|
|
2047
|
|
|
|
|
|
|
private: |
|
2048
|
|
|
|
|
|
|
bool parse_paragraph(vector& paragraph, string& error); |
|
2049
|
|
|
|
|
|
|
|
|
2050
|
|
|
|
|
|
|
unique_ptr tokenizer; |
|
2051
|
|
|
|
|
|
|
const model_morphodita_parsito& model; |
|
2052
|
|
|
|
|
|
|
int max_sentence_len; |
|
2053
|
|
|
|
|
|
|
double change_boundary_logprob; |
|
2054
|
|
|
|
|
|
|
double sentence_logprob; |
|
2055
|
|
|
|
|
|
|
|
|
2056
|
|
|
|
|
|
|
string_piece text; |
|
2057
|
|
|
|
|
|
|
string text_copy; |
|
2058
|
|
|
|
|
|
|
bool new_document = true; |
|
2059
|
|
|
|
|
|
|
string document_id; |
|
2060
|
|
|
|
|
|
|
unsigned sentence_id = 1; |
|
2061
|
|
|
|
|
|
|
vector sentences; |
|
2062
|
|
|
|
|
|
|
size_t sentences_index = 0; |
|
2063
|
|
|
|
|
|
|
}; |
|
2064
|
|
|
|
|
|
|
|
|
2065
|
|
|
|
|
|
|
void fill_word_analysis(const morphodita::tagged_lemma& analysis, bool raw, bool upostag, int lemma, bool xpostag, bool feats, word& word) const; |
|
2066
|
|
|
|
|
|
|
const string& normalize_form(string_piece form, string& output) const; |
|
2067
|
|
|
|
|
|
|
const string& normalize_lemma(string_piece lemma, string& output) const; |
|
2068
|
|
|
|
|
|
|
friend class trainer_morphodita_parsito; |
|
2069
|
|
|
|
|
|
|
}; |
|
2070
|
|
|
|
|
|
|
|
|
2071
|
|
|
|
|
|
|
///////// |
|
2072
|
|
|
|
|
|
|
// File: model/model.cpp |
|
2073
|
|
|
|
|
|
|
///////// |
|
2074
|
|
|
|
|
|
|
|
|
2075
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
2076
|
|
|
|
|
|
|
// |
|
2077
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
2078
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
2079
|
|
|
|
|
|
|
// |
|
2080
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
2081
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
2082
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
2083
|
|
|
|
|
|
|
|
|
2084
|
2
|
|
|
|
|
|
const string model::DEFAULT; |
|
2085
|
2
|
|
|
|
|
|
const string model::TOKENIZER_NORMALIZED_SPACES = "normalized_spaces"; |
|
2086
|
2
|
|
|
|
|
|
const string model::TOKENIZER_PRESEGMENTED = "presegmented"; |
|
2087
|
2
|
|
|
|
|
|
const string model::TOKENIZER_RANGES = "ranges"; |
|
2088
|
|
|
|
|
|
|
|
|
2089
|
1
|
|
|
|
|
|
model* model::load(const char* fname) { |
|
2090
|
2
|
50
|
|
|
|
|
ifstream in(path_from_utf8(fname).c_str(), ifstream::in | ifstream::binary); |
|
2091
|
1
|
50
|
|
|
|
|
if (!in.is_open()) return nullptr; |
|
2092
|
1
|
50
|
|
|
|
|
return load(in); |
|
2093
|
|
|
|
|
|
|
} |
|
2094
|
|
|
|
|
|
|
|
|
2095
|
1
|
|
|
|
|
|
model* model::load(istream& is) { |
|
2096
|
|
|
|
|
|
|
char len; |
|
2097
|
1
|
50
|
|
|
|
|
if (!is.get(len)) return nullptr; |
|
2098
|
1
|
|
|
|
|
|
string name(len, ' '); |
|
2099
|
1
|
50
|
|
|
|
|
if (!is.read(&name[0], len)) return nullptr; |
|
|
|
50
|
|
|
|
|
|
|
2100
|
|
|
|
|
|
|
|
|
2101
|
1
|
50
|
|
|
|
|
if (name == "morphodita_parsito") return model_morphodita_parsito::load(is); |
|
|
|
50
|
|
|
|
|
|
|
2102
|
|
|
|
|
|
|
|
|
2103
|
|
|
|
|
|
|
return nullptr; |
|
2104
|
|
|
|
|
|
|
} |
|
2105
|
|
|
|
|
|
|
|
|
2106
|
|
|
|
|
|
|
///////// |
|
2107
|
|
|
|
|
|
|
// File: morphodita/tagger/tagger_ids.h |
|
2108
|
|
|
|
|
|
|
///////// |
|
2109
|
|
|
|
|
|
|
|
|
2110
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
2111
|
|
|
|
|
|
|
// |
|
2112
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
2113
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
2114
|
|
|
|
|
|
|
// |
|
2115
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
2116
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
2117
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
2118
|
|
|
|
|
|
|
|
|
2119
|
|
|
|
|
|
|
namespace morphodita { |
|
2120
|
|
|
|
|
|
|
|
|
2121
|
|
|
|
|
|
|
class tagger_ids { |
|
2122
|
|
|
|
|
|
|
public: |
|
2123
|
|
|
|
|
|
|
enum tagger_id { |
|
2124
|
|
|
|
|
|
|
CZECH2 = 0, CZECH3 = 1, CZECH2_3 = 6, |
|
2125
|
|
|
|
|
|
|
/* 2 was used internally for ENGLISH3, but never released publicly */ |
|
2126
|
|
|
|
|
|
|
GENERIC2 = 3, GENERIC3 = 4, GENERIC4 = 5, GENERIC2_3 = 7, |
|
2127
|
|
|
|
|
|
|
CONLLU2 = 8, CONLLU2_3 = 9, CONLLU3 = 10, |
|
2128
|
|
|
|
|
|
|
}; |
|
2129
|
|
|
|
|
|
|
|
|
2130
|
|
|
|
|
|
|
static bool parse(const string& str, tagger_id& id) { |
|
2131
|
|
|
|
|
|
|
if (str == "czech2") return id = CZECH2, true; |
|
2132
|
|
|
|
|
|
|
if (str == "czech2_3") return id = CZECH2_3, true; |
|
2133
|
|
|
|
|
|
|
if (str == "czech3") return id = CZECH3, true; |
|
2134
|
|
|
|
|
|
|
if (str == "generic2") return id = GENERIC2, true; |
|
2135
|
|
|
|
|
|
|
if (str == "generic2_3") return id = GENERIC2_3, true; |
|
2136
|
|
|
|
|
|
|
if (str == "generic3") return id = GENERIC3, true; |
|
2137
|
|
|
|
|
|
|
if (str == "generic4") return id = GENERIC4, true; |
|
2138
|
|
|
|
|
|
|
if (str == "conllu2") return id = CONLLU2, true; |
|
2139
|
|
|
|
|
|
|
if (str == "conllu2_3") return id = CONLLU2_3, true; |
|
2140
|
|
|
|
|
|
|
if (str == "conllu3") return id = CONLLU3, true; |
|
2141
|
|
|
|
|
|
|
return false; |
|
2142
|
|
|
|
|
|
|
} |
|
2143
|
|
|
|
|
|
|
|
|
2144
|
|
|
|
|
|
|
static int decoding_order(tagger_id id) { |
|
2145
|
|
|
|
|
|
|
switch (id) { |
|
2146
|
|
|
|
|
|
|
case CZECH2: return 2; |
|
2147
|
|
|
|
|
|
|
case CZECH2_3: return 2; |
|
2148
|
|
|
|
|
|
|
case CZECH3: return 3; |
|
2149
|
|
|
|
|
|
|
case GENERIC2: return 2; |
|
2150
|
|
|
|
|
|
|
case GENERIC2_3: return 2; |
|
2151
|
|
|
|
|
|
|
case GENERIC3: return 3; |
|
2152
|
|
|
|
|
|
|
case GENERIC4: return 4; |
|
2153
|
|
|
|
|
|
|
case CONLLU2: return 2; |
|
2154
|
|
|
|
|
|
|
case CONLLU2_3: return 2; |
|
2155
|
|
|
|
|
|
|
case CONLLU3: return 3; |
|
2156
|
|
|
|
|
|
|
} |
|
2157
|
|
|
|
|
|
|
return 0; |
|
2158
|
|
|
|
|
|
|
} |
|
2159
|
|
|
|
|
|
|
|
|
2160
|
|
|
|
|
|
|
static int window_size(tagger_id id) { |
|
2161
|
|
|
|
|
|
|
switch (id) { |
|
2162
|
|
|
|
|
|
|
case CZECH2_3: return 3; |
|
2163
|
|
|
|
|
|
|
case GENERIC2_3: return 3; |
|
2164
|
|
|
|
|
|
|
case CONLLU2_3: return 3; |
|
2165
|
|
|
|
|
|
|
default: break; |
|
2166
|
|
|
|
|
|
|
} |
|
2167
|
|
|
|
|
|
|
return decoding_order(id); |
|
2168
|
|
|
|
|
|
|
} |
|
2169
|
|
|
|
|
|
|
}; |
|
2170
|
|
|
|
|
|
|
|
|
2171
|
|
|
|
|
|
|
typedef tagger_ids::tagger_id tagger_id; |
|
2172
|
|
|
|
|
|
|
|
|
2173
|
|
|
|
|
|
|
} // namespace morphodita |
|
2174
|
|
|
|
|
|
|
|
|
2175
|
|
|
|
|
|
|
///////// |
|
2176
|
|
|
|
|
|
|
// File: tokenizer/morphodita_tokenizer_wrapper.h |
|
2177
|
|
|
|
|
|
|
///////// |
|
2178
|
|
|
|
|
|
|
|
|
2179
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
2180
|
|
|
|
|
|
|
// |
|
2181
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
2182
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
2183
|
|
|
|
|
|
|
// |
|
2184
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
2185
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
2186
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
2187
|
|
|
|
|
|
|
|
|
2188
|
4
|
|
|
|
|
|
class morphodita_tokenizer_wrapper : public input_format { |
|
2189
|
|
|
|
|
|
|
public: |
|
2190
|
|
|
|
|
|
|
morphodita_tokenizer_wrapper(morphodita::tokenizer* tokenizer, const multiword_splitter* splitter, bool normalized_spaces, bool token_ranges); |
|
2191
|
|
|
|
|
|
|
|
|
2192
|
|
|
|
|
|
|
virtual bool read_block(istream& is, string& block) const override; |
|
2193
|
|
|
|
|
|
|
virtual void reset_document(string_piece id) override; |
|
2194
|
|
|
|
|
|
|
virtual void set_text(string_piece text, bool make_copy = false) override; |
|
2195
|
|
|
|
|
|
|
virtual bool next_sentence(sentence& s, string& error) override; |
|
2196
|
|
|
|
|
|
|
|
|
2197
|
|
|
|
|
|
|
private: |
|
2198
|
|
|
|
|
|
|
unique_ptr tokenizer; |
|
2199
|
|
|
|
|
|
|
const multiword_splitter* splitter; |
|
2200
|
|
|
|
|
|
|
bool normalized_spaces, token_ranges; |
|
2201
|
|
|
|
|
|
|
|
|
2202
|
|
|
|
|
|
|
bool new_document = true; |
|
2203
|
|
|
|
|
|
|
string document_id; |
|
2204
|
|
|
|
|
|
|
unsigned preceeding_newlines = 2; |
|
2205
|
|
|
|
|
|
|
unsigned sentence_id = 1; |
|
2206
|
|
|
|
|
|
|
|
|
2207
|
|
|
|
|
|
|
string_piece text; |
|
2208
|
|
|
|
|
|
|
string text_copy; |
|
2209
|
|
|
|
|
|
|
size_t unicode_offset = 0, text_unicode_length = 0; |
|
2210
|
|
|
|
|
|
|
string saved_spaces; |
|
2211
|
|
|
|
|
|
|
vector forms; |
|
2212
|
|
|
|
|
|
|
vector tokens; |
|
2213
|
|
|
|
|
|
|
token tok; |
|
2214
|
|
|
|
|
|
|
}; |
|
2215
|
|
|
|
|
|
|
|
|
2216
|
|
|
|
|
|
|
///////// |
|
2217
|
|
|
|
|
|
|
// File: utils/getpara.h |
|
2218
|
|
|
|
|
|
|
///////// |
|
2219
|
|
|
|
|
|
|
|
|
2220
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
|
2221
|
|
|
|
|
|
|
// |
|
2222
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
2223
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
2224
|
|
|
|
|
|
|
// |
|
2225
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
2226
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
2227
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
2228
|
|
|
|
|
|
|
|
|
2229
|
|
|
|
|
|
|
namespace utils { |
|
2230
|
|
|
|
|
|
|
|
|
2231
|
|
|
|
|
|
|
// |
|
2232
|
|
|
|
|
|
|
// Declarations |
|
2233
|
|
|
|
|
|
|
// |
|
2234
|
|
|
|
|
|
|
|
|
2235
|
|
|
|
|
|
|
// Read paragraph until EOF or end line. All encountered \n are stored. |
|
2236
|
|
|
|
|
|
|
inline istream& getpara(istream& is, string& para); |
|
2237
|
|
|
|
|
|
|
|
|
2238
|
|
|
|
|
|
|
// |
|
2239
|
|
|
|
|
|
|
// Definitions |
|
2240
|
|
|
|
|
|
|
// |
|
2241
|
|
|
|
|
|
|
|
|
2242
|
0
|
|
|
|
|
|
istream& getpara(istream& is, string& para) { |
|
2243
|
|
|
|
|
|
|
para.clear(); |
|
2244
|
|
|
|
|
|
|
|
|
2245
|
0
|
0
|
|
|
|
|
for (string line; getline(is, line); ) { |
|
|
|
0
|
|
|
|
|
|
|
2246
|
|
|
|
|
|
|
para.append(line); |
|
2247
|
0
|
0
|
|
|
|
|
para.push_back('\n'); |
|
2248
|
|
|
|
|
|
|
|
|
2249
|
0
|
0
|
|
|
|
|
if (line.empty()) break; |
|
2250
|
|
|
|
|
|
|
} |
|
2251
|
|
|
|
|
|
|
|
|
2252
|
0
|
0
|
|
|
|
|
if (is.eof() && !para.empty()) is.clear(istream::eofbit); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
2253
|
0
|
|
|
|
|
|
return is; |
|
2254
|
|
|
|
|
|
|
} |
|
2255
|
|
|
|
|
|
|
|
|
2256
|
|
|
|
|
|
|
} // namespace utils |
|
2257
|
|
|
|
|
|
|
|
|
2258
|
|
|
|
|
|
|
///////// |
|
2259
|
|
|
|
|
|
|
// File: utils/parse_double.h |
|
2260
|
|
|
|
|
|
|
///////// |
|
2261
|
|
|
|
|
|
|
|
|
2262
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
|
2263
|
|
|
|
|
|
|
// |
|
2264
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
2265
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
2266
|
|
|
|
|
|
|
// |
|
2267
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
2268
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
2269
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
2270
|
|
|
|
|
|
|
|
|
2271
|
|
|
|
|
|
|
namespace utils { |
|
2272
|
|
|
|
|
|
|
|
|
2273
|
|
|
|
|
|
|
// |
|
2274
|
|
|
|
|
|
|
// Declarations |
|
2275
|
|
|
|
|
|
|
// |
|
2276
|
|
|
|
|
|
|
|
|
2277
|
|
|
|
|
|
|
// Try to parse an double from given string. If the double cannot be parsed or does |
|
2278
|
|
|
|
|
|
|
// not fit doubleo double, false is returned and the error string is filled using the |
|
2279
|
|
|
|
|
|
|
// value_name argument. |
|
2280
|
|
|
|
|
|
|
inline bool parse_double(string_piece str, const char* value_name, double& value, string& error); |
|
2281
|
|
|
|
|
|
|
|
|
2282
|
|
|
|
|
|
|
// Try to parse an double from given string. If the double cannot be parsed or does |
|
2283
|
|
|
|
|
|
|
// not fit doubleo double, an error is displayed and program exits. |
|
2284
|
|
|
|
|
|
|
inline double parse_double(string_piece str, const char* value_name); |
|
2285
|
|
|
|
|
|
|
|
|
2286
|
|
|
|
|
|
|
// |
|
2287
|
|
|
|
|
|
|
// Definitions |
|
2288
|
|
|
|
|
|
|
// |
|
2289
|
|
|
|
|
|
|
|
|
2290
|
0
|
|
|
|
|
|
bool parse_double(string_piece str, const char* value_name, double& value, string& error) { |
|
2291
|
|
|
|
|
|
|
string_piece original = str; |
|
2292
|
|
|
|
|
|
|
|
|
2293
|
|
|
|
|
|
|
// Skip spaces |
|
2294
|
0
|
0
|
|
|
|
|
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
2295
|
0
|
|
|
|
|
|
str.str++, str.len--; |
|
2296
|
|
|
|
|
|
|
|
|
2297
|
|
|
|
|
|
|
// Allow plus/minus |
|
2298
|
|
|
|
|
|
|
bool negative = false; |
|
2299
|
0
|
0
|
|
|
|
|
if (str.len && (str.str[0] == '+' || str.str[0] == '-')) { |
|
|
|
0
|
|
|
|
|
|
|
2300
|
|
|
|
|
|
|
negative = str.str[0] == '-'; |
|
2301
|
0
|
|
|
|
|
|
str.str++, str.len--; |
|
2302
|
|
|
|
|
|
|
} |
|
2303
|
|
|
|
|
|
|
|
|
2304
|
|
|
|
|
|
|
// Parse value, checking for overflow/underflow |
|
2305
|
0
|
0
|
|
|
|
|
if (!str.len) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': empty string."), false; |
|
2306
|
|
|
|
|
|
|
if (!(str.str[0] >= '0' || str.str[0] <= '9')) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': non-digit character found."), false; |
|
2307
|
|
|
|
|
|
|
|
|
2308
|
0
|
|
|
|
|
|
value = 0; |
|
2309
|
0
|
0
|
|
|
|
|
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
2310
|
0
|
|
|
|
|
|
value = 10 * value + (str.str[0] - '0'); |
|
2311
|
0
|
|
|
|
|
|
str.str++, str.len--; |
|
2312
|
|
|
|
|
|
|
} |
|
2313
|
|
|
|
|
|
|
|
|
2314
|
|
|
|
|
|
|
// If there is a decimal point, parse the rest of the |
|
2315
|
0
|
0
|
|
|
|
|
if (str.len && str.str[0] == '.') { |
|
|
|
0
|
|
|
|
|
|
|
2316
|
|
|
|
|
|
|
double divider = 1; |
|
2317
|
|
|
|
|
|
|
|
|
2318
|
0
|
|
|
|
|
|
str.str++, str.len--; |
|
2319
|
0
|
0
|
|
|
|
|
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
2320
|
0
|
|
|
|
|
|
value = 10 * value + (str.str[0] - '0'); |
|
2321
|
0
|
|
|
|
|
|
divider *= 10.; |
|
2322
|
0
|
|
|
|
|
|
str.str++, str.len--; |
|
2323
|
|
|
|
|
|
|
} |
|
2324
|
|
|
|
|
|
|
|
|
2325
|
0
|
|
|
|
|
|
value /= divider; |
|
2326
|
|
|
|
|
|
|
} |
|
2327
|
0
|
0
|
|
|
|
|
if (!isfinite(value)) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': overflow occured."), false; |
|
2328
|
|
|
|
|
|
|
|
|
2329
|
|
|
|
|
|
|
// Optionally parse an exponent |
|
2330
|
0
|
0
|
|
|
|
|
if (str.len && (str.str[0] == 'e' || str.str[0] == 'E')) { |
|
|
|
0
|
|
|
|
|
|
|
2331
|
0
|
|
|
|
|
|
str.str++, str.len--; |
|
2332
|
|
|
|
|
|
|
|
|
2333
|
|
|
|
|
|
|
double exponent = 0; |
|
2334
|
|
|
|
|
|
|
bool exponent_negative = false; |
|
2335
|
0
|
0
|
|
|
|
|
if (str.len && (str.str[0] == '+' || str.str[0] == '-')) { |
|
|
|
0
|
|
|
|
|
|
|
2336
|
|
|
|
|
|
|
exponent_negative = str.str[0] == '-'; |
|
2337
|
0
|
|
|
|
|
|
str.str++, str.len--; |
|
2338
|
|
|
|
|
|
|
} |
|
2339
|
|
|
|
|
|
|
|
|
2340
|
0
|
0
|
|
|
|
|
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
2341
|
0
|
|
|
|
|
|
exponent = 10 * exponent + (str.str[0] - '0'); |
|
2342
|
0
|
|
|
|
|
|
str.str++, str.len--; |
|
2343
|
|
|
|
|
|
|
} |
|
2344
|
|
|
|
|
|
|
|
|
2345
|
0
|
0
|
|
|
|
|
exponent = pow(10., exponent_negative ? -exponent : exponent); |
|
2346
|
0
|
0
|
|
|
|
|
if (!isfinite(exponent)) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': exponent overflow occured."), false; |
|
2347
|
0
|
0
|
|
|
|
|
if (exponent == 0) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': exponent underflow occured."), false; |
|
2348
|
|
|
|
|
|
|
|
|
2349
|
0
|
0
|
|
|
|
|
if (value) { |
|
2350
|
0
|
|
|
|
|
|
value *= exponent; |
|
2351
|
0
|
0
|
|
|
|
|
if (!isfinite(value)) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': overflow occured."), false; |
|
2352
|
0
|
0
|
|
|
|
|
if (value == 0) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': underflow occured."), false; |
|
2353
|
|
|
|
|
|
|
} |
|
2354
|
|
|
|
|
|
|
} |
|
2355
|
|
|
|
|
|
|
|
|
2356
|
|
|
|
|
|
|
// Apply initial minus |
|
2357
|
0
|
0
|
|
|
|
|
if (negative) value *= -1; |
|
2358
|
|
|
|
|
|
|
|
|
2359
|
|
|
|
|
|
|
// Skip spaces |
|
2360
|
0
|
0
|
|
|
|
|
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
2361
|
0
|
|
|
|
|
|
str.str++, str.len--; |
|
2362
|
|
|
|
|
|
|
|
|
2363
|
|
|
|
|
|
|
// Check for remaining characters |
|
2364
|
0
|
0
|
|
|
|
|
if (str.len) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': non-digit character found."), false; |
|
2365
|
|
|
|
|
|
|
|
|
2366
|
|
|
|
|
|
|
return true; |
|
2367
|
|
|
|
|
|
|
} |
|
2368
|
|
|
|
|
|
|
|
|
2369
|
0
|
|
|
|
|
|
double parse_double(string_piece str, const char* value_name) { |
|
2370
|
|
|
|
|
|
|
double result; |
|
2371
|
|
|
|
|
|
|
string error; |
|
2372
|
|
|
|
|
|
|
|
|
2373
|
0
|
0
|
|
|
|
|
if (!parse_double(str, value_name, result, error)) |
|
|
|
0
|
|
|
|
|
|
|
2374
|
0
|
|
|
|
|
|
runtime_failure(error); |
|
2375
|
|
|
|
|
|
|
|
|
2376
|
0
|
|
|
|
|
|
return result; |
|
2377
|
|
|
|
|
|
|
} |
|
2378
|
|
|
|
|
|
|
|
|
2379
|
|
|
|
|
|
|
} // namespace utils |
|
2380
|
|
|
|
|
|
|
|
|
2381
|
|
|
|
|
|
|
///////// |
|
2382
|
|
|
|
|
|
|
// File: model/model_morphodita_parsito.cpp |
|
2383
|
|
|
|
|
|
|
///////// |
|
2384
|
|
|
|
|
|
|
|
|
2385
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
2386
|
|
|
|
|
|
|
// |
|
2387
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
2388
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
2389
|
|
|
|
|
|
|
// |
|
2390
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
2391
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
2392
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
2393
|
|
|
|
|
|
|
|
|
2394
|
|
|
|
|
|
|
// Versions: |
|
2395
|
|
|
|
|
|
|
// 1 - initial version |
|
2396
|
|
|
|
|
|
|
// 2 - add absolute lemmas (tagger_model::lemma == 2) |
|
2397
|
|
|
|
|
|
|
// - use Arabic and space normalization |
|
2398
|
|
|
|
|
|
|
|
|
2399
|
1
|
|
|
|
|
|
input_format* model_morphodita_parsito::new_tokenizer(const string& options) const { |
|
2400
|
1
|
50
|
|
|
|
|
if (!tokenizer_factory) |
|
2401
|
|
|
|
|
|
|
return nullptr; |
|
2402
|
|
|
|
|
|
|
|
|
2403
|
|
|
|
|
|
|
named_values::map parsed_options; |
|
2404
|
|
|
|
|
|
|
string parse_error; |
|
2405
|
1
|
50
|
|
|
|
|
if (!named_values::parse(options, parsed_options, parse_error)) |
|
|
|
50
|
|
|
|
|
|
|
2406
|
|
|
|
|
|
|
return nullptr; |
|
2407
|
|
|
|
|
|
|
|
|
2408
|
1
|
50
|
|
|
|
|
bool normalized_spaces = parsed_options.count("normalized_spaces"); |
|
2409
|
1
|
50
|
|
|
|
|
bool token_ranges = parsed_options.count("ranges"); |
|
2410
|
|
|
|
|
|
|
|
|
2411
|
1
|
50
|
|
|
|
|
const auto* morpho = !taggers.empty() ? taggers[0].tagger->get_morpho() : nullptr; |
|
|
|
50
|
|
|
|
|
|
|
2412
|
1
|
50
|
|
|
|
|
unique_ptr result(new morphodita_tokenizer_wrapper(tokenizer_factory->new_tokenizer(morpho), splitter.get(), normalized_spaces, token_ranges)); |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2413
|
|
|
|
|
|
|
|
|
2414
|
|
|
|
|
|
|
// Presegmented |
|
2415
|
3
|
0
|
|
|
|
|
if (parsed_options.count("presegmented") && result) |
|
|
|
50
|
|
|
|
|
|
|
2416
|
0
|
0
|
|
|
|
|
result.reset(input_format::new_presegmented_tokenizer(result.release())); |
|
2417
|
|
|
|
|
|
|
|
|
2418
|
|
|
|
|
|
|
// Joint with parsing |
|
2419
|
3
|
0
|
|
|
|
|
if (parsed_options.count("joint_with_parsing") && result) { |
|
|
|
50
|
|
|
|
|
|
|
2420
|
0
|
|
|
|
|
|
int max_sentence_len = 20; |
|
2421
|
0
|
0
|
|
|
|
|
if (parsed_options.count("joint_max_sentence_len") && !parse_int(parsed_options["joint_max_sentence_len"], "joint max sentence len", max_sentence_len, parse_error)) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
2422
|
0
|
|
|
|
|
|
return nullptr; |
|
2423
|
|
|
|
|
|
|
|
|
2424
|
0
|
|
|
|
|
|
double change_boundary_logprob = -0.5; |
|
2425
|
0
|
0
|
|
|
|
|
if (parsed_options.count("joint_change_boundary_logprob") && !parse_double(parsed_options["joint_change_boundary_logprob"], "joint change boundary logprob", change_boundary_logprob, parse_error)) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
2426
|
|
|
|
|
|
|
return nullptr; |
|
2427
|
|
|
|
|
|
|
|
|
2428
|
0
|
|
|
|
|
|
double sentence_logprob = -0.5; |
|
2429
|
0
|
0
|
|
|
|
|
if (parsed_options.count("joint_sentence_logprob") && !parse_double(parsed_options["joint_sentence_logprob"], "joint sentence logprob", sentence_logprob, parse_error)) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
2430
|
|
|
|
|
|
|
return nullptr; |
|
2431
|
|
|
|
|
|
|
|
|
2432
|
0
|
0
|
|
|
|
|
result.reset(new joint_with_parsing_tokenizer(result.release(), *this, max_sentence_len, change_boundary_logprob, sentence_logprob)); |
|
2433
|
|
|
|
|
|
|
} |
|
2434
|
|
|
|
|
|
|
|
|
2435
|
1
|
|
|
|
|
|
return result.release(); |
|
2436
|
|
|
|
|
|
|
} |
|
2437
|
|
|
|
|
|
|
|
|
2438
|
1
|
|
|
|
|
|
bool model_morphodita_parsito::tag(sentence& s, const string& /*options*/, string& error) const { |
|
2439
|
|
|
|
|
|
|
error.clear(); |
|
2440
|
|
|
|
|
|
|
|
|
2441
|
1
|
50
|
|
|
|
|
if (taggers.empty()) return error.assign("No tagger defined for the UDPipe model!"), false; |
|
2442
|
1
|
50
|
|
|
|
|
if (s.empty()) return true; |
|
2443
|
|
|
|
|
|
|
|
|
2444
|
1
|
|
|
|
|
|
tagger_cache* c = tagger_caches.pop(); |
|
2445
|
1
|
50
|
|
|
|
|
if (!c) c = new tagger_cache(); |
|
2446
|
|
|
|
|
|
|
|
|
2447
|
|
|
|
|
|
|
// Prepare input forms |
|
2448
|
1
|
|
|
|
|
|
c->forms_normalized.resize(s.words.size() - 1); |
|
2449
|
1
|
|
|
|
|
|
c->forms_string_pieces.resize(s.words.size() - 1); |
|
2450
|
8
|
100
|
|
|
|
|
for (size_t i = 1; i < s.words.size(); i++) |
|
2451
|
7
|
|
|
|
|
|
c->forms_string_pieces[i - 1] = normalize_form(s.words[i].form, c->forms_normalized[i - 1]); |
|
2452
|
|
|
|
|
|
|
|
|
2453
|
|
|
|
|
|
|
// Clear first |
|
2454
|
8
|
100
|
|
|
|
|
for (size_t i = 1; i < s.words.size(); i++) { |
|
2455
|
7
|
|
|
|
|
|
s.words[i].lemma.assign("_"); |
|
2456
|
|
|
|
|
|
|
s.words[i].upostag.clear(); |
|
2457
|
|
|
|
|
|
|
s.words[i].xpostag.clear(); |
|
2458
|
|
|
|
|
|
|
s.words[i].feats.clear(); |
|
2459
|
|
|
|
|
|
|
} |
|
2460
|
|
|
|
|
|
|
|
|
2461
|
|
|
|
|
|
|
// Fill information from the tagger models |
|
2462
|
2
|
100
|
|
|
|
|
for (auto&& tagger : taggers) { |
|
2463
|
1
|
50
|
|
|
|
|
if (!tagger.tagger) return error.assign("No tagger defined for the UDPipe model!"), false; |
|
2464
|
|
|
|
|
|
|
|
|
2465
|
1
|
|
|
|
|
|
tagger.tagger->tag(c->forms_string_pieces, c->lemmas); |
|
2466
|
|
|
|
|
|
|
|
|
2467
|
8
|
100
|
|
|
|
|
for (size_t i = 0; i < c->lemmas.size(); i++) |
|
2468
|
7
|
|
|
|
|
|
fill_word_analysis(c->lemmas[i], tagger.raw, tagger.upostag, tagger.lemma, tagger.xpostag, tagger.feats, s.words[i+1]); |
|
2469
|
|
|
|
|
|
|
} |
|
2470
|
|
|
|
|
|
|
|
|
2471
|
|
|
|
|
|
|
// For raw tagger models, fill MorphoGuesser=Yes where appropriate |
|
2472
|
1
|
50
|
|
|
|
|
if (taggers.size() == 1 && taggers[0].raw && taggers[0].tagger->get_morpho()) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2473
|
0
|
|
|
|
|
|
const auto* morpho = taggers[0].tagger->get_morpho(); |
|
2474
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < c->forms_string_pieces.size(); i++) { |
|
2475
|
0
|
0
|
|
|
|
|
if (morpho->analyze(c->forms_string_pieces[i], morphodita::morpho::GUESSER, c->lemmas) == morphodita::morpho::GUESSER) |
|
2476
|
0
|
0
|
|
|
|
|
s.words[i + 1].misc.append(s.words[i + 1].misc.empty() ? "" : "|").append("MorphoGuesser=Yes"); |
|
2477
|
|
|
|
|
|
|
} |
|
2478
|
|
|
|
|
|
|
} |
|
2479
|
|
|
|
|
|
|
|
|
2480
|
1
|
|
|
|
|
|
tagger_caches.push(c); |
|
2481
|
1
|
|
|
|
|
|
return true; |
|
2482
|
|
|
|
|
|
|
} |
|
2483
|
|
|
|
|
|
|
|
|
2484
|
1
|
|
|
|
|
|
bool model_morphodita_parsito::parse(sentence& s, const string& options, string& error) const { |
|
2485
|
1
|
|
|
|
|
|
return parse(s, options, error, nullptr); |
|
2486
|
|
|
|
|
|
|
} |
|
2487
|
|
|
|
|
|
|
|
|
2488
|
1
|
|
|
|
|
|
bool model_morphodita_parsito::parse(sentence& s, const string& options, string& error, double* cost) const { |
|
2489
|
|
|
|
|
|
|
error.clear(); |
|
2490
|
|
|
|
|
|
|
|
|
2491
|
1
|
50
|
|
|
|
|
if (!parser) return error.assign("No parser defined for the UDPipe model!"), false; |
|
2492
|
1
|
50
|
|
|
|
|
if (s.empty()) return true; |
|
2493
|
|
|
|
|
|
|
|
|
2494
|
1
|
|
|
|
|
|
parser_cache* c = parser_caches.pop(); |
|
2495
|
1
|
50
|
|
|
|
|
if (!c) c = new parser_cache(); |
|
2496
|
|
|
|
|
|
|
|
|
2497
|
1
|
|
|
|
|
|
int beam_search = 5; |
|
2498
|
1
|
50
|
|
|
|
|
if (!named_values::parse(options, c->options, error)) |
|
2499
|
|
|
|
|
|
|
return false; |
|
2500
|
2
|
50
|
|
|
|
|
if (c->options.count("beam_search")) |
|
2501
|
0
|
0
|
|
|
|
|
if (!parse_int(c->options["beam_search"], "beam_search", beam_search, error)) |
|
|
|
0
|
|
|
|
|
|
|
2502
|
|
|
|
|
|
|
return false; |
|
2503
|
|
|
|
|
|
|
|
|
2504
|
1
|
|
|
|
|
|
c->tree.clear(); |
|
2505
|
8
|
100
|
|
|
|
|
for (size_t i = 1; i < s.words.size(); i++) { |
|
2506
|
7
|
|
|
|
|
|
c->tree.add_node(string()); |
|
2507
|
7
|
|
|
|
|
|
normalize_form(s.words[i].form, c->tree.nodes.back().form); |
|
2508
|
7
|
|
|
|
|
|
normalize_lemma(s.words[i].lemma, c->tree.nodes.back().lemma); |
|
2509
|
14
|
|
|
|
|
|
c->tree.nodes.back().upostag.assign(s.words[i].upostag); |
|
2510
|
14
|
|
|
|
|
|
c->tree.nodes.back().xpostag.assign(s.words[i].xpostag); |
|
2511
|
14
|
|
|
|
|
|
c->tree.nodes.back().feats.assign(s.words[i].feats); |
|
2512
|
14
|
|
|
|
|
|
c->tree.nodes.back().deps.assign(s.words[i].deps); |
|
2513
|
14
|
|
|
|
|
|
c->tree.nodes.back().misc.assign(s.words[i].misc); |
|
2514
|
|
|
|
|
|
|
} |
|
2515
|
|
|
|
|
|
|
|
|
2516
|
1
|
|
|
|
|
|
parser->parse(c->tree, beam_search, cost); |
|
2517
|
8
|
100
|
|
|
|
|
for (size_t i = 1; i < s.words.size(); i++) |
|
2518
|
7
|
|
|
|
|
|
s.set_head(i, c->tree.nodes[i].head, c->tree.nodes[i].deprel); |
|
2519
|
|
|
|
|
|
|
|
|
2520
|
1
|
|
|
|
|
|
parser_caches.push(c); |
|
2521
|
|
|
|
|
|
|
return true; |
|
2522
|
|
|
|
|
|
|
} |
|
2523
|
|
|
|
|
|
|
|
|
2524
|
1
|
|
|
|
|
|
model* model_morphodita_parsito::load(istream& is) { |
|
2525
|
|
|
|
|
|
|
char version; |
|
2526
|
1
|
50
|
|
|
|
|
if (!is.get(version)) return nullptr; |
|
2527
|
1
|
50
|
|
|
|
|
if (!(version >= 1 && version <= VERSION_LATEST)) return nullptr; |
|
2528
|
|
|
|
|
|
|
|
|
2529
|
|
|
|
|
|
|
// Because UDPipe 1.0 does not check the model version, |
|
2530
|
|
|
|
|
|
|
// a specific sentinel was added since version 2 so that |
|
2531
|
|
|
|
|
|
|
// loading of such model fail on UDPipe 1.0 |
|
2532
|
1
|
50
|
|
|
|
|
if (version >= 2) { |
|
2533
|
|
|
|
|
|
|
char sentinel; |
|
2534
|
0
|
0
|
|
|
|
|
if (!is.get(sentinel) || sentinel != 0x7F) return nullptr; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
2535
|
0
|
0
|
|
|
|
|
if (!is.get(sentinel) || sentinel != 0x7F) return nullptr; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
2536
|
|
|
|
|
|
|
} |
|
2537
|
|
|
|
|
|
|
|
|
2538
|
1
|
|
|
|
|
|
unique_ptr m(new model_morphodita_parsito((unsigned char)version)); |
|
2539
|
1
|
50
|
|
|
|
|
if (!m) return nullptr; |
|
2540
|
|
|
|
|
|
|
|
|
2541
|
|
|
|
|
|
|
char tokenizer; |
|
2542
|
1
|
50
|
|
|
|
|
if (!is.get(tokenizer)) return nullptr; |
|
|
|
50
|
|
|
|
|
|
|
2543
|
1
|
50
|
|
|
|
|
m->tokenizer_factory.reset(tokenizer ? morphodita::tokenizer_factory::load(is) : nullptr); |
|
|
|
50
|
|
|
|
|
|
|
2544
|
1
|
50
|
|
|
|
|
if (tokenizer && !m->tokenizer_factory) return nullptr; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2545
|
1
|
50
|
|
|
|
|
m->splitter.reset(tokenizer ? multiword_splitter::load(is) : nullptr); |
|
|
|
50
|
|
|
|
|
|
|
2546
|
1
|
50
|
|
|
|
|
if (tokenizer && !m->splitter) return nullptr; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2547
|
|
|
|
|
|
|
|
|
2548
|
1
|
|
|
|
|
|
m->taggers.clear(); |
|
2549
|
1
|
50
|
|
|
|
|
char taggers; if (!is.get(taggers)) return nullptr; |
|
|
|
50
|
|
|
|
|
|
|
2550
|
2
|
100
|
|
|
|
|
for (char i = 0; i < taggers; i++) { |
|
2551
|
1
|
50
|
|
|
|
|
char lemma; if (!is.get(lemma)) return nullptr; |
|
|
|
50
|
|
|
|
|
|
|
2552
|
1
|
50
|
|
|
|
|
char xpostag; if (!is.get(xpostag)) return nullptr; |
|
|
|
50
|
|
|
|
|
|
|
2553
|
1
|
50
|
|
|
|
|
char feats; if (!is.get(feats)) return nullptr; |
|
|
|
50
|
|
|
|
|
|
|
2554
|
1
|
50
|
|
|
|
|
int model_type = is.peek(); |
|
2555
|
1
|
|
|
|
|
|
bool raw = !(model_type == morphodita::tagger_ids::CONLLU2 || |
|
2556
|
|
|
|
|
|
|
model_type == morphodita::tagger_ids::CONLLU2_3 || |
|
2557
|
1
|
50
|
|
|
|
|
model_type == morphodita::tagger_ids::CONLLU3); |
|
|
|
50
|
|
|
|
|
|
|
2558
|
1
|
50
|
|
|
|
|
morphodita::tagger* tagger = morphodita::tagger::load(is); |
|
2559
|
1
|
50
|
|
|
|
|
if (!tagger) return nullptr; |
|
2560
|
1
|
50
|
|
|
|
|
m->taggers.emplace_back(raw, i == 0, int(lemma), bool(xpostag), bool(feats), tagger); |
|
2561
|
|
|
|
|
|
|
} |
|
2562
|
|
|
|
|
|
|
|
|
2563
|
|
|
|
|
|
|
char parser; |
|
2564
|
1
|
50
|
|
|
|
|
if (!is.get(parser)) return nullptr; |
|
|
|
50
|
|
|
|
|
|
|
2565
|
1
|
50
|
|
|
|
|
m->parser.reset(parser ? parsito::parser::load(is) : nullptr); |
|
|
|
50
|
|
|
|
|
|
|
2566
|
1
|
50
|
|
|
|
|
if (parser && !m->parser) return nullptr; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
2567
|
|
|
|
|
|
|
|
|
2568
|
1
|
|
|
|
|
|
return m.release(); |
|
2569
|
|
|
|
|
|
|
} |
|
2570
|
|
|
|
|
|
|
|
|
2571
|
0
|
|
|
|
|
|
model_morphodita_parsito::model_morphodita_parsito(unsigned version) : version(version) {} |
|
2572
|
|
|
|
|
|
|
|
|
2573
|
0
|
|
|
|
|
|
bool model_morphodita_parsito::joint_with_parsing_tokenizer::read_block(istream& is, string& block) const { |
|
2574
|
|
|
|
|
|
|
block.clear(); |
|
2575
|
|
|
|
|
|
|
|
|
2576
|
0
|
0
|
|
|
|
|
for (string line; getline(is, line); ) { |
|
|
|
0
|
|
|
|
|
|
|
2577
|
|
|
|
|
|
|
block.append(line); |
|
2578
|
0
|
0
|
|
|
|
|
block.push_back('\n'); |
|
2579
|
|
|
|
|
|
|
} |
|
2580
|
|
|
|
|
|
|
|
|
2581
|
0
|
0
|
|
|
|
|
if (is.eof() && !block.empty()) is.clear(istream::eofbit); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
2582
|
0
|
|
|
|
|
|
return bool(is); |
|
2583
|
|
|
|
|
|
|
} |
|
2584
|
|
|
|
|
|
|
|
|
2585
|
0
|
|
|
|
|
|
void model_morphodita_parsito::joint_with_parsing_tokenizer::reset_document(string_piece id) { |
|
2586
|
0
|
|
|
|
|
|
new_document = true; |
|
2587
|
0
|
|
|
|
|
|
document_id.assign(id.str, id.len); |
|
2588
|
0
|
|
|
|
|
|
sentence_id = 1; |
|
2589
|
0
|
|
|
|
|
|
set_text(""); |
|
2590
|
|
|
|
|
|
|
sentences.clear(); |
|
2591
|
0
|
|
|
|
|
|
sentences_index = 0; |
|
2592
|
0
|
|
|
|
|
|
} |
|
2593
|
|
|
|
|
|
|
|
|
2594
|
0
|
|
|
|
|
|
void model_morphodita_parsito::joint_with_parsing_tokenizer::set_text(string_piece text, bool make_copy) { |
|
2595
|
0
|
0
|
|
|
|
|
if (make_copy) { |
|
2596
|
0
|
|
|
|
|
|
text_copy.assign(text.str, text.len); |
|
2597
|
|
|
|
|
|
|
text.str = text_copy.c_str(); |
|
2598
|
|
|
|
|
|
|
} |
|
2599
|
0
|
|
|
|
|
|
this->text = text; |
|
2600
|
0
|
|
|
|
|
|
} |
|
2601
|
|
|
|
|
|
|
|
|
2602
|
0
|
|
|
|
|
|
bool model_morphodita_parsito::joint_with_parsing_tokenizer::next_sentence(sentence& s, string& error) { |
|
2603
|
|
|
|
|
|
|
error.clear(); |
|
2604
|
|
|
|
|
|
|
|
|
2605
|
0
|
0
|
|
|
|
|
if (text.len) { |
|
2606
|
|
|
|
|
|
|
sentences.clear(); |
|
2607
|
0
|
|
|
|
|
|
sentences_index = 0; |
|
2608
|
|
|
|
|
|
|
|
|
2609
|
0
|
|
|
|
|
|
tokenizer->set_text(text, false); |
|
2610
|
|
|
|
|
|
|
|
|
2611
|
0
|
|
|
|
|
|
sentence input; |
|
2612
|
0
|
|
|
|
|
|
vector paragraph; |
|
2613
|
0
|
0
|
|
|
|
|
while (tokenizer->next_sentence(input, error)) { |
|
|
|
0
|
|
|
|
|
|
|
2614
|
0
|
0
|
|
|
|
|
if (input.get_new_par() && !paragraph.empty()) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
2615
|
0
|
0
|
|
|
|
|
if (!parse_paragraph(paragraph, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
2616
|
0
|
0
|
|
|
|
|
for (auto&& sentence : paragraph) |
|
2617
|
0
|
0
|
|
|
|
|
sentences.push_back(sentence); |
|
2618
|
|
|
|
|
|
|
paragraph.clear(); |
|
2619
|
|
|
|
|
|
|
} |
|
2620
|
0
|
0
|
|
|
|
|
paragraph.push_back(input); |
|
2621
|
|
|
|
|
|
|
} |
|
2622
|
0
|
0
|
|
|
|
|
if (!error.empty()) return false; |
|
2623
|
|
|
|
|
|
|
|
|
2624
|
0
|
0
|
|
|
|
|
if (!paragraph.empty()) { |
|
2625
|
0
|
0
|
|
|
|
|
if (!parse_paragraph(paragraph, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
2626
|
0
|
0
|
|
|
|
|
for (auto&& sentence : paragraph) |
|
2627
|
0
|
0
|
|
|
|
|
sentences.push_back(sentence); |
|
2628
|
|
|
|
|
|
|
} |
|
2629
|
|
|
|
|
|
|
|
|
2630
|
0
|
|
|
|
|
|
text.len = 0; |
|
2631
|
|
|
|
|
|
|
} |
|
2632
|
|
|
|
|
|
|
|
|
2633
|
0
|
0
|
|
|
|
|
if (sentences_index < sentences.size()) { |
|
2634
|
0
|
|
|
|
|
|
s = sentences[sentences_index++]; |
|
2635
|
0
|
|
|
|
|
|
return true; |
|
2636
|
|
|
|
|
|
|
} |
|
2637
|
|
|
|
|
|
|
|
|
2638
|
|
|
|
|
|
|
return false; |
|
2639
|
|
|
|
|
|
|
} |
|
2640
|
|
|
|
|
|
|
|
|
2641
|
0
|
|
|
|
|
|
bool model_morphodita_parsito::joint_with_parsing_tokenizer::parse_paragraph(vector& paragraph, string& error) { |
|
2642
|
0
|
|
|
|
|
|
sentence all_words; |
|
2643
|
0
|
0
|
|
|
|
|
vector sentence_boundary(1, true); |
|
2644
|
0
|
0
|
|
|
|
|
vector token_boundary(1, true); |
|
2645
|
|
|
|
|
|
|
|
|
2646
|
0
|
0
|
|
|
|
|
for (auto&& s : paragraph) { |
|
2647
|
0
|
|
|
|
|
|
unsigned offset = all_words.words.size() - 1; |
|
2648
|
0
|
0
|
|
|
|
|
for (unsigned i = 1; i < s.words.size(); i++) { |
|
2649
|
0
|
0
|
|
|
|
|
all_words.words.push_back(s.words[i]); |
|
2650
|
0
|
|
|
|
|
|
all_words.words.back().id += offset; |
|
2651
|
0
|
0
|
|
|
|
|
sentence_boundary.push_back(i+1 == s.words.size()); |
|
2652
|
0
|
0
|
|
|
|
|
token_boundary.push_back(true); |
|
2653
|
|
|
|
|
|
|
} |
|
2654
|
|
|
|
|
|
|
|
|
2655
|
0
|
0
|
|
|
|
|
for (auto&& mwt : s.multiword_tokens) { |
|
2656
|
0
|
0
|
|
|
|
|
all_words.multiword_tokens.push_back(mwt); |
|
2657
|
0
|
|
|
|
|
|
all_words.multiword_tokens.back().id_first += offset; |
|
2658
|
0
|
|
|
|
|
|
all_words.multiword_tokens.back().id_last += offset; |
|
2659
|
0
|
0
|
|
|
|
|
for (int i = all_words.multiword_tokens.back().id_first; i < all_words.multiword_tokens.back().id_last; i++) |
|
2660
|
0
|
|
|
|
|
|
token_boundary[i] = false; |
|
2661
|
|
|
|
|
|
|
} |
|
2662
|
|
|
|
|
|
|
} |
|
2663
|
|
|
|
|
|
|
|
|
2664
|
0
|
0
|
|
|
|
|
vector best_logprob(all_words.words.size(), -numeric_limits::infinity()); best_logprob[0] = 0.; |
|
2665
|
0
|
0
|
|
|
|
|
vector best_length(all_words.words.size(), 0); |
|
2666
|
0
|
0
|
|
|
|
|
sentence s; |
|
2667
|
|
|
|
|
|
|
|
|
2668
|
0
|
0
|
|
|
|
|
for (unsigned start = 1; start < all_words.words.size(); start++) { |
|
2669
|
0
|
0
|
|
|
|
|
if (!token_boundary[start - 1]) continue; |
|
2670
|
0
|
0
|
|
|
|
|
s.clear(); |
|
2671
|
0
|
0
|
|
|
|
|
for (unsigned end = start + 1; end <= all_words.words.size() && (end - start) <= unsigned(max_sentence_len); end++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
2672
|
0
|
0
|
|
|
|
|
s.words.push_back(all_words.words[end - 1]); |
|
2673
|
0
|
|
|
|
|
|
s.words.back().id -= start - 1; |
|
2674
|
0
|
0
|
|
|
|
|
if (!token_boundary[end - 1]) continue; |
|
2675
|
|
|
|
|
|
|
|
|
2676
|
0
|
0
|
|
|
|
|
for (unsigned i = 1; i < s.words.size(); i++) { |
|
2677
|
0
|
|
|
|
|
|
s.words[i].head = -1; |
|
2678
|
|
|
|
|
|
|
s.words[i].children.clear(); |
|
2679
|
|
|
|
|
|
|
} |
|
2680
|
|
|
|
|
|
|
|
|
2681
|
|
|
|
|
|
|
double cost; |
|
2682
|
0
|
0
|
|
|
|
|
if (!model.parse(s, DEFAULT, error, &cost)) return false; |
|
|
|
0
|
|
|
|
|
|
|
2683
|
0
|
|
|
|
|
|
cost += sentence_logprob + change_boundary_logprob * (2 - int(sentence_boundary[start - 1]) - int(sentence_boundary[end - 1])); |
|
2684
|
0
|
0
|
|
|
|
|
if (best_logprob[start - 1] + cost > best_logprob[end - 1]) { |
|
2685
|
0
|
|
|
|
|
|
best_logprob[end - 1] = best_logprob[start - 1] + cost; |
|
2686
|
0
|
|
|
|
|
|
best_length[end - 1] = end - start; |
|
2687
|
|
|
|
|
|
|
} |
|
2688
|
|
|
|
|
|
|
} |
|
2689
|
|
|
|
|
|
|
} |
|
2690
|
|
|
|
|
|
|
|
|
2691
|
|
|
|
|
|
|
vector sentence_lengths; |
|
2692
|
0
|
0
|
|
|
|
|
for (unsigned end = all_words.words.size(); end > 1; end -= best_length[end - 1]) |
|
2693
|
0
|
0
|
|
|
|
|
sentence_lengths.push_back(best_length[end - 1]); |
|
2694
|
|
|
|
|
|
|
|
|
2695
|
|
|
|
|
|
|
paragraph.clear(); |
|
2696
|
|
|
|
|
|
|
|
|
2697
|
0
|
|
|
|
|
|
sentence_lengths.push_back(1); |
|
2698
|
|
|
|
|
|
|
reverse(sentence_lengths.begin(), sentence_lengths.end()); |
|
2699
|
0
|
0
|
|
|
|
|
for (unsigned i = 1; i < sentence_lengths.size(); i++) { |
|
2700
|
0
|
|
|
|
|
|
sentence_lengths[i] += sentence_lengths[i - 1]; |
|
2701
|
|
|
|
|
|
|
|
|
2702
|
0
|
0
|
|
|
|
|
paragraph.emplace_back(); |
|
2703
|
0
|
0
|
|
|
|
|
while (!all_words.multiword_tokens.empty() && unsigned(all_words.multiword_tokens.front().id_first) < sentence_lengths[i]) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
2704
|
0
|
0
|
|
|
|
|
paragraph.back().multiword_tokens.push_back(all_words.multiword_tokens.front()); |
|
2705
|
0
|
|
|
|
|
|
paragraph.back().multiword_tokens.back().id_first -= sentence_lengths[i-1] - 1; |
|
2706
|
0
|
|
|
|
|
|
paragraph.back().multiword_tokens.back().id_last -= sentence_lengths[i-1] - 1; |
|
2707
|
|
|
|
|
|
|
all_words.multiword_tokens.erase(all_words.multiword_tokens.begin()); |
|
2708
|
|
|
|
|
|
|
} |
|
2709
|
|
|
|
|
|
|
|
|
2710
|
0
|
0
|
|
|
|
|
for (unsigned word = sentence_lengths[i - 1]; word < sentence_lengths[i]; word++) { |
|
2711
|
0
|
0
|
|
|
|
|
paragraph.back().words.push_back(all_words.words[word]); |
|
2712
|
0
|
|
|
|
|
|
paragraph.back().words.back().id -= sentence_lengths[i-1] - 1; |
|
2713
|
0
|
|
|
|
|
|
paragraph.back().words.back().head = -1; |
|
2714
|
|
|
|
|
|
|
paragraph.back().words.back().children.clear(); |
|
2715
|
|
|
|
|
|
|
} |
|
2716
|
|
|
|
|
|
|
} |
|
2717
|
|
|
|
|
|
|
|
|
2718
|
0
|
0
|
|
|
|
|
if (!paragraph.empty()) { |
|
2719
|
0
|
0
|
|
|
|
|
if (new_document) { |
|
2720
|
0
|
0
|
|
|
|
|
paragraph.front().set_new_doc(true, document_id); |
|
2721
|
0
|
|
|
|
|
|
new_document = false; |
|
2722
|
|
|
|
|
|
|
} |
|
2723
|
|
|
|
|
|
|
|
|
2724
|
0
|
0
|
|
|
|
|
paragraph.front().set_new_par(true); |
|
2725
|
|
|
|
|
|
|
} |
|
2726
|
|
|
|
|
|
|
|
|
2727
|
|
|
|
|
|
|
return true; |
|
2728
|
|
|
|
|
|
|
} |
|
2729
|
|
|
|
|
|
|
|
|
2730
|
7
|
|
|
|
|
|
void model_morphodita_parsito::fill_word_analysis(const morphodita::tagged_lemma& analysis, bool raw, bool upostag, int lemma, bool xpostag, bool feats, word& word) const { |
|
2731
|
|
|
|
|
|
|
// Handle raw MorphoDiTa models. |
|
2732
|
7
|
50
|
|
|
|
|
if (raw) { |
|
2733
|
0
|
0
|
|
|
|
|
if (lemma) word.lemma.assign(analysis.lemma); |
|
2734
|
0
|
0
|
|
|
|
|
if (xpostag) word.xpostag.assign(analysis.tag); |
|
2735
|
|
|
|
|
|
|
return; |
|
2736
|
|
|
|
|
|
|
} |
|
2737
|
|
|
|
|
|
|
|
|
2738
|
|
|
|
|
|
|
// Lemma |
|
2739
|
7
|
50
|
|
|
|
|
if (lemma == 1) { |
|
2740
|
7
|
|
|
|
|
|
word.lemma.assign(analysis.lemma); |
|
2741
|
0
|
0
|
|
|
|
|
} else if (lemma == 2) { |
|
2742
|
0
|
|
|
|
|
|
word.lemma.assign(analysis.lemma); |
|
2743
|
|
|
|
|
|
|
|
|
2744
|
|
|
|
|
|
|
// Lemma matching ~replacement~normalized_form is changed to replacement. |
|
2745
|
0
|
0
|
|
|
|
|
if (analysis.lemma[0] == '~') { |
|
2746
|
0
|
|
|
|
|
|
auto end = analysis.lemma.find('~', 1); |
|
2747
|
0
|
0
|
|
|
|
|
if (end != string::npos) { |
|
2748
|
0
|
|
|
|
|
|
normalize_form(word.form, word.lemma); |
|
2749
|
0
|
0
|
|
|
|
|
if (analysis.lemma.compare(end + 1, string::npos, word.lemma) == 0) |
|
2750
|
0
|
|
|
|
|
|
word.lemma.assign(analysis.lemma, 1, end - 1); |
|
2751
|
|
|
|
|
|
|
else |
|
2752
|
|
|
|
|
|
|
word.lemma.assign(analysis.lemma); |
|
2753
|
|
|
|
|
|
|
} |
|
2754
|
|
|
|
|
|
|
} |
|
2755
|
|
|
|
|
|
|
} |
|
2756
|
7
|
50
|
|
|
|
|
if (version == 2) { |
|
2757
|
|
|
|
|
|
|
// Replace '\001' back to spaces |
|
2758
|
0
|
0
|
|
|
|
|
for (auto && chr : word.lemma) |
|
2759
|
0
|
0
|
|
|
|
|
if (chr == '\001') |
|
2760
|
0
|
|
|
|
|
|
chr = ' '; |
|
2761
|
7
|
50
|
|
|
|
|
} else if (version >= 3) { |
|
2762
|
|
|
|
|
|
|
// Replace '0xC2 0xA0' back to spaces |
|
2763
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i + 1 < word.lemma.size(); i++) |
|
2764
|
0
|
0
|
|
|
|
|
if (word.lemma[i] == char(0xC2) && word.lemma[i+1] == char(0xA0)) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
2765
|
0
|
|
|
|
|
|
word.lemma.replace(i, 2, 1, ' '); |
|
2766
|
|
|
|
|
|
|
} |
|
2767
|
|
|
|
|
|
|
|
|
2768
|
7
|
50
|
|
|
|
|
if (!upostag && !xpostag && !feats) return; |
|
|
|
0
|
|
|
|
|
|
|
2769
|
|
|
|
|
|
|
|
|
2770
|
|
|
|
|
|
|
// UPOSTag |
|
2771
|
7
|
|
|
|
|
|
char separator = analysis.tag[0]; |
|
2772
|
7
|
|
|
|
|
|
size_t start = min(size_t(1), analysis.tag.size()), end = min(analysis.tag.find(separator, 1), analysis.tag.size()); |
|
2773
|
7
|
50
|
|
|
|
|
if (upostag) word.upostag.assign(analysis.tag, start, end - start); |
|
2774
|
|
|
|
|
|
|
|
|
2775
|
7
|
50
|
|
|
|
|
if (!xpostag && !feats) return; |
|
2776
|
|
|
|
|
|
|
|
|
2777
|
|
|
|
|
|
|
// XPOSTag |
|
2778
|
14
|
|
|
|
|
|
start = min(end + 1, analysis.tag.size()); |
|
2779
|
7
|
|
|
|
|
|
end = min(analysis.tag.find(separator, start), analysis.tag.size()); |
|
2780
|
7
|
50
|
|
|
|
|
if (xpostag) word.xpostag.assign(analysis.tag, start, end - start); |
|
2781
|
|
|
|
|
|
|
|
|
2782
|
7
|
50
|
|
|
|
|
if (!feats) return; |
|
2783
|
|
|
|
|
|
|
|
|
2784
|
|
|
|
|
|
|
// Features |
|
2785
|
14
|
|
|
|
|
|
start = min(end + 1, analysis.tag.size()); |
|
2786
|
7
|
|
|
|
|
|
word.feats.assign(analysis.tag, start, analysis.tag.size() - start); |
|
2787
|
|
|
|
|
|
|
} |
|
2788
|
|
|
|
|
|
|
|
|
2789
|
14
|
|
|
|
|
|
const string& model_morphodita_parsito::normalize_form(string_piece form, string& output) const { |
|
2790
|
|
|
|
|
|
|
using unilib::utf8; |
|
2791
|
|
|
|
|
|
|
|
|
2792
|
|
|
|
|
|
|
// No normalization on version 1 |
|
2793
|
28
|
50
|
|
|
|
|
if (version <= 1) return output.assign(form.str, form.len); |
|
2794
|
|
|
|
|
|
|
|
|
2795
|
|
|
|
|
|
|
// If requested, replace space by \001 in version 2 and by (\u00a0) since version 3 |
|
2796
|
|
|
|
|
|
|
|
|
2797
|
|
|
|
|
|
|
// Arabic normalization since version 2, implementation resulted from |
|
2798
|
|
|
|
|
|
|
// discussion with Otakar Smrz and Nasrin Taghizadeh. |
|
2799
|
|
|
|
|
|
|
// 1. Remove https://codepoints.net/U+0640 without any reasonable doubt :) |
|
2800
|
|
|
|
|
|
|
// 2. Remove https://codepoints.net/U+0652 |
|
2801
|
|
|
|
|
|
|
// 3. Remove https://codepoints.net/U+0670 |
|
2802
|
|
|
|
|
|
|
// 4. Remove everything from https://codepoints.net/U+0653 to |
|
2803
|
|
|
|
|
|
|
// https://codepoints.net/U+0657 though they are probably very rare in date |
|
2804
|
|
|
|
|
|
|
// 5. Remove everything from https://codepoints.net/U+064B to |
|
2805
|
|
|
|
|
|
|
// https://codepoints.net/U+0650 |
|
2806
|
|
|
|
|
|
|
// 6. Remove https://codepoints.net/U+0651 |
|
2807
|
|
|
|
|
|
|
// 7. Replace https://codepoints.net/U+0671 with https://codepoints.net/U+0627 |
|
2808
|
|
|
|
|
|
|
// 8. Replace https://codepoints.net/U+0622 with https://codepoints.net/U+0627 |
|
2809
|
|
|
|
|
|
|
// 9. Replace https://codepoints.net/U+0623 with https://codepoints.net/U+0627 |
|
2810
|
|
|
|
|
|
|
// 10. Replace https://codepoints.net/U+0625 with https://codepoints.net/U+0627 |
|
2811
|
|
|
|
|
|
|
// 11. Replace https://codepoints.net/U+0624 with https://codepoints.net/U+0648 |
|
2812
|
|
|
|
|
|
|
// 12. Replace https://codepoints.net/U+0626 with https://codepoints.net/U+064A |
|
2813
|
|
|
|
|
|
|
// One might also consider replacing some Farsi characters that might be typed |
|
2814
|
|
|
|
|
|
|
// unintentionally (by Iranians writing Arabic language texts): |
|
2815
|
|
|
|
|
|
|
// 13. Replace https://codepoints.net/U+06CC with https://codepoints.net/U+064A |
|
2816
|
|
|
|
|
|
|
// 14. Replace https://codepoints.net/U+06A9 with https://codepoints.net/U+0643 |
|
2817
|
|
|
|
|
|
|
// 15. Replace https://codepoints.net/U+06AA with https://codepoints.net/U+0643 |
|
2818
|
|
|
|
|
|
|
// |
|
2819
|
|
|
|
|
|
|
// Not implemented: |
|
2820
|
|
|
|
|
|
|
// There is additional challenge with data coming from Egypt (such as printed |
|
2821
|
|
|
|
|
|
|
// or online newspapers), where the word-final https://codepoints.net/U+064A |
|
2822
|
|
|
|
|
|
|
// may be switched for https://codepoints.net/U+0649 and visa versa. Also, the |
|
2823
|
|
|
|
|
|
|
// word-final https://codepoints.net/U+0647 could actually represent https:// |
|
2824
|
|
|
|
|
|
|
// codepoints.net/U+0629. You can experiment with the following replacements, |
|
2825
|
|
|
|
|
|
|
// but I would rather apply them only after classifying the whole document as |
|
2826
|
|
|
|
|
|
|
// following such convention: |
|
2827
|
|
|
|
|
|
|
// 1. Replace https://codepoints.net/U+0629 with https://codepoints.net/U+0647 |
|
2828
|
|
|
|
|
|
|
// (frequent femine ending markers would appear like a third-person |
|
2829
|
|
|
|
|
|
|
// masculine pronoun clitic instead) |
|
2830
|
|
|
|
|
|
|
// 2. Replace https://codepoints.net/U+0649 with https://codepoints.net/U+064A |
|
2831
|
|
|
|
|
|
|
// (some "weak" words would become even more ambiguous or appear as if |
|
2832
|
|
|
|
|
|
|
// with a first-person pronoun clitic) |
|
2833
|
|
|
|
|
|
|
|
|
2834
|
|
|
|
|
|
|
output.clear(); |
|
2835
|
0
|
0
|
|
|
|
|
for (auto&& chr : utf8::decoder(form.str, form.len)) { |
|
2836
|
|
|
|
|
|
|
// Arabic normalization |
|
2837
|
0
|
0
|
|
|
|
|
if (chr == 0x640 || (chr >= 0x64B && chr <= 0x657) || chr == 0x670) {} |
|
|
|
0
|
|
|
|
|
|
|
2838
|
0
|
0
|
|
|
|
|
else if (chr == 0x622) utf8::append(output, 0x627); |
|
2839
|
0
|
0
|
|
|
|
|
else if (chr == 0x623) utf8::append(output, 0x627); |
|
2840
|
0
|
0
|
|
|
|
|
else if (chr == 0x624) utf8::append(output, 0x648); |
|
2841
|
0
|
0
|
|
|
|
|
else if (chr == 0x625) utf8::append(output, 0x627); |
|
2842
|
0
|
0
|
|
|
|
|
else if (chr == 0x626) utf8::append(output, 0x64A); |
|
2843
|
0
|
0
|
|
|
|
|
else if (chr == 0x671) utf8::append(output, 0x627); |
|
2844
|
0
|
0
|
|
|
|
|
else if (chr == 0x6A9) utf8::append(output, 0x643); |
|
2845
|
0
|
0
|
|
|
|
|
else if (chr == 0x6AA) utf8::append(output, 0x643); |
|
2846
|
0
|
0
|
|
|
|
|
else if (chr == 0x6CC) utf8::append(output, 0x64A); |
|
2847
|
|
|
|
|
|
|
// Space normalization |
|
2848
|
0
|
0
|
|
|
|
|
else if (chr == ' ' && version == 2) utf8::append(output, 0x01); |
|
|
|
0
|
|
|
|
|
|
|
2849
|
0
|
0
|
|
|
|
|
else if (chr == ' ' && version >= 3) utf8::append(output, 0xA0); |
|
|
|
0
|
|
|
|
|
|
|
2850
|
|
|
|
|
|
|
// Default |
|
2851
|
0
|
|
|
|
|
|
else utf8::append(output, chr); |
|
2852
|
|
|
|
|
|
|
} |
|
2853
|
|
|
|
|
|
|
|
|
2854
|
|
|
|
|
|
|
// Make sure we do not remove everything |
|
2855
|
0
|
0
|
|
|
|
|
if (output.empty() && form.len) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
2856
|
0
|
|
|
|
|
|
utf8::append(output, utf8::first(form.str, form.len)); |
|
2857
|
|
|
|
|
|
|
|
|
2858
|
|
|
|
|
|
|
return output; |
|
2859
|
|
|
|
|
|
|
} |
|
2860
|
|
|
|
|
|
|
|
|
2861
|
7
|
|
|
|
|
|
const string& model_morphodita_parsito::normalize_lemma(string_piece lemma, string& output) const { |
|
2862
|
|
|
|
|
|
|
using unilib::utf8; |
|
2863
|
|
|
|
|
|
|
|
|
2864
|
|
|
|
|
|
|
// No normalization on version 1 and 2 |
|
2865
|
14
|
50
|
|
|
|
|
if (version <= 2) return output.assign(lemma.str, lemma.len); |
|
2866
|
|
|
|
|
|
|
|
|
2867
|
|
|
|
|
|
|
// Normalize spaces by since version 3 |
|
2868
|
|
|
|
|
|
|
output.clear(); |
|
2869
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < lemma.len; i++) { |
|
2870
|
|
|
|
|
|
|
// Space normalization |
|
2871
|
0
|
0
|
|
|
|
|
if (lemma.str[i] == ' ') utf8::append(output, 0xA0); |
|
2872
|
|
|
|
|
|
|
// Default |
|
2873
|
0
|
|
|
|
|
|
else output.push_back(lemma.str[i]); |
|
2874
|
|
|
|
|
|
|
} |
|
2875
|
|
|
|
|
|
|
|
|
2876
|
|
|
|
|
|
|
return output; |
|
2877
|
|
|
|
|
|
|
} |
|
2878
|
|
|
|
|
|
|
|
|
2879
|
|
|
|
|
|
|
///////// |
|
2880
|
|
|
|
|
|
|
// File: model/pipeline.h |
|
2881
|
|
|
|
|
|
|
///////// |
|
2882
|
|
|
|
|
|
|
|
|
2883
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
2884
|
|
|
|
|
|
|
// |
|
2885
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
2886
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
2887
|
|
|
|
|
|
|
// |
|
2888
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
2889
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
2890
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
2891
|
|
|
|
|
|
|
|
|
2892
|
|
|
|
|
|
|
class pipeline { |
|
2893
|
|
|
|
|
|
|
public: |
|
2894
|
|
|
|
|
|
|
pipeline(const model* m, const string& input, const string& tagger, const string& parser, const string& output); |
|
2895
|
|
|
|
|
|
|
|
|
2896
|
|
|
|
|
|
|
void set_model(const model* m); |
|
2897
|
|
|
|
|
|
|
void set_input(const string& input); |
|
2898
|
|
|
|
|
|
|
void set_tagger(const string& tagger); |
|
2899
|
|
|
|
|
|
|
void set_parser(const string& parser); |
|
2900
|
|
|
|
|
|
|
void set_output(const string& output); |
|
2901
|
|
|
|
|
|
|
|
|
2902
|
|
|
|
|
|
|
void set_immediate(bool immediate); |
|
2903
|
|
|
|
|
|
|
void set_document_id(const string& document_id); |
|
2904
|
|
|
|
|
|
|
|
|
2905
|
|
|
|
|
|
|
bool process(istream& is, ostream& os, string& error) const; |
|
2906
|
|
|
|
|
|
|
|
|
2907
|
|
|
|
|
|
|
static const string DEFAULT; |
|
2908
|
|
|
|
|
|
|
static const string NONE; |
|
2909
|
|
|
|
|
|
|
|
|
2910
|
|
|
|
|
|
|
private: |
|
2911
|
|
|
|
|
|
|
const model* m; |
|
2912
|
|
|
|
|
|
|
string input, tokenizer, tagger, parser, output; |
|
2913
|
|
|
|
|
|
|
string document_id; |
|
2914
|
|
|
|
|
|
|
bool immediate; |
|
2915
|
|
|
|
|
|
|
}; |
|
2916
|
|
|
|
|
|
|
|
|
2917
|
|
|
|
|
|
|
///////// |
|
2918
|
|
|
|
|
|
|
// File: sentence/output_format.h |
|
2919
|
|
|
|
|
|
|
///////// |
|
2920
|
|
|
|
|
|
|
|
|
2921
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
2922
|
|
|
|
|
|
|
// |
|
2923
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
2924
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
2925
|
|
|
|
|
|
|
// |
|
2926
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
2927
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
2928
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
2929
|
|
|
|
|
|
|
|
|
2930
|
1
|
|
|
|
|
|
class output_format { |
|
2931
|
|
|
|
|
|
|
public: |
|
2932
|
1
|
|
|
|
|
|
virtual ~output_format() {} |
|
2933
|
|
|
|
|
|
|
|
|
2934
|
|
|
|
|
|
|
virtual void write_sentence(const sentence& s, ostream& os) = 0; |
|
2935
|
0
|
|
|
|
|
|
virtual void finish_document(ostream& /*os*/) {} |
|
2936
|
|
|
|
|
|
|
|
|
2937
|
|
|
|
|
|
|
// Static factory methods |
|
2938
|
|
|
|
|
|
|
static output_format* new_output_format(const string& name); |
|
2939
|
|
|
|
|
|
|
static output_format* new_conllu_output_format(const string& options = string()); |
|
2940
|
|
|
|
|
|
|
static output_format* new_epe_output_format(const string& options = string()); |
|
2941
|
|
|
|
|
|
|
static output_format* new_matxin_output_format(const string& options = string()); |
|
2942
|
|
|
|
|
|
|
static output_format* new_horizontal_output_format(const string& options = string()); |
|
2943
|
|
|
|
|
|
|
static output_format* new_plaintext_output_format(const string& options = string()); |
|
2944
|
|
|
|
|
|
|
static output_format* new_vertical_output_format(const string& options = string()); |
|
2945
|
|
|
|
|
|
|
|
|
2946
|
|
|
|
|
|
|
static const string CONLLU_V1; |
|
2947
|
|
|
|
|
|
|
static const string CONLLU_V2; |
|
2948
|
|
|
|
|
|
|
static const string HORIZONTAL_PARAGRAPHS; |
|
2949
|
|
|
|
|
|
|
static const string PLAINTEXT_NORMALIZED_SPACES; |
|
2950
|
|
|
|
|
|
|
static const string VERTICAL_PARAGRAPHS; |
|
2951
|
|
|
|
|
|
|
}; |
|
2952
|
|
|
|
|
|
|
|
|
2953
|
|
|
|
|
|
|
///////// |
|
2954
|
|
|
|
|
|
|
// File: utils/getwhole.h |
|
2955
|
|
|
|
|
|
|
///////// |
|
2956
|
|
|
|
|
|
|
|
|
2957
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
|
2958
|
|
|
|
|
|
|
// |
|
2959
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
2960
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
2961
|
|
|
|
|
|
|
// |
|
2962
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
2963
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
2964
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
2965
|
|
|
|
|
|
|
|
|
2966
|
|
|
|
|
|
|
namespace utils { |
|
2967
|
|
|
|
|
|
|
|
|
2968
|
|
|
|
|
|
|
// |
|
2969
|
|
|
|
|
|
|
// Declarations |
|
2970
|
|
|
|
|
|
|
// |
|
2971
|
|
|
|
|
|
|
|
|
2972
|
|
|
|
|
|
|
// Read whole content until EOF. All encountered \n are stored. |
|
2973
|
|
|
|
|
|
|
inline istream& getwhole(istream& is, string& whole); |
|
2974
|
|
|
|
|
|
|
|
|
2975
|
|
|
|
|
|
|
// |
|
2976
|
|
|
|
|
|
|
// Definitions |
|
2977
|
|
|
|
|
|
|
// |
|
2978
|
|
|
|
|
|
|
|
|
2979
|
0
|
|
|
|
|
|
istream& getwhole(istream& is, string& whole) { |
|
2980
|
|
|
|
|
|
|
whole.clear(); |
|
2981
|
|
|
|
|
|
|
|
|
2982
|
0
|
0
|
|
|
|
|
for (string line; getline(is, line); ) |
|
|
|
0
|
|
|
|
|
|
|
2983
|
0
|
0
|
|
|
|
|
whole.append(line).push_back('\n'); |
|
2984
|
|
|
|
|
|
|
|
|
2985
|
0
|
0
|
|
|
|
|
if (is.eof() && !whole.empty()) is.clear(istream::eofbit); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
2986
|
0
|
|
|
|
|
|
return is; |
|
2987
|
|
|
|
|
|
|
} |
|
2988
|
|
|
|
|
|
|
|
|
2989
|
|
|
|
|
|
|
} // namespace utils |
|
2990
|
|
|
|
|
|
|
|
|
2991
|
|
|
|
|
|
|
///////// |
|
2992
|
|
|
|
|
|
|
// File: model/pipeline.cpp |
|
2993
|
|
|
|
|
|
|
///////// |
|
2994
|
|
|
|
|
|
|
|
|
2995
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
2996
|
|
|
|
|
|
|
// |
|
2997
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
2998
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
2999
|
|
|
|
|
|
|
// |
|
3000
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
3001
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3002
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
3003
|
|
|
|
|
|
|
|
|
3004
|
2
|
|
|
|
|
|
const string pipeline::DEFAULT; |
|
3005
|
2
|
|
|
|
|
|
const string pipeline::NONE = "none"; |
|
3006
|
|
|
|
|
|
|
|
|
3007
|
0
|
|
|
|
|
|
pipeline::pipeline(const model* m, const string& input, const string& tagger, const string& parser, const string& output) : immediate(false) { |
|
3008
|
|
|
|
|
|
|
set_model(m); |
|
3009
|
0
|
0
|
|
|
|
|
set_input(input); |
|
3010
|
|
|
|
|
|
|
set_tagger(tagger); |
|
3011
|
|
|
|
|
|
|
set_parser(parser); |
|
3012
|
0
|
0
|
|
|
|
|
set_output(output); |
|
3013
|
0
|
|
|
|
|
|
} |
|
3014
|
|
|
|
|
|
|
|
|
3015
|
0
|
|
|
|
|
|
void pipeline::set_model(const model* m) { |
|
3016
|
0
|
|
|
|
|
|
this->m = m; |
|
3017
|
0
|
|
|
|
|
|
} |
|
3018
|
|
|
|
|
|
|
|
|
3019
|
0
|
|
|
|
|
|
void pipeline::set_input(const string& input) { |
|
3020
|
|
|
|
|
|
|
tokenizer.clear(); |
|
3021
|
|
|
|
|
|
|
|
|
3022
|
0
|
0
|
|
|
|
|
if (input.empty()) { |
|
3023
|
0
|
|
|
|
|
|
this->input = "conllu"; |
|
3024
|
0
|
0
|
|
|
|
|
} else if (input == "tokenize" || input == "tokenizer") { |
|
3025
|
0
|
|
|
|
|
|
this->input = "tokenizer"; |
|
3026
|
0
|
0
|
|
|
|
|
} else if (input.compare(0, 10, "tokenizer=") == 0) { |
|
3027
|
0
|
|
|
|
|
|
this->input = "tokenizer"; |
|
3028
|
0
|
|
|
|
|
|
tokenizer.assign(input, 10, string::npos); |
|
3029
|
|
|
|
|
|
|
} else { |
|
3030
|
0
|
|
|
|
|
|
this->input = input; |
|
3031
|
|
|
|
|
|
|
} |
|
3032
|
0
|
|
|
|
|
|
} |
|
3033
|
|
|
|
|
|
|
|
|
3034
|
0
|
|
|
|
|
|
void pipeline::set_tagger(const string& tagger) { |
|
3035
|
0
|
|
|
|
|
|
this->tagger = tagger; |
|
3036
|
0
|
|
|
|
|
|
} |
|
3037
|
|
|
|
|
|
|
|
|
3038
|
0
|
|
|
|
|
|
void pipeline::set_parser(const string& parser) { |
|
3039
|
0
|
|
|
|
|
|
this->parser = parser; |
|
3040
|
0
|
|
|
|
|
|
} |
|
3041
|
|
|
|
|
|
|
|
|
3042
|
0
|
|
|
|
|
|
void pipeline::set_output(const string& output) { |
|
3043
|
0
|
0
|
|
|
|
|
this->output = output.empty() ? "conllu" : output; |
|
3044
|
0
|
|
|
|
|
|
} |
|
3045
|
|
|
|
|
|
|
|
|
3046
|
0
|
|
|
|
|
|
void pipeline::set_immediate(bool immediate) { |
|
3047
|
0
|
|
|
|
|
|
this->immediate = immediate; |
|
3048
|
0
|
|
|
|
|
|
} |
|
3049
|
|
|
|
|
|
|
|
|
3050
|
0
|
|
|
|
|
|
void pipeline::set_document_id(const string& document_id) { |
|
3051
|
0
|
|
|
|
|
|
this->document_id = document_id; |
|
3052
|
0
|
|
|
|
|
|
} |
|
3053
|
|
|
|
|
|
|
|
|
3054
|
0
|
|
|
|
|
|
bool pipeline::process(istream& is, ostream& os, string& error) const { |
|
3055
|
|
|
|
|
|
|
error.clear(); |
|
3056
|
|
|
|
|
|
|
|
|
3057
|
0
|
|
|
|
|
|
sentence s; |
|
3058
|
|
|
|
|
|
|
|
|
3059
|
|
|
|
|
|
|
unique_ptr reader; |
|
3060
|
0
|
0
|
|
|
|
|
if (input == "tokenizer") { |
|
3061
|
0
|
0
|
|
|
|
|
reader.reset(m->new_tokenizer(tokenizer)); |
|
3062
|
0
|
0
|
|
|
|
|
if (!reader) return error.assign("The model does not have a tokenizer!"), false; |
|
|
|
0
|
|
|
|
|
|
|
3063
|
|
|
|
|
|
|
} else { |
|
3064
|
0
|
0
|
|
|
|
|
reader.reset(input_format::new_input_format(input)); |
|
3065
|
0
|
0
|
|
|
|
|
if (!reader) return error.assign("The requested input format '").append(input).append("' does not exist!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
3066
|
|
|
|
|
|
|
} |
|
3067
|
0
|
0
|
|
|
|
|
reader->reset_document(document_id); |
|
3068
|
|
|
|
|
|
|
|
|
3069
|
0
|
0
|
|
|
|
|
unique_ptr writer(output_format::new_output_format(output)); |
|
3070
|
0
|
0
|
|
|
|
|
if (!writer) return error.assign("The requested output format '").append(output).append("' does not exist!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
3071
|
|
|
|
|
|
|
|
|
3072
|
|
|
|
|
|
|
string block; |
|
3073
|
0
|
0
|
|
|
|
|
while (immediate ? reader->read_block(is, block) : bool(getwhole(is, block))) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
3074
|
0
|
0
|
|
|
|
|
reader->set_text(block); |
|
3075
|
0
|
0
|
|
|
|
|
while (reader->next_sentence(s, error)) { |
|
|
|
0
|
|
|
|
|
|
|
3076
|
0
|
0
|
|
|
|
|
if (tagger != NONE) |
|
3077
|
0
|
0
|
|
|
|
|
if (!m->tag(s, tagger, error)) |
|
|
|
0
|
|
|
|
|
|
|
3078
|
|
|
|
|
|
|
return false; |
|
3079
|
|
|
|
|
|
|
|
|
3080
|
0
|
0
|
|
|
|
|
if (parser != NONE) |
|
3081
|
0
|
0
|
|
|
|
|
if (!m->parse(s, parser, error)) |
|
|
|
0
|
|
|
|
|
|
|
3082
|
|
|
|
|
|
|
return false; |
|
3083
|
|
|
|
|
|
|
|
|
3084
|
0
|
0
|
|
|
|
|
writer->write_sentence(s, os); |
|
3085
|
|
|
|
|
|
|
} |
|
3086
|
0
|
0
|
|
|
|
|
if (!error.empty()) return false; |
|
3087
|
|
|
|
|
|
|
} |
|
3088
|
0
|
0
|
|
|
|
|
writer->finish_document(os); |
|
3089
|
|
|
|
|
|
|
|
|
3090
|
|
|
|
|
|
|
return true; |
|
3091
|
|
|
|
|
|
|
} |
|
3092
|
|
|
|
|
|
|
|
|
3093
|
|
|
|
|
|
|
///////// |
|
3094
|
|
|
|
|
|
|
// File: morphodita/tagset_converter/tagset_converter.h |
|
3095
|
|
|
|
|
|
|
///////// |
|
3096
|
|
|
|
|
|
|
|
|
3097
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
3098
|
|
|
|
|
|
|
// |
|
3099
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
3100
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
3101
|
|
|
|
|
|
|
// |
|
3102
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
3103
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3104
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
3105
|
|
|
|
|
|
|
|
|
3106
|
|
|
|
|
|
|
namespace morphodita { |
|
3107
|
|
|
|
|
|
|
|
|
3108
|
0
|
|
|
|
|
|
class tagset_converter { |
|
3109
|
|
|
|
|
|
|
public: |
|
3110
|
0
|
|
|
|
|
|
virtual ~tagset_converter() {} |
|
3111
|
|
|
|
|
|
|
|
|
3112
|
|
|
|
|
|
|
// Convert a tag-lemma pair to a different tag set. |
|
3113
|
|
|
|
|
|
|
virtual void convert(tagged_lemma& tagged_lemma) const = 0; |
|
3114
|
|
|
|
|
|
|
// Convert a result of analysis to a different tag set. Apart from calling |
|
3115
|
|
|
|
|
|
|
// convert, any repeated entry is removed. |
|
3116
|
|
|
|
|
|
|
virtual void convert_analyzed(vector& tagged_lemmas) const = 0; |
|
3117
|
|
|
|
|
|
|
// Convert a result of generation to a different tag set. Apart from calling |
|
3118
|
|
|
|
|
|
|
// convert, any repeated entry is removed. |
|
3119
|
|
|
|
|
|
|
virtual void convert_generated(vector& forms) const = 0; |
|
3120
|
|
|
|
|
|
|
|
|
3121
|
|
|
|
|
|
|
// Static factory methods |
|
3122
|
|
|
|
|
|
|
static tagset_converter* new_identity_converter(); |
|
3123
|
|
|
|
|
|
|
|
|
3124
|
|
|
|
|
|
|
static tagset_converter* new_pdt_to_conll2009_converter(); |
|
3125
|
|
|
|
|
|
|
static tagset_converter* new_strip_lemma_comment_converter(const morpho& dictionary); |
|
3126
|
|
|
|
|
|
|
static tagset_converter* new_strip_lemma_id_converter(const morpho& dictionary); |
|
3127
|
|
|
|
|
|
|
}; |
|
3128
|
|
|
|
|
|
|
|
|
3129
|
|
|
|
|
|
|
// Helper method for creating tagset_converter from instance name. |
|
3130
|
|
|
|
|
|
|
tagset_converter* new_tagset_converter(const string& name, const morpho& dictionary); |
|
3131
|
|
|
|
|
|
|
|
|
3132
|
|
|
|
|
|
|
// Helper methods making sure remapped results are unique. |
|
3133
|
|
|
|
|
|
|
void tagset_converter_unique_analyzed(vector& tagged_lemmas); |
|
3134
|
|
|
|
|
|
|
void tagset_converter_unique_generated(vector& forms); |
|
3135
|
|
|
|
|
|
|
|
|
3136
|
|
|
|
|
|
|
} // namespace morphodita |
|
3137
|
|
|
|
|
|
|
|
|
3138
|
|
|
|
|
|
|
///////// |
|
3139
|
|
|
|
|
|
|
// File: morphodita/derivator/derivation_formatter.h |
|
3140
|
|
|
|
|
|
|
///////// |
|
3141
|
|
|
|
|
|
|
|
|
3142
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
3143
|
|
|
|
|
|
|
// |
|
3144
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
|
3145
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
3146
|
|
|
|
|
|
|
// |
|
3147
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
3148
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3149
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
3150
|
|
|
|
|
|
|
|
|
3151
|
|
|
|
|
|
|
namespace morphodita { |
|
3152
|
|
|
|
|
|
|
|
|
3153
|
0
|
|
|
|
|
|
class derivation_formatter { |
|
3154
|
|
|
|
|
|
|
public: |
|
3155
|
0
|
|
|
|
|
|
virtual ~derivation_formatter() {} |
|
3156
|
|
|
|
|
|
|
|
|
3157
|
|
|
|
|
|
|
// Perform the required derivation and store it directly in the lemma. |
|
3158
|
|
|
|
|
|
|
virtual void format_derivation(string& lemma) const; |
|
3159
|
|
|
|
|
|
|
|
|
3160
|
|
|
|
|
|
|
// Perform the required derivation and store it directly in the tagged_lemma. |
|
3161
|
|
|
|
|
|
|
// If a tagset_converter is given, it is also applied. |
|
3162
|
|
|
|
|
|
|
virtual void format_tagged_lemma(tagged_lemma& lemma, const tagset_converter* converter = nullptr) const = 0; |
|
3163
|
|
|
|
|
|
|
|
|
3164
|
|
|
|
|
|
|
// Perform the required derivation on a list of tagged_lemmas. |
|
3165
|
|
|
|
|
|
|
// If a tagset_converter is given, it is also applied. |
|
3166
|
|
|
|
|
|
|
// Either way, only unique entries are returned. |
|
3167
|
|
|
|
|
|
|
virtual void format_tagged_lemmas(vector& lemmas, const tagset_converter* converter = nullptr) const; |
|
3168
|
|
|
|
|
|
|
|
|
3169
|
|
|
|
|
|
|
// Static factory methods. |
|
3170
|
|
|
|
|
|
|
static derivation_formatter* new_none_derivation_formatter(); |
|
3171
|
|
|
|
|
|
|
static derivation_formatter* new_root_derivation_formatter(const derivator* derinet); |
|
3172
|
|
|
|
|
|
|
static derivation_formatter* new_path_derivation_formatter(const derivator* derinet); |
|
3173
|
|
|
|
|
|
|
static derivation_formatter* new_tree_derivation_formatter(const derivator* derinet); |
|
3174
|
|
|
|
|
|
|
// String version of static factory method. |
|
3175
|
|
|
|
|
|
|
static derivation_formatter* new_derivation_formatter(string_piece name, const derivator* derinet); |
|
3176
|
|
|
|
|
|
|
}; |
|
3177
|
|
|
|
|
|
|
|
|
3178
|
|
|
|
|
|
|
} // namespace morphodita |
|
3179
|
|
|
|
|
|
|
|
|
3180
|
|
|
|
|
|
|
///////// |
|
3181
|
|
|
|
|
|
|
// File: morphodita/derivator/derivation_formatter.cpp |
|
3182
|
|
|
|
|
|
|
///////// |
|
3183
|
|
|
|
|
|
|
|
|
3184
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
3185
|
|
|
|
|
|
|
// |
|
3186
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
|
3187
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
3188
|
|
|
|
|
|
|
// |
|
3189
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
3190
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3191
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
3192
|
|
|
|
|
|
|
|
|
3193
|
|
|
|
|
|
|
namespace morphodita { |
|
3194
|
|
|
|
|
|
|
|
|
3195
|
0
|
|
|
|
|
|
void derivation_formatter::format_derivation(string& lemma) const { |
|
3196
|
0
|
|
|
|
|
|
tagged_lemma result; |
|
3197
|
0
|
|
|
|
|
|
result.lemma.swap(lemma); |
|
3198
|
0
|
0
|
|
|
|
|
format_tagged_lemma(result); |
|
3199
|
0
|
|
|
|
|
|
lemma.swap(result.lemma); |
|
3200
|
0
|
|
|
|
|
|
} |
|
3201
|
|
|
|
|
|
|
|
|
3202
|
0
|
|
|
|
|
|
void derivation_formatter::format_tagged_lemmas(vector& lemmas, const tagset_converter* converter) const { |
|
3203
|
0
|
0
|
|
|
|
|
for (auto&& lemma : lemmas) |
|
3204
|
0
|
|
|
|
|
|
format_tagged_lemma(lemma, converter); |
|
3205
|
|
|
|
|
|
|
|
|
3206
|
0
|
0
|
|
|
|
|
if (lemmas.size() > 1) |
|
3207
|
0
|
|
|
|
|
|
tagset_converter_unique_analyzed(lemmas); |
|
3208
|
0
|
|
|
|
|
|
} |
|
3209
|
|
|
|
|
|
|
|
|
3210
|
0
|
|
|
|
|
|
class none_derivation_formatter : public derivation_formatter { |
|
3211
|
0
|
|
|
|
|
|
virtual void format_derivation(string& /*lemma*/) const override {} |
|
3212
|
|
|
|
|
|
|
|
|
3213
|
0
|
|
|
|
|
|
virtual void format_tagged_lemma(tagged_lemma& lemma, const tagset_converter* converter) const override { |
|
3214
|
0
|
0
|
|
|
|
|
if (converter) converter->convert(lemma); |
|
3215
|
0
|
|
|
|
|
|
} |
|
3216
|
|
|
|
|
|
|
|
|
3217
|
0
|
|
|
|
|
|
virtual void format_tagged_lemmas(vector& lemmas, const tagset_converter* converter) const override { |
|
3218
|
0
|
0
|
|
|
|
|
if (converter) converter->convert_analyzed(lemmas); |
|
3219
|
0
|
|
|
|
|
|
} |
|
3220
|
|
|
|
|
|
|
}; |
|
3221
|
|
|
|
|
|
|
|
|
3222
|
0
|
|
|
|
|
|
derivation_formatter* derivation_formatter::new_none_derivation_formatter() { |
|
3223
|
0
|
|
|
|
|
|
return new none_derivation_formatter(); |
|
3224
|
|
|
|
|
|
|
} |
|
3225
|
|
|
|
|
|
|
|
|
3226
|
0
|
|
|
|
|
|
class root_derivation_formatter : public derivation_formatter { |
|
3227
|
|
|
|
|
|
|
public: |
|
3228
|
0
|
|
|
|
|
|
root_derivation_formatter(const derivator* derinet) : derinet(derinet) {} |
|
3229
|
|
|
|
|
|
|
|
|
3230
|
0
|
|
|
|
|
|
virtual void format_tagged_lemma(tagged_lemma& lemma, const tagset_converter* converter) const override { |
|
3231
|
0
|
0
|
|
|
|
|
for (derivated_lemma parent; derinet->parent(lemma.lemma, parent); ) |
|
|
|
0
|
|
|
|
|
|
|
3232
|
0
|
|
|
|
|
|
lemma.lemma.assign(parent.lemma); |
|
3233
|
0
|
0
|
|
|
|
|
if (converter) converter->convert(lemma); |
|
3234
|
0
|
|
|
|
|
|
} |
|
3235
|
|
|
|
|
|
|
|
|
3236
|
|
|
|
|
|
|
private: |
|
3237
|
|
|
|
|
|
|
const derivator* derinet; |
|
3238
|
|
|
|
|
|
|
}; |
|
3239
|
|
|
|
|
|
|
|
|
3240
|
0
|
|
|
|
|
|
derivation_formatter* derivation_formatter::new_root_derivation_formatter(const derivator* derinet) { |
|
3241
|
0
|
0
|
|
|
|
|
return derinet ? new root_derivation_formatter(derinet) : nullptr; |
|
|
|
0
|
|
|
|
|
|
|
3242
|
|
|
|
|
|
|
} |
|
3243
|
|
|
|
|
|
|
|
|
3244
|
0
|
|
|
|
|
|
class path_derivation_formatter : public derivation_formatter { |
|
3245
|
|
|
|
|
|
|
public: |
|
3246
|
0
|
|
|
|
|
|
path_derivation_formatter(const derivator* derinet) : derinet(derinet) {} |
|
3247
|
|
|
|
|
|
|
|
|
3248
|
0
|
|
|
|
|
|
virtual void format_tagged_lemma(tagged_lemma& lemma, const tagset_converter* converter) const override { |
|
3249
|
0
|
|
|
|
|
|
tagged_lemma current(lemma); |
|
3250
|
0
|
0
|
|
|
|
|
if (converter) converter->convert(lemma); |
|
|
|
0
|
|
|
|
|
|
|
3251
|
0
|
0
|
|
|
|
|
for (derivated_lemma parent; derinet->parent(current.lemma, parent); current.lemma.swap(parent.lemma)) { |
|
|
|
0
|
|
|
|
|
|
|
3252
|
0
|
0
|
|
|
|
|
tagged_lemma parrent_lemma(parent.lemma, current.tag); |
|
3253
|
0
|
0
|
|
|
|
|
if (converter) converter->convert(parrent_lemma); |
|
|
|
0
|
|
|
|
|
|
|
3254
|
0
|
0
|
|
|
|
|
lemma.lemma.append(" ").append(parrent_lemma.lemma); |
|
3255
|
|
|
|
|
|
|
} |
|
3256
|
0
|
|
|
|
|
|
} |
|
3257
|
|
|
|
|
|
|
|
|
3258
|
|
|
|
|
|
|
private: |
|
3259
|
|
|
|
|
|
|
const derivator* derinet; |
|
3260
|
|
|
|
|
|
|
}; |
|
3261
|
|
|
|
|
|
|
|
|
3262
|
0
|
|
|
|
|
|
derivation_formatter* derivation_formatter::new_path_derivation_formatter(const derivator* derinet) { |
|
3263
|
0
|
0
|
|
|
|
|
return derinet ? new path_derivation_formatter(derinet) : nullptr; |
|
|
|
0
|
|
|
|
|
|
|
3264
|
|
|
|
|
|
|
} |
|
3265
|
|
|
|
|
|
|
|
|
3266
|
0
|
|
|
|
|
|
class tree_derivation_formatter : public derivation_formatter { |
|
3267
|
|
|
|
|
|
|
public: |
|
3268
|
0
|
|
|
|
|
|
tree_derivation_formatter(const derivator* derinet) : derinet(derinet) {} |
|
3269
|
|
|
|
|
|
|
|
|
3270
|
0
|
|
|
|
|
|
virtual void format_tagged_lemma(tagged_lemma& lemma, const tagset_converter* converter) const override { |
|
3271
|
|
|
|
|
|
|
string root(lemma.lemma), tag(lemma.tag); |
|
3272
|
0
|
0
|
|
|
|
|
if (converter) converter->convert(lemma); |
|
|
|
0
|
|
|
|
|
|
|
3273
|
0
|
0
|
|
|
|
|
for (derivated_lemma parent; derinet->parent(root, parent); root.swap(parent.lemma)) {} |
|
|
|
0
|
|
|
|
|
|
|
3274
|
0
|
0
|
|
|
|
|
format_tree(root, tag, lemma, converter); |
|
3275
|
0
|
|
|
|
|
|
} |
|
3276
|
|
|
|
|
|
|
|
|
3277
|
0
|
|
|
|
|
|
void format_tree(const string& root, const string& tag, tagged_lemma& tree, const tagset_converter* converter) const { |
|
3278
|
0
|
|
|
|
|
|
vector children; |
|
3279
|
|
|
|
|
|
|
|
|
3280
|
0
|
0
|
|
|
|
|
if (converter) { |
|
3281
|
0
|
0
|
|
|
|
|
tagged_lemma current(root, tag); |
|
3282
|
0
|
0
|
|
|
|
|
converter->convert(current); |
|
3283
|
0
|
0
|
|
|
|
|
tree.lemma.append(" ").append(current.lemma); |
|
3284
|
|
|
|
|
|
|
} else { |
|
3285
|
0
|
0
|
|
|
|
|
tree.lemma.append(" ").append(root); |
|
3286
|
|
|
|
|
|
|
} |
|
3287
|
|
|
|
|
|
|
|
|
3288
|
0
|
0
|
|
|
|
|
if (derinet->children(root, children)) |
|
|
|
0
|
|
|
|
|
|
|
3289
|
0
|
0
|
|
|
|
|
for (auto&& child : children) |
|
3290
|
0
|
0
|
|
|
|
|
format_tree(child.lemma, tag, tree, converter); |
|
3291
|
0
|
0
|
|
|
|
|
tree.lemma.push_back(' '); |
|
3292
|
0
|
|
|
|
|
|
} |
|
3293
|
|
|
|
|
|
|
|
|
3294
|
|
|
|
|
|
|
private: |
|
3295
|
|
|
|
|
|
|
const derivator* derinet; |
|
3296
|
|
|
|
|
|
|
}; |
|
3297
|
|
|
|
|
|
|
|
|
3298
|
0
|
|
|
|
|
|
derivation_formatter* derivation_formatter::new_tree_derivation_formatter(const derivator* derinet) { |
|
3299
|
0
|
0
|
|
|
|
|
return derinet ? new tree_derivation_formatter(derinet) : nullptr; |
|
|
|
0
|
|
|
|
|
|
|
3300
|
|
|
|
|
|
|
} |
|
3301
|
|
|
|
|
|
|
|
|
3302
|
0
|
|
|
|
|
|
derivation_formatter* derivation_formatter::new_derivation_formatter(string_piece name, const derivator* derinet) { |
|
3303
|
0
|
0
|
|
|
|
|
if (name == "none") return new_none_derivation_formatter(); |
|
3304
|
0
|
0
|
|
|
|
|
if (name == "root") return new_root_derivation_formatter(derinet); |
|
3305
|
0
|
0
|
|
|
|
|
if (name == "path") return new_path_derivation_formatter(derinet); |
|
3306
|
0
|
0
|
|
|
|
|
if (name == "tree") return new_tree_derivation_formatter(derinet); |
|
3307
|
|
|
|
|
|
|
return nullptr; |
|
3308
|
|
|
|
|
|
|
} |
|
3309
|
|
|
|
|
|
|
|
|
3310
|
|
|
|
|
|
|
} // namespace morphodita |
|
3311
|
|
|
|
|
|
|
|
|
3312
|
|
|
|
|
|
|
///////// |
|
3313
|
|
|
|
|
|
|
// File: morphodita/morpho/small_stringops.h |
|
3314
|
|
|
|
|
|
|
///////// |
|
3315
|
|
|
|
|
|
|
|
|
3316
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
3317
|
|
|
|
|
|
|
// |
|
3318
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
3319
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
3320
|
|
|
|
|
|
|
// |
|
3321
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
3322
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3323
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
3324
|
|
|
|
|
|
|
|
|
3325
|
|
|
|
|
|
|
namespace morphodita { |
|
3326
|
|
|
|
|
|
|
|
|
3327
|
|
|
|
|
|
|
// Declarations |
|
3328
|
|
|
|
|
|
|
inline bool small_memeq(const void* a, const void* b, size_t len); |
|
3329
|
|
|
|
|
|
|
inline void small_memcpy(void* dest, const void* src, size_t len); |
|
3330
|
|
|
|
|
|
|
|
|
3331
|
|
|
|
|
|
|
// Definitions |
|
3332
|
|
|
|
|
|
|
bool small_memeq(const void* a_void, const void* b_void, size_t len) { |
|
3333
|
|
|
|
|
|
|
const char* a = (const char*)a_void; |
|
3334
|
|
|
|
|
|
|
const char* b = (const char*)b_void; |
|
3335
|
|
|
|
|
|
|
|
|
3336
|
1980
|
0
|
|
|
|
|
while (len--) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
3337
|
1735
|
0
|
|
|
|
|
if (*a++ != *b++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
3338
|
|
|
|
|
|
|
return false; |
|
3339
|
|
|
|
|
|
|
return true; |
|
3340
|
|
|
|
|
|
|
} |
|
3341
|
|
|
|
|
|
|
|
|
3342
|
|
|
|
|
|
|
void small_memcpy(void* dest_void, const void* src_void, size_t len) { |
|
3343
|
|
|
|
|
|
|
char* dest = (char*)dest_void; |
|
3344
|
|
|
|
|
|
|
const char* src = (const char*)src_void; |
|
3345
|
|
|
|
|
|
|
|
|
3346
|
1353
|
0
|
|
|
|
|
while (len--) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
3347
|
967
|
|
|
|
|
|
*dest++ = *src++; |
|
3348
|
|
|
|
|
|
|
} |
|
3349
|
|
|
|
|
|
|
|
|
3350
|
|
|
|
|
|
|
} // namespace morphodita |
|
3351
|
|
|
|
|
|
|
|
|
3352
|
|
|
|
|
|
|
///////// |
|
3353
|
|
|
|
|
|
|
// File: trainer/training_failure.h |
|
3354
|
|
|
|
|
|
|
///////// |
|
3355
|
|
|
|
|
|
|
|
|
3356
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
3357
|
|
|
|
|
|
|
// |
|
3358
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
|
3359
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
3360
|
|
|
|
|
|
|
// |
|
3361
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
3362
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3363
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
3364
|
|
|
|
|
|
|
|
|
3365
|
|
|
|
|
|
|
namespace utils { |
|
3366
|
|
|
|
|
|
|
|
|
3367
|
0
|
|
|
|
|
|
class training_error : public runtime_error { |
|
3368
|
|
|
|
|
|
|
public: |
|
3369
|
|
|
|
|
|
|
training_error(); |
|
3370
|
|
|
|
|
|
|
|
|
3371
|
|
|
|
|
|
|
static ostringstream message_collector; |
|
3372
|
|
|
|
|
|
|
}; |
|
3373
|
|
|
|
|
|
|
|
|
3374
|
|
|
|
|
|
|
#define training_failure(message) throw (training_error::message_collector << message, training_error()) |
|
3375
|
|
|
|
|
|
|
|
|
3376
|
|
|
|
|
|
|
} // namespace utils |
|
3377
|
|
|
|
|
|
|
|
|
3378
|
|
|
|
|
|
|
///////// |
|
3379
|
|
|
|
|
|
|
// File: utils/binary_encoder.h |
|
3380
|
|
|
|
|
|
|
///////// |
|
3381
|
|
|
|
|
|
|
|
|
3382
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
|
3383
|
|
|
|
|
|
|
// |
|
3384
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
3385
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
3386
|
|
|
|
|
|
|
// |
|
3387
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
3388
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3389
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
3390
|
|
|
|
|
|
|
|
|
3391
|
|
|
|
|
|
|
namespace utils { |
|
3392
|
|
|
|
|
|
|
|
|
3393
|
|
|
|
|
|
|
// |
|
3394
|
|
|
|
|
|
|
// Declarations |
|
3395
|
|
|
|
|
|
|
// |
|
3396
|
|
|
|
|
|
|
|
|
3397
|
0
|
|
|
|
|
|
class binary_encoder { |
|
3398
|
|
|
|
|
|
|
public: |
|
3399
|
|
|
|
|
|
|
inline binary_encoder(); |
|
3400
|
|
|
|
|
|
|
|
|
3401
|
|
|
|
|
|
|
inline void add_1B(unsigned val); |
|
3402
|
|
|
|
|
|
|
inline void add_2B(unsigned val); |
|
3403
|
|
|
|
|
|
|
inline void add_4B(unsigned val); |
|
3404
|
|
|
|
|
|
|
inline void add_float(double val); |
|
3405
|
|
|
|
|
|
|
inline void add_double(double val); |
|
3406
|
|
|
|
|
|
|
inline void add_str(string_piece str); |
|
3407
|
|
|
|
|
|
|
inline void add_data(string_piece data); |
|
3408
|
|
|
|
|
|
|
template inline void add_data(const vector& data); |
|
3409
|
|
|
|
|
|
|
template inline void add_data(const T* data, size_t elements); |
|
3410
|
|
|
|
|
|
|
|
|
3411
|
|
|
|
|
|
|
vector data; |
|
3412
|
|
|
|
|
|
|
}; |
|
3413
|
|
|
|
|
|
|
|
|
3414
|
|
|
|
|
|
|
// |
|
3415
|
|
|
|
|
|
|
// Definitions |
|
3416
|
|
|
|
|
|
|
// |
|
3417
|
|
|
|
|
|
|
|
|
3418
|
0
|
|
|
|
|
|
binary_encoder::binary_encoder() { |
|
3419
|
0
|
0
|
|
|
|
|
data.reserve(16); |
|
3420
|
0
|
|
|
|
|
|
} |
|
3421
|
|
|
|
|
|
|
|
|
3422
|
0
|
|
|
|
|
|
void binary_encoder::add_1B(unsigned val) { |
|
3423
|
0
|
0
|
|
|
|
|
if (uint8_t(val) != val) training_failure("Should encode value " << val << " in one byte!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
3424
|
0
|
|
|
|
|
|
data.push_back(val); |
|
3425
|
0
|
|
|
|
|
|
} |
|
3426
|
|
|
|
|
|
|
|
|
3427
|
0
|
|
|
|
|
|
void binary_encoder::add_2B(unsigned val) { |
|
3428
|
0
|
0
|
|
|
|
|
if (uint16_t(val) != val) training_failure("Should encode value " << val << " in two bytes!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
3429
|
0
|
|
|
|
|
|
data.insert(data.end(), (unsigned char*) &val, ((unsigned char*) &val) + sizeof(uint16_t)); |
|
3430
|
0
|
|
|
|
|
|
} |
|
3431
|
|
|
|
|
|
|
|
|
3432
|
|
|
|
|
|
|
void binary_encoder::add_4B(unsigned val) { |
|
3433
|
|
|
|
|
|
|
if (uint32_t(val) != val) training_failure("Should encode value " << val << " in four bytes!"); |
|
3434
|
0
|
|
|
|
|
|
data.insert(data.end(), (unsigned char*) &val, ((unsigned char*) &val) + sizeof(uint32_t)); |
|
3435
|
|
|
|
|
|
|
} |
|
3436
|
|
|
|
|
|
|
|
|
3437
|
|
|
|
|
|
|
void binary_encoder::add_float(double val) { |
|
3438
|
|
|
|
|
|
|
data.insert(data.end(), (unsigned char*) &val, ((unsigned char*) &val) + sizeof(float)); |
|
3439
|
|
|
|
|
|
|
} |
|
3440
|
|
|
|
|
|
|
|
|
3441
|
|
|
|
|
|
|
void binary_encoder::add_double(double val) { |
|
3442
|
|
|
|
|
|
|
data.insert(data.end(), (unsigned char*) &val, ((unsigned char*) &val) + sizeof(double)); |
|
3443
|
|
|
|
|
|
|
} |
|
3444
|
|
|
|
|
|
|
|
|
3445
|
0
|
|
|
|
|
|
void binary_encoder::add_str(string_piece str) { |
|
3446
|
0
|
|
|
|
|
|
add_1B(str.len < 255 ? str.len : 255); |
|
3447
|
0
|
0
|
|
|
|
|
if (!(str.len < 255)) add_4B(str.len); |
|
3448
|
|
|
|
|
|
|
add_data(str); |
|
3449
|
0
|
|
|
|
|
|
} |
|
3450
|
|
|
|
|
|
|
|
|
3451
|
|
|
|
|
|
|
void binary_encoder::add_data(string_piece data) { |
|
3452
|
0
|
|
|
|
|
|
this->data.insert(this->data.end(), (const unsigned char*) data.str, (const unsigned char*) (data.str + data.len)); |
|
3453
|
|
|
|
|
|
|
} |
|
3454
|
|
|
|
|
|
|
|
|
3455
|
|
|
|
|
|
|
template |
|
3456
|
|
|
|
|
|
|
void binary_encoder::add_data(const vector& data) { |
|
3457
|
0
|
|
|
|
|
|
this->data.insert(this->data.end(), (const unsigned char*) data.data(), (const unsigned char*) (data.data() + data.size())); |
|
3458
|
|
|
|
|
|
|
} |
|
3459
|
|
|
|
|
|
|
|
|
3460
|
|
|
|
|
|
|
template |
|
3461
|
|
|
|
|
|
|
void binary_encoder::add_data(const T* data, size_t elements) { |
|
3462
|
0
|
|
|
|
|
|
this->data.insert(this->data.end(), (const unsigned char*) data, (const unsigned char*) (data + elements)); |
|
3463
|
|
|
|
|
|
|
} |
|
3464
|
|
|
|
|
|
|
|
|
3465
|
|
|
|
|
|
|
} // namespace utils |
|
3466
|
|
|
|
|
|
|
|
|
3467
|
|
|
|
|
|
|
///////// |
|
3468
|
|
|
|
|
|
|
// File: utils/pointer_decoder.h |
|
3469
|
|
|
|
|
|
|
///////// |
|
3470
|
|
|
|
|
|
|
|
|
3471
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
|
3472
|
|
|
|
|
|
|
// |
|
3473
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
3474
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
3475
|
|
|
|
|
|
|
// |
|
3476
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
3477
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3478
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
3479
|
|
|
|
|
|
|
|
|
3480
|
|
|
|
|
|
|
namespace utils { |
|
3481
|
|
|
|
|
|
|
|
|
3482
|
|
|
|
|
|
|
// |
|
3483
|
|
|
|
|
|
|
// Declarations |
|
3484
|
|
|
|
|
|
|
// |
|
3485
|
|
|
|
|
|
|
|
|
3486
|
|
|
|
|
|
|
class pointer_decoder { |
|
3487
|
|
|
|
|
|
|
public: |
|
3488
|
|
|
|
|
|
|
inline pointer_decoder(const unsigned char*& data); |
|
3489
|
|
|
|
|
|
|
inline unsigned next_1B(); |
|
3490
|
|
|
|
|
|
|
inline unsigned next_2B(); |
|
3491
|
|
|
|
|
|
|
inline unsigned next_4B(); |
|
3492
|
|
|
|
|
|
|
inline void next_str(string& str); |
|
3493
|
|
|
|
|
|
|
template inline const T* next(unsigned elements); |
|
3494
|
|
|
|
|
|
|
|
|
3495
|
|
|
|
|
|
|
private: |
|
3496
|
|
|
|
|
|
|
const unsigned char*& data; |
|
3497
|
|
|
|
|
|
|
}; |
|
3498
|
|
|
|
|
|
|
|
|
3499
|
|
|
|
|
|
|
// |
|
3500
|
|
|
|
|
|
|
// Definitions |
|
3501
|
|
|
|
|
|
|
// |
|
3502
|
|
|
|
|
|
|
|
|
3503
|
14
|
|
|
|
|
|
pointer_decoder::pointer_decoder(const unsigned char*& data) : data(data) {} |
|
3504
|
|
|
|
|
|
|
|
|
3505
|
|
|
|
|
|
|
unsigned pointer_decoder::next_1B() { |
|
3506
|
0
|
|
|
|
|
|
return *data++; |
|
3507
|
|
|
|
|
|
|
} |
|
3508
|
|
|
|
|
|
|
|
|
3509
|
|
|
|
|
|
|
unsigned pointer_decoder::next_2B() { |
|
3510
|
|
|
|
|
|
|
uint16_t result; |
|
3511
|
14
|
|
|
|
|
|
memcpy(&result, data, sizeof(uint16_t)); |
|
3512
|
14
|
|
|
|
|
|
data += sizeof(uint16_t); |
|
3513
|
1
|
|
|
|
|
|
return result; |
|
3514
|
|
|
|
|
|
|
} |
|
3515
|
|
|
|
|
|
|
|
|
3516
|
|
|
|
|
|
|
unsigned pointer_decoder::next_4B() { |
|
3517
|
|
|
|
|
|
|
uint32_t result; |
|
3518
|
13
|
|
|
|
|
|
memcpy(&result, data, sizeof(uint32_t)); |
|
3519
|
13
|
|
|
|
|
|
data += sizeof(uint32_t); |
|
3520
|
|
|
|
|
|
|
return result; |
|
3521
|
|
|
|
|
|
|
} |
|
3522
|
|
|
|
|
|
|
|
|
3523
|
|
|
|
|
|
|
void pointer_decoder::next_str(string& str) { |
|
3524
|
|
|
|
|
|
|
unsigned len = next_1B(); |
|
3525
|
|
|
|
|
|
|
if (len == 255) len = next_4B(); |
|
3526
|
|
|
|
|
|
|
str.assign(next(len), len); |
|
3527
|
|
|
|
|
|
|
} |
|
3528
|
|
|
|
|
|
|
|
|
3529
|
|
|
|
|
|
|
template const T* pointer_decoder::next(unsigned elements) { |
|
3530
|
3
|
|
|
|
|
|
const T* result = (const T*) data; |
|
3531
|
0
|
|
|
|
|
|
data += sizeof(T) * elements; |
|
3532
|
|
|
|
|
|
|
return result; |
|
3533
|
|
|
|
|
|
|
} |
|
3534
|
|
|
|
|
|
|
|
|
3535
|
|
|
|
|
|
|
} // namespace utils |
|
3536
|
|
|
|
|
|
|
|
|
3537
|
|
|
|
|
|
|
///////// |
|
3538
|
|
|
|
|
|
|
// File: utils/unaligned_access.h |
|
3539
|
|
|
|
|
|
|
///////// |
|
3540
|
|
|
|
|
|
|
|
|
3541
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
|
3542
|
|
|
|
|
|
|
// |
|
3543
|
|
|
|
|
|
|
// Copyright 2017 Institute of Formal and Applied Linguistics, Faculty of |
|
3544
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
3545
|
|
|
|
|
|
|
// |
|
3546
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
3547
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3548
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
3549
|
|
|
|
|
|
|
|
|
3550
|
|
|
|
|
|
|
namespace utils { |
|
3551
|
|
|
|
|
|
|
|
|
3552
|
|
|
|
|
|
|
// |
|
3553
|
|
|
|
|
|
|
// Declarations |
|
3554
|
|
|
|
|
|
|
// |
|
3555
|
|
|
|
|
|
|
|
|
3556
|
|
|
|
|
|
|
template |
|
3557
|
|
|
|
|
|
|
inline T unaligned_load(const P* ptr); |
|
3558
|
|
|
|
|
|
|
|
|
3559
|
|
|
|
|
|
|
template |
|
3560
|
|
|
|
|
|
|
inline T unaligned_load_inc(const P*& ptr); |
|
3561
|
|
|
|
|
|
|
|
|
3562
|
|
|
|
|
|
|
template |
|
3563
|
|
|
|
|
|
|
inline void unaligned_store(P* ptr, T value); |
|
3564
|
|
|
|
|
|
|
|
|
3565
|
|
|
|
|
|
|
template |
|
3566
|
|
|
|
|
|
|
inline void unaligned_store_inc(P*& ptr, T value); |
|
3567
|
|
|
|
|
|
|
|
|
3568
|
|
|
|
|
|
|
template |
|
3569
|
|
|
|
|
|
|
T* unaligned_lower_bound(T* first, size_t size, T val); |
|
3570
|
|
|
|
|
|
|
|
|
3571
|
|
|
|
|
|
|
template |
|
3572
|
|
|
|
|
|
|
T* unaligned_upper_bound(T* first, size_t size, T val); |
|
3573
|
|
|
|
|
|
|
|
|
3574
|
|
|
|
|
|
|
// |
|
3575
|
|
|
|
|
|
|
// Definitions |
|
3576
|
|
|
|
|
|
|
// |
|
3577
|
|
|
|
|
|
|
|
|
3578
|
|
|
|
|
|
|
template |
|
3579
|
|
|
|
|
|
|
inline T unaligned_load(const P* ptr) { |
|
3580
|
|
|
|
|
|
|
T value; |
|
3581
|
|
|
|
|
|
|
memcpy(&value, ptr, sizeof(T)); |
|
3582
|
|
|
|
|
|
|
return value; |
|
3583
|
|
|
|
|
|
|
} |
|
3584
|
|
|
|
|
|
|
|
|
3585
|
|
|
|
|
|
|
template |
|
3586
|
|
|
|
|
|
|
inline T unaligned_load_inc(const P*& ptr) { |
|
3587
|
|
|
|
|
|
|
T value; |
|
3588
|
|
|
|
|
|
|
memcpy(&value, ptr, sizeof(T)); |
|
3589
|
0
|
|
|
|
|
|
((const char*&)ptr) += sizeof(T); |
|
3590
|
|
|
|
|
|
|
return value; |
|
3591
|
|
|
|
|
|
|
} |
|
3592
|
|
|
|
|
|
|
|
|
3593
|
|
|
|
|
|
|
template |
|
3594
|
|
|
|
|
|
|
inline void unaligned_store(P* ptr, T value) { |
|
3595
|
|
|
|
|
|
|
memcpy(ptr, &value, sizeof(T)); |
|
3596
|
|
|
|
|
|
|
} |
|
3597
|
|
|
|
|
|
|
|
|
3598
|
|
|
|
|
|
|
template |
|
3599
|
|
|
|
|
|
|
inline void unaligned_store_inc(P*& ptr, T value) { |
|
3600
|
|
|
|
|
|
|
memcpy(ptr, &value, sizeof(T)); |
|
3601
|
50
|
|
|
|
|
|
((char*&)ptr) += sizeof(T); |
|
3602
|
|
|
|
|
|
|
} |
|
3603
|
|
|
|
|
|
|
|
|
3604
|
|
|
|
|
|
|
template |
|
3605
|
|
|
|
|
|
|
T* unaligned_lower_bound(T* first, size_t size, T val) { |
|
3606
|
40
|
100
|
|
|
|
|
while (size) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
3607
|
30
|
|
|
|
|
|
size_t step = size >> 1; |
|
3608
|
30
|
100
|
|
|
|
|
if (unaligned_load(first + step) < val) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
3609
|
9
|
|
|
|
|
|
first += step + 1; |
|
3610
|
9
|
|
|
|
|
|
size -= step + 1; |
|
3611
|
|
|
|
|
|
|
} else { |
|
3612
|
|
|
|
|
|
|
size = step; |
|
3613
|
|
|
|
|
|
|
} |
|
3614
|
|
|
|
|
|
|
} |
|
3615
|
|
|
|
|
|
|
return first; |
|
3616
|
|
|
|
|
|
|
} |
|
3617
|
|
|
|
|
|
|
|
|
3618
|
|
|
|
|
|
|
template |
|
3619
|
|
|
|
|
|
|
T* unaligned_upper_bound(T* first, size_t size, T val) { |
|
3620
|
|
|
|
|
|
|
while (size) { |
|
3621
|
|
|
|
|
|
|
size_t step = size >> 1; |
|
3622
|
|
|
|
|
|
|
if (!(val < unaligned_load(first + step))) { |
|
3623
|
|
|
|
|
|
|
first += step + 1; |
|
3624
|
|
|
|
|
|
|
size -= step + 1; |
|
3625
|
|
|
|
|
|
|
} else { |
|
3626
|
|
|
|
|
|
|
size = step; |
|
3627
|
|
|
|
|
|
|
} |
|
3628
|
|
|
|
|
|
|
} |
|
3629
|
|
|
|
|
|
|
return first; |
|
3630
|
|
|
|
|
|
|
} |
|
3631
|
|
|
|
|
|
|
|
|
3632
|
|
|
|
|
|
|
} // namespace utils |
|
3633
|
|
|
|
|
|
|
|
|
3634
|
|
|
|
|
|
|
///////// |
|
3635
|
|
|
|
|
|
|
// File: morphodita/morpho/persistent_unordered_map.h |
|
3636
|
|
|
|
|
|
|
///////// |
|
3637
|
|
|
|
|
|
|
|
|
3638
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
3639
|
|
|
|
|
|
|
// |
|
3640
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
3641
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
3642
|
|
|
|
|
|
|
// |
|
3643
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
3644
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3645
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
3646
|
|
|
|
|
|
|
|
|
3647
|
|
|
|
|
|
|
namespace morphodita { |
|
3648
|
|
|
|
|
|
|
|
|
3649
|
|
|
|
|
|
|
// Declarations |
|
3650
|
103
|
0
|
|
|
|
|
class persistent_unordered_map { |
|
|
|
0
|
|
|
|
|
|
|
3651
|
|
|
|
|
|
|
public: |
|
3652
|
|
|
|
|
|
|
// Accessing function |
|
3653
|
|
|
|
|
|
|
template |
|
3654
|
|
|
|
|
|
|
inline const unsigned char* at(const char* str, int len, EntrySize entry_size) const; |
|
3655
|
|
|
|
|
|
|
|
|
3656
|
|
|
|
|
|
|
template |
|
3657
|
|
|
|
|
|
|
inline const T* at_typed(const char* str, int len) const; |
|
3658
|
|
|
|
|
|
|
|
|
3659
|
|
|
|
|
|
|
template |
|
3660
|
|
|
|
|
|
|
inline void iter(const char* str, int len, EntryProcess entry_process) const; |
|
3661
|
|
|
|
|
|
|
|
|
3662
|
|
|
|
|
|
|
template |
|
3663
|
|
|
|
|
|
|
inline void iter_all(EntryProcess entry_process) const; |
|
3664
|
|
|
|
|
|
|
|
|
3665
|
|
|
|
|
|
|
// Two helper functions accessing some internals |
|
3666
|
|
|
|
|
|
|
inline int max_length() const; |
|
3667
|
|
|
|
|
|
|
inline const unsigned char* data_start(int len) const; |
|
3668
|
|
|
|
|
|
|
|
|
3669
|
|
|
|
|
|
|
// Creation functions |
|
3670
|
|
|
|
|
|
|
persistent_unordered_map() {} |
|
3671
|
|
|
|
|
|
|
template |
|
3672
|
|
|
|
|
|
|
persistent_unordered_map(const unordered_map& map, double load_factor, EntryEncode entry_encode); |
|
3673
|
|
|
|
|
|
|
template |
|
3674
|
|
|
|
|
|
|
persistent_unordered_map(const unordered_map& map, double load_factor, bool add_prefixes, bool add_suffixes, EntryEncode entry_encode); |
|
3675
|
|
|
|
|
|
|
|
|
3676
|
|
|
|
|
|
|
// Manual creation functions |
|
3677
|
|
|
|
|
|
|
inline void resize(unsigned elems); |
|
3678
|
|
|
|
|
|
|
inline void add(const char* str, int str_len, int data_len); |
|
3679
|
|
|
|
|
|
|
inline void done_adding(); |
|
3680
|
|
|
|
|
|
|
inline unsigned char* fill(const char* str, int str_len, int data_len); |
|
3681
|
|
|
|
|
|
|
inline void done_filling(); |
|
3682
|
|
|
|
|
|
|
|
|
3683
|
|
|
|
|
|
|
// Serialization |
|
3684
|
|
|
|
|
|
|
inline void load(binary_decoder& data); |
|
3685
|
|
|
|
|
|
|
inline void save(binary_encoder& enc); |
|
3686
|
|
|
|
|
|
|
|
|
3687
|
|
|
|
|
|
|
private: |
|
3688
|
|
|
|
|
|
|
struct fnv_hash; |
|
3689
|
|
|
|
|
|
|
vector hashes; |
|
3690
|
|
|
|
|
|
|
|
|
3691
|
|
|
|
|
|
|
template |
|
3692
|
|
|
|
|
|
|
void construct(const map& map, double load_factor, EntryEncode entry_encode); |
|
3693
|
|
|
|
|
|
|
}; |
|
3694
|
|
|
|
|
|
|
|
|
3695
|
|
|
|
|
|
|
// Definitions |
|
3696
|
1063
|
0
|
|
|
|
|
struct persistent_unordered_map::fnv_hash { |
|
3697
|
24
|
|
|
|
|
|
fnv_hash(unsigned num) { |
|
3698
|
24
|
|
|
|
|
|
mask = 1; |
|
3699
|
76
|
100
|
|
|
|
|
while (mask < num) |
|
3700
|
52
|
|
|
|
|
|
mask <<= 1; |
|
3701
|
24
|
50
|
|
|
|
|
hash.resize(mask + 1); |
|
3702
|
24
|
|
|
|
|
|
mask--; |
|
3703
|
24
|
|
|
|
|
|
} |
|
3704
|
484
|
|
|
|
|
|
fnv_hash(binary_decoder& data) { |
|
3705
|
484
|
50
|
|
|
|
|
uint32_t size = data.next_4B(); |
|
3706
|
484
|
|
|
|
|
|
mask = size - 2; |
|
3707
|
484
|
50
|
|
|
|
|
hash.resize(size); |
|
3708
|
484
|
50
|
|
|
|
|
memcpy(hash.data(), data.next(size), size * sizeof(uint32_t)); |
|
3709
|
|
|
|
|
|
|
|
|
3710
|
484
|
50
|
|
|
|
|
size = data.next_4B(); |
|
3711
|
484
|
50
|
|
|
|
|
this->data.resize(size); |
|
3712
|
484
|
100
|
|
|
|
|
if (size) memcpy(this->data.data(), data.next(size), size); |
|
|
|
50
|
|
|
|
|
|
|
3713
|
484
|
|
|
|
|
|
} |
|
3714
|
|
|
|
|
|
|
|
|
3715
|
|
|
|
|
|
|
inline uint32_t index(const char* data, int len) const { |
|
3716
|
464
|
0
|
|
|
|
|
if (len <= 0) return 0; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
3717
|
456
|
0
|
|
|
|
|
if (len == 1) return unaligned_load(data); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
3718
|
427
|
0
|
|
|
|
|
if (len == 2) return unaligned_load(data); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
3719
|
|
|
|
|
|
|
|
|
3720
|
|
|
|
|
|
|
uint32_t hash = 2166136261U; |
|
3721
|
1563
|
0
|
|
|
|
|
while (len--) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
3722
|
1413
|
|
|
|
|
|
hash = (hash ^ unsigned((signed char)*data++)) * 16777619U; |
|
3723
|
150
|
|
|
|
|
|
return hash & mask; |
|
3724
|
|
|
|
|
|
|
} |
|
3725
|
|
|
|
|
|
|
|
|
3726
|
|
|
|
|
|
|
inline void save(binary_encoder& enc); |
|
3727
|
|
|
|
|
|
|
|
|
3728
|
|
|
|
|
|
|
unsigned mask; |
|
3729
|
|
|
|
|
|
|
vector hash; |
|
3730
|
|
|
|
|
|
|
vector data; |
|
3731
|
|
|
|
|
|
|
}; |
|
3732
|
|
|
|
|
|
|
|
|
3733
|
|
|
|
|
|
|
template |
|
3734
|
8
|
|
|
|
|
|
const unsigned char* persistent_unordered_map::at(const char* str, int len, EntrySize entry_size) const { |
|
3735
|
8
|
0
|
|
|
|
|
if (unsigned(len) >= hashes.size()) return nullptr; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
3736
|
|
|
|
|
|
|
|
|
3737
|
8
|
|
|
|
|
|
unsigned index = hashes[len].index(str, len); |
|
3738
|
16
|
|
|
|
|
|
const unsigned char* data = hashes[len].data.data() + hashes[len].hash[index]; |
|
3739
|
16
|
|
|
|
|
|
const unsigned char* end = hashes[len].data.data() + hashes[len].hash[index+1]; |
|
3740
|
|
|
|
|
|
|
|
|
3741
|
8
|
0
|
|
|
|
|
if (len <= 2) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
3742
|
8
|
0
|
|
|
|
|
return data != end ? data + len : nullptr; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
3743
|
|
|
|
|
|
|
|
|
3744
|
0
|
0
|
|
|
|
|
while (data < end) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
3745
|
0
|
0
|
|
|
|
|
if (small_memeq(str, data, len)) return data + len; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
3746
|
0
|
|
|
|
|
|
data += len; |
|
3747
|
|
|
|
|
|
|
pointer_decoder decoder(data); |
|
3748
|
0
|
|
|
|
|
|
entry_size(decoder); |
|
3749
|
|
|
|
|
|
|
} |
|
3750
|
|
|
|
|
|
|
|
|
3751
|
|
|
|
|
|
|
return nullptr; |
|
3752
|
|
|
|
|
|
|
} |
|
3753
|
|
|
|
|
|
|
|
|
3754
|
|
|
|
|
|
|
template |
|
3755
|
438
|
|
|
|
|
|
const T* persistent_unordered_map::at_typed(const char* str, int len) const { |
|
3756
|
438
|
100
|
|
|
|
|
if (unsigned(len) >= hashes.size()) return nullptr; |
|
|
|
100
|
|
|
|
|
|
|
3757
|
|
|
|
|
|
|
|
|
3758
|
408
|
|
|
|
|
|
unsigned index = hashes[len].index(str, len); |
|
3759
|
816
|
|
|
|
|
|
const unsigned char* data = hashes[len].data.data() + hashes[len].hash[index]; |
|
3760
|
816
|
|
|
|
|
|
const unsigned char* end = hashes[len].data.data() + hashes[len].hash[index+1]; |
|
3761
|
|
|
|
|
|
|
|
|
3762
|
408
|
100
|
|
|
|
|
if (len <= 2) |
|
|
|
100
|
|
|
|
|
|
|
3763
|
293
|
100
|
|
|
|
|
return data != end ? (const T*)(data + len) : nullptr; |
|
|
|
100
|
|
|
|
|
|
|
3764
|
|
|
|
|
|
|
|
|
3765
|
146
|
100
|
|
|
|
|
while (data < end) { |
|
|
|
100
|
|
|
|
|
|
|
3766
|
133
|
100
|
|
|
|
|
if (small_memeq(str, data, len)) return (const T*)(data + len); |
|
|
|
100
|
|
|
|
|
|
|
3767
|
31
|
|
|
|
|
|
data += len + sizeof(T); |
|
3768
|
|
|
|
|
|
|
} |
|
3769
|
|
|
|
|
|
|
|
|
3770
|
|
|
|
|
|
|
return nullptr; |
|
3771
|
|
|
|
|
|
|
} |
|
3772
|
|
|
|
|
|
|
|
|
3773
|
|
|
|
|
|
|
template |
|
3774
|
8
|
|
|
|
|
|
void persistent_unordered_map::iter(const char* str, int len, EntryProcess entry_process) const { |
|
3775
|
8
|
0
|
|
|
|
|
if (unsigned(len) >= hashes.size()) return; |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
3776
|
|
|
|
|
|
|
|
|
3777
|
8
|
|
|
|
|
|
unsigned index = hashes[len].index(str, len); |
|
3778
|
16
|
|
|
|
|
|
const unsigned char* data = hashes[len].data.data() + hashes[len].hash[index]; |
|
3779
|
8
|
|
|
|
|
|
const unsigned char* end = hashes[len].data.data() + hashes[len].hash[index+1]; |
|
3780
|
|
|
|
|
|
|
|
|
3781
|
21
|
0
|
|
|
|
|
while (data < end) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
3782
|
|
|
|
|
|
|
auto start = (const char*) data; |
|
3783
|
13
|
|
|
|
|
|
data += len; |
|
3784
|
|
|
|
|
|
|
pointer_decoder decoder(data); |
|
3785
|
13
|
|
|
|
|
|
entry_process(start, decoder); |
|
3786
|
|
|
|
|
|
|
} |
|
3787
|
|
|
|
|
|
|
} |
|
3788
|
|
|
|
|
|
|
|
|
3789
|
|
|
|
|
|
|
template |
|
3790
|
2
|
|
|
|
|
|
void persistent_unordered_map::iter_all(EntryProcess entry_process) const { |
|
3791
|
2
|
100
|
|
|
|
|
for (unsigned len = 0; len < hashes.size(); len++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
3792
|
1
|
|
|
|
|
|
const unsigned char* data = hashes[len].data.data(); |
|
3793
|
|
|
|
|
|
|
const unsigned char* end = data + hashes[len].data.size(); |
|
3794
|
|
|
|
|
|
|
|
|
3795
|
2
|
100
|
|
|
|
|
while (data < end) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
3796
|
|
|
|
|
|
|
auto start = (const char*) data; |
|
3797
|
1
|
|
|
|
|
|
data += len; |
|
3798
|
|
|
|
|
|
|
pointer_decoder decoder(data); |
|
3799
|
1
|
|
|
|
|
|
entry_process(start, len, decoder); |
|
3800
|
|
|
|
|
|
|
} |
|
3801
|
|
|
|
|
|
|
} |
|
3802
|
1
|
|
|
|
|
|
} |
|
3803
|
|
|
|
|
|
|
|
|
3804
|
|
|
|
|
|
|
int persistent_unordered_map::max_length() const { |
|
3805
|
20
|
|
|
|
|
|
return hashes.size(); |
|
3806
|
|
|
|
|
|
|
} |
|
3807
|
|
|
|
|
|
|
|
|
3808
|
|
|
|
|
|
|
const unsigned char* persistent_unordered_map::data_start(int len) const { |
|
3809
|
30
|
0
|
|
|
|
|
return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
3810
|
|
|
|
|
|
|
} |
|
3811
|
|
|
|
|
|
|
|
|
3812
|
24
|
|
|
|
|
|
void persistent_unordered_map::resize(unsigned elems) { |
|
3813
|
24
|
100
|
|
|
|
|
if (hashes.size() == 0) hashes.emplace_back(1); |
|
3814
|
22
|
100
|
|
|
|
|
else if (hashes.size() == 1) hashes.emplace_back(1<<8); |
|
3815
|
20
|
100
|
|
|
|
|
else if (hashes.size() == 2) hashes.emplace_back(1<<16); |
|
3816
|
18
|
|
|
|
|
|
else hashes.emplace_back(elems); |
|
3817
|
24
|
|
|
|
|
|
} |
|
3818
|
|
|
|
|
|
|
|
|
3819
|
20
|
|
|
|
|
|
void persistent_unordered_map::add(const char* str, int str_len, int data_len) { |
|
3820
|
20
|
50
|
|
|
|
|
if (unsigned(str_len) < hashes.size()) |
|
3821
|
20
|
|
|
|
|
|
hashes[str_len].hash[hashes[str_len].index(str, str_len)] += str_len + data_len; |
|
3822
|
20
|
|
|
|
|
|
} |
|
3823
|
|
|
|
|
|
|
|
|
3824
|
2
|
|
|
|
|
|
void persistent_unordered_map::done_adding() { |
|
3825
|
26
|
100
|
|
|
|
|
for (auto&& hash : hashes) { |
|
3826
|
|
|
|
|
|
|
int total = 0; |
|
3827
|
131657
|
100
|
|
|
|
|
for (auto&& len : hash.hash) total += len, len = total - len; |
|
3828
|
24
|
|
|
|
|
|
hash.data.resize(total); |
|
3829
|
|
|
|
|
|
|
} |
|
3830
|
2
|
|
|
|
|
|
} |
|
3831
|
|
|
|
|
|
|
|
|
3832
|
20
|
|
|
|
|
|
unsigned char* persistent_unordered_map::fill(const char* str, int str_len, int data_len) { |
|
3833
|
20
|
50
|
|
|
|
|
if (unsigned(str_len) < hashes.size()) { |
|
3834
|
20
|
|
|
|
|
|
unsigned index = hashes[str_len].index(str, str_len); |
|
3835
|
40
|
|
|
|
|
|
unsigned offset = hashes[str_len].hash[index]; |
|
3836
|
20
|
|
|
|
|
|
small_memcpy(hashes[str_len].data.data() + offset, str, str_len); |
|
3837
|
20
|
|
|
|
|
|
hashes[str_len].hash[index] += str_len + data_len; |
|
3838
|
20
|
|
|
|
|
|
return hashes[str_len].data.data() + offset + str_len; |
|
3839
|
|
|
|
|
|
|
} |
|
3840
|
|
|
|
|
|
|
return nullptr; |
|
3841
|
|
|
|
|
|
|
} |
|
3842
|
|
|
|
|
|
|
|
|
3843
|
2
|
|
|
|
|
|
void persistent_unordered_map::done_filling() { |
|
3844
|
26
|
100
|
|
|
|
|
for (auto&& hash : hashes) |
|
3845
|
131657
|
100
|
|
|
|
|
for (int i = hash.hash.size() - 1; i >= 0; i--) |
|
3846
|
131633
|
100
|
|
|
|
|
hash.hash[i] = i > 0 ? hash.hash[i-1] : 0; |
|
3847
|
2
|
|
|
|
|
|
} |
|
3848
|
|
|
|
|
|
|
|
|
3849
|
103
|
|
|
|
|
|
void persistent_unordered_map::load(binary_decoder& data) { |
|
3850
|
103
|
|
|
|
|
|
unsigned sizes = data.next_1B(); |
|
3851
|
|
|
|
|
|
|
|
|
3852
|
|
|
|
|
|
|
hashes.clear(); |
|
3853
|
587
|
100
|
|
|
|
|
for (unsigned i = 0; i < sizes; i++) |
|
3854
|
484
|
|
|
|
|
|
hashes.emplace_back(data); |
|
3855
|
103
|
|
|
|
|
|
} |
|
3856
|
|
|
|
|
|
|
|
|
3857
|
|
|
|
|
|
|
} // namespace morphodita |
|
3858
|
|
|
|
|
|
|
|
|
3859
|
|
|
|
|
|
|
///////// |
|
3860
|
|
|
|
|
|
|
// File: morphodita/derivator/derivator_dictionary.h |
|
3861
|
|
|
|
|
|
|
///////// |
|
3862
|
|
|
|
|
|
|
|
|
3863
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
3864
|
|
|
|
|
|
|
// |
|
3865
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
|
3866
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
3867
|
|
|
|
|
|
|
// |
|
3868
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
3869
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3870
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
3871
|
|
|
|
|
|
|
|
|
3872
|
|
|
|
|
|
|
namespace morphodita { |
|
3873
|
|
|
|
|
|
|
|
|
3874
|
0
|
|
|
|
|
|
class derivator_dictionary : public derivator { |
|
3875
|
|
|
|
|
|
|
public: |
|
3876
|
|
|
|
|
|
|
virtual bool parent(string_piece lemma, derivated_lemma& parent) const override; |
|
3877
|
|
|
|
|
|
|
virtual bool children(string_piece lemma, vector& children) const override; |
|
3878
|
|
|
|
|
|
|
|
|
3879
|
|
|
|
|
|
|
bool load(istream& is); |
|
3880
|
|
|
|
|
|
|
|
|
3881
|
|
|
|
|
|
|
private: |
|
3882
|
|
|
|
|
|
|
friend class morpho; |
|
3883
|
|
|
|
|
|
|
const morpho* dictionary; |
|
3884
|
|
|
|
|
|
|
persistent_unordered_map derinet; |
|
3885
|
|
|
|
|
|
|
}; |
|
3886
|
|
|
|
|
|
|
|
|
3887
|
|
|
|
|
|
|
} // namespace morphodita |
|
3888
|
|
|
|
|
|
|
|
|
3889
|
|
|
|
|
|
|
///////// |
|
3890
|
|
|
|
|
|
|
// File: utils/compressor.h |
|
3891
|
|
|
|
|
|
|
///////// |
|
3892
|
|
|
|
|
|
|
|
|
3893
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
|
3894
|
|
|
|
|
|
|
// |
|
3895
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
3896
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
3897
|
|
|
|
|
|
|
// |
|
3898
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
3899
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3900
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
3901
|
|
|
|
|
|
|
|
|
3902
|
|
|
|
|
|
|
namespace utils { |
|
3903
|
|
|
|
|
|
|
|
|
3904
|
|
|
|
|
|
|
class binary_decoder; |
|
3905
|
|
|
|
|
|
|
class binary_encoder; |
|
3906
|
|
|
|
|
|
|
|
|
3907
|
|
|
|
|
|
|
class compressor { |
|
3908
|
|
|
|
|
|
|
public: |
|
3909
|
|
|
|
|
|
|
static bool load(istream& is, binary_decoder& data); |
|
3910
|
|
|
|
|
|
|
static bool save(ostream& os, const binary_encoder& enc); |
|
3911
|
|
|
|
|
|
|
}; |
|
3912
|
|
|
|
|
|
|
|
|
3913
|
|
|
|
|
|
|
} // namespace utils |
|
3914
|
|
|
|
|
|
|
|
|
3915
|
|
|
|
|
|
|
///////// |
|
3916
|
|
|
|
|
|
|
// File: morphodita/derivator/derivator_dictionary.cpp |
|
3917
|
|
|
|
|
|
|
///////// |
|
3918
|
|
|
|
|
|
|
|
|
3919
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
3920
|
|
|
|
|
|
|
// |
|
3921
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
|
3922
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
3923
|
|
|
|
|
|
|
// |
|
3924
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
3925
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3926
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
3927
|
|
|
|
|
|
|
|
|
3928
|
|
|
|
|
|
|
namespace morphodita { |
|
3929
|
|
|
|
|
|
|
|
|
3930
|
0
|
|
|
|
|
|
bool derivator_dictionary::parent(string_piece lemma, derivated_lemma& parent) const { |
|
3931
|
0
|
0
|
|
|
|
|
if (dictionary) lemma.len = dictionary->lemma_id_len(lemma); |
|
3932
|
|
|
|
|
|
|
|
|
3933
|
0
|
|
|
|
|
|
auto lemma_data = derinet.at(lemma.str, lemma.len, [](pointer_decoder& data) { |
|
3934
|
|
|
|
|
|
|
data.next(data.next_1B()); |
|
3935
|
|
|
|
|
|
|
data.next_4B(); |
|
3936
|
|
|
|
|
|
|
data.next(data.next_2B()); |
|
3937
|
0
|
|
|
|
|
|
}); |
|
3938
|
0
|
0
|
|
|
|
|
if (lemma_data) { |
|
3939
|
0
|
|
|
|
|
|
auto parent_encoded = *(uint32_t*)(lemma_data + 1 + *lemma_data); |
|
3940
|
0
|
0
|
|
|
|
|
if (parent_encoded) { |
|
3941
|
0
|
|
|
|
|
|
unsigned parent_len = parent_encoded & 0xFF; |
|
3942
|
0
|
|
|
|
|
|
auto parent_data = derinet.data_start(parent_len) + (parent_encoded >> 8); |
|
3943
|
0
|
|
|
|
|
|
parent.lemma.assign((const char*) parent_data, parent_len); |
|
3944
|
0
|
0
|
|
|
|
|
if (parent_data[parent_len]) |
|
3945
|
0
|
|
|
|
|
|
parent.lemma.append((const char*) parent_data + parent_len + 1, parent_data[parent_len]); |
|
3946
|
|
|
|
|
|
|
return true; |
|
3947
|
|
|
|
|
|
|
} |
|
3948
|
|
|
|
|
|
|
} |
|
3949
|
|
|
|
|
|
|
parent.lemma.clear(); |
|
3950
|
0
|
|
|
|
|
|
return false; |
|
3951
|
|
|
|
|
|
|
} |
|
3952
|
|
|
|
|
|
|
|
|
3953
|
0
|
|
|
|
|
|
bool derivator_dictionary::children(string_piece lemma, vector& children) const { |
|
3954
|
0
|
0
|
|
|
|
|
if (dictionary) lemma.len = dictionary->lemma_id_len(lemma); |
|
3955
|
|
|
|
|
|
|
|
|
3956
|
0
|
|
|
|
|
|
auto lemma_data = derinet.at(lemma.str, lemma.len, [](pointer_decoder& data) { |
|
3957
|
|
|
|
|
|
|
data.next(data.next_1B()); |
|
3958
|
|
|
|
|
|
|
data.next_4B(); |
|
3959
|
|
|
|
|
|
|
data.next(data.next_2B()); |
|
3960
|
0
|
|
|
|
|
|
}); |
|
3961
|
0
|
0
|
|
|
|
|
if (lemma_data) { |
|
3962
|
0
|
|
|
|
|
|
auto children_len = *(uint16_t*)(lemma_data + 1 + *lemma_data + 4); |
|
3963
|
0
|
|
|
|
|
|
auto children_encoded = (uint32_t*)(lemma_data + 1 + *lemma_data + 4 + 2); |
|
3964
|
0
|
0
|
|
|
|
|
if (children_len) { |
|
3965
|
0
|
|
|
|
|
|
children.resize(children_len); |
|
3966
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < children_len; i++) { |
|
3967
|
0
|
|
|
|
|
|
unsigned child_len = children_encoded[i] & 0xFF; |
|
3968
|
0
|
|
|
|
|
|
auto child_data = derinet.data_start(child_len) + (children_encoded[i] >> 8); |
|
3969
|
0
|
|
|
|
|
|
children[i].lemma.assign((const char*) child_data, child_len); |
|
3970
|
0
|
0
|
|
|
|
|
if (child_data[child_len]) |
|
3971
|
0
|
|
|
|
|
|
children[i].lemma.append((const char*) child_data + child_len + 1, child_data[child_len]); |
|
3972
|
|
|
|
|
|
|
} |
|
3973
|
|
|
|
|
|
|
return true; |
|
3974
|
|
|
|
|
|
|
} |
|
3975
|
|
|
|
|
|
|
} |
|
3976
|
0
|
|
|
|
|
|
children.clear(); |
|
3977
|
0
|
|
|
|
|
|
return false; |
|
3978
|
|
|
|
|
|
|
} |
|
3979
|
|
|
|
|
|
|
|
|
3980
|
0
|
|
|
|
|
|
bool derivator_dictionary::load(istream& is) { |
|
3981
|
|
|
|
|
|
|
binary_decoder data; |
|
3982
|
0
|
0
|
|
|
|
|
if (!compressor::load(is, data)) return false; |
|
|
|
0
|
|
|
|
|
|
|
3983
|
|
|
|
|
|
|
|
|
3984
|
|
|
|
|
|
|
try { |
|
3985
|
0
|
0
|
|
|
|
|
for (int i = data.next_1B(); i > 0; i--) |
|
|
|
0
|
|
|
|
|
|
|
3986
|
0
|
0
|
|
|
|
|
derinet.resize(data.next_4B()); |
|
|
|
0
|
|
|
|
|
|
|
3987
|
|
|
|
|
|
|
|
|
3988
|
|
|
|
|
|
|
unsigned data_position = data.tell(); |
|
3989
|
|
|
|
|
|
|
vector lemma, parent; |
|
3990
|
0
|
0
|
|
|
|
|
for (int pass = 1; pass <= 3; pass++) { |
|
3991
|
0
|
0
|
|
|
|
|
if (pass > 1) data.seek(data_position); |
|
|
|
0
|
|
|
|
|
|
|
3992
|
|
|
|
|
|
|
|
|
3993
|
|
|
|
|
|
|
lemma.clear(); |
|
3994
|
0
|
0
|
|
|
|
|
for (int i = data.next_4B(); i > 0; i--) { |
|
|
|
0
|
|
|
|
|
|
|
3995
|
0
|
0
|
|
|
|
|
lemma.resize(lemma.size() - data.next_1B()); |
|
|
|
0
|
|
|
|
|
|
|
3996
|
0
|
0
|
|
|
|
|
for (int i = data.next_1B(); i > 0; i--) |
|
|
|
0
|
|
|
|
|
|
|
3997
|
0
|
0
|
|
|
|
|
lemma.push_back(data.next_1B()); |
|
3998
|
|
|
|
|
|
|
|
|
3999
|
0
|
0
|
|
|
|
|
unsigned char lemma_comment_len = data.next_1B(); |
|
4000
|
0
|
0
|
|
|
|
|
const char* lemma_comment = lemma_comment_len ? data.next(lemma_comment_len) : nullptr; |
|
|
|
0
|
|
|
|
|
|
|
4001
|
|
|
|
|
|
|
|
|
4002
|
0
|
0
|
|
|
|
|
unsigned children = data.next_2B(); |
|
4003
|
|
|
|
|
|
|
|
|
4004
|
0
|
0
|
|
|
|
|
if (pass == 3) parent.clear(); |
|
4005
|
|
|
|
|
|
|
enum { REMOVE_START = 1, REMOVE_END = 2, ADD_START = 4, ADD_END = 8 }; |
|
4006
|
0
|
0
|
|
|
|
|
int operations = data.next_1B(); |
|
4007
|
0
|
0
|
|
|
|
|
if (operations) { |
|
4008
|
0
|
0
|
|
|
|
|
int remove_start = operations & REMOVE_START ? data.next_1B() : 0; |
|
|
|
0
|
|
|
|
|
|
|
4009
|
0
|
0
|
|
|
|
|
int remove_end = operations & REMOVE_END ? data.next_1B() : 0; |
|
|
|
0
|
|
|
|
|
|
|
4010
|
0
|
0
|
|
|
|
|
if (operations & ADD_START) { |
|
4011
|
0
|
0
|
|
|
|
|
int add_start = data.next_1B(); |
|
4012
|
0
|
0
|
|
|
|
|
const char* str = data.next(add_start); |
|
4013
|
0
|
0
|
|
|
|
|
if (pass == 3) parent.assign(str, str + add_start); |
|
4014
|
|
|
|
|
|
|
} |
|
4015
|
0
|
0
|
|
|
|
|
if (pass == 3) parent.insert(parent.end(), lemma.begin() + remove_start, lemma.end() - remove_end); |
|
|
|
0
|
|
|
|
|
|
|
4016
|
0
|
0
|
|
|
|
|
if (operations & ADD_END) { |
|
4017
|
0
|
0
|
|
|
|
|
int add_end = data.next_1B(); |
|
4018
|
0
|
0
|
|
|
|
|
const char* str = data.next(add_end); |
|
4019
|
0
|
0
|
|
|
|
|
if (pass == 3) parent.insert(parent.end(), str, str + add_end); |
|
4020
|
|
|
|
|
|
|
} |
|
4021
|
|
|
|
|
|
|
} |
|
4022
|
|
|
|
|
|
|
|
|
4023
|
0
|
0
|
|
|
|
|
if (pass == 1) { |
|
4024
|
0
|
|
|
|
|
|
derinet.add(lemma.data(), lemma.size(), 1 + lemma_comment_len + 4 + 2 + 4 * children); |
|
4025
|
0
|
0
|
|
|
|
|
} else if (pass == 2) { |
|
4026
|
0
|
|
|
|
|
|
unsigned char* lemma_data = derinet.fill(lemma.data(), lemma.size(), 1 + lemma_comment_len + 4 + 2 + 4 * children); |
|
4027
|
0
|
|
|
|
|
|
*lemma_data++ = lemma_comment_len; |
|
4028
|
0
|
0
|
|
|
|
|
while (lemma_comment_len--) *lemma_data++ = *lemma_comment++; |
|
4029
|
|
|
|
|
|
|
unaligned_store_inc(lemma_data, 0); |
|
4030
|
|
|
|
|
|
|
unaligned_store_inc(lemma_data, children); |
|
4031
|
0
|
0
|
|
|
|
|
if (children) unaligned_store(((uint32_t*)lemma_data) + children - 1, 0); |
|
4032
|
0
|
0
|
|
|
|
|
} else if (pass == 3 && !parent.empty()) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4033
|
0
|
|
|
|
|
|
auto lemma_data = derinet.at(lemma.data(), lemma.size(), [](pointer_decoder& data) { |
|
4034
|
|
|
|
|
|
|
data.next(data.next_1B()); |
|
4035
|
|
|
|
|
|
|
data.next_4B(); |
|
4036
|
|
|
|
|
|
|
data.next(data.next_2B()); |
|
4037
|
0
|
|
|
|
|
|
}); |
|
4038
|
0
|
|
|
|
|
|
auto parent_data = derinet.at(parent.data(), parent.size(), [](pointer_decoder& data) { |
|
4039
|
|
|
|
|
|
|
data.next(data.next_1B()); |
|
4040
|
|
|
|
|
|
|
data.next_4B(); |
|
4041
|
|
|
|
|
|
|
data.next(data.next_2B()); |
|
4042
|
0
|
|
|
|
|
|
}); |
|
4043
|
0
|
0
|
|
|
|
|
assert(lemma_data && parent_data); |
|
4044
|
|
|
|
|
|
|
|
|
4045
|
0
|
|
|
|
|
|
unsigned parent_offset = parent_data - parent.size() - derinet.data_start(parent.size()); |
|
4046
|
0
|
0
|
|
|
|
|
assert(parent.size() < (1<<8) && parent_offset < (1<<24)); |
|
|
|
0
|
|
|
|
|
|
|
4047
|
0
|
|
|
|
|
|
unaligned_store((void *)(lemma_data + 1 + *lemma_data), (parent_offset << 8) | parent.size()); |
|
4048
|
|
|
|
|
|
|
|
|
4049
|
0
|
|
|
|
|
|
unsigned lemma_offset = lemma_data - lemma.size() - derinet.data_start(lemma.size()); |
|
4050
|
0
|
0
|
|
|
|
|
assert(lemma.size() < (1<<8) && lemma_offset < (1<<24)); |
|
|
|
0
|
|
|
|
|
|
|
4051
|
0
|
|
|
|
|
|
auto children_len = unaligned_load(parent_data + 1 + *parent_data + 4); |
|
4052
|
0
|
|
|
|
|
|
auto children = (uint32_t*)(parent_data + 1 + *parent_data + 4 + 2); |
|
4053
|
0
|
|
|
|
|
|
auto child_index = unaligned_load(children + children_len - 1); |
|
4054
|
0
|
|
|
|
|
|
unaligned_store(children + child_index, (lemma_offset << 8) | lemma.size()); |
|
4055
|
0
|
0
|
|
|
|
|
if (child_index+1 < children_len) |
|
4056
|
0
|
|
|
|
|
|
unaligned_store(children + children_len - 1, unaligned_load(children + children_len - 1) + 1); |
|
4057
|
|
|
|
|
|
|
} |
|
4058
|
|
|
|
|
|
|
} |
|
4059
|
|
|
|
|
|
|
|
|
4060
|
0
|
0
|
|
|
|
|
if (pass == 1) |
|
4061
|
0
|
0
|
|
|
|
|
derinet.done_adding(); |
|
4062
|
0
|
0
|
|
|
|
|
if (pass == 2) |
|
4063
|
0
|
|
|
|
|
|
derinet.done_filling(); |
|
4064
|
|
0
|
|
|
|
|
} |
|
4065
|
|
|
|
|
|
|
} catch (binary_decoder_error&) { |
|
4066
|
|
|
|
|
|
|
return false; |
|
4067
|
|
|
|
|
|
|
} |
|
4068
|
0
|
|
|
|
|
|
return true; |
|
4069
|
|
|
|
|
|
|
} |
|
4070
|
|
|
|
|
|
|
|
|
4071
|
|
|
|
|
|
|
} // namespace morphodita |
|
4072
|
|
|
|
|
|
|
|
|
4073
|
|
|
|
|
|
|
///////// |
|
4074
|
|
|
|
|
|
|
// File: morphodita/morpho/casing_variants.h |
|
4075
|
|
|
|
|
|
|
///////// |
|
4076
|
|
|
|
|
|
|
|
|
4077
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
4078
|
|
|
|
|
|
|
// |
|
4079
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
4080
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
4081
|
|
|
|
|
|
|
// |
|
4082
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
4083
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4084
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
4085
|
|
|
|
|
|
|
|
|
4086
|
|
|
|
|
|
|
namespace morphodita { |
|
4087
|
|
|
|
|
|
|
|
|
4088
|
7
|
|
|
|
|
|
inline void generate_casing_variants(string_piece form, string& form_uclc, string& form_lc) { |
|
4089
|
|
|
|
|
|
|
using namespace unilib; |
|
4090
|
|
|
|
|
|
|
|
|
4091
|
|
|
|
|
|
|
// Detect uppercase+titlecase characters. |
|
4092
|
|
|
|
|
|
|
bool first_Lut = false; // first character is uppercase or titlecase |
|
4093
|
|
|
|
|
|
|
bool rest_has_Lut = false; // any character but first is uppercase or titlecase |
|
4094
|
|
|
|
|
|
|
{ |
|
4095
|
7
|
|
|
|
|
|
string_piece form_tmp = form; |
|
4096
|
14
|
|
|
|
|
|
first_Lut = unicode::category(utf8::decode(form_tmp.str, form_tmp.len)) & unicode::Lut; |
|
4097
|
29
|
100
|
|
|
|
|
while (form_tmp.len && !rest_has_Lut) |
|
|
|
50
|
|
|
|
|
|
|
4098
|
22
|
|
|
|
|
|
rest_has_Lut = unicode::category(utf8::decode(form_tmp.str, form_tmp.len)) & unicode::Lut; |
|
4099
|
|
|
|
|
|
|
} |
|
4100
|
|
|
|
|
|
|
|
|
4101
|
|
|
|
|
|
|
// Generate all casing variants if needed (they are different than given form). |
|
4102
|
|
|
|
|
|
|
// We only replace letters with their lowercase variants. |
|
4103
|
|
|
|
|
|
|
// - form_uclc: first uppercase, rest lowercase |
|
4104
|
|
|
|
|
|
|
// - form_lc: all lowercase |
|
4105
|
|
|
|
|
|
|
|
|
4106
|
7
|
100
|
|
|
|
|
if (first_Lut && !rest_has_Lut) { // common case allowing fast execution |
|
4107
|
1
|
|
|
|
|
|
form_lc.reserve(form.len); |
|
4108
|
1
|
|
|
|
|
|
string_piece form_tmp = form; |
|
4109
|
1
|
|
|
|
|
|
utf8::append(form_lc, unicode::lowercase(utf8::decode(form_tmp.str, form_tmp.len))); |
|
4110
|
1
|
|
|
|
|
|
form_lc.append(form_tmp.str, form_tmp.len); |
|
4111
|
6
|
50
|
|
|
|
|
} else if (!first_Lut && rest_has_Lut) { |
|
4112
|
0
|
|
|
|
|
|
form_lc.reserve(form.len); |
|
4113
|
0
|
|
|
|
|
|
utf8::map(unicode::lowercase, form.str, form.len, form_lc); |
|
4114
|
6
|
50
|
|
|
|
|
} else if (first_Lut && rest_has_Lut) { |
|
4115
|
0
|
|
|
|
|
|
form_lc.reserve(form.len); |
|
4116
|
0
|
|
|
|
|
|
form_uclc.reserve(form.len); |
|
4117
|
0
|
|
|
|
|
|
string_piece form_tmp = form; |
|
4118
|
0
|
|
|
|
|
|
char32_t first = utf8::decode(form_tmp.str, form_tmp.len); |
|
4119
|
0
|
|
|
|
|
|
utf8::append(form_lc, unicode::lowercase(first)); |
|
4120
|
0
|
|
|
|
|
|
utf8::append(form_uclc, first); |
|
4121
|
0
|
0
|
|
|
|
|
while (form_tmp.len) { |
|
4122
|
0
|
|
|
|
|
|
char32_t lowercase = unicode::lowercase(utf8::decode(form_tmp.str, form_tmp.len)); |
|
4123
|
0
|
|
|
|
|
|
utf8::append(form_lc, lowercase); |
|
4124
|
0
|
|
|
|
|
|
utf8::append(form_uclc, lowercase); |
|
4125
|
|
|
|
|
|
|
} |
|
4126
|
|
|
|
|
|
|
} |
|
4127
|
7
|
|
|
|
|
|
} |
|
4128
|
|
|
|
|
|
|
|
|
4129
|
|
|
|
|
|
|
} // namespace morphodita |
|
4130
|
|
|
|
|
|
|
|
|
4131
|
|
|
|
|
|
|
///////// |
|
4132
|
|
|
|
|
|
|
// File: morphodita/morpho/czech_lemma_addinfo.h |
|
4133
|
|
|
|
|
|
|
///////// |
|
4134
|
|
|
|
|
|
|
|
|
4135
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
4136
|
|
|
|
|
|
|
// |
|
4137
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
4138
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
4139
|
|
|
|
|
|
|
// |
|
4140
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
4141
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4142
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
4143
|
|
|
|
|
|
|
|
|
4144
|
|
|
|
|
|
|
namespace morphodita { |
|
4145
|
|
|
|
|
|
|
|
|
4146
|
|
|
|
|
|
|
// Declarations |
|
4147
|
0
|
|
|
|
|
|
struct czech_lemma_addinfo { |
|
4148
|
|
|
|
|
|
|
inline static int raw_lemma_len(string_piece lemma); |
|
4149
|
|
|
|
|
|
|
inline static int lemma_id_len(string_piece lemma); |
|
4150
|
|
|
|
|
|
|
inline static string format(const unsigned char* addinfo, int addinfo_len); |
|
4151
|
|
|
|
|
|
|
inline static bool generatable(const unsigned char* addinfo, int addinfo_len); |
|
4152
|
|
|
|
|
|
|
|
|
4153
|
|
|
|
|
|
|
inline int parse(string_piece lemma, bool die_on_failure = false); |
|
4154
|
|
|
|
|
|
|
inline bool match_lemma_id(const unsigned char* other_addinfo, int other_addinfo_len); |
|
4155
|
|
|
|
|
|
|
|
|
4156
|
|
|
|
|
|
|
vector data; |
|
4157
|
|
|
|
|
|
|
}; |
|
4158
|
|
|
|
|
|
|
|
|
4159
|
|
|
|
|
|
|
// Definitions |
|
4160
|
0
|
|
|
|
|
|
int czech_lemma_addinfo::raw_lemma_len(string_piece lemma) { |
|
4161
|
|
|
|
|
|
|
// Lemma ends by a '-[0-9]', '`' or '_' on non-first position. |
|
4162
|
0
|
0
|
|
|
|
|
for (unsigned len = 1; len < lemma.len; len++) |
|
4163
|
0
|
0
|
|
|
|
|
if (lemma.str[len] == '`' || lemma.str[len] == '_' || |
|
|
|
0
|
|
|
|
|
|
|
4164
|
0
|
0
|
|
|
|
|
(lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9')) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4165
|
0
|
|
|
|
|
|
return len; |
|
4166
|
0
|
|
|
|
|
|
return lemma.len; |
|
4167
|
|
|
|
|
|
|
} |
|
4168
|
|
|
|
|
|
|
|
|
4169
|
0
|
|
|
|
|
|
int czech_lemma_addinfo::lemma_id_len(string_piece lemma) { |
|
4170
|
|
|
|
|
|
|
// Lemma ends by a '-[0-9]', '`' or '_' on non-first position. |
|
4171
|
0
|
0
|
|
|
|
|
for (unsigned len = 1; len < lemma.len; len++) { |
|
4172
|
0
|
0
|
|
|
|
|
if (lemma.str[len] == '`' || lemma.str[len] == '_') |
|
4173
|
0
|
|
|
|
|
|
return len; |
|
4174
|
0
|
0
|
|
|
|
|
if (lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9') { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4175
|
0
|
|
|
|
|
|
len += 2; |
|
4176
|
0
|
0
|
|
|
|
|
while (len < lemma.len && lemma.str[len] >= '0' && lemma.str[len] <= '9') len++; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4177
|
0
|
|
|
|
|
|
return len; |
|
4178
|
|
|
|
|
|
|
} |
|
4179
|
|
|
|
|
|
|
} |
|
4180
|
0
|
|
|
|
|
|
return lemma.len; |
|
4181
|
|
|
|
|
|
|
} |
|
4182
|
|
|
|
|
|
|
|
|
4183
|
0
|
|
|
|
|
|
string czech_lemma_addinfo::format(const unsigned char* addinfo, int addinfo_len) { |
|
4184
|
|
|
|
|
|
|
string res; |
|
4185
|
|
|
|
|
|
|
|
|
4186
|
0
|
0
|
|
|
|
|
if (addinfo_len) { |
|
4187
|
0
|
0
|
|
|
|
|
res.reserve(addinfo_len + 4); |
|
4188
|
0
|
0
|
|
|
|
|
if (addinfo[0] != 255) { |
|
4189
|
|
|
|
|
|
|
char num[5]; |
|
4190
|
0
|
|
|
|
|
|
snprintf(num, sizeof(num), "-%u", addinfo[0]); |
|
4191
|
|
|
|
|
|
|
res += num; |
|
4192
|
|
|
|
|
|
|
} |
|
4193
|
0
|
0
|
|
|
|
|
for (int i = 1; i < addinfo_len; i++) |
|
4194
|
0
|
|
|
|
|
|
res += addinfo[i]; |
|
4195
|
|
|
|
|
|
|
} |
|
4196
|
|
|
|
|
|
|
|
|
4197
|
0
|
|
|
|
|
|
return res; |
|
4198
|
|
|
|
|
|
|
} |
|
4199
|
|
|
|
|
|
|
|
|
4200
|
|
|
|
|
|
|
bool czech_lemma_addinfo::generatable(const unsigned char* addinfo, int addinfo_len) { |
|
4201
|
0
|
0
|
|
|
|
|
for (int i = 1; i + 2 < addinfo_len; i++) |
|
4202
|
0
|
0
|
|
|
|
|
if (addinfo[i] == '_' && addinfo[i+1] == ',' && addinfo[i+2] == 'x') |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4203
|
|
|
|
|
|
|
return false; |
|
4204
|
|
|
|
|
|
|
|
|
4205
|
|
|
|
|
|
|
return true; |
|
4206
|
|
|
|
|
|
|
} |
|
4207
|
|
|
|
|
|
|
|
|
4208
|
0
|
|
|
|
|
|
int czech_lemma_addinfo::parse(string_piece lemma, bool die_on_failure) { |
|
4209
|
|
|
|
|
|
|
data.clear(); |
|
4210
|
|
|
|
|
|
|
|
|
4211
|
0
|
|
|
|
|
|
const char* lemma_info = lemma.str + raw_lemma_len(lemma); |
|
4212
|
0
|
0
|
|
|
|
|
if (lemma_info < lemma.str + lemma.len) { |
|
4213
|
0
|
|
|
|
|
|
int lemma_num = 255; |
|
4214
|
|
|
|
|
|
|
const char* lemma_additional_info = lemma_info; |
|
4215
|
|
|
|
|
|
|
|
|
4216
|
0
|
0
|
|
|
|
|
if (*lemma_info == '-') { |
|
4217
|
0
|
|
|
|
|
|
lemma_num = 0; |
|
4218
|
0
|
|
|
|
|
|
for (lemma_additional_info = lemma_info + 1; |
|
4219
|
0
|
0
|
|
|
|
|
lemma_additional_info < lemma.str + lemma.len && (*lemma_additional_info >= '0' && *lemma_additional_info <= '9'); |
|
|
|
0
|
|
|
|
|
|
|
4220
|
|
|
|
|
|
|
lemma_additional_info++) |
|
4221
|
0
|
|
|
|
|
|
lemma_num = 10 * lemma_num + (*lemma_additional_info - '0'); |
|
4222
|
|
|
|
|
|
|
|
|
4223
|
0
|
0
|
|
|
|
|
if (lemma_additional_info == lemma_info + 1 || (lemma_additional_info < lemma.str + lemma.len && *lemma_additional_info != '`' && *lemma_additional_info != '_') || lemma_num >= 255) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4224
|
0
|
0
|
|
|
|
|
if (die_on_failure) |
|
4225
|
0
|
0
|
|
|
|
|
training_failure("Lemma number " << lemma_num << " in lemma " << lemma << " out of range!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4226
|
|
|
|
|
|
|
else |
|
4227
|
0
|
|
|
|
|
|
lemma_num = 255; |
|
4228
|
|
|
|
|
|
|
} |
|
4229
|
|
|
|
|
|
|
} |
|
4230
|
0
|
|
|
|
|
|
data.emplace_back(lemma_num); |
|
4231
|
0
|
0
|
|
|
|
|
while (lemma_additional_info < lemma.str + lemma.len) |
|
4232
|
0
|
|
|
|
|
|
data.push_back(*(unsigned char*)lemma_additional_info++); |
|
4233
|
|
|
|
|
|
|
|
|
4234
|
0
|
0
|
|
|
|
|
if (data.size() > 255) { |
|
4235
|
0
|
0
|
|
|
|
|
if (die_on_failure) |
|
4236
|
0
|
0
|
|
|
|
|
training_failure("Too long lemma info " << lemma_info << " in lemma " << lemma << '!'); |
|
|
|
0
|
|
|
|
|
|
|
4237
|
|
|
|
|
|
|
else |
|
4238
|
0
|
|
|
|
|
|
data.resize(255); |
|
4239
|
|
|
|
|
|
|
} |
|
4240
|
|
|
|
|
|
|
} |
|
4241
|
|
|
|
|
|
|
|
|
4242
|
0
|
|
|
|
|
|
return lemma_info - lemma.str; |
|
4243
|
|
|
|
|
|
|
} |
|
4244
|
|
|
|
|
|
|
|
|
4245
|
|
|
|
|
|
|
bool czech_lemma_addinfo::match_lemma_id(const unsigned char* other_addinfo, int other_addinfo_len) { |
|
4246
|
0
|
0
|
|
|
|
|
if (data.empty()) return true; |
|
4247
|
0
|
0
|
|
|
|
|
if (data[0] != 255 && (!other_addinfo_len || other_addinfo[0] != data[0])) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4248
|
|
|
|
|
|
|
return true; |
|
4249
|
|
|
|
|
|
|
} |
|
4250
|
|
|
|
|
|
|
|
|
4251
|
|
|
|
|
|
|
} // namespace morphodita |
|
4252
|
|
|
|
|
|
|
|
|
4253
|
|
|
|
|
|
|
///////// |
|
4254
|
|
|
|
|
|
|
// File: morphodita/morpho/tag_filter.h |
|
4255
|
|
|
|
|
|
|
///////// |
|
4256
|
|
|
|
|
|
|
|
|
4257
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
4258
|
|
|
|
|
|
|
// |
|
4259
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
4260
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
4261
|
|
|
|
|
|
|
// |
|
4262
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
4263
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4264
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
4265
|
|
|
|
|
|
|
|
|
4266
|
|
|
|
|
|
|
namespace morphodita { |
|
4267
|
|
|
|
|
|
|
|
|
4268
|
|
|
|
|
|
|
// Declarations |
|
4269
|
0
|
|
|
|
|
|
class tag_filter { |
|
4270
|
|
|
|
|
|
|
public: |
|
4271
|
|
|
|
|
|
|
tag_filter(const char* filter = nullptr); |
|
4272
|
|
|
|
|
|
|
|
|
4273
|
|
|
|
|
|
|
inline bool matches(const char* tag) const; |
|
4274
|
|
|
|
|
|
|
|
|
4275
|
|
|
|
|
|
|
private: |
|
4276
|
|
|
|
|
|
|
struct char_filter { |
|
4277
|
|
|
|
|
|
|
char_filter(int pos, bool negate, int chars_offset, int chars_len) |
|
4278
|
0
|
|
|
|
|
|
: pos(pos), negate(negate), chars_offset(chars_offset), chars_len(chars_len) {} |
|
4279
|
|
|
|
|
|
|
|
|
4280
|
|
|
|
|
|
|
int pos; |
|
4281
|
|
|
|
|
|
|
bool negate; |
|
4282
|
|
|
|
|
|
|
int chars_offset, chars_len; |
|
4283
|
|
|
|
|
|
|
}; |
|
4284
|
|
|
|
|
|
|
|
|
4285
|
|
|
|
|
|
|
string wildcard; |
|
4286
|
|
|
|
|
|
|
std::vector filters; |
|
4287
|
|
|
|
|
|
|
}; |
|
4288
|
|
|
|
|
|
|
|
|
4289
|
|
|
|
|
|
|
// Definitions |
|
4290
|
0
|
|
|
|
|
|
inline bool tag_filter::matches(const char* tag) const { |
|
4291
|
0
|
0
|
|
|
|
|
if (filters.empty()) return true; |
|
4292
|
|
|
|
|
|
|
|
|
4293
|
|
|
|
|
|
|
int tag_pos = 0; |
|
4294
|
0
|
0
|
|
|
|
|
for (auto&& filter : filters) { |
|
4295
|
|
|
|
|
|
|
// Skip until next filter position. If the tag ends prematurely, accept. |
|
4296
|
0
|
0
|
|
|
|
|
while (tag_pos < filter.pos) |
|
4297
|
0
|
0
|
|
|
|
|
if (!tag[tag_pos++]) |
|
4298
|
|
|
|
|
|
|
return true; |
|
4299
|
0
|
0
|
|
|
|
|
if (!tag[tag_pos]) |
|
4300
|
|
|
|
|
|
|
return true; |
|
4301
|
|
|
|
|
|
|
|
|
4302
|
|
|
|
|
|
|
// We assume filter.chars_len >= 1. |
|
4303
|
0
|
|
|
|
|
|
bool matched = (wildcard[filter.chars_offset] == tag[tag_pos]) ^ filter.negate; |
|
4304
|
0
|
0
|
|
|
|
|
for (int i = 1; i < filter.chars_len && ((!matched) ^ filter.negate); i++) |
|
|
|
0
|
|
|
|
|
|
|
4305
|
0
|
|
|
|
|
|
matched = (wildcard[filter.chars_offset + i] == tag[tag_pos]) ^ filter.negate; |
|
4306
|
0
|
0
|
|
|
|
|
if (!matched) return false; |
|
4307
|
|
|
|
|
|
|
} |
|
4308
|
|
|
|
|
|
|
return true; |
|
4309
|
|
|
|
|
|
|
} |
|
4310
|
|
|
|
|
|
|
|
|
4311
|
|
|
|
|
|
|
} // namespace morphodita |
|
4312
|
|
|
|
|
|
|
|
|
4313
|
|
|
|
|
|
|
///////// |
|
4314
|
|
|
|
|
|
|
// File: morphodita/morpho/morpho_dictionary.h |
|
4315
|
|
|
|
|
|
|
///////// |
|
4316
|
|
|
|
|
|
|
|
|
4317
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
4318
|
|
|
|
|
|
|
// |
|
4319
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
4320
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
4321
|
|
|
|
|
|
|
// |
|
4322
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
4323
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4324
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
4325
|
|
|
|
|
|
|
|
|
4326
|
|
|
|
|
|
|
namespace morphodita { |
|
4327
|
|
|
|
|
|
|
|
|
4328
|
|
|
|
|
|
|
// Declarations |
|
4329
|
|
|
|
|
|
|
template |
|
4330
|
2
|
|
|
|
|
|
class morpho_dictionary { |
|
4331
|
|
|
|
|
|
|
public: |
|
4332
|
|
|
|
|
|
|
void load(binary_decoder& data); |
|
4333
|
|
|
|
|
|
|
void analyze(string_piece form, vector& lemmas) const; |
|
4334
|
|
|
|
|
|
|
bool generate(string_piece lemma, const tag_filter& filter, vector& lemmas_forms) const; |
|
4335
|
|
|
|
|
|
|
private: |
|
4336
|
|
|
|
|
|
|
persistent_unordered_map lemmas, roots, suffixes; |
|
4337
|
|
|
|
|
|
|
|
|
4338
|
|
|
|
|
|
|
vector tags; |
|
4339
|
|
|
|
|
|
|
vector>>> classes; |
|
4340
|
|
|
|
|
|
|
}; |
|
4341
|
|
|
|
|
|
|
|
|
4342
|
|
|
|
|
|
|
// Definitions |
|
4343
|
|
|
|
|
|
|
template |
|
4344
|
1
|
|
|
|
|
|
void morpho_dictionary::load(binary_decoder& data) { |
|
4345
|
|
|
|
|
|
|
// Prepare lemmas and roots hashes |
|
4346
|
13
|
100
|
|
|
|
|
for (int i = data.next_1B(); i > 0; i--) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4347
|
12
|
|
|
|
|
|
lemmas.resize(data.next_4B()); |
|
4348
|
13
|
100
|
|
|
|
|
for (int i = data.next_1B(); i > 0; i--) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4349
|
12
|
|
|
|
|
|
roots.resize(data.next_4B()); |
|
4350
|
|
|
|
|
|
|
|
|
4351
|
|
|
|
|
|
|
// Perform two pass over the lemmas and roots data, filling the hashes. |
|
4352
|
|
|
|
|
|
|
|
|
4353
|
1
|
|
|
|
|
|
vector lemma(max(lemmas.max_length(), roots.max_length())); |
|
4354
|
1
|
50
|
|
|
|
|
vector root(max(lemmas.max_length(), roots.max_length())); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4355
|
|
|
|
|
|
|
unsigned data_position = data.tell(); |
|
4356
|
3
|
100
|
|
|
|
|
for (int pass = 1; pass <= 2; pass++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4357
|
2
|
100
|
|
|
|
|
if (pass > 1) data.seek(data_position); |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4358
|
|
|
|
|
|
|
|
|
4359
|
|
|
|
|
|
|
int lemma_len = 0; |
|
4360
|
|
|
|
|
|
|
int root_len = 0; |
|
4361
|
|
|
|
|
|
|
|
|
4362
|
22
|
50
|
|
|
|
|
for (int i = data.next_4B(); i > 0; i--) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4363
|
20
|
50
|
|
|
|
|
lemma_len -= data.next_1B(); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4364
|
126
|
50
|
|
|
|
|
for (int i = data.next_1B(); i > 0; i--) |
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4365
|
106
|
50
|
|
|
|
|
lemma[lemma_len++] = data.next_1B(); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4366
|
20
|
50
|
|
|
|
|
unsigned char lemma_info_len = data.next_1B(); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4367
|
20
|
50
|
|
|
|
|
const char* lemma_info = lemma_info_len ? data.next(lemma_info_len) : nullptr; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4368
|
20
|
50
|
|
|
|
|
unsigned lemma_roots = data.next_1B(); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4369
|
|
|
|
|
|
|
|
|
4370
|
|
|
|
|
|
|
unsigned char* lemma_data /* to keep compiler happy */ = nullptr; |
|
4371
|
|
|
|
|
|
|
unsigned lemma_offset /* to keep compiler happy */ = 0; |
|
4372
|
|
|
|
|
|
|
|
|
4373
|
20
|
100
|
|
|
|
|
if (pass == 1) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4374
|
10
|
|
|
|
|
|
lemmas.add(lemma.data(), lemma_len, 1 + lemma_info_len + 1 + lemma_roots * (sizeof(uint32_t) + sizeof(uint8_t) + sizeof(uint16_t))); |
|
4375
|
|
|
|
|
|
|
} else /*if (pass == 2)*/ { |
|
4376
|
10
|
|
|
|
|
|
lemma_data = lemmas.fill(lemma.data(), lemma_len, 1 + lemma_info_len + 1 + lemma_roots * (sizeof(uint32_t) + sizeof(uint8_t) + sizeof(uint16_t))); |
|
4377
|
20
|
|
|
|
|
|
lemma_offset = lemma_data - lemma_len - lemmas.data_start(lemma_len); |
|
4378
|
|
|
|
|
|
|
|
|
4379
|
10
|
|
|
|
|
|
*lemma_data++ = lemma_info_len; |
|
4380
|
10
|
50
|
|
|
|
|
if (lemma_info_len) small_memcpy(lemma_data, lemma_info, lemma_info_len), lemma_data += lemma_info_len; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4381
|
10
|
|
|
|
|
|
*lemma_data++ = lemma_roots; |
|
4382
|
|
|
|
|
|
|
} |
|
4383
|
|
|
|
|
|
|
|
|
4384
|
20
|
|
|
|
|
|
small_memcpy(root.data(), lemma.data(), lemma_len); root_len = lemma_len; |
|
4385
|
40
|
100
|
|
|
|
|
for (unsigned i = 0; i < lemma_roots; i++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4386
|
|
|
|
|
|
|
enum { REMOVE_START = 1, REMOVE_END = 2, ADD_START = 4, ADD_END = 8 }; |
|
4387
|
20
|
50
|
|
|
|
|
int operations = data.next_1B(); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4388
|
48
|
100
|
|
|
|
|
if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; } |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4389
|
20
|
100
|
|
|
|
|
if (operations & REMOVE_END) root_len -= data.next_1B(); |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4390
|
20
|
100
|
|
|
|
|
if (operations & ADD_START) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4391
|
44
|
50
|
|
|
|
|
int from = root_len, to = from + data.next_1B(); while (from > 0) root[--to] = root[--from]; root_len += to; |
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4392
|
14
|
100
|
|
|
|
|
for (int i = 0; i < to; i++) root[i] = data.next_1B(); |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4393
|
|
|
|
|
|
|
} |
|
4394
|
20
|
100
|
|
|
|
|
if (operations & ADD_END) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4395
|
34
|
50
|
|
|
|
|
for (int len = data.next_1B(); len > 0; len--) |
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4396
|
22
|
50
|
|
|
|
|
root[root_len++] = data.next_1B(); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4397
|
20
|
50
|
|
|
|
|
uint16_t clas = data.next_2B(); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4398
|
|
|
|
|
|
|
|
|
4399
|
20
|
100
|
|
|
|
|
if (pass == 1) { // for each root |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4400
|
10
|
|
|
|
|
|
roots.add(root.data(), root_len, sizeof(uint16_t) + sizeof(uint32_t) + sizeof(uint8_t)); |
|
4401
|
|
|
|
|
|
|
} else /*if (pass == 2)*/ { |
|
4402
|
10
|
|
|
|
|
|
unsigned char* root_data = roots.fill(root.data(), root_len, sizeof(uint16_t) + sizeof(uint32_t) + sizeof(uint8_t)); |
|
4403
|
20
|
|
|
|
|
|
unsigned root_offset = root_data - root_len - roots.data_start(root_len); |
|
4404
|
|
|
|
|
|
|
|
|
4405
|
|
|
|
|
|
|
unaligned_store_inc(root_data, clas); |
|
4406
|
|
|
|
|
|
|
unaligned_store_inc(root_data, lemma_offset); |
|
4407
|
|
|
|
|
|
|
unaligned_store_inc(root_data, lemma_len); |
|
4408
|
10
|
50
|
|
|
|
|
assert(uint8_t(lemma_len) == lemma_len); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4409
|
|
|
|
|
|
|
|
|
4410
|
|
|
|
|
|
|
unaligned_store_inc(lemma_data, root_offset); |
|
4411
|
|
|
|
|
|
|
unaligned_store_inc(lemma_data, root_len); |
|
4412
|
|
|
|
|
|
|
unaligned_store_inc(lemma_data, clas); |
|
4413
|
10
|
50
|
|
|
|
|
assert(uint8_t(root_len) == root_len); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4414
|
|
|
|
|
|
|
} |
|
4415
|
|
|
|
|
|
|
} |
|
4416
|
|
|
|
|
|
|
} |
|
4417
|
|
|
|
|
|
|
|
|
4418
|
2
|
100
|
|
|
|
|
if (pass == 1) { // after the whole pass |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4419
|
1
|
50
|
|
|
|
|
lemmas.done_adding(); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4420
|
1
|
50
|
|
|
|
|
roots.done_adding(); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4421
|
|
|
|
|
|
|
} else /*if (pass == 2)*/ { |
|
4422
|
1
|
|
|
|
|
|
lemmas.done_filling(); |
|
4423
|
1
|
|
|
|
|
|
roots.done_filling(); |
|
4424
|
|
|
|
|
|
|
} |
|
4425
|
|
|
|
|
|
|
} |
|
4426
|
|
|
|
|
|
|
|
|
4427
|
|
|
|
|
|
|
// Load tags |
|
4428
|
1
|
50
|
|
|
|
|
tags.resize(data.next_2B()); |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4429
|
7
|
100
|
|
|
|
|
for (auto&& tag : tags) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4430
|
6
|
50
|
|
|
|
|
tag.resize(data.next_1B()); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4431
|
403
|
100
|
|
|
|
|
for (unsigned i = 0; i < tag.size(); i++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4432
|
397
|
50
|
|
|
|
|
tag[i] = data.next_1B(); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4433
|
|
|
|
|
|
|
} |
|
4434
|
|
|
|
|
|
|
|
|
4435
|
|
|
|
|
|
|
// Load suffixes |
|
4436
|
1
|
50
|
|
|
|
|
suffixes.load(data); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4437
|
|
|
|
|
|
|
|
|
4438
|
|
|
|
|
|
|
// Fill classes from suffixes |
|
4439
|
2
|
50
|
|
|
|
|
suffixes.iter_all([this](const char* suffix, int len, pointer_decoder& data) mutable { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4440
|
|
|
|
|
|
|
unsigned classes_len = data.next_2B(); |
|
4441
|
|
|
|
|
|
|
const uint16_t* classes_ptr = data.next(classes_len); |
|
4442
|
1
|
|
|
|
|
|
const uint16_t* indices_ptr = data.next(classes_len + 1); |
|
4443
|
1
|
|
|
|
|
|
uint32_t tags_len = unaligned_load(indices_ptr); |
|
4444
|
7
|
100
|
|
|
|
|
for (unsigned i = 0; i < classes_len; i++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4445
|
6
|
|
|
|
|
|
tags_len += uint16_t(unaligned_load(indices_ptr + i + 1) - unaligned_load(indices_ptr + i)); |
|
4446
|
|
|
|
|
|
|
const uint16_t* tags_ptr = data.next(tags_len); |
|
4447
|
|
|
|
|
|
|
|
|
4448
|
1
|
|
|
|
|
|
string suffix_str(suffix, len); |
|
4449
|
1
|
|
|
|
|
|
uint32_t index = unaligned_load(indices_ptr), prev_index = 0; |
|
4450
|
7
|
100
|
|
|
|
|
for (unsigned i = 0; i < classes_len; i++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4451
|
6
|
|
|
|
|
|
auto classes_ptr_i = unaligned_load(classes_ptr + i); |
|
4452
|
6
|
50
|
|
|
|
|
if (classes_ptr_i >= classes.size()) classes.resize(classes_ptr_i + 1); |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4453
|
|
|
|
|
|
|
prev_index = index; |
|
4454
|
6
|
|
|
|
|
|
index += uint16_t(unaligned_load(indices_ptr + i + 1) - unaligned_load(indices_ptr + i)); |
|
4455
|
6
|
50
|
|
|
|
|
classes[classes_ptr_i].emplace_back(suffix_str, vector()); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4456
|
12
|
100
|
|
|
|
|
for (const uint16_t* ptr = tags_ptr + prev_index; ptr < tags_ptr + index; ptr++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4457
|
6
|
50
|
|
|
|
|
classes[classes_ptr_i].back().second.emplace_back(unaligned_load(ptr)); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4458
|
|
|
|
|
|
|
} |
|
4459
|
1
|
|
|
|
|
|
}); |
|
4460
|
1
|
|
|
|
|
|
} |
|
4461
|
|
|
|
|
|
|
|
|
4462
|
|
|
|
|
|
|
template |
|
4463
|
8
|
|
|
|
|
|
void morpho_dictionary::analyze(string_piece form, vector& lemmas) const { |
|
4464
|
|
|
|
|
|
|
int max_suffix_len = suffixes.max_length(); |
|
4465
|
|
|
|
|
|
|
|
|
4466
|
|
|
|
|
|
|
uint16_t* suff_stack[16]; vector suff_heap; |
|
4467
|
8
|
50
|
|
|
|
|
uint16_t** suff = max_suffix_len <= 16 ? suff_stack : (suff_heap.resize(max_suffix_len), suff_heap.data()); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4468
|
|
|
|
|
|
|
int suff_len = 0; |
|
4469
|
16
|
100
|
|
|
|
|
for (int i = form.len; i >= 0 && suff_len < max_suffix_len; i--, suff_len++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4470
|
8
|
|
|
|
|
|
suff[suff_len] = (uint16_t*) suffixes.at(form.str + i, suff_len, [](pointer_decoder& data) { |
|
4471
|
0
|
|
|
|
|
|
data.next(2 * data.next_2B()); |
|
4472
|
|
|
|
|
|
|
data.next(data.next_2B()); |
|
4473
|
0
|
|
|
|
|
|
}); |
|
4474
|
8
|
|
|
|
|
|
if (!suff[suff_len]) break; |
|
4475
|
|
|
|
|
|
|
} |
|
4476
|
|
|
|
|
|
|
|
|
4477
|
16
|
100
|
|
|
|
|
for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++) |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4478
|
8
|
50
|
|
|
|
|
if (unaligned_load(suff[suff_len])) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4479
|
8
|
|
|
|
|
|
unsigned suff_classes = unaligned_load(suff[suff_len]); |
|
4480
|
8
|
|
|
|
|
|
uint16_t* suff_data = suff[suff_len] + 1; |
|
4481
|
|
|
|
|
|
|
|
|
4482
|
21
|
50
|
|
|
|
|
roots.iter(form.str, root_len, [&](const char* root, pointer_decoder& root_data) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4483
|
|
|
|
|
|
|
uint16_t root_class = root_data.next_2B(); |
|
4484
|
|
|
|
|
|
|
unsigned lemma_offset = root_data.next_4B(); |
|
4485
|
|
|
|
|
|
|
unsigned lemma_len = root_data.next_1B(); |
|
4486
|
|
|
|
|
|
|
|
|
4487
|
26
|
100
|
|
|
|
|
if (small_memeq(form.str, root, root_len)) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4488
|
19
|
|
|
|
|
|
uint16_t* suffix_class_ptr = unaligned_lower_bound(suff_data, suff_classes, root_class); |
|
4489
|
10
|
50
|
|
|
|
|
if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4490
|
30
|
|
|
|
|
|
const unsigned char* lemma_data = this->lemmas.data_start(lemma_len) + lemma_offset; |
|
4491
|
|
|
|
|
|
|
string lemma((const char*)lemma_data, lemma_len); |
|
4492
|
10
|
50
|
|
|
|
|
if (lemma_data[lemma_len]) lemma += LemmaAddinfo::format(lemma_data + lemma_len + 1, lemma_data[lemma_len]); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4493
|
|
|
|
|
|
|
|
|
4494
|
20
|
|
|
|
|
|
uint16_t* suff_tag_indices = suff_data + suff_classes; |
|
4495
|
10
|
|
|
|
|
|
uint16_t* suff_tags = suff_tag_indices + suff_classes + 1; |
|
4496
|
20
|
100
|
|
|
|
|
for (unsigned i = unaligned_load(suff_tag_indices + (suffix_class_ptr - suff_data)); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4497
|
20
|
|
|
|
|
|
i < unaligned_load(suff_tag_indices + (suffix_class_ptr - suff_data) + 1); i++) |
|
4498
|
10
|
50
|
|
|
|
|
lemmas.emplace_back(lemma, tags[unaligned_load(suff_tags + i)]); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4499
|
|
|
|
|
|
|
} |
|
4500
|
|
|
|
|
|
|
} |
|
4501
|
13
|
|
|
|
|
|
}); |
|
4502
|
|
|
|
|
|
|
} |
|
4503
|
8
|
|
|
|
|
|
} |
|
4504
|
|
|
|
|
|
|
|
|
4505
|
|
|
|
|
|
|
template |
|
4506
|
0
|
|
|
|
|
|
bool morpho_dictionary::generate(string_piece lemma, const tag_filter& filter, vector& lemmas_forms) const { |
|
4507
|
|
|
|
|
|
|
LemmaAddinfo addinfo; |
|
4508
|
0
|
0
|
|
|
|
|
int raw_lemma_len = addinfo.parse(lemma); |
|
|
|
0
|
|
|
|
|
|
|
4509
|
0
|
|
|
|
|
|
bool matched_lemma = false; |
|
4510
|
|
|
|
|
|
|
|
|
4511
|
0
|
0
|
|
|
|
|
lemmas.iter(lemma.str, raw_lemma_len, [&](const char* lemma_str, pointer_decoder& data) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4512
|
|
|
|
|
|
|
unsigned lemma_info_len = data.next_1B(); |
|
4513
|
|
|
|
|
|
|
const auto* lemma_info = data.next(lemma_info_len); |
|
4514
|
|
|
|
|
|
|
unsigned lemma_roots_len = data.next_1B(); |
|
4515
|
0
|
|
|
|
|
|
auto* lemma_roots_ptr = data.next(lemma_roots_len * (sizeof(uint32_t) + sizeof(uint8_t) + sizeof(uint16_t))); |
|
4516
|
|
|
|
|
|
|
|
|
4517
|
0
|
0
|
|
|
|
|
if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4518
|
0
|
|
|
|
|
|
matched_lemma = true; |
|
4519
|
|
|
|
|
|
|
|
|
4520
|
|
|
|
|
|
|
vector* forms = nullptr; |
|
4521
|
|
|
|
|
|
|
pointer_decoder lemma_roots(lemma_roots_ptr); |
|
4522
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < lemma_roots_len; i++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4523
|
|
|
|
|
|
|
unsigned root_offset = lemma_roots.next_4B(); |
|
4524
|
|
|
|
|
|
|
unsigned root_len = lemma_roots.next_1B(); |
|
4525
|
|
|
|
|
|
|
unsigned clas = lemma_roots.next_2B(); |
|
4526
|
|
|
|
|
|
|
|
|
4527
|
0
|
|
|
|
|
|
const unsigned char* root_data = roots.data_start(root_len) + root_offset; |
|
4528
|
0
|
0
|
|
|
|
|
for (auto&& suffix : classes[clas]) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4529
|
|
|
|
|
|
|
string root_with_suffix; |
|
4530
|
0
|
0
|
|
|
|
|
for (auto&& tag : suffix.second) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4531
|
0
|
0
|
|
|
|
|
if (filter.matches(tags[tag].c_str())) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4532
|
0
|
0
|
|
|
|
|
if (!forms) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4533
|
0
|
0
|
|
|
|
|
lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len)); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4534
|
0
|
|
|
|
|
|
forms = &lemmas_forms.back().forms; |
|
4535
|
|
|
|
|
|
|
} |
|
4536
|
|
|
|
|
|
|
|
|
4537
|
0
|
0
|
|
|
|
|
if (root_with_suffix.empty() && root_len + suffix.first.size()) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4538
|
0
|
0
|
|
|
|
|
root_with_suffix.reserve(root_len + suffix.first.size()); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4539
|
|
|
|
|
|
|
root_with_suffix.assign((const char*)root_data, root_len); |
|
4540
|
|
|
|
|
|
|
root_with_suffix.append(suffix.first); |
|
4541
|
|
|
|
|
|
|
} |
|
4542
|
|
|
|
|
|
|
|
|
4543
|
0
|
0
|
|
|
|
|
forms->emplace_back(root_with_suffix, tags[tag]); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4544
|
|
|
|
|
|
|
} |
|
4545
|
|
|
|
|
|
|
} |
|
4546
|
|
|
|
|
|
|
} |
|
4547
|
|
|
|
|
|
|
} |
|
4548
|
0
|
|
|
|
|
|
}); |
|
4549
|
|
|
|
|
|
|
|
|
4550
|
0
|
|
|
|
|
|
return matched_lemma; |
|
4551
|
|
|
|
|
|
|
} |
|
4552
|
|
|
|
|
|
|
|
|
4553
|
|
|
|
|
|
|
} // namespace morphodita |
|
4554
|
|
|
|
|
|
|
|
|
4555
|
|
|
|
|
|
|
///////// |
|
4556
|
|
|
|
|
|
|
// File: morphodita/morpho/morpho_prefix_guesser.h |
|
4557
|
|
|
|
|
|
|
///////// |
|
4558
|
|
|
|
|
|
|
|
|
4559
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
4560
|
|
|
|
|
|
|
// |
|
4561
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
4562
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
4563
|
|
|
|
|
|
|
// |
|
4564
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
4565
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4566
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
4567
|
|
|
|
|
|
|
|
|
4568
|
|
|
|
|
|
|
namespace morphodita { |
|
4569
|
|
|
|
|
|
|
|
|
4570
|
|
|
|
|
|
|
// Declarations |
|
4571
|
|
|
|
|
|
|
template |
|
4572
|
0
|
|
|
|
|
|
class morpho_prefix_guesser { |
|
4573
|
|
|
|
|
|
|
public: |
|
4574
|
0
|
|
|
|
|
|
morpho_prefix_guesser(const MorphoDictionary& dictionary) : dictionary(dictionary) {} |
|
4575
|
|
|
|
|
|
|
|
|
4576
|
|
|
|
|
|
|
void load(binary_decoder& data); |
|
4577
|
|
|
|
|
|
|
void analyze(string_piece form, vector& lemmas); |
|
4578
|
|
|
|
|
|
|
bool generate(string_piece lemma, const tag_filter& filter, vector& lemmas_forms); |
|
4579
|
|
|
|
|
|
|
|
|
4580
|
|
|
|
|
|
|
private: |
|
4581
|
|
|
|
|
|
|
const MorphoDictionary& dictionary; |
|
4582
|
|
|
|
|
|
|
vector tag_filters; |
|
4583
|
|
|
|
|
|
|
persistent_unordered_map prefixes_initial, prefixes_middle; |
|
4584
|
|
|
|
|
|
|
}; |
|
4585
|
|
|
|
|
|
|
|
|
4586
|
|
|
|
|
|
|
// Definitions |
|
4587
|
|
|
|
|
|
|
template |
|
4588
|
0
|
|
|
|
|
|
void morpho_prefix_guesser::load(binary_decoder& data) { |
|
4589
|
|
|
|
|
|
|
// Load and construct tag filters |
|
4590
|
0
|
0
|
|
|
|
|
for (unsigned tag_filters_len = data.next_1B(); tag_filters_len; tag_filters_len--) { |
|
4591
|
0
|
|
|
|
|
|
unsigned tag_filter_len = data.next_1B(); |
|
4592
|
0
|
|
|
|
|
|
string tag_filter(data.next(tag_filter_len), tag_filter_len); |
|
4593
|
|
|
|
|
|
|
|
|
4594
|
0
|
0
|
|
|
|
|
tag_filters.emplace_back(tag_filter.c_str()); |
|
4595
|
|
|
|
|
|
|
} |
|
4596
|
|
|
|
|
|
|
|
|
4597
|
|
|
|
|
|
|
// Load prefixes |
|
4598
|
0
|
|
|
|
|
|
prefixes_initial.load(data); |
|
4599
|
0
|
|
|
|
|
|
prefixes_middle.load(data); |
|
4600
|
0
|
|
|
|
|
|
} |
|
4601
|
|
|
|
|
|
|
|
|
4602
|
|
|
|
|
|
|
// Analyze can return non-unique lemma-tag pairs. |
|
4603
|
|
|
|
|
|
|
template |
|
4604
|
0
|
|
|
|
|
|
void morpho_prefix_guesser::analyze(string_piece form, vector& lemmas) { |
|
4605
|
0
|
0
|
|
|
|
|
if (!form.len) return; |
|
4606
|
|
|
|
|
|
|
|
|
4607
|
|
|
|
|
|
|
vector form_tmp; |
|
4608
|
|
|
|
|
|
|
vector middle_masks; |
|
4609
|
0
|
0
|
|
|
|
|
middle_masks.reserve(form.len); |
|
4610
|
|
|
|
|
|
|
|
|
4611
|
0
|
0
|
|
|
|
|
for (unsigned initial = 0; initial < form.len; initial++) { |
|
4612
|
|
|
|
|
|
|
// Match the initial prefix. |
|
4613
|
0
|
|
|
|
|
|
unsigned initial_mask = (1<
|
|
4614
|
0
|
0
|
|
|
|
|
if (initial) { |
|
4615
|
0
|
|
|
|
|
|
auto found = prefixes_initial.at_typed(form.str, initial); |
|
4616
|
0
|
0
|
|
|
|
|
if (!found) break; |
|
4617
|
0
|
|
|
|
|
|
initial_mask = unaligned_load(found); |
|
4618
|
|
|
|
|
|
|
} |
|
4619
|
|
|
|
|
|
|
|
|
4620
|
|
|
|
|
|
|
// If we have found an initial prefix (including the empty one), match middle prefixes. |
|
4621
|
0
|
0
|
|
|
|
|
if (initial_mask) { |
|
4622
|
0
|
0
|
|
|
|
|
middle_masks.resize(initial); |
|
4623
|
0
|
0
|
|
|
|
|
middle_masks.emplace_back(initial_mask); |
|
4624
|
0
|
0
|
|
|
|
|
for (unsigned middle = initial; middle < middle_masks.size(); middle++) { |
|
4625
|
0
|
0
|
|
|
|
|
if (!middle_masks[middle]) continue; |
|
4626
|
|
|
|
|
|
|
// Try matching middle prefixes from current index. |
|
4627
|
0
|
0
|
|
|
|
|
for (unsigned i = middle + 1; i < form.len; i++) { |
|
4628
|
0
|
|
|
|
|
|
auto found = prefixes_middle.at_typed(form.str + middle, i - middle); |
|
4629
|
0
|
0
|
|
|
|
|
if (!found) break; |
|
4630
|
0
|
0
|
|
|
|
|
if (unaligned_load(found)) { |
|
4631
|
0
|
0
|
|
|
|
|
if (i + 1 > middle_masks.size()) middle_masks.resize(i + 1); |
|
|
|
0
|
|
|
|
|
|
|
4632
|
0
|
|
|
|
|
|
middle_masks[i] |= middle_masks[middle] & unaligned_load(found); |
|
4633
|
|
|
|
|
|
|
} |
|
4634
|
|
|
|
|
|
|
} |
|
4635
|
|
|
|
|
|
|
|
|
4636
|
|
|
|
|
|
|
// Try matching word forms if at least one middle prefix was found. |
|
4637
|
0
|
0
|
|
|
|
|
if (middle > initial && middle < form.len ) { |
|
|
|
0
|
|
|
|
|
|
|
4638
|
0
|
0
|
|
|
|
|
if (initial) { |
|
4639
|
0
|
0
|
|
|
|
|
if (form_tmp.empty()) form_tmp.assign(form.str, form.str + form.len); |
|
4640
|
0
|
|
|
|
|
|
small_memcpy(form_tmp.data() + middle - initial, form.str, initial); |
|
4641
|
|
|
|
|
|
|
} |
|
4642
|
0
|
|
|
|
|
|
unsigned lemmas_ori_size = lemmas.size(); |
|
4643
|
0
|
0
|
|
|
|
|
dictionary.analyze(string_piece((initial ? form_tmp.data() : form.str) + middle - initial, form.len - middle + initial), lemmas); |
|
|
|
0
|
|
|
|
|
|
|
4644
|
|
|
|
|
|
|
unsigned lemmas_new_size = lemmas_ori_size; |
|
4645
|
0
|
0
|
|
|
|
|
for (unsigned i = lemmas_ori_size; i < lemmas.size(); i++) { |
|
4646
|
0
|
0
|
|
|
|
|
for (unsigned filter = 0; filter < tag_filters.size(); filter++) |
|
4647
|
0
|
0
|
|
|
|
|
if ((middle_masks[middle] & (1<
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4648
|
0
|
0
|
|
|
|
|
if (i == lemmas_new_size) { |
|
4649
|
0
|
|
|
|
|
|
lemmas[lemmas_new_size].lemma.insert(0, form.str + initial, middle - initial); |
|
4650
|
|
|
|
|
|
|
} else { |
|
4651
|
0
|
0
|
|
|
|
|
lemmas[lemmas_new_size].lemma.reserve(lemmas[i].lemma.size() + middle - initial); |
|
4652
|
0
|
|
|
|
|
|
lemmas[lemmas_new_size].lemma.assign(form.str + initial, middle - initial); |
|
4653
|
0
|
|
|
|
|
|
lemmas[lemmas_new_size].lemma.append(lemmas[i].lemma); |
|
4654
|
0
|
|
|
|
|
|
lemmas[lemmas_new_size].tag = lemmas[i].tag; |
|
4655
|
|
|
|
|
|
|
} |
|
4656
|
0
|
|
|
|
|
|
lemmas_new_size++; |
|
4657
|
0
|
|
|
|
|
|
break; |
|
4658
|
|
|
|
|
|
|
} |
|
4659
|
|
|
|
|
|
|
} |
|
4660
|
0
|
0
|
|
|
|
|
if (lemmas_new_size < lemmas.size()) lemmas.erase(lemmas.begin() + lemmas_new_size, lemmas.end()); |
|
4661
|
|
|
|
|
|
|
} |
|
4662
|
|
|
|
|
|
|
} |
|
4663
|
|
|
|
|
|
|
} |
|
4664
|
|
|
|
|
|
|
} |
|
4665
|
|
|
|
|
|
|
} |
|
4666
|
|
|
|
|
|
|
|
|
4667
|
|
|
|
|
|
|
template |
|
4668
|
|
|
|
|
|
|
bool morpho_prefix_guesser::generate(string_piece /*lemma*/, const tag_filter& /*filter*/, vector& /*lemmas_forms*/) { |
|
4669
|
|
|
|
|
|
|
// Not implemented yet. Is it actually needed? |
|
4670
|
|
|
|
|
|
|
return false; |
|
4671
|
|
|
|
|
|
|
} |
|
4672
|
|
|
|
|
|
|
} // namespace morphodita |
|
4673
|
|
|
|
|
|
|
|
|
4674
|
|
|
|
|
|
|
///////// |
|
4675
|
|
|
|
|
|
|
// File: morphodita/morpho/morpho_statistical_guesser.h |
|
4676
|
|
|
|
|
|
|
///////// |
|
4677
|
|
|
|
|
|
|
|
|
4678
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
4679
|
|
|
|
|
|
|
// |
|
4680
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
4681
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
4682
|
|
|
|
|
|
|
// |
|
4683
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
4684
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4685
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
4686
|
|
|
|
|
|
|
|
|
4687
|
|
|
|
|
|
|
namespace morphodita { |
|
4688
|
|
|
|
|
|
|
|
|
4689
|
1
|
|
|
|
|
|
class morpho_statistical_guesser { |
|
4690
|
|
|
|
|
|
|
public: |
|
4691
|
|
|
|
|
|
|
void load(binary_decoder& data); |
|
4692
|
|
|
|
|
|
|
typedef vector used_rules; |
|
4693
|
|
|
|
|
|
|
void analyze(string_piece form, vector& lemmas, used_rules* used); |
|
4694
|
|
|
|
|
|
|
|
|
4695
|
|
|
|
|
|
|
private: |
|
4696
|
|
|
|
|
|
|
vector tags; |
|
4697
|
|
|
|
|
|
|
unsigned default_tag; |
|
4698
|
|
|
|
|
|
|
persistent_unordered_map rules; |
|
4699
|
|
|
|
|
|
|
}; |
|
4700
|
|
|
|
|
|
|
|
|
4701
|
|
|
|
|
|
|
} // namespace morphodita |
|
4702
|
|
|
|
|
|
|
|
|
4703
|
|
|
|
|
|
|
///////// |
|
4704
|
|
|
|
|
|
|
// File: morphodita/tokenizer/unicode_tokenizer.h |
|
4705
|
|
|
|
|
|
|
///////// |
|
4706
|
|
|
|
|
|
|
|
|
4707
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
4708
|
|
|
|
|
|
|
// |
|
4709
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
4710
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
4711
|
|
|
|
|
|
|
// |
|
4712
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
4713
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4714
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
4715
|
|
|
|
|
|
|
|
|
4716
|
|
|
|
|
|
|
namespace morphodita { |
|
4717
|
|
|
|
|
|
|
|
|
4718
|
1
|
|
|
|
|
|
class unicode_tokenizer : public tokenizer { |
|
4719
|
|
|
|
|
|
|
public: |
|
4720
|
|
|
|
|
|
|
enum { URL_EMAIL_LATEST = 2 }; |
|
4721
|
|
|
|
|
|
|
unicode_tokenizer(unsigned url_email_tokenizer); |
|
4722
|
|
|
|
|
|
|
|
|
4723
|
|
|
|
|
|
|
virtual void set_text(string_piece text, bool make_copy = false) override; |
|
4724
|
|
|
|
|
|
|
virtual bool next_sentence(vector* forms, vector* tokens) override; |
|
4725
|
|
|
|
|
|
|
|
|
4726
|
|
|
|
|
|
|
virtual bool next_sentence(vector& tokens) = 0; |
|
4727
|
|
|
|
|
|
|
|
|
4728
|
|
|
|
|
|
|
protected: |
|
4729
|
|
|
|
|
|
|
struct char_info { |
|
4730
|
|
|
|
|
|
|
char32_t chr; |
|
4731
|
|
|
|
|
|
|
unilib::unicode::category_t cat; |
|
4732
|
|
|
|
|
|
|
const char* str; |
|
4733
|
|
|
|
|
|
|
|
|
4734
|
36
|
|
|
|
|
|
char_info(char32_t chr, const char* str) : chr(chr), cat(unilib::unicode::category(chr)), str(str) {} |
|
4735
|
|
|
|
|
|
|
}; |
|
4736
|
|
|
|
|
|
|
vector chars; |
|
4737
|
|
|
|
|
|
|
size_t current; |
|
4738
|
|
|
|
|
|
|
|
|
4739
|
|
|
|
|
|
|
bool tokenize_url_email(vector& tokens); |
|
4740
|
|
|
|
|
|
|
bool emergency_sentence_split(const vector& tokens); |
|
4741
|
|
|
|
|
|
|
bool is_eos(const vector& tokens, char32_t eos_chr, const unordered_set* abbreviations); |
|
4742
|
|
|
|
|
|
|
|
|
4743
|
|
|
|
|
|
|
private: |
|
4744
|
|
|
|
|
|
|
unsigned url_email_tokenizer; |
|
4745
|
|
|
|
|
|
|
string text_buffer; |
|
4746
|
|
|
|
|
|
|
vector tokens_buffer; |
|
4747
|
|
|
|
|
|
|
string eos_buffer; |
|
4748
|
|
|
|
|
|
|
}; |
|
4749
|
|
|
|
|
|
|
|
|
4750
|
|
|
|
|
|
|
} // namespace morphodita |
|
4751
|
|
|
|
|
|
|
|
|
4752
|
|
|
|
|
|
|
///////// |
|
4753
|
|
|
|
|
|
|
// File: morphodita/tokenizer/ragel_tokenizer.h |
|
4754
|
|
|
|
|
|
|
///////// |
|
4755
|
|
|
|
|
|
|
|
|
4756
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
4757
|
|
|
|
|
|
|
// |
|
4758
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
4759
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
4760
|
|
|
|
|
|
|
// |
|
4761
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
4762
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4763
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
4764
|
|
|
|
|
|
|
|
|
4765
|
|
|
|
|
|
|
namespace morphodita { |
|
4766
|
|
|
|
|
|
|
|
|
4767
|
0
|
|
|
|
|
|
class ragel_tokenizer : public unicode_tokenizer { |
|
4768
|
|
|
|
|
|
|
public: |
|
4769
|
|
|
|
|
|
|
ragel_tokenizer(unsigned url_email_tokenizer); |
|
4770
|
|
|
|
|
|
|
|
|
4771
|
|
|
|
|
|
|
protected: |
|
4772
|
|
|
|
|
|
|
static inline uint8_t ragel_char(const char_info& chr); |
|
4773
|
|
|
|
|
|
|
|
|
4774
|
|
|
|
|
|
|
private: |
|
4775
|
|
|
|
|
|
|
static void initialize_ragel_map(); |
|
4776
|
|
|
|
|
|
|
static vector ragel_map; |
|
4777
|
|
|
|
|
|
|
static atomic_flag ragel_map_flag; |
|
4778
|
|
|
|
|
|
|
static void ragel_map_add(char32_t chr, uint8_t mapping); |
|
4779
|
|
|
|
|
|
|
|
|
4780
|
|
|
|
|
|
|
friend class unicode_tokenizer; |
|
4781
|
|
|
|
|
|
|
static bool ragel_url_email(unsigned version, const vector& chars, size_t& current_char, vector& tokens); |
|
4782
|
|
|
|
|
|
|
}; |
|
4783
|
|
|
|
|
|
|
|
|
4784
|
|
|
|
|
|
|
uint8_t ragel_tokenizer::ragel_char(const char_info& chr) { |
|
4785
|
30
|
50
|
|
|
|
|
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4786
|
|
|
|
|
|
|
} |
|
4787
|
|
|
|
|
|
|
|
|
4788
|
|
|
|
|
|
|
} // namespace morphodita |
|
4789
|
|
|
|
|
|
|
|
|
4790
|
|
|
|
|
|
|
///////// |
|
4791
|
|
|
|
|
|
|
// File: morphodita/tokenizer/czech_tokenizer.h |
|
4792
|
|
|
|
|
|
|
///////// |
|
4793
|
|
|
|
|
|
|
|
|
4794
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
4795
|
|
|
|
|
|
|
// |
|
4796
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
4797
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
4798
|
|
|
|
|
|
|
// |
|
4799
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
4800
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4801
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
4802
|
|
|
|
|
|
|
|
|
4803
|
|
|
|
|
|
|
namespace morphodita { |
|
4804
|
|
|
|
|
|
|
|
|
4805
|
0
|
|
|
|
|
|
class czech_tokenizer : public ragel_tokenizer { |
|
4806
|
|
|
|
|
|
|
public: |
|
4807
|
|
|
|
|
|
|
enum tokenizer_language { CZECH = 0, SLOVAK = 1 }; |
|
4808
|
|
|
|
|
|
|
enum { LATEST = 2 }; |
|
4809
|
|
|
|
|
|
|
czech_tokenizer(tokenizer_language language, unsigned version, const morpho* m = nullptr); |
|
4810
|
|
|
|
|
|
|
|
|
4811
|
|
|
|
|
|
|
virtual bool next_sentence(vector& tokens) override; |
|
4812
|
|
|
|
|
|
|
|
|
4813
|
|
|
|
|
|
|
private: |
|
4814
|
|
|
|
|
|
|
const morpho* m; |
|
4815
|
|
|
|
|
|
|
const unordered_set* abbreviations; |
|
4816
|
|
|
|
|
|
|
vector lemmas; |
|
4817
|
|
|
|
|
|
|
|
|
4818
|
|
|
|
|
|
|
void merge_hyphenated(vector& tokens); |
|
4819
|
|
|
|
|
|
|
|
|
4820
|
|
|
|
|
|
|
static const unordered_set abbreviations_czech; |
|
4821
|
|
|
|
|
|
|
static const unordered_set abbreviations_slovak; |
|
4822
|
|
|
|
|
|
|
}; |
|
4823
|
|
|
|
|
|
|
|
|
4824
|
|
|
|
|
|
|
} // namespace morphodita |
|
4825
|
|
|
|
|
|
|
|
|
4826
|
|
|
|
|
|
|
///////// |
|
4827
|
|
|
|
|
|
|
// File: morphodita/morpho/czech_morpho.h |
|
4828
|
|
|
|
|
|
|
///////// |
|
4829
|
|
|
|
|
|
|
|
|
4830
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
4831
|
|
|
|
|
|
|
// |
|
4832
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
4833
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
4834
|
|
|
|
|
|
|
// |
|
4835
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
4836
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4837
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
4838
|
|
|
|
|
|
|
|
|
4839
|
|
|
|
|
|
|
namespace morphodita { |
|
4840
|
|
|
|
|
|
|
|
|
4841
|
0
|
|
|
|
|
|
class czech_morpho : public morpho { |
|
4842
|
|
|
|
|
|
|
public: |
|
4843
|
|
|
|
|
|
|
using morpho_language = czech_tokenizer::tokenizer_language; |
|
4844
|
|
|
|
|
|
|
|
|
4845
|
0
|
0
|
|
|
|
|
czech_morpho(morpho_language language, unsigned version) : language(language), version(version) {} |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4846
|
|
|
|
|
|
|
|
|
4847
|
|
|
|
|
|
|
virtual int analyze(string_piece form, morpho::guesser_mode guesser, vector& lemmas) const override; |
|
4848
|
|
|
|
|
|
|
virtual int generate(string_piece lemma, const char* tag_wildcard, guesser_mode guesser, vector& forms) const override; |
|
4849
|
|
|
|
|
|
|
virtual int raw_lemma_len(string_piece lemma) const override; |
|
4850
|
|
|
|
|
|
|
virtual int lemma_id_len(string_piece lemma) const override; |
|
4851
|
|
|
|
|
|
|
virtual int raw_form_len(string_piece form) const override; |
|
4852
|
|
|
|
|
|
|
virtual tokenizer* new_tokenizer() const override; |
|
4853
|
|
|
|
|
|
|
|
|
4854
|
|
|
|
|
|
|
bool load(istream& is); |
|
4855
|
|
|
|
|
|
|
private: |
|
4856
|
|
|
|
|
|
|
inline void analyze_special(string_piece form, vector& lemmas) const; |
|
4857
|
|
|
|
|
|
|
|
|
4858
|
|
|
|
|
|
|
morpho_language language; |
|
4859
|
|
|
|
|
|
|
unsigned version; |
|
4860
|
|
|
|
|
|
|
morpho_dictionary dictionary; |
|
4861
|
|
|
|
|
|
|
unique_ptr> prefix_guesser; |
|
4862
|
|
|
|
|
|
|
unique_ptr statistical_guesser; |
|
4863
|
|
|
|
|
|
|
|
|
4864
|
|
|
|
|
|
|
string unknown_tag = "X@-------------"; |
|
4865
|
|
|
|
|
|
|
string number_tag = "C=-------------"; |
|
4866
|
|
|
|
|
|
|
string punctuation_tag = "Z:-------------"; |
|
4867
|
|
|
|
|
|
|
}; |
|
4868
|
|
|
|
|
|
|
|
|
4869
|
|
|
|
|
|
|
} // namespace morphodita |
|
4870
|
|
|
|
|
|
|
|
|
4871
|
|
|
|
|
|
|
///////// |
|
4872
|
|
|
|
|
|
|
// File: morphodita/morpho/czech_morpho.cpp |
|
4873
|
|
|
|
|
|
|
///////// |
|
4874
|
|
|
|
|
|
|
|
|
4875
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
4876
|
|
|
|
|
|
|
// |
|
4877
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
4878
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
4879
|
|
|
|
|
|
|
// |
|
4880
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
4881
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4882
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
4883
|
|
|
|
|
|
|
|
|
4884
|
|
|
|
|
|
|
namespace morphodita { |
|
4885
|
|
|
|
|
|
|
|
|
4886
|
0
|
|
|
|
|
|
bool czech_morpho::load(istream& is) { |
|
4887
|
|
|
|
|
|
|
binary_decoder data; |
|
4888
|
0
|
0
|
|
|
|
|
if (!compressor::load(is, data)) return false; |
|
|
|
0
|
|
|
|
|
|
|
4889
|
|
|
|
|
|
|
|
|
4890
|
|
|
|
|
|
|
try { |
|
4891
|
|
|
|
|
|
|
// Load tag length |
|
4892
|
0
|
0
|
|
|
|
|
unsigned tag_length = data.next_1B(); |
|
4893
|
0
|
0
|
|
|
|
|
if (tag_length < unknown_tag.size()) unknown_tag.erase(tag_length); |
|
|
|
0
|
|
|
|
|
|
|
4894
|
0
|
0
|
|
|
|
|
if (tag_length < number_tag.size()) number_tag.erase(tag_length); |
|
|
|
0
|
|
|
|
|
|
|
4895
|
0
|
0
|
|
|
|
|
if (tag_length < punctuation_tag.size()) punctuation_tag.erase(tag_length); |
|
|
|
0
|
|
|
|
|
|
|
4896
|
|
|
|
|
|
|
|
|
4897
|
|
|
|
|
|
|
// Load dictionary |
|
4898
|
0
|
0
|
|
|
|
|
dictionary.load(data); |
|
4899
|
|
|
|
|
|
|
|
|
4900
|
|
|
|
|
|
|
// Optionally prefix guesser if present |
|
4901
|
0
|
|
|
|
|
|
prefix_guesser.reset(); |
|
4902
|
0
|
0
|
|
|
|
|
if (data.next_1B()) { |
|
|
|
0
|
|
|
|
|
|
|
4903
|
0
|
0
|
|
|
|
|
prefix_guesser.reset(new morpho_prefix_guesser(dictionary)); |
|
4904
|
0
|
0
|
|
|
|
|
prefix_guesser->load(data); |
|
4905
|
|
|
|
|
|
|
} |
|
4906
|
|
|
|
|
|
|
|
|
4907
|
|
|
|
|
|
|
// Optionally statistical guesser if present |
|
4908
|
|
|
|
|
|
|
statistical_guesser.reset(); |
|
4909
|
0
|
0
|
|
|
|
|
if (data.next_1B()) { |
|
|
|
0
|
|
|
|
|
|
|
4910
|
0
|
0
|
|
|
|
|
statistical_guesser.reset(new morpho_statistical_guesser()); |
|
4911
|
0
|
0
|
|
|
|
|
statistical_guesser->load(data); |
|
4912
|
|
0
|
|
|
|
|
} |
|
4913
|
|
|
|
|
|
|
} catch (binary_decoder_error&) { |
|
4914
|
|
|
|
|
|
|
return false; |
|
4915
|
|
|
|
|
|
|
} |
|
4916
|
|
|
|
|
|
|
|
|
4917
|
0
|
|
|
|
|
|
return data.is_end(); |
|
4918
|
|
|
|
|
|
|
} |
|
4919
|
|
|
|
|
|
|
|
|
4920
|
0
|
|
|
|
|
|
int czech_morpho::analyze(string_piece form, guesser_mode guesser, vector& lemmas) const { |
|
4921
|
|
|
|
|
|
|
lemmas.clear(); |
|
4922
|
|
|
|
|
|
|
|
|
4923
|
0
|
0
|
|
|
|
|
if (form.len) { |
|
4924
|
|
|
|
|
|
|
// Generate all casing variants if needed (they are different than given form). |
|
4925
|
|
|
|
|
|
|
string form_uclc; // first uppercase, rest lowercase |
|
4926
|
|
|
|
|
|
|
string form_lc; // all lowercase |
|
4927
|
0
|
0
|
|
|
|
|
generate_casing_variants(form, form_uclc, form_lc); |
|
4928
|
|
|
|
|
|
|
|
|
4929
|
|
|
|
|
|
|
// Start by analysing using the dictionary and all casing variants. |
|
4930
|
0
|
0
|
|
|
|
|
dictionary.analyze(form, lemmas); |
|
4931
|
0
|
0
|
|
|
|
|
if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas); |
|
|
|
0
|
|
|
|
|
|
|
4932
|
0
|
0
|
|
|
|
|
if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas); |
|
|
|
0
|
|
|
|
|
|
|
4933
|
0
|
0
|
|
|
|
|
if (!lemmas.empty()) return NO_GUESSER; |
|
4934
|
|
|
|
|
|
|
|
|
4935
|
|
|
|
|
|
|
// Then call analyze_special to handle numbers and punctuation. |
|
4936
|
0
|
0
|
|
|
|
|
analyze_special(form, lemmas); |
|
4937
|
0
|
0
|
|
|
|
|
if (!lemmas.empty()) return NO_GUESSER; |
|
4938
|
|
|
|
|
|
|
|
|
4939
|
|
|
|
|
|
|
// For the prefix guesser, use only form_lc. |
|
4940
|
0
|
0
|
|
|
|
|
if (guesser == GUESSER && prefix_guesser) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4941
|
0
|
0
|
|
|
|
|
prefix_guesser->analyze(form_lc.empty() ? form : form_lc, lemmas); |
|
|
|
0
|
|
|
|
|
|
|
4942
|
|
|
|
|
|
|
bool prefix_guesser_guesses = !lemmas.empty(); |
|
4943
|
|
|
|
|
|
|
|
|
4944
|
|
|
|
|
|
|
// For the statistical guesser, use all casing variants. |
|
4945
|
0
|
0
|
|
|
|
|
if (guesser == GUESSER && statistical_guesser) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4946
|
0
|
0
|
|
|
|
|
if (form_uclc.empty() && form_lc.empty()) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4947
|
0
|
0
|
|
|
|
|
statistical_guesser->analyze(form, lemmas, nullptr); |
|
4948
|
|
|
|
|
|
|
else { |
|
4949
|
0
|
0
|
|
|
|
|
morpho_statistical_guesser::used_rules used_rules; used_rules.reserve(3); |
|
4950
|
0
|
0
|
|
|
|
|
statistical_guesser->analyze(form, lemmas, &used_rules); |
|
4951
|
0
|
0
|
|
|
|
|
if (!form_uclc.empty()) statistical_guesser->analyze(form_uclc, lemmas, &used_rules); |
|
|
|
0
|
|
|
|
|
|
|
4952
|
0
|
0
|
|
|
|
|
if (!form_lc.empty()) statistical_guesser->analyze(form_lc, lemmas, &used_rules); |
|
|
|
0
|
|
|
|
|
|
|
4953
|
|
|
|
|
|
|
} |
|
4954
|
|
|
|
|
|
|
} |
|
4955
|
|
|
|
|
|
|
|
|
4956
|
|
|
|
|
|
|
// Make sure results are unique lemma-tag pairs. Statistical guesser produces |
|
4957
|
|
|
|
|
|
|
// unique lemma-tag pairs, but prefix guesser does not. |
|
4958
|
0
|
0
|
|
|
|
|
if (prefix_guesser_guesses) { |
|
4959
|
0
|
|
|
|
|
|
sort(lemmas.begin(), lemmas.end(), [](const tagged_lemma& a, const tagged_lemma& b) { |
|
4960
|
0
|
|
|
|
|
|
int lemma_compare = a.lemma.compare(b.lemma); |
|
4961
|
0
|
0
|
|
|
|
|
return lemma_compare < 0 || (lemma_compare == 0 && a.tag < b.tag); |
|
4962
|
|
|
|
|
|
|
}); |
|
4963
|
0
|
|
|
|
|
|
auto lemmas_end = unique(lemmas.begin(), lemmas.end(), [](const tagged_lemma& a, const tagged_lemma& b) { |
|
4964
|
0
|
0
|
|
|
|
|
return a.lemma == b.lemma && a.tag == b.tag; |
|
|
|
0
|
|
|
|
|
|
|
4965
|
0
|
|
|
|
|
|
}); |
|
4966
|
0
|
0
|
|
|
|
|
if (lemmas_end != lemmas.end()) lemmas.erase(lemmas_end, lemmas.end()); |
|
4967
|
|
|
|
|
|
|
} |
|
4968
|
|
|
|
|
|
|
|
|
4969
|
0
|
0
|
|
|
|
|
if (!lemmas.empty()) return GUESSER; |
|
4970
|
|
|
|
|
|
|
} |
|
4971
|
|
|
|
|
|
|
|
|
4972
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), unknown_tag); |
|
4973
|
0
|
|
|
|
|
|
return -1; |
|
4974
|
|
|
|
|
|
|
} |
|
4975
|
|
|
|
|
|
|
|
|
4976
|
0
|
|
|
|
|
|
int czech_morpho::generate(string_piece lemma, const char* tag_wildcard, morpho::guesser_mode guesser, vector& forms) const { |
|
4977
|
|
|
|
|
|
|
forms.clear(); |
|
4978
|
|
|
|
|
|
|
|
|
4979
|
0
|
|
|
|
|
|
tag_filter filter(tag_wildcard); |
|
4980
|
|
|
|
|
|
|
|
|
4981
|
0
|
0
|
|
|
|
|
if (lemma.len) { |
|
4982
|
0
|
0
|
|
|
|
|
if (dictionary.generate(lemma, filter, forms)) |
|
|
|
0
|
|
|
|
|
|
|
4983
|
|
|
|
|
|
|
return NO_GUESSER; |
|
4984
|
|
|
|
|
|
|
|
|
4985
|
0
|
0
|
|
|
|
|
if (guesser == GUESSER && prefix_guesser) |
|
|
|
0
|
|
|
|
|
|
|
4986
|
|
|
|
|
|
|
if (prefix_guesser->generate(lemma, filter, forms)) |
|
4987
|
|
|
|
|
|
|
return GUESSER; |
|
4988
|
|
|
|
|
|
|
} |
|
4989
|
|
|
|
|
|
|
|
|
4990
|
|
|
|
|
|
|
return -1; |
|
4991
|
|
|
|
|
|
|
} |
|
4992
|
|
|
|
|
|
|
|
|
4993
|
0
|
|
|
|
|
|
int czech_morpho::raw_lemma_len(string_piece lemma) const { |
|
4994
|
0
|
|
|
|
|
|
return czech_lemma_addinfo::raw_lemma_len(lemma); |
|
4995
|
|
|
|
|
|
|
} |
|
4996
|
|
|
|
|
|
|
|
|
4997
|
0
|
|
|
|
|
|
int czech_morpho::lemma_id_len(string_piece lemma) const { |
|
4998
|
0
|
|
|
|
|
|
return czech_lemma_addinfo::lemma_id_len(lemma); |
|
4999
|
|
|
|
|
|
|
} |
|
5000
|
|
|
|
|
|
|
|
|
5001
|
0
|
|
|
|
|
|
int czech_morpho::raw_form_len(string_piece form) const { |
|
5002
|
0
|
|
|
|
|
|
return form.len; |
|
5003
|
|
|
|
|
|
|
} |
|
5004
|
|
|
|
|
|
|
|
|
5005
|
0
|
|
|
|
|
|
tokenizer* czech_morpho::new_tokenizer() const { |
|
5006
|
0
|
0
|
|
|
|
|
return new czech_tokenizer(language, version, this); |
|
5007
|
|
|
|
|
|
|
} |
|
5008
|
|
|
|
|
|
|
|
|
5009
|
|
|
|
|
|
|
// What characters are considered punctuation except for the ones in unicode Punctuation category. |
|
5010
|
|
|
|
|
|
|
static bool punctuation_additional[] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1/*$*/, |
|
5011
|
|
|
|
|
|
|
0,0,0,0,0,0,1/*+*/,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1/*<*/,1/*=*/,1/*>*/,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
|
5012
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,1/*^*/,0,1/*`*/,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1/*|*/,0,1/*~*/,0,0,0,0,0,0,0,0, |
|
5013
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
|
5014
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
|
5015
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
|
5016
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
|
5017
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
|
5018
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
|
5019
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
|
5020
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
|
5021
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
|
5022
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1/*caron*/}; |
|
5023
|
|
|
|
|
|
|
|
|
5024
|
|
|
|
|
|
|
// What characters of unicode Punctuation category are not considered punctuation. |
|
5025
|
|
|
|
|
|
|
static bool punctuation_exceptions[] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
|
5026
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
|
5027
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
|
5028
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,1/*paragraph*/}; |
|
5029
|
|
|
|
|
|
|
|
|
5030
|
0
|
|
|
|
|
|
void czech_morpho::analyze_special(string_piece form, vector& lemmas) const { |
|
5031
|
|
|
|
|
|
|
using namespace unilib; |
|
5032
|
|
|
|
|
|
|
|
|
5033
|
|
|
|
|
|
|
// Analyzer for numbers and punctuation. |
|
5034
|
|
|
|
|
|
|
// Number is anything matching [+-]? is_Pn* ([.,] is_Pn*)? ([Ee] [+-]? is_Pn+)? for at least one is_Pn* nonempty. |
|
5035
|
|
|
|
|
|
|
// Punctuation is any form beginning with either unicode punctuation or punctuation_exceptions character. |
|
5036
|
|
|
|
|
|
|
// Beware that numbers takes precedence, so - is punctuation, -3 is number, -. is punctuation, -.3 is number. |
|
5037
|
0
|
0
|
|
|
|
|
if (!form.len) return; |
|
5038
|
|
|
|
|
|
|
|
|
5039
|
0
|
|
|
|
|
|
string_piece form_ori = form; |
|
5040
|
0
|
|
|
|
|
|
char32_t first = utf8::decode(form.str, form.len); |
|
5041
|
|
|
|
|
|
|
|
|
5042
|
|
|
|
|
|
|
// Try matching a number. |
|
5043
|
|
|
|
|
|
|
char32_t codepoint = first; |
|
5044
|
|
|
|
|
|
|
bool any_digit = false; |
|
5045
|
0
|
0
|
|
|
|
|
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(form.str, form.len); |
|
5046
|
0
|
0
|
|
|
|
|
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(form.str, form.len); |
|
5047
|
0
|
0
|
|
|
|
|
if ((codepoint == '.' && form.len) || codepoint == ',') codepoint = utf8::decode(form.str, form.len); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
5048
|
0
|
0
|
|
|
|
|
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(form.str, form.len); |
|
5049
|
0
|
0
|
|
|
|
|
if (any_digit && (codepoint == 'e' || codepoint == 'E')) { |
|
|
|
0
|
|
|
|
|
|
|
5050
|
0
|
|
|
|
|
|
codepoint = utf8::decode(form.str, form.len); |
|
5051
|
0
|
0
|
|
|
|
|
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(form.str, form.len); |
|
5052
|
|
|
|
|
|
|
any_digit = false; |
|
5053
|
0
|
0
|
|
|
|
|
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(form.str, form.len); |
|
5054
|
|
|
|
|
|
|
} |
|
5055
|
|
|
|
|
|
|
|
|
5056
|
0
|
0
|
|
|
|
|
if (any_digit && !form.len && (!codepoint || codepoint == '.')) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
5057
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form_ori.str, form_ori.len), number_tag); |
|
5058
|
0
|
0
|
|
|
|
|
} else if ((first < sizeof(punctuation_additional) && punctuation_additional[first]) || |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
5059
|
0
|
0
|
|
|
|
|
((unicode::category(first) & unicode::P) && (first >= sizeof(punctuation_exceptions) || !punctuation_exceptions[first]))) |
|
|
|
0
|
|
|
|
|
|
|
5060
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form_ori.str, form_ori.len), punctuation_tag); |
|
5061
|
|
|
|
|
|
|
} |
|
5062
|
|
|
|
|
|
|
|
|
5063
|
|
|
|
|
|
|
} // namespace morphodita |
|
5064
|
|
|
|
|
|
|
|
|
5065
|
|
|
|
|
|
|
///////// |
|
5066
|
|
|
|
|
|
|
// File: morphodita/morpho/english_lemma_addinfo.h |
|
5067
|
|
|
|
|
|
|
///////// |
|
5068
|
|
|
|
|
|
|
|
|
5069
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
5070
|
|
|
|
|
|
|
// |
|
5071
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
5072
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
5073
|
|
|
|
|
|
|
// |
|
5074
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
5075
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
5076
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
5077
|
|
|
|
|
|
|
|
|
5078
|
|
|
|
|
|
|
namespace morphodita { |
|
5079
|
|
|
|
|
|
|
|
|
5080
|
|
|
|
|
|
|
// Declarations |
|
5081
|
0
|
|
|
|
|
|
struct english_lemma_addinfo { |
|
5082
|
|
|
|
|
|
|
inline static int raw_lemma_len(string_piece lemma); |
|
5083
|
|
|
|
|
|
|
inline static int lemma_id_len(string_piece lemma); |
|
5084
|
|
|
|
|
|
|
inline static string format(const unsigned char* addinfo, int addinfo_len); |
|
5085
|
|
|
|
|
|
|
inline static bool generatable(const unsigned char* addinfo, int addinfo_len); |
|
5086
|
|
|
|
|
|
|
|
|
5087
|
|
|
|
|
|
|
inline int parse(string_piece lemma, bool die_on_failure = false); |
|
5088
|
|
|
|
|
|
|
inline bool match_lemma_id(const unsigned char* other_addinfo, int other_addinfo_len); |
|
5089
|
|
|
|
|
|
|
|
|
5090
|
|
|
|
|
|
|
vector data; |
|
5091
|
|
|
|
|
|
|
}; |
|
5092
|
|
|
|
|
|
|
|
|
5093
|
|
|
|
|
|
|
// Definitions |
|
5094
|
0
|
|
|
|
|
|
int english_lemma_addinfo::raw_lemma_len(string_piece lemma) { |
|
5095
|
|
|
|
|
|
|
// Lemma ends either by |
|
5096
|
|
|
|
|
|
|
// - '^' on non-first position followed by nothing or [A-Za-z][-A-Za-z]* |
|
5097
|
|
|
|
|
|
|
// - '+' on non-first position followed by nothing |
|
5098
|
0
|
0
|
|
|
|
|
for (unsigned len = 1; len < lemma.len; len++) { |
|
5099
|
0
|
0
|
|
|
|
|
if (len + 1 == lemma.len && (lemma.str[len] == '^' || lemma.str[len] == '+')) |
|
|
|
0
|
|
|
|
|
|
|
5100
|
0
|
|
|
|
|
|
return len; |
|
5101
|
0
|
0
|
|
|
|
|
if (len + 1 < lemma.len && lemma.str[len] == '^') { |
|
|
|
0
|
|
|
|
|
|
|
5102
|
|
|
|
|
|
|
bool ok = true; |
|
5103
|
0
|
0
|
|
|
|
|
for (unsigned i = len + 1; ok && i < lemma.len; i++) |
|
|
|
0
|
|
|
|
|
|
|
5104
|
0
|
0
|
|
|
|
|
ok &= (lemma.str[i] >= 'A' && lemma.str[i] <= 'Z') || |
|
5105
|
0
|
0
|
|
|
|
|
(lemma.str[i] >= 'a' && lemma.str[i] <= 'z') || |
|
|
|
0
|
|
|
|
|
|
|
5106
|
0
|
0
|
|
|
|
|
(i > len + 1 && lemma.str[i] == '-'); |
|
5107
|
0
|
0
|
|
|
|
|
if (ok) return len; |
|
5108
|
|
|
|
|
|
|
} |
|
5109
|
|
|
|
|
|
|
} |
|
5110
|
0
|
|
|
|
|
|
return lemma.len; |
|
5111
|
|
|
|
|
|
|
} |
|
5112
|
|
|
|
|
|
|
|
|
5113
|
|
|
|
|
|
|
int english_lemma_addinfo::lemma_id_len(string_piece lemma) { |
|
5114
|
|
|
|
|
|
|
// No lemma comments. |
|
5115
|
0
|
|
|
|
|
|
return lemma.len; |
|
5116
|
|
|
|
|
|
|
} |
|
5117
|
|
|
|
|
|
|
|
|
5118
|
|
|
|
|
|
|
string english_lemma_addinfo::format(const unsigned char* addinfo, int addinfo_len) { |
|
5119
|
0
|
|
|
|
|
|
return string((const char*) addinfo, addinfo_len); |
|
5120
|
|
|
|
|
|
|
} |
|
5121
|
|
|
|
|
|
|
|
|
5122
|
|
|
|
|
|
|
bool english_lemma_addinfo::generatable(const unsigned char* /*addinfo*/, int /*addinfo_len*/) { |
|
5123
|
|
|
|
|
|
|
return true; |
|
5124
|
|
|
|
|
|
|
} |
|
5125
|
|
|
|
|
|
|
|
|
5126
|
0
|
|
|
|
|
|
int english_lemma_addinfo::parse(string_piece lemma, bool /*die_on_failure*/) { |
|
5127
|
|
|
|
|
|
|
data.clear(); |
|
5128
|
|
|
|
|
|
|
|
|
5129
|
0
|
|
|
|
|
|
size_t len = raw_lemma_len(lemma); |
|
5130
|
0
|
0
|
|
|
|
|
for (size_t i = len; i < lemma.len; i++) |
|
5131
|
0
|
|
|
|
|
|
data.push_back(lemma.str[i]); |
|
5132
|
|
|
|
|
|
|
|
|
5133
|
0
|
|
|
|
|
|
return len; |
|
5134
|
|
|
|
|
|
|
} |
|
5135
|
|
|
|
|
|
|
|
|
5136
|
0
|
|
|
|
|
|
bool english_lemma_addinfo::match_lemma_id(const unsigned char* other_addinfo, int other_addinfo_len) { |
|
5137
|
0
|
0
|
|
|
|
|
if (data.empty()) return true; |
|
5138
|
0
|
0
|
|
|
|
|
if (data.size() == 1 && data[0] == '^') return other_addinfo_len > 0 && other_addinfo[0] == '^'; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
5139
|
0
|
0
|
|
|
|
|
if (data.size() == 1 && data[0] == '+') return other_addinfo_len == 0; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
5140
|
0
|
0
|
|
|
|
|
return data.size() == size_t(other_addinfo_len) && small_memeq(data.data(), other_addinfo, other_addinfo_len); |
|
|
|
0
|
|
|
|
|
|
|
5141
|
|
|
|
|
|
|
} |
|
5142
|
|
|
|
|
|
|
|
|
5143
|
|
|
|
|
|
|
} // namespace morphodita |
|
5144
|
|
|
|
|
|
|
|
|
5145
|
|
|
|
|
|
|
///////// |
|
5146
|
|
|
|
|
|
|
// File: morphodita/morpho/english_morpho_guesser.h |
|
5147
|
|
|
|
|
|
|
///////// |
|
5148
|
|
|
|
|
|
|
|
|
5149
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
5150
|
|
|
|
|
|
|
// |
|
5151
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
5152
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
5153
|
|
|
|
|
|
|
// |
|
5154
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
5155
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
5156
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
5157
|
|
|
|
|
|
|
|
|
5158
|
|
|
|
|
|
|
namespace morphodita { |
|
5159
|
|
|
|
|
|
|
|
|
5160
|
0
|
0
|
|
|
|
|
class english_morpho_guesser { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
5161
|
|
|
|
|
|
|
public: |
|
5162
|
|
|
|
|
|
|
void load(binary_decoder& data); |
|
5163
|
|
|
|
|
|
|
void analyze(string_piece form, string_piece form_lc, vector& lemmas) const; |
|
5164
|
|
|
|
|
|
|
bool analyze_proper_names(string_piece form, string_piece form_lc, vector& lemmas) const; |
|
5165
|
|
|
|
|
|
|
|
|
5166
|
|
|
|
|
|
|
private: |
|
5167
|
|
|
|
|
|
|
inline void add(const string& tag, const string& form, vector& lemmas) const; |
|
5168
|
|
|
|
|
|
|
inline void add(const string& tag, const string& tag2, const string& form, vector& lemmas) const; |
|
5169
|
|
|
|
|
|
|
inline void add(const string& tag, const string& form, unsigned negation_len, vector& lemmas) const; |
|
5170
|
|
|
|
|
|
|
inline void add(const string& tag, const string& tag2, const string& form, unsigned negation_len, vector& lemmas) const; |
|
5171
|
|
|
|
|
|
|
void add_NNS(const string& form, unsigned negation_len, vector& lemmas) const; |
|
5172
|
|
|
|
|
|
|
void add_NNPS(const string& form, vector& lemmas) const; |
|
5173
|
|
|
|
|
|
|
void add_VBG(const string& form, vector& lemmas) const; |
|
5174
|
|
|
|
|
|
|
void add_VBD_VBN(const string& form, vector& lemmas) const; |
|
5175
|
|
|
|
|
|
|
void add_VBZ(const string& form, vector& lemmas) const; |
|
5176
|
|
|
|
|
|
|
void add_JJR_RBR(const string& form, unsigned negation_len, vector& lemmas) const; |
|
5177
|
|
|
|
|
|
|
void add_JJS_RBS(const string& form, unsigned negation_len, vector& lemmas) const; |
|
5178
|
|
|
|
|
|
|
|
|
5179
|
|
|
|
|
|
|
enum { NEGATION_LEN = 0, TO_FOLLOW = 1, TOTAL = 2 }; |
|
5180
|
|
|
|
|
|
|
vector exceptions_tags; |
|
5181
|
|
|
|
|
|
|
persistent_unordered_map exceptions; |
|
5182
|
|
|
|
|
|
|
persistent_unordered_map negations; |
|
5183
|
|
|
|
|
|
|
string CD = "CD", FW = "FW", JJ = "JJ", JJR = "JJR", JJS = "JJS", |
|
5184
|
|
|
|
|
|
|
NN = "NN", NNP = "NNP", NNPS = "NNPS", NNS = "NNS", RB = "RB", |
|
5185
|
|
|
|
|
|
|
RBR = "RBR", RBS = "RBS", SYM = "SYM", VB = "VB", VBD = "VBD", |
|
5186
|
|
|
|
|
|
|
VBG = "VBG", VBN = "VBN", VBP = "VBP", VBZ = "VBZ"; |
|
5187
|
|
|
|
|
|
|
}; |
|
5188
|
|
|
|
|
|
|
|
|
5189
|
|
|
|
|
|
|
} // namespace morphodita |
|
5190
|
|
|
|
|
|
|
|
|
5191
|
|
|
|
|
|
|
///////// |
|
5192
|
|
|
|
|
|
|
// File: morphodita/morpho/english_morpho.h |
|
5193
|
|
|
|
|
|
|
///////// |
|
5194
|
|
|
|
|
|
|
|
|
5195
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
5196
|
|
|
|
|
|
|
// |
|
5197
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
5198
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
5199
|
|
|
|
|
|
|
// |
|
5200
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
5201
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
5202
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
5203
|
|
|
|
|
|
|
|
|
5204
|
|
|
|
|
|
|
namespace morphodita { |
|
5205
|
|
|
|
|
|
|
|
|
5206
|
0
|
|
|
|
|
|
class english_morpho : public morpho { |
|
5207
|
|
|
|
|
|
|
public: |
|
5208
|
0
|
0
|
|
|
|
|
english_morpho(unsigned version) : version(version) {} |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
5209
|
|
|
|
|
|
|
|
|
5210
|
|
|
|
|
|
|
virtual int analyze(string_piece form, morpho::guesser_mode guesser, vector& lemmas) const override; |
|
5211
|
|
|
|
|
|
|
virtual int generate(string_piece lemma, const char* tag_wildcard, guesser_mode guesser, vector& forms) const override; |
|
5212
|
|
|
|
|
|
|
virtual int raw_lemma_len(string_piece lemma) const override; |
|
5213
|
|
|
|
|
|
|
virtual int lemma_id_len(string_piece lemma) const override; |
|
5214
|
|
|
|
|
|
|
virtual int raw_form_len(string_piece form) const override; |
|
5215
|
|
|
|
|
|
|
virtual tokenizer* new_tokenizer() const override; |
|
5216
|
|
|
|
|
|
|
|
|
5217
|
|
|
|
|
|
|
bool load(istream& is); |
|
5218
|
|
|
|
|
|
|
private: |
|
5219
|
|
|
|
|
|
|
inline void analyze_special(string_piece form, vector& lemmas) const; |
|
5220
|
|
|
|
|
|
|
|
|
5221
|
|
|
|
|
|
|
unsigned version; |
|
5222
|
|
|
|
|
|
|
morpho_dictionary dictionary; |
|
5223
|
|
|
|
|
|
|
english_morpho_guesser morpho_guesser; |
|
5224
|
|
|
|
|
|
|
|
|
5225
|
|
|
|
|
|
|
string unknown_tag = "UNK"; |
|
5226
|
|
|
|
|
|
|
string number_tag = "CD", nnp_tag = "NNP", ls_tag = "LS"; |
|
5227
|
|
|
|
|
|
|
string open_quotation_tag = "``", close_quotation_tag = "''"; |
|
5228
|
|
|
|
|
|
|
string open_parenthesis_tag = "(", close_parenthesis_tag = ")"; |
|
5229
|
|
|
|
|
|
|
string comma_tag = ",", dot_tag = ".", punctuation_tag = ":", hash_tag = "#", dollar_tag = "$"; |
|
5230
|
|
|
|
|
|
|
string sym_tag = "SYM", jj_tag = "JJ", nn_tag = "NN", nns_tag = "NNS", cc_tag = "CC", pos_tag = "POS", in_tag = "IN"; |
|
5231
|
|
|
|
|
|
|
}; |
|
5232
|
|
|
|
|
|
|
|
|
5233
|
|
|
|
|
|
|
} // namespace morphodita |
|
5234
|
|
|
|
|
|
|
|
|
5235
|
|
|
|
|
|
|
///////// |
|
5236
|
|
|
|
|
|
|
// File: morphodita/tokenizer/english_tokenizer.h |
|
5237
|
|
|
|
|
|
|
///////// |
|
5238
|
|
|
|
|
|
|
|
|
5239
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
5240
|
|
|
|
|
|
|
// |
|
5241
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
5242
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
5243
|
|
|
|
|
|
|
// |
|
5244
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
5245
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
5246
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
5247
|
|
|
|
|
|
|
|
|
5248
|
|
|
|
|
|
|
namespace morphodita { |
|
5249
|
|
|
|
|
|
|
|
|
5250
|
0
|
|
|
|
|
|
class english_tokenizer : public ragel_tokenizer { |
|
5251
|
|
|
|
|
|
|
public: |
|
5252
|
|
|
|
|
|
|
enum { LATEST = 2 }; |
|
5253
|
|
|
|
|
|
|
english_tokenizer(unsigned version); |
|
5254
|
|
|
|
|
|
|
|
|
5255
|
|
|
|
|
|
|
virtual bool next_sentence(vector& tokens) override; |
|
5256
|
|
|
|
|
|
|
|
|
5257
|
|
|
|
|
|
|
private: |
|
5258
|
|
|
|
|
|
|
void split_token(vector& tokens); |
|
5259
|
|
|
|
|
|
|
|
|
5260
|
|
|
|
|
|
|
static const unordered_set abbreviations; |
|
5261
|
|
|
|
|
|
|
}; |
|
5262
|
|
|
|
|
|
|
|
|
5263
|
|
|
|
|
|
|
} // namespace morphodita |
|
5264
|
|
|
|
|
|
|
|
|
5265
|
|
|
|
|
|
|
///////// |
|
5266
|
|
|
|
|
|
|
// File: morphodita/morpho/english_morpho.cpp |
|
5267
|
|
|
|
|
|
|
///////// |
|
5268
|
|
|
|
|
|
|
|
|
5269
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
5270
|
|
|
|
|
|
|
// |
|
5271
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
5272
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
5273
|
|
|
|
|
|
|
// |
|
5274
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
5275
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
5276
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
5277
|
|
|
|
|
|
|
|
|
5278
|
|
|
|
|
|
|
namespace morphodita { |
|
5279
|
|
|
|
|
|
|
|
|
5280
|
0
|
|
|
|
|
|
bool english_morpho::load(istream& is) { |
|
5281
|
|
|
|
|
|
|
binary_decoder data; |
|
5282
|
0
|
0
|
|
|
|
|
if (!compressor::load(is, data)) return false; |
|
|
|
0
|
|
|
|
|
|
|
5283
|
|
|
|
|
|
|
|
|
5284
|
|
|
|
|
|
|
try { |
|
5285
|
0
|
0
|
|
|
|
|
dictionary.load(data); |
|
5286
|
0
|
0
|
|
|
|
|
morpho_guesser.load(data); |
|
|
|
0
|
|
|
|
|
|
|
5287
|
|
|
|
|
|
|
} catch (binary_decoder_error&) { |
|
5288
|
|
|
|
|
|
|
return false; |
|
5289
|
|
|
|
|
|
|
} |
|
5290
|
|
|
|
|
|
|
|
|
5291
|
0
|
|
|
|
|
|
return data.is_end(); |
|
5292
|
|
|
|
|
|
|
} |
|
5293
|
|
|
|
|
|
|
|
|
5294
|
0
|
|
|
|
|
|
int english_morpho::analyze(string_piece form, guesser_mode guesser, vector& lemmas) const { |
|
5295
|
|
|
|
|
|
|
lemmas.clear(); |
|
5296
|
|
|
|
|
|
|
|
|
5297
|
0
|
0
|
|
|
|
|
if (form.len) { |
|
5298
|
|
|
|
|
|
|
// Generate all casing variants if needed (they are different than given form). |
|
5299
|
|
|
|
|
|
|
string form_uclc; // first uppercase, rest lowercase |
|
5300
|
|
|
|
|
|
|
string form_lc; // all lowercase |
|
5301
|
0
|
0
|
|
|
|
|
generate_casing_variants(form, form_uclc, form_lc); |
|
5302
|
|
|
|
|
|
|
|
|
5303
|
|
|
|
|
|
|
// Start by analysing using the dictionary and all casing variants. |
|
5304
|
0
|
0
|
|
|
|
|
dictionary.analyze(form, lemmas); |
|
5305
|
0
|
0
|
|
|
|
|
if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas); |
|
|
|
0
|
|
|
|
|
|
|
5306
|
0
|
0
|
|
|
|
|
if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas); |
|
|
|
0
|
|
|
|
|
|
|
5307
|
0
|
0
|
|
|
|
|
if (!lemmas.empty()) |
|
5308
|
0
|
0
|
|
|
|
|
return guesser == NO_GUESSER || !morpho_guesser.analyze_proper_names(form, form_lc.empty() ? form : form_lc, lemmas) ? NO_GUESSER : GUESSER; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
5309
|
|
|
|
|
|
|
|
|
5310
|
|
|
|
|
|
|
// Then call analyze_special to handle numbers, punctuation and symbols. |
|
5311
|
0
|
0
|
|
|
|
|
analyze_special(form, lemmas); |
|
5312
|
0
|
0
|
|
|
|
|
if (!lemmas.empty()) return NO_GUESSER; |
|
5313
|
|
|
|
|
|
|
|
|
5314
|
|
|
|
|
|
|
// Use English guesser on form_lc if allowed. |
|
5315
|
0
|
0
|
|
|
|
|
if (guesser == GUESSER) |
|
5316
|
0
|
0
|
|
|
|
|
morpho_guesser.analyze(form, form_lc.empty() ? form : form_lc, lemmas); |
|
|
|
0
|
|
|
|
|
|
|
5317
|
0
|
0
|
|
|
|
|
if (!lemmas.empty()) return GUESSER; |
|
5318
|
|
|
|
|
|
|
} |
|
5319
|
|
|
|
|
|
|
|
|
5320
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), unknown_tag); |
|
5321
|
0
|
|
|
|
|
|
return -1; |
|
5322
|
|
|
|
|
|
|
} |
|
5323
|
|
|
|
|
|
|
|
|
5324
|
0
|
|
|
|
|
|
int english_morpho::generate(string_piece lemma, const char* tag_wildcard, morpho::guesser_mode /*guesser*/, vector& forms) const { |
|
5325
|
|
|
|
|
|
|
forms.clear(); |
|
5326
|
|
|
|
|
|
|
|
|
5327
|
0
|
|
|
|
|
|
tag_filter filter(tag_wildcard); |
|
5328
|
|
|
|
|
|
|
|
|
5329
|
0
|
0
|
|
|
|
|
if (lemma.len) { |
|
5330
|
0
|
0
|
|
|
|
|
if (dictionary.generate(lemma, filter, forms)) |
|
|
|
0
|
|
|
|
|
|
|
5331
|
|
|
|
|
|
|
return NO_GUESSER; |
|
5332
|
|
|
|
|
|
|
} |
|
5333
|
|
|
|
|
|
|
|
|
5334
|
|
|
|
|
|
|
return -1; |
|
5335
|
|
|
|
|
|
|
} |
|
5336
|
|
|
|
|
|
|
|
|
5337
|
0
|
|
|
|
|
|
int english_morpho::raw_lemma_len(string_piece lemma) const { |
|
5338
|
0
|
|
|
|
|
|
return english_lemma_addinfo::raw_lemma_len(lemma); |
|
5339
|
|
|
|
|
|
|
} |
|
5340
|
|
|
|
|
|
|
|
|
5341
|
0
|
|
|
|
|
|
int english_morpho::lemma_id_len(string_piece lemma) const { |
|
5342
|
0
|
|
|
|
|
|
return english_lemma_addinfo::lemma_id_len(lemma); |
|
5343
|
|
|
|
|
|
|
} |
|
5344
|
|
|
|
|
|
|
|
|
5345
|
0
|
|
|
|
|
|
int english_morpho::raw_form_len(string_piece form) const { |
|
5346
|
0
|
|
|
|
|
|
return form.len; |
|
5347
|
|
|
|
|
|
|
} |
|
5348
|
|
|
|
|
|
|
|
|
5349
|
0
|
|
|
|
|
|
tokenizer* english_morpho::new_tokenizer() const { |
|
5350
|
0
|
0
|
|
|
|
|
return new english_tokenizer(version <= 2 ? 1 : 2); |
|
5351
|
|
|
|
|
|
|
} |
|
5352
|
|
|
|
|
|
|
|
|
5353
|
0
|
|
|
|
|
|
void english_morpho::analyze_special(string_piece form, vector& lemmas) const { |
|
5354
|
|
|
|
|
|
|
using namespace unilib; |
|
5355
|
|
|
|
|
|
|
|
|
5356
|
|
|
|
|
|
|
// Analyzer for numbers and punctuation. |
|
5357
|
0
|
0
|
|
|
|
|
if (!form.len) return; |
|
5358
|
|
|
|
|
|
|
|
|
5359
|
|
|
|
|
|
|
// One-letter punctuation exceptions. |
|
5360
|
0
|
0
|
|
|
|
|
if (form.len == 1) |
|
5361
|
0
|
|
|
|
|
|
switch(*form.str) { |
|
5362
|
|
|
|
|
|
|
case '.': |
|
5363
|
|
|
|
|
|
|
case '!': |
|
5364
|
0
|
0
|
|
|
|
|
case '?': lemmas.emplace_back(string(form.str, form.len), dot_tag); return; |
|
5365
|
0
|
0
|
|
|
|
|
case ',': lemmas.emplace_back(string(form.str, form.len), comma_tag); return; |
|
5366
|
0
|
0
|
|
|
|
|
case '#': lemmas.emplace_back(string(form.str, form.len), hash_tag); return; |
|
5367
|
0
|
0
|
|
|
|
|
case '$': lemmas.emplace_back(string(form.str, form.len), dollar_tag); return; |
|
5368
|
0
|
0
|
|
|
|
|
case '[': lemmas.emplace_back(string(form.str, form.len), sym_tag); return; |
|
5369
|
0
|
0
|
|
|
|
|
case ']': lemmas.emplace_back(string(form.str, form.len), sym_tag); return; |
|
5370
|
0
|
0
|
|
|
|
|
case '%': lemmas.emplace_back(string(form.str, form.len), jj_tag); |
|
5371
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), nn_tag); return; |
|
5372
|
0
|
0
|
|
|
|
|
case '&': lemmas.emplace_back(string(form.str, form.len), cc_tag); |
|
5373
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), sym_tag); return; |
|
5374
|
0
|
0
|
|
|
|
|
case '*': lemmas.emplace_back(string(form.str, form.len), sym_tag); |
|
5375
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), nn_tag); return; |
|
5376
|
0
|
0
|
|
|
|
|
case '@': lemmas.emplace_back(string(form.str, form.len), sym_tag); |
|
5377
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), in_tag); return; |
|
5378
|
0
|
0
|
|
|
|
|
case '\'': lemmas.emplace_back(string(form.str, form.len), close_quotation_tag); |
|
5379
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), pos_tag); return; |
|
5380
|
|
|
|
|
|
|
} |
|
5381
|
|
|
|
|
|
|
|
|
5382
|
|
|
|
|
|
|
// Try matching a number: [+-]? is_Pn* (, is_Pn{3})? (. is_Pn*)? (s | [Ee] [+-]? is_Pn+)? with at least one digit |
|
5383
|
0
|
|
|
|
|
|
string_piece number = form; |
|
5384
|
0
|
|
|
|
|
|
char32_t codepoint = utf8::decode(number.str, number.len); |
|
5385
|
|
|
|
|
|
|
bool any_digit = false; |
|
5386
|
0
|
0
|
|
|
|
|
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(number.str, number.len); |
|
5387
|
0
|
0
|
|
|
|
|
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
|
5388
|
0
|
0
|
|
|
|
|
while (codepoint == ',') { |
|
5389
|
0
|
|
|
|
|
|
string_piece group = number; |
|
5390
|
0
|
0
|
|
|
|
|
if (unicode::category(utf8::decode(group.str, group.len) & ~unicode::N)) break; |
|
5391
|
0
|
0
|
|
|
|
|
if (unicode::category(utf8::decode(group.str, group.len) & ~unicode::N)) break; |
|
5392
|
0
|
0
|
|
|
|
|
if (unicode::category(utf8::decode(group.str, group.len) & ~unicode::N)) break; |
|
5393
|
|
|
|
|
|
|
any_digit = true; |
|
5394
|
0
|
|
|
|
|
|
number = group; |
|
5395
|
0
|
|
|
|
|
|
codepoint = utf8::decode(number.str, number.len); |
|
5396
|
|
|
|
|
|
|
} |
|
5397
|
0
|
0
|
|
|
|
|
if (codepoint == '.' && number.len) { |
|
|
|
0
|
|
|
|
|
|
|
5398
|
0
|
|
|
|
|
|
codepoint = utf8::decode(number.str, number.len); |
|
5399
|
0
|
0
|
|
|
|
|
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
|
5400
|
|
|
|
|
|
|
} |
|
5401
|
0
|
0
|
|
|
|
|
if (version >= 2 && any_digit && codepoint == 's' && !number.len) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
5402
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), number_tag); |
|
5403
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len - 1), nns_tag); |
|
5404
|
0
|
|
|
|
|
|
return; |
|
5405
|
|
|
|
|
|
|
} |
|
5406
|
0
|
0
|
|
|
|
|
if (any_digit && (codepoint == 'e' || codepoint == 'E')) { |
|
|
|
0
|
|
|
|
|
|
|
5407
|
0
|
|
|
|
|
|
codepoint = utf8::decode(number.str, number.len); |
|
5408
|
0
|
0
|
|
|
|
|
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(number.str, number.len); |
|
5409
|
|
|
|
|
|
|
any_digit = false; |
|
5410
|
0
|
0
|
|
|
|
|
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
|
5411
|
|
|
|
|
|
|
} |
|
5412
|
0
|
0
|
|
|
|
|
if (any_digit && !number.len && (!codepoint || codepoint == '.')) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
5413
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), number_tag); |
|
5414
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), nnp_tag); |
|
5415
|
0
|
0
|
|
|
|
|
if (form.len == 1 + (codepoint == '.') && *form.str >= '1' && *form.str <= '9') |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
5416
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), ls_tag); |
|
5417
|
|
|
|
|
|
|
return; |
|
5418
|
|
|
|
|
|
|
} |
|
5419
|
|
|
|
|
|
|
|
|
5420
|
|
|
|
|
|
|
// Open quotation, end quotation, open parentheses, end parentheses, symbol, or other |
|
5421
|
0
|
|
|
|
|
|
string_piece punctuation = form; |
|
5422
|
|
|
|
|
|
|
bool open_quotation = true, close_quotation = true, open_parenthesis = true, close_parenthesis = true, any_punctuation = true, symbol = true; |
|
5423
|
0
|
0
|
|
|
|
|
while ((symbol || any_punctuation) && punctuation.len) { |
|
|
|
0
|
|
|
|
|
|
|
5424
|
0
|
|
|
|
|
|
codepoint = utf8::decode(punctuation.str, punctuation.len); |
|
5425
|
0
|
0
|
|
|
|
|
if (open_quotation) open_quotation = codepoint == '`' || unicode::category(codepoint) & unicode::Pi; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
5426
|
0
|
0
|
|
|
|
|
if (close_quotation) close_quotation = codepoint == '\'' || codepoint == '"' || unicode::category(codepoint) & unicode::Pf; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
5427
|
0
|
0
|
|
|
|
|
if (open_parenthesis) open_parenthesis = unicode::category(codepoint) & unicode::Ps; |
|
5428
|
0
|
0
|
|
|
|
|
if (close_parenthesis) close_parenthesis = unicode::category(codepoint) & unicode::Pe; |
|
5429
|
0
|
0
|
|
|
|
|
if (any_punctuation) any_punctuation = unicode::category(codepoint) & unicode::P; |
|
5430
|
0
|
0
|
|
|
|
|
if (symbol) symbol = codepoint == '*' || unicode::category(codepoint) & unicode::S; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
5431
|
|
|
|
|
|
|
} |
|
5432
|
0
|
0
|
|
|
|
|
if (!punctuation.len && open_quotation) { lemmas.emplace_back(string(form.str, form.len), open_quotation_tag); return; } |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
5433
|
0
|
0
|
|
|
|
|
if (!punctuation.len && close_quotation) { lemmas.emplace_back(string(form.str, form.len), close_quotation_tag); return; } |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
5434
|
0
|
0
|
|
|
|
|
if (!punctuation.len && open_parenthesis) { lemmas.emplace_back(string(form.str, form.len), open_parenthesis_tag); return; } |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
5435
|
0
|
0
|
|
|
|
|
if (!punctuation.len && close_parenthesis) { lemmas.emplace_back(string(form.str, form.len), close_parenthesis_tag); return; } |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
5436
|
0
|
0
|
|
|
|
|
if (!punctuation.len && symbol) { lemmas.emplace_back(string(form.str, form.len), sym_tag); return; } |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
5437
|
0
|
0
|
|
|
|
|
if (!punctuation.len && any_punctuation) { lemmas.emplace_back(string(form.str, form.len), punctuation_tag); return; } |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
5438
|
|
|
|
|
|
|
} |
|
5439
|
|
|
|
|
|
|
|
|
5440
|
|
|
|
|
|
|
} // namespace morphodita |
|
5441
|
|
|
|
|
|
|
|
|
5442
|
|
|
|
|
|
|
///////// |
|
5443
|
|
|
|
|
|
|
// File: morphodita/morpho/english_morpho_guesser.cpp |
|
5444
|
|
|
|
|
|
|
///////// |
|
5445
|
|
|
|
|
|
|
|
|
5446
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
5447
|
|
|
|
|
|
|
// |
|
5448
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
5449
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
5450
|
|
|
|
|
|
|
// |
|
5451
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
5452
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
5453
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
5454
|
|
|
|
|
|
|
|
|
5455
|
|
|
|
|
|
|
// This code is a reimplementation of morphologic analyzer Morphium |
|
5456
|
|
|
|
|
|
|
// by Johanka Spoustova (Treex::Tool::EnglishMorpho::Analysis Perl module) |
|
5457
|
|
|
|
|
|
|
// and reimplementation of morphologic lemmatizer by Martin Popel |
|
5458
|
|
|
|
|
|
|
// (Treex::Tool::EnglishMorpho::Lemmatizer Perl module). The latter is based |
|
5459
|
|
|
|
|
|
|
// on morpha: |
|
5460
|
|
|
|
|
|
|
// Minnen, G., J. Carroll and D. Pearce (2001). Applied morphological |
|
5461
|
|
|
|
|
|
|
// processing of English, Natural Language Engineering, 7(3). 207-223. |
|
5462
|
|
|
|
|
|
|
// Morpha has been released under LGPL as a part of RASP system |
|
5463
|
|
|
|
|
|
|
// http://ilexir.co.uk/applications/rasp/. |
|
5464
|
|
|
|
|
|
|
|
|
5465
|
|
|
|
|
|
|
namespace morphodita { |
|
5466
|
|
|
|
|
|
|
|
|
5467
|
0
|
|
|
|
|
|
void english_morpho_guesser::load(binary_decoder& data) { |
|
5468
|
0
|
|
|
|
|
|
unsigned tags = data.next_2B(); |
|
5469
|
0
|
|
|
|
|
|
exceptions_tags.clear(); |
|
5470
|
0
|
|
|
|
|
|
exceptions_tags.reserve(tags); |
|
5471
|
0
|
0
|
|
|
|
|
while (tags--) { |
|
5472
|
0
|
|
|
|
|
|
unsigned len = data.next_1B(); |
|
5473
|
0
|
0
|
|
|
|
|
exceptions_tags.emplace_back(string(data.next(len), len)); |
|
5474
|
|
|
|
|
|
|
} |
|
5475
|
|
|
|
|
|
|
|
|
5476
|
0
|
|
|
|
|
|
exceptions.load(data); |
|
5477
|
0
|
|
|
|
|
|
negations.load(data); |
|
5478
|
0
|
|
|
|
|
|
} |
|
5479
|
|
|
|
|
|
|
|
|
5480
|
|
|
|
|
|
|
static const char _tag_guesser_actions[] = { |
|
5481
|
|
|
|
|
|
|
0, 1, 0, 1, 1, 1, 2, 1, |
|
5482
|
|
|
|
|
|
|
3, 1, 4, 1, 5, 1, 6, 1, |
|
5483
|
|
|
|
|
|
|
7, 2, 2, 6, 2, 2, 7, 2, |
|
5484
|
|
|
|
|
|
|
4, 6, 2, 4, 7, 2, 5, 6, |
|
5485
|
|
|
|
|
|
|
2, 5, 7, 2, 6, 7, 3, 2, |
|
5486
|
|
|
|
|
|
|
6, 7, 3, 4, 6, 7, 3, 5, |
|
5487
|
|
|
|
|
|
|
6, 7 |
|
5488
|
|
|
|
|
|
|
}; |
|
5489
|
|
|
|
|
|
|
|
|
5490
|
|
|
|
|
|
|
static const unsigned char _tag_guesser_key_offsets[] = { |
|
5491
|
|
|
|
|
|
|
0, 19, 26, 34, 42, 50, 58, 66, |
|
5492
|
|
|
|
|
|
|
74, 82, 90, 100, 108, 116, 124, 132, |
|
5493
|
|
|
|
|
|
|
145, 153, 161, 168, 179, 195, 212, 220, |
|
5494
|
|
|
|
|
|
|
228, 236 |
|
5495
|
|
|
|
|
|
|
}; |
|
5496
|
|
|
|
|
|
|
|
|
5497
|
|
|
|
|
|
|
static const char _tag_guesser_trans_keys[] = { |
|
5498
|
|
|
|
|
|
|
45, 46, 99, 100, 103, 105, 109, 110, |
|
5499
|
|
|
|
|
|
|
114, 115, 116, 118, 120, 48, 57, 65, |
|
5500
|
|
|
|
|
|
|
90, 97, 122, 45, 48, 57, 65, 90, |
|
5501
|
|
|
|
|
|
|
97, 122, 45, 114, 48, 57, 65, 90, |
|
5502
|
|
|
|
|
|
|
97, 122, 45, 111, 48, 57, 65, 90, |
|
5503
|
|
|
|
|
|
|
97, 122, 45, 109, 48, 57, 65, 90, |
|
5504
|
|
|
|
|
|
|
97, 122, 45, 101, 48, 57, 65, 90, |
|
5505
|
|
|
|
|
|
|
97, 122, 45, 115, 48, 57, 65, 90, |
|
5506
|
|
|
|
|
|
|
97, 122, 45, 101, 48, 57, 65, 90, |
|
5507
|
|
|
|
|
|
|
97, 122, 45, 108, 48, 57, 65, 90, |
|
5508
|
|
|
|
|
|
|
97, 122, 45, 115, 48, 57, 65, 90, |
|
5509
|
|
|
|
|
|
|
97, 122, 45, 97, 101, 111, 48, 57, |
|
5510
|
|
|
|
|
|
|
65, 90, 98, 122, 45, 101, 48, 57, |
|
5511
|
|
|
|
|
|
|
65, 90, 97, 122, 45, 108, 48, 57, |
|
5512
|
|
|
|
|
|
|
65, 90, 97, 122, 45, 109, 48, 57, |
|
5513
|
|
|
|
|
|
|
65, 90, 97, 122, 45, 105, 48, 57, |
|
5514
|
|
|
|
|
|
|
65, 90, 97, 122, 45, 97, 101, 105, |
|
5515
|
|
|
|
|
|
|
111, 117, 121, 48, 57, 65, 90, 98, |
|
5516
|
|
|
|
|
|
|
122, 45, 115, 48, 57, 65, 90, 97, |
|
5517
|
|
|
|
|
|
|
122, 45, 101, 48, 57, 65, 90, 97, |
|
5518
|
|
|
|
|
|
|
122, 45, 48, 57, 65, 90, 97, 122, |
|
5519
|
|
|
|
|
|
|
45, 101, 114, 115, 116, 48, 57, 65, |
|
5520
|
|
|
|
|
|
|
90, 97, 122, 45, 46, 105, 109, 118, |
|
5521
|
|
|
|
|
|
|
120, 48, 57, 65, 90, 97, 98, 99, |
|
5522
|
|
|
|
|
|
|
100, 101, 122, 45, 46, 101, 105, 109, |
|
5523
|
|
|
|
|
|
|
118, 120, 48, 57, 65, 90, 97, 98, |
|
5524
|
|
|
|
|
|
|
99, 100, 102, 122, 45, 110, 48, 57, |
|
5525
|
|
|
|
|
|
|
65, 90, 97, 122, 45, 105, 48, 57, |
|
5526
|
|
|
|
|
|
|
65, 90, 97, 122, 45, 101, 48, 57, |
|
5527
|
|
|
|
|
|
|
65, 90, 97, 122, 45, 115, 48, 57, |
|
5528
|
|
|
|
|
|
|
65, 90, 97, 122, 0 |
|
5529
|
|
|
|
|
|
|
}; |
|
5530
|
|
|
|
|
|
|
|
|
5531
|
|
|
|
|
|
|
static const char _tag_guesser_single_lengths[] = { |
|
5532
|
|
|
|
|
|
|
13, 1, 2, 2, 2, 2, 2, 2, |
|
5533
|
|
|
|
|
|
|
2, 2, 4, 2, 2, 2, 2, 7, |
|
5534
|
|
|
|
|
|
|
2, 2, 1, 5, 6, 7, 2, 2, |
|
5535
|
|
|
|
|
|
|
2, 2 |
|
5536
|
|
|
|
|
|
|
}; |
|
5537
|
|
|
|
|
|
|
|
|
5538
|
|
|
|
|
|
|
static const char _tag_guesser_range_lengths[] = { |
|
5539
|
|
|
|
|
|
|
3, 3, 3, 3, 3, 3, 3, 3, |
|
5540
|
|
|
|
|
|
|
3, 3, 3, 3, 3, 3, 3, 3, |
|
5541
|
|
|
|
|
|
|
3, 3, 3, 3, 5, 5, 3, 3, |
|
5542
|
|
|
|
|
|
|
3, 3 |
|
5543
|
|
|
|
|
|
|
}; |
|
5544
|
|
|
|
|
|
|
|
|
5545
|
|
|
|
|
|
|
static const unsigned char _tag_guesser_index_offsets[] = { |
|
5546
|
|
|
|
|
|
|
0, 17, 22, 28, 34, 40, 46, 52, |
|
5547
|
|
|
|
|
|
|
58, 64, 70, 78, 84, 90, 96, 102, |
|
5548
|
|
|
|
|
|
|
113, 119, 125, 130, 139, 151, 164, 170, |
|
5549
|
|
|
|
|
|
|
176, 182 |
|
5550
|
|
|
|
|
|
|
}; |
|
5551
|
|
|
|
|
|
|
|
|
5552
|
|
|
|
|
|
|
static const char _tag_guesser_indicies[] = { |
|
5553
|
|
|
|
|
|
|
1, 2, 5, 6, 7, 5, 5, 8, |
|
5554
|
|
|
|
|
|
|
9, 10, 11, 5, 5, 3, 4, 4, |
|
5555
|
|
|
|
|
|
|
0, 13, 14, 15, 15, 12, 13, 16, |
|
5556
|
|
|
|
|
|
|
14, 15, 15, 12, 13, 17, 14, 15, |
|
5557
|
|
|
|
|
|
|
15, 12, 13, 18, 14, 15, 15, 12, |
|
5558
|
|
|
|
|
|
|
13, 18, 14, 15, 15, 12, 13, 19, |
|
5559
|
|
|
|
|
|
|
14, 15, 15, 12, 13, 20, 14, 15, |
|
5560
|
|
|
|
|
|
|
15, 12, 13, 18, 14, 15, 15, 12, |
|
5561
|
|
|
|
|
|
|
13, 21, 14, 15, 15, 12, 13, 22, |
|
5562
|
|
|
|
|
|
|
23, 24, 14, 15, 15, 12, 13, 25, |
|
5563
|
|
|
|
|
|
|
14, 15, 15, 12, 13, 23, 14, 15, |
|
5564
|
|
|
|
|
|
|
15, 12, 13, 23, 14, 15, 15, 12, |
|
5565
|
|
|
|
|
|
|
13, 26, 14, 15, 15, 12, 28, 15, |
|
5566
|
|
|
|
|
|
|
15, 15, 15, 15, 15, 29, 26, 26, |
|
5567
|
|
|
|
|
|
|
27, 31, 4, 32, 33, 33, 30, 13, |
|
5568
|
|
|
|
|
|
|
23, 14, 15, 15, 12, 13, 14, 15, |
|
5569
|
|
|
|
|
|
|
15, 12, 13, 34, 35, 36, 37, 14, |
|
5570
|
|
|
|
|
|
|
15, 15, 12, 13, 38, 39, 39, 39, |
|
5571
|
|
|
|
|
|
|
39, 14, 15, 15, 39, 15, 12, 13, |
|
5572
|
|
|
|
|
|
|
38, 40, 39, 39, 39, 39, 14, 15, |
|
5573
|
|
|
|
|
|
|
15, 39, 15, 12, 13, 41, 14, 15, |
|
5574
|
|
|
|
|
|
|
15, 12, 13, 42, 14, 15, 15, 12, |
|
5575
|
|
|
|
|
|
|
13, 18, 14, 15, 15, 12, 13, 43, |
|
5576
|
|
|
|
|
|
|
14, 15, 15, 12, 0 |
|
5577
|
|
|
|
|
|
|
}; |
|
5578
|
|
|
|
|
|
|
|
|
5579
|
|
|
|
|
|
|
static const char _tag_guesser_trans_targs[] = { |
|
5580
|
|
|
|
|
|
|
18, 19, 20, 18, 18, 20, 21, 22, |
|
5581
|
|
|
|
|
|
|
23, 24, 16, 25, 18, 19, 18, 1, |
|
5582
|
|
|
|
|
|
|
3, 4, 18, 7, 8, 10, 11, 18, |
|
5583
|
|
|
|
|
|
|
13, 12, 18, 18, 19, 18, 18, 19, |
|
5584
|
|
|
|
|
|
|
18, 18, 2, 5, 6, 9, 20, 20, |
|
5585
|
|
|
|
|
|
|
18, 14, 15, 17 |
|
5586
|
|
|
|
|
|
|
}; |
|
5587
|
|
|
|
|
|
|
|
|
5588
|
|
|
|
|
|
|
static const char _tag_guesser_trans_actions[] = { |
|
5589
|
|
|
|
|
|
|
29, 46, 29, 32, 11, 11, 11, 11, |
|
5590
|
|
|
|
|
|
|
11, 11, 0, 11, 13, 35, 15, 0, |
|
5591
|
|
|
|
|
|
|
0, 0, 1, 0, 0, 0, 0, 3, |
|
5592
|
|
|
|
|
|
|
0, 0, 5, 17, 38, 20, 23, 42, |
|
5593
|
|
|
|
|
|
|
26, 9, 0, 0, 0, 0, 13, 0, |
|
5594
|
|
|
|
|
|
|
7, 0, 0, 0 |
|
5595
|
|
|
|
|
|
|
}; |
|
5596
|
|
|
|
|
|
|
|
|
5597
|
|
|
|
|
|
|
static const char _tag_guesser_eof_actions[] = { |
|
5598
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
5599
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
5600
|
|
|
|
|
|
|
0, 0, 0, 0, 15, 15, 0, 0, |
|
5601
|
|
|
|
|
|
|
0, 0 |
|
5602
|
|
|
|
|
|
|
}; |
|
5603
|
|
|
|
|
|
|
|
|
5604
|
|
|
|
|
|
|
static const int tag_guesser_start = 0; |
|
5605
|
|
|
|
|
|
|
|
|
5606
|
0
|
|
|
|
|
|
void english_morpho_guesser::analyze(string_piece form, string_piece form_lc, vector& lemmas) const { |
|
5607
|
|
|
|
|
|
|
// Try exceptions list |
|
5608
|
0
|
|
|
|
|
|
auto* exception = exceptions.at(form_lc.str, form_lc.len, [](pointer_decoder& data){ |
|
5609
|
0
|
0
|
|
|
|
|
for (unsigned len = data.next_1B(); len; len--) { |
|
5610
|
|
|
|
|
|
|
data.next(data.next_1B()); |
|
5611
|
|
|
|
|
|
|
data.next(data.next_1B()); |
|
5612
|
|
|
|
|
|
|
} |
|
5613
|
0
|
|
|
|
|
|
}); |
|
5614
|
|
|
|
|
|
|
|
|
5615
|
0
|
0
|
|
|
|
|
if (exception) { |
|
5616
|
|
|
|
|
|
|
// Found in exceptions list |
|
5617
|
|
|
|
|
|
|
pointer_decoder data(exception); |
|
5618
|
0
|
0
|
|
|
|
|
for (unsigned len = data.next_1B(); len; len--) { |
|
5619
|
|
|
|
|
|
|
unsigned lemma_len = data.next_1B(); |
|
5620
|
0
|
|
|
|
|
|
string lemma(data.next(lemma_len), lemma_len); |
|
5621
|
0
|
0
|
|
|
|
|
for (unsigned tags = data.next_1B(); tags; tags--) |
|
5622
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(lemma, exceptions_tags[data.next_2B()]); |
|
5623
|
|
|
|
|
|
|
} |
|
5624
|
|
|
|
|
|
|
} else { |
|
5625
|
|
|
|
|
|
|
// Try stripping negative prefix and use rule guesser |
|
5626
|
|
|
|
|
|
|
string lemma_lc(form_lc.str, form_lc.len); |
|
5627
|
|
|
|
|
|
|
// Try finding negative prefix |
|
5628
|
|
|
|
|
|
|
unsigned negation_len = 0; |
|
5629
|
0
|
0
|
|
|
|
|
for (unsigned prefix = 1; prefix <= form_lc.len; prefix++) { |
|
5630
|
0
|
|
|
|
|
|
auto found = negations.at(form_lc.str, prefix, [](pointer_decoder& data){ data.next(TOTAL); }); |
|
5631
|
0
|
0
|
|
|
|
|
if (!found) break; |
|
5632
|
0
|
0
|
|
|
|
|
if (found[NEGATION_LEN]) { |
|
5633
|
0
|
0
|
|
|
|
|
if (form_lc.len - prefix >= found[TO_FOLLOW]) negation_len = found[NEGATION_LEN]; |
|
5634
|
|
|
|
|
|
|
} |
|
5635
|
|
|
|
|
|
|
} |
|
5636
|
|
|
|
|
|
|
|
|
5637
|
|
|
|
|
|
|
// Add default tags |
|
5638
|
0
|
|
|
|
|
|
add(FW, lemma_lc, lemmas); |
|
5639
|
0
|
0
|
|
|
|
|
add(JJ, lemma_lc, negation_len, lemmas); |
|
5640
|
0
|
0
|
|
|
|
|
add(RB, lemma_lc, negation_len, lemmas); |
|
5641
|
0
|
0
|
|
|
|
|
add(NN, lemma_lc, negation_len, lemmas); |
|
5642
|
0
|
0
|
|
|
|
|
add_NNS(lemma_lc, negation_len, lemmas); |
|
5643
|
|
|
|
|
|
|
|
|
5644
|
|
|
|
|
|
|
// Add specialized tags |
|
5645
|
|
|
|
|
|
|
const char* p = form_lc.str; int cs; |
|
5646
|
|
|
|
|
|
|
bool added_JJR_RBR = false, added_JJS_RBS = false, added_SYM = false, added_CD = false; |
|
5647
|
|
|
|
|
|
|
|
|
5648
|
|
|
|
|
|
|
{ |
|
5649
|
|
|
|
|
|
|
cs = tag_guesser_start; |
|
5650
|
|
|
|
|
|
|
} |
|
5651
|
|
|
|
|
|
|
|
|
5652
|
|
|
|
|
|
|
{ |
|
5653
|
|
|
|
|
|
|
int _klen; |
|
5654
|
|
|
|
|
|
|
unsigned int _trans; |
|
5655
|
|
|
|
|
|
|
const char *_acts; |
|
5656
|
|
|
|
|
|
|
unsigned int _nacts; |
|
5657
|
|
|
|
|
|
|
const char *_keys; |
|
5658
|
|
|
|
|
|
|
|
|
5659
|
0
|
0
|
|
|
|
|
if ( p == ( (form_lc.str + form_lc.len)) ) |
|
5660
|
|
|
|
|
|
|
goto _test_eof; |
|
5661
|
|
|
|
|
|
|
_resume: |
|
5662
|
0
|
|
|
|
|
|
_keys = _tag_guesser_trans_keys + _tag_guesser_key_offsets[cs]; |
|
5663
|
0
|
|
|
|
|
|
_trans = _tag_guesser_index_offsets[cs]; |
|
5664
|
|
|
|
|
|
|
|
|
5665
|
0
|
|
|
|
|
|
_klen = _tag_guesser_single_lengths[cs]; |
|
5666
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
|
5667
|
|
|
|
|
|
|
const char *_lower = _keys; |
|
5668
|
|
|
|
|
|
|
const char *_mid; |
|
5669
|
0
|
|
|
|
|
|
const char *_upper = _keys + _klen - 1; |
|
5670
|
|
|
|
|
|
|
while (1) { |
|
5671
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
|
5672
|
|
|
|
|
|
|
break; |
|
5673
|
|
|
|
|
|
|
|
|
5674
|
0
|
|
|
|
|
|
_mid = _lower + ((_upper-_lower) >> 1); |
|
5675
|
0
|
0
|
|
|
|
|
if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) < *_mid ) |
|
5676
|
0
|
|
|
|
|
|
_upper = _mid - 1; |
|
5677
|
0
|
0
|
|
|
|
|
else if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) > *_mid ) |
|
5678
|
0
|
|
|
|
|
|
_lower = _mid + 1; |
|
5679
|
|
|
|
|
|
|
else { |
|
5680
|
0
|
|
|
|
|
|
_trans += (unsigned int)(_mid - _keys); |
|
5681
|
0
|
|
|
|
|
|
goto _match; |
|
5682
|
|
|
|
|
|
|
} |
|
5683
|
|
|
|
|
|
|
} |
|
5684
|
0
|
|
|
|
|
|
_keys += _klen; |
|
5685
|
0
|
|
|
|
|
|
_trans += _klen; |
|
5686
|
|
|
|
|
|
|
} |
|
5687
|
|
|
|
|
|
|
|
|
5688
|
0
|
|
|
|
|
|
_klen = _tag_guesser_range_lengths[cs]; |
|
5689
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
|
5690
|
|
|
|
|
|
|
const char *_lower = _keys; |
|
5691
|
|
|
|
|
|
|
const char *_mid; |
|
5692
|
0
|
|
|
|
|
|
const char *_upper = _keys + (_klen<<1) - 2; |
|
5693
|
|
|
|
|
|
|
while (1) { |
|
5694
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
|
5695
|
|
|
|
|
|
|
break; |
|
5696
|
|
|
|
|
|
|
|
|
5697
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
|
5698
|
0
|
0
|
|
|
|
|
if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) < _mid[0] ) |
|
5699
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
|
5700
|
0
|
0
|
|
|
|
|
else if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) > _mid[1] ) |
|
5701
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
|
5702
|
|
|
|
|
|
|
else { |
|
5703
|
0
|
|
|
|
|
|
_trans += (unsigned int)((_mid - _keys)>>1); |
|
5704
|
0
|
|
|
|
|
|
goto _match; |
|
5705
|
|
|
|
|
|
|
} |
|
5706
|
|
|
|
|
|
|
} |
|
5707
|
0
|
|
|
|
|
|
_trans += _klen; |
|
5708
|
|
|
|
|
|
|
} |
|
5709
|
|
|
|
|
|
|
|
|
5710
|
|
|
|
|
|
|
_match: |
|
5711
|
0
|
|
|
|
|
|
_trans = _tag_guesser_indicies[_trans]; |
|
5712
|
0
|
|
|
|
|
|
cs = _tag_guesser_trans_targs[_trans]; |
|
5713
|
|
|
|
|
|
|
|
|
5714
|
0
|
0
|
|
|
|
|
if ( _tag_guesser_trans_actions[_trans] == 0 ) |
|
5715
|
|
|
|
|
|
|
goto _again; |
|
5716
|
|
|
|
|
|
|
|
|
5717
|
0
|
|
|
|
|
|
_acts = _tag_guesser_actions + _tag_guesser_trans_actions[_trans]; |
|
5718
|
0
|
|
|
|
|
|
_nacts = (unsigned int) *_acts++; |
|
5719
|
0
|
0
|
|
|
|
|
while ( _nacts-- > 0 ) |
|
5720
|
|
|
|
|
|
|
{ |
|
5721
|
0
|
|
|
|
|
|
switch ( *_acts++ ) |
|
5722
|
|
|
|
|
|
|
{ |
|
5723
|
|
|
|
|
|
|
case 0: |
|
5724
|
0
|
0
|
|
|
|
|
{ if (!added_JJR_RBR) added_JJR_RBR = true, add_JJR_RBR(lemma_lc, negation_len, lemmas); } |
|
|
|
0
|
|
|
|
|
|
|
5725
|
|
|
|
|
|
|
break; |
|
5726
|
|
|
|
|
|
|
case 1: |
|
5727
|
0
|
0
|
|
|
|
|
{ if (!added_JJS_RBS) added_JJS_RBS = true, add_JJS_RBS(lemma_lc, negation_len, lemmas); } |
|
|
|
0
|
|
|
|
|
|
|
5728
|
|
|
|
|
|
|
break; |
|
5729
|
|
|
|
|
|
|
case 2: |
|
5730
|
0
|
0
|
|
|
|
|
{ add_VBG(lemma_lc, lemmas); } |
|
5731
|
|
|
|
|
|
|
break; |
|
5732
|
|
|
|
|
|
|
case 3: |
|
5733
|
0
|
0
|
|
|
|
|
{ add_VBD_VBN(lemma_lc, lemmas); } |
|
5734
|
|
|
|
|
|
|
break; |
|
5735
|
|
|
|
|
|
|
case 4: |
|
5736
|
0
|
0
|
|
|
|
|
{ add_VBZ(lemma_lc, lemmas); } |
|
5737
|
|
|
|
|
|
|
break; |
|
5738
|
|
|
|
|
|
|
case 5: |
|
5739
|
0
|
|
|
|
|
|
{ add(VB, lemma_lc, lemmas); add(VBP, lemma_lc, lemmas); } |
|
5740
|
|
|
|
|
|
|
break; |
|
5741
|
|
|
|
|
|
|
case 6: |
|
5742
|
0
|
0
|
|
|
|
|
{ if (!added_SYM) added_SYM = true, add(SYM, lemma_lc, lemmas); } |
|
5743
|
|
|
|
|
|
|
break; |
|
5744
|
|
|
|
|
|
|
case 7: |
|
5745
|
0
|
0
|
|
|
|
|
{ if (!added_CD) added_CD = true, add(CD, lemma_lc, lemmas); } |
|
5746
|
|
|
|
|
|
|
break; |
|
5747
|
|
|
|
|
|
|
} |
|
5748
|
|
|
|
|
|
|
} |
|
5749
|
|
|
|
|
|
|
|
|
5750
|
|
|
|
|
|
|
_again: |
|
5751
|
0
|
0
|
|
|
|
|
if ( ++p != ( (form_lc.str + form_lc.len)) ) |
|
5752
|
|
|
|
|
|
|
goto _resume; |
|
5753
|
|
|
|
|
|
|
_test_eof: {} |
|
5754
|
0
|
0
|
|
|
|
|
if ( p == ( (form_lc.str + form_lc.len)) ) |
|
5755
|
|
|
|
|
|
|
{ |
|
5756
|
0
|
|
|
|
|
|
const char *__acts = _tag_guesser_actions + _tag_guesser_eof_actions[cs]; |
|
5757
|
0
|
|
|
|
|
|
unsigned int __nacts = (unsigned int) *__acts++; |
|
5758
|
0
|
0
|
|
|
|
|
while ( __nacts-- > 0 ) { |
|
5759
|
0
|
0
|
|
|
|
|
switch ( *__acts++ ) { |
|
5760
|
|
|
|
|
|
|
case 7: |
|
5761
|
0
|
0
|
|
|
|
|
{ if (!added_CD) added_CD = true, add(CD, lemma_lc, lemmas); } |
|
5762
|
|
|
|
|
|
|
break; |
|
5763
|
|
|
|
|
|
|
} |
|
5764
|
|
|
|
|
|
|
} |
|
5765
|
|
|
|
|
|
|
} |
|
5766
|
|
|
|
|
|
|
|
|
5767
|
|
|
|
|
|
|
} |
|
5768
|
|
|
|
|
|
|
|
|
5769
|
|
|
|
|
|
|
} |
|
5770
|
|
|
|
|
|
|
|
|
5771
|
|
|
|
|
|
|
// Add proper names |
|
5772
|
0
|
|
|
|
|
|
analyze_proper_names(form, form_lc, lemmas); |
|
5773
|
0
|
|
|
|
|
|
} |
|
5774
|
|
|
|
|
|
|
|
|
5775
|
0
|
|
|
|
|
|
bool english_morpho_guesser::analyze_proper_names(string_piece form, string_piece form_lc, vector& lemmas) const { |
|
5776
|
|
|
|
|
|
|
// NNP if form_lc != form or form.str[0] =~ /[0-9']/, NNPS if form_lc != form |
|
5777
|
0
|
0
|
|
|
|
|
bool is_NNP = form.str != form_lc.str || (form.len && (*form.str == '\'' || (*form.str >= '0' && *form.str <= '9'))); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
5778
|
0
|
|
|
|
|
|
bool is_NNPS = form.str != form_lc.str; |
|
5779
|
0
|
0
|
|
|
|
|
if (!is_NNP && !is_NNPS) return false; |
|
5780
|
|
|
|
|
|
|
|
|
5781
|
|
|
|
|
|
|
bool was_NNP = false, was_NNPS = false; |
|
5782
|
0
|
0
|
|
|
|
|
for (auto&& lemma : lemmas) { |
|
5783
|
0
|
|
|
|
|
|
was_NNP |= lemma.tag == NNP; |
|
5784
|
0
|
|
|
|
|
|
was_NNPS |= lemma.tag == NNPS; |
|
5785
|
|
|
|
|
|
|
} |
|
5786
|
0
|
0
|
|
|
|
|
if (!((is_NNP && !was_NNP) || (is_NNPS && !was_NNPS))) return false; |
|
|
|
0
|
|
|
|
|
|
|
5787
|
|
|
|
|
|
|
|
|
5788
|
|
|
|
|
|
|
string lemma(form.str, form.len); |
|
5789
|
0
|
0
|
|
|
|
|
if (is_NNP && !was_NNP) add(NNP, lemma, lemmas); |
|
5790
|
0
|
0
|
|
|
|
|
if (is_NNPS && !was_NNPS) add_NNPS(lemma, lemmas); |
|
|
|
0
|
|
|
|
|
|
|
5791
|
|
|
|
|
|
|
return true; |
|
5792
|
|
|
|
|
|
|
} |
|
5793
|
|
|
|
|
|
|
|
|
5794
|
|
|
|
|
|
|
inline void english_morpho_guesser::add(const string& tag, const string& form, vector& lemmas) const { |
|
5795
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(form, tag); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
5796
|
|
|
|
|
|
|
} |
|
5797
|
|
|
|
|
|
|
|
|
5798
|
|
|
|
|
|
|
inline void english_morpho_guesser::add(const string& tag, const string& tag2, const string& form, vector& lemmas) const { |
|
5799
|
|
|
|
|
|
|
add(tag, form, lemmas); |
|
5800
|
|
|
|
|
|
|
add(tag2, form, lemmas); |
|
5801
|
|
|
|
|
|
|
} |
|
5802
|
|
|
|
|
|
|
|
|
5803
|
0
|
|
|
|
|
|
inline void english_morpho_guesser::add(const string& tag, const string& form, unsigned negation_len, vector& lemmas) const { |
|
5804
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
5805
|
0
|
|
|
|
|
|
} |
|
5806
|
|
|
|
|
|
|
|
|
5807
|
0
|
|
|
|
|
|
inline void english_morpho_guesser::add(const string& tag, const string& tag2, const string& form, unsigned negation_len, vector& lemmas) const { |
|
5808
|
0
|
|
|
|
|
|
add(tag, form, negation_len, lemmas); |
|
5809
|
0
|
|
|
|
|
|
add(tag2, form, negation_len, lemmas); |
|
5810
|
0
|
|
|
|
|
|
} |
|
5811
|
|
|
|
|
|
|
|
|
5812
|
|
|
|
|
|
|
// Common definitions (written backwards) |
|
5813
|
|
|
|
|
|
|
#define REM(str, len) (str.substr(0, str.size() - len)) |
|
5814
|
|
|
|
|
|
|
#define REM_ADD(str, len, add) (str.substr(0, str.size() - len).append(add)) |
|
5815
|
|
|
|
|
|
|
|
|
5816
|
|
|
|
|
|
|
static const char _NNS_actions[] = { |
|
5817
|
|
|
|
|
|
|
0, 1, 0, 1, 1, 1, 2, 1, |
|
5818
|
|
|
|
|
|
|
3, 1, 4, 1, 5, 1, 6, 1, |
|
5819
|
|
|
|
|
|
|
7, 1, 8, 1, 9, 1, 10, 1, |
|
5820
|
|
|
|
|
|
|
11, 1, 12, 1, 13 |
|
5821
|
|
|
|
|
|
|
}; |
|
5822
|
|
|
|
|
|
|
|
|
5823
|
|
|
|
|
|
|
static const char _NNS_key_offsets[] = { |
|
5824
|
|
|
|
|
|
|
0, 0, 2, 3, 4, 5, 7, 17, |
|
5825
|
|
|
|
|
|
|
17, 29, 30, 35, 35, 36, 37, 37, |
|
5826
|
|
|
|
|
|
|
37, 44, 45, 53, 63, 72 |
|
5827
|
|
|
|
|
|
|
}; |
|
5828
|
|
|
|
|
|
|
|
|
5829
|
|
|
|
|
|
|
static const char _NNS_trans_keys[] = { |
|
5830
|
|
|
|
|
|
|
110, 115, 101, 109, 101, 99, 115, 98, |
|
5831
|
|
|
|
|
|
|
100, 102, 104, 106, 110, 112, 116, 118, |
|
5832
|
|
|
|
|
|
|
122, 104, 122, 98, 100, 102, 103, 106, |
|
5833
|
|
|
|
|
|
|
110, 112, 116, 118, 120, 111, 97, 101, |
|
5834
|
|
|
|
|
|
|
105, 111, 117, 105, 119, 104, 105, 111, |
|
5835
|
|
|
|
|
|
|
115, 118, 120, 122, 115, 97, 101, 105, |
|
5836
|
|
|
|
|
|
|
110, 111, 114, 115, 117, 98, 100, 102, |
|
5837
|
|
|
|
|
|
|
104, 106, 110, 112, 116, 118, 122, 97, |
|
5838
|
|
|
|
|
|
|
101, 105, 111, 117, 121, 122, 98, 120, |
|
5839
|
|
|
|
|
|
|
0 |
|
5840
|
|
|
|
|
|
|
}; |
|
5841
|
|
|
|
|
|
|
|
|
5842
|
|
|
|
|
|
|
static const char _NNS_single_lengths[] = { |
|
5843
|
|
|
|
|
|
|
0, 2, 1, 1, 1, 2, 0, 0, |
|
5844
|
|
|
|
|
|
|
2, 1, 5, 0, 1, 1, 0, 0, |
|
5845
|
|
|
|
|
|
|
7, 1, 8, 0, 7, 0 |
|
5846
|
|
|
|
|
|
|
}; |
|
5847
|
|
|
|
|
|
|
|
|
5848
|
|
|
|
|
|
|
static const char _NNS_range_lengths[] = { |
|
5849
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 5, 0, |
|
5850
|
|
|
|
|
|
|
5, 0, 0, 0, 0, 0, 0, 0, |
|
5851
|
|
|
|
|
|
|
0, 0, 0, 5, 1, 0 |
|
5852
|
|
|
|
|
|
|
}; |
|
5853
|
|
|
|
|
|
|
|
|
5854
|
|
|
|
|
|
|
static const char _NNS_index_offsets[] = { |
|
5855
|
|
|
|
|
|
|
0, 0, 3, 5, 7, 9, 12, 18, |
|
5856
|
|
|
|
|
|
|
19, 27, 29, 35, 36, 38, 40, 41, |
|
5857
|
|
|
|
|
|
|
42, 50, 52, 61, 67, 76 |
|
5858
|
|
|
|
|
|
|
}; |
|
5859
|
|
|
|
|
|
|
|
|
5860
|
|
|
|
|
|
|
static const char _NNS_indicies[] = { |
|
5861
|
|
|
|
|
|
|
0, 2, 1, 3, 1, 4, 1, 6, |
|
5862
|
|
|
|
|
|
|
5, 7, 7, 1, 8, 8, 8, 8, |
|
5863
|
|
|
|
|
|
|
8, 1, 9, 11, 10, 10, 10, 10, |
|
5864
|
|
|
|
|
|
|
10, 10, 1, 12, 1, 13, 13, 13, |
|
5865
|
|
|
|
|
|
|
13, 13, 1, 14, 15, 1, 16, 1, |
|
5866
|
|
|
|
|
|
|
17, 1, 18, 19, 20, 21, 22, 7, |
|
5867
|
|
|
|
|
|
|
23, 1, 24, 1, 25, 25, 25, 26, |
|
5868
|
|
|
|
|
|
|
25, 27, 28, 29, 1, 30, 30, 30, |
|
5869
|
|
|
|
|
|
|
30, 30, 1, 31, 31, 31, 31, 31, |
|
5870
|
|
|
|
|
|
|
31, 33, 32, 1, 17, 0 |
|
5871
|
|
|
|
|
|
|
}; |
|
5872
|
|
|
|
|
|
|
|
|
5873
|
|
|
|
|
|
|
static const char _NNS_trans_targs[] = { |
|
5874
|
|
|
|
|
|
|
2, 0, 4, 3, 15, 15, 16, 15, |
|
5875
|
|
|
|
|
|
|
7, 15, 15, 17, 15, 11, 15, 13, |
|
5876
|
|
|
|
|
|
|
15, 15, 5, 6, 8, 18, 12, 20, |
|
5877
|
|
|
|
|
|
|
15, 15, 9, 10, 15, 19, 15, 15, |
|
5878
|
|
|
|
|
|
|
14, 21 |
|
5879
|
|
|
|
|
|
|
}; |
|
5880
|
|
|
|
|
|
|
|
|
5881
|
|
|
|
|
|
|
static const char _NNS_trans_actions[] = { |
|
5882
|
|
|
|
|
|
|
0, 0, 0, 0, 1, 27, 27, 21, |
|
5883
|
|
|
|
|
|
|
0, 23, 25, 25, 19, 0, 17, 0, |
|
5884
|
|
|
|
|
|
|
5, 11, 0, 0, 0, 21, 0, 21, |
|
5885
|
|
|
|
|
|
|
3, 9, 0, 0, 15, 9, 7, 13, |
|
5886
|
|
|
|
|
|
|
0, 15 |
|
5887
|
|
|
|
|
|
|
}; |
|
5888
|
|
|
|
|
|
|
|
|
5889
|
|
|
|
|
|
|
static const int NNS_start = 1; |
|
5890
|
|
|
|
|
|
|
|
|
5891
|
0
|
|
|
|
|
|
void english_morpho_guesser::add_NNS(const string& form, unsigned negation_len, vector& lemmas) const { |
|
5892
|
0
|
|
|
|
|
|
const char* p = form.c_str() + negation_len; int cs; |
|
5893
|
|
|
|
|
|
|
char best = 'z'; unsigned remove = 0; const char* append = nullptr; |
|
5894
|
|
|
|
|
|
|
|
|
5895
|
|
|
|
|
|
|
{ |
|
5896
|
|
|
|
|
|
|
cs = NNS_start; |
|
5897
|
|
|
|
|
|
|
} |
|
5898
|
|
|
|
|
|
|
|
|
5899
|
|
|
|
|
|
|
{ |
|
5900
|
|
|
|
|
|
|
int _klen; |
|
5901
|
|
|
|
|
|
|
unsigned int _trans; |
|
5902
|
|
|
|
|
|
|
const char *_acts; |
|
5903
|
|
|
|
|
|
|
unsigned int _nacts; |
|
5904
|
|
|
|
|
|
|
const char *_keys; |
|
5905
|
|
|
|
|
|
|
|
|
5906
|
0
|
0
|
|
|
|
|
if ( p == ( (form.c_str() + form.size())) ) |
|
5907
|
|
|
|
|
|
|
goto _test_eof; |
|
5908
|
|
|
|
|
|
|
if ( cs == 0 ) |
|
5909
|
|
|
|
|
|
|
goto _out; |
|
5910
|
|
|
|
|
|
|
_resume: |
|
5911
|
0
|
|
|
|
|
|
_keys = _NNS_trans_keys + _NNS_key_offsets[cs]; |
|
5912
|
0
|
|
|
|
|
|
_trans = _NNS_index_offsets[cs]; |
|
5913
|
|
|
|
|
|
|
|
|
5914
|
0
|
|
|
|
|
|
_klen = _NNS_single_lengths[cs]; |
|
5915
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
|
5916
|
|
|
|
|
|
|
const char *_lower = _keys; |
|
5917
|
|
|
|
|
|
|
const char *_mid; |
|
5918
|
0
|
|
|
|
|
|
const char *_upper = _keys + _klen - 1; |
|
5919
|
|
|
|
|
|
|
while (1) { |
|
5920
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
|
5921
|
|
|
|
|
|
|
break; |
|
5922
|
|
|
|
|
|
|
|
|
5923
|
0
|
|
|
|
|
|
_mid = _lower + ((_upper-_lower) >> 1); |
|
5924
|
0
|
0
|
|
|
|
|
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < *_mid ) |
|
5925
|
0
|
|
|
|
|
|
_upper = _mid - 1; |
|
5926
|
0
|
0
|
|
|
|
|
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > *_mid ) |
|
5927
|
0
|
|
|
|
|
|
_lower = _mid + 1; |
|
5928
|
|
|
|
|
|
|
else { |
|
5929
|
0
|
|
|
|
|
|
_trans += (unsigned int)(_mid - _keys); |
|
5930
|
0
|
|
|
|
|
|
goto _match; |
|
5931
|
|
|
|
|
|
|
} |
|
5932
|
|
|
|
|
|
|
} |
|
5933
|
0
|
|
|
|
|
|
_keys += _klen; |
|
5934
|
0
|
|
|
|
|
|
_trans += _klen; |
|
5935
|
|
|
|
|
|
|
} |
|
5936
|
|
|
|
|
|
|
|
|
5937
|
0
|
|
|
|
|
|
_klen = _NNS_range_lengths[cs]; |
|
5938
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
|
5939
|
|
|
|
|
|
|
const char *_lower = _keys; |
|
5940
|
|
|
|
|
|
|
const char *_mid; |
|
5941
|
0
|
|
|
|
|
|
const char *_upper = _keys + (_klen<<1) - 2; |
|
5942
|
|
|
|
|
|
|
while (1) { |
|
5943
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
|
5944
|
|
|
|
|
|
|
break; |
|
5945
|
|
|
|
|
|
|
|
|
5946
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
|
5947
|
0
|
0
|
|
|
|
|
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < _mid[0] ) |
|
5948
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
|
5949
|
0
|
0
|
|
|
|
|
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > _mid[1] ) |
|
5950
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
|
5951
|
|
|
|
|
|
|
else { |
|
5952
|
0
|
|
|
|
|
|
_trans += (unsigned int)((_mid - _keys)>>1); |
|
5953
|
0
|
|
|
|
|
|
goto _match; |
|
5954
|
|
|
|
|
|
|
} |
|
5955
|
|
|
|
|
|
|
} |
|
5956
|
0
|
|
|
|
|
|
_trans += _klen; |
|
5957
|
|
|
|
|
|
|
} |
|
5958
|
|
|
|
|
|
|
|
|
5959
|
|
|
|
|
|
|
_match: |
|
5960
|
0
|
|
|
|
|
|
_trans = _NNS_indicies[_trans]; |
|
5961
|
0
|
|
|
|
|
|
cs = _NNS_trans_targs[_trans]; |
|
5962
|
|
|
|
|
|
|
|
|
5963
|
0
|
0
|
|
|
|
|
if ( _NNS_trans_actions[_trans] == 0 ) |
|
5964
|
|
|
|
|
|
|
goto _again; |
|
5965
|
|
|
|
|
|
|
|
|
5966
|
0
|
|
|
|
|
|
_acts = _NNS_actions + _NNS_trans_actions[_trans]; |
|
5967
|
0
|
|
|
|
|
|
_nacts = (unsigned int) *_acts++; |
|
5968
|
0
|
0
|
|
|
|
|
while ( _nacts-- > 0 ) |
|
5969
|
|
|
|
|
|
|
{ |
|
5970
|
0
|
|
|
|
|
|
switch ( *_acts++ ) |
|
5971
|
|
|
|
|
|
|
{ |
|
5972
|
|
|
|
|
|
|
case 0: |
|
5973
|
0
|
0
|
|
|
|
|
{ if (best > 'a') best = 'a', remove = 2, append = "an"; } |
|
5974
|
|
|
|
|
|
|
break; |
|
5975
|
|
|
|
|
|
|
case 1: |
|
5976
|
0
|
0
|
|
|
|
|
{ if (best > 'b') best = 'b', remove = 1, append = nullptr; } |
|
5977
|
|
|
|
|
|
|
break; |
|
5978
|
|
|
|
|
|
|
case 2: |
|
5979
|
0
|
0
|
|
|
|
|
{ if (best > 'c') best = 'c', remove = 3, append = "fe"; } |
|
5980
|
|
|
|
|
|
|
break; |
|
5981
|
|
|
|
|
|
|
case 3: |
|
5982
|
0
|
0
|
|
|
|
|
{ if (best > 'd') best = 'd', remove = 2, append = nullptr; } |
|
5983
|
|
|
|
|
|
|
break; |
|
5984
|
|
|
|
|
|
|
case 4: |
|
5985
|
0
|
0
|
|
|
|
|
{ if (best > 'e') best = 'e', remove = 1, append = nullptr; } |
|
5986
|
|
|
|
|
|
|
break; |
|
5987
|
|
|
|
|
|
|
case 5: |
|
5988
|
0
|
0
|
|
|
|
|
{ if (best > 'f') best = 'f', remove = 2, append = nullptr; } |
|
5989
|
|
|
|
|
|
|
break; |
|
5990
|
|
|
|
|
|
|
case 6: |
|
5991
|
0
|
0
|
|
|
|
|
{ if (best > 'g') best = 'g', remove = 1, append = nullptr; } |
|
5992
|
|
|
|
|
|
|
break; |
|
5993
|
|
|
|
|
|
|
case 7: |
|
5994
|
0
|
0
|
|
|
|
|
{ if (best > 'h') best = 'h', remove = 2, append = nullptr; } |
|
5995
|
|
|
|
|
|
|
break; |
|
5996
|
|
|
|
|
|
|
case 8: |
|
5997
|
0
|
0
|
|
|
|
|
{ if (best > 'i') best = 'i', remove = 1, append = nullptr; } |
|
5998
|
|
|
|
|
|
|
break; |
|
5999
|
|
|
|
|
|
|
case 9: |
|
6000
|
0
|
0
|
|
|
|
|
{ if (best > 'j') best = 'j', remove = 1, append = nullptr; } |
|
6001
|
|
|
|
|
|
|
break; |
|
6002
|
|
|
|
|
|
|
case 10: |
|
6003
|
0
|
0
|
|
|
|
|
{ if (best > 'k') best = 'k', remove = 2, append = nullptr; } |
|
6004
|
|
|
|
|
|
|
break; |
|
6005
|
|
|
|
|
|
|
case 11: |
|
6006
|
0
|
0
|
|
|
|
|
{ if (best > 'l') best = 'l', remove = 3, append = "y"; } |
|
6007
|
|
|
|
|
|
|
break; |
|
6008
|
|
|
|
|
|
|
case 12: |
|
6009
|
0
|
0
|
|
|
|
|
{ if (best > 'm') best = 'm', remove = 2, append = nullptr; } |
|
6010
|
|
|
|
|
|
|
break; |
|
6011
|
|
|
|
|
|
|
case 13: |
|
6012
|
0
|
0
|
|
|
|
|
{ if (best > 'n') best = 'n', remove = 1, append = nullptr; } |
|
6013
|
|
|
|
|
|
|
break; |
|
6014
|
|
|
|
|
|
|
} |
|
6015
|
|
|
|
|
|
|
} |
|
6016
|
|
|
|
|
|
|
|
|
6017
|
|
|
|
|
|
|
_again: |
|
6018
|
0
|
0
|
|
|
|
|
if ( cs == 0 ) |
|
6019
|
|
|
|
|
|
|
goto _out; |
|
6020
|
0
|
0
|
|
|
|
|
if ( ++p != ( (form.c_str() + form.size())) ) |
|
6021
|
|
|
|
|
|
|
goto _resume; |
|
6022
|
|
|
|
|
|
|
_test_eof: {} |
|
6023
|
|
|
|
|
|
|
_out: {} |
|
6024
|
|
|
|
|
|
|
} |
|
6025
|
|
|
|
|
|
|
|
|
6026
|
0
|
0
|
|
|
|
|
add(NNS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
6027
|
0
|
|
|
|
|
|
} |
|
6028
|
|
|
|
|
|
|
|
|
6029
|
|
|
|
|
|
|
static const char _NNPS_actions[] = { |
|
6030
|
|
|
|
|
|
|
0, 1, 1, 1, 2, 1, 4, 1, |
|
6031
|
|
|
|
|
|
|
5, 1, 6, 1, 7, 1, 8, 1, |
|
6032
|
|
|
|
|
|
|
9, 1, 10, 1, 11, 1, 12, 1, |
|
6033
|
|
|
|
|
|
|
14, 1, 15, 1, 16, 2, 0, 1, |
|
6034
|
|
|
|
|
|
|
2, 3, 4, 2, 13, 14 |
|
6035
|
|
|
|
|
|
|
}; |
|
6036
|
|
|
|
|
|
|
|
|
6037
|
|
|
|
|
|
|
static const unsigned char _NNPS_key_offsets[] = { |
|
6038
|
|
|
|
|
|
|
0, 0, 4, 6, 8, 10, 12, 16, |
|
6039
|
|
|
|
|
|
|
36, 36, 60, 62, 72, 72, 74, 76, |
|
6040
|
|
|
|
|
|
|
78, 78, 98, 98, 100, 102, 104, 104, |
|
6041
|
|
|
|
|
|
|
118, 120, 136, 156, 174, 174 |
|
6042
|
|
|
|
|
|
|
}; |
|
6043
|
|
|
|
|
|
|
|
|
6044
|
|
|
|
|
|
|
static const char _NNPS_trans_keys[] = { |
|
6045
|
|
|
|
|
|
|
78, 83, 110, 115, 69, 101, 77, 109, |
|
6046
|
|
|
|
|
|
|
77, 109, 69, 101, 67, 83, 99, 115, |
|
6047
|
|
|
|
|
|
|
66, 68, 70, 72, 74, 78, 80, 84, |
|
6048
|
|
|
|
|
|
|
86, 90, 98, 100, 102, 104, 106, 110, |
|
6049
|
|
|
|
|
|
|
112, 116, 118, 122, 72, 90, 104, 122, |
|
6050
|
|
|
|
|
|
|
66, 68, 70, 71, 74, 78, 80, 84, |
|
6051
|
|
|
|
|
|
|
86, 88, 98, 100, 102, 103, 106, 110, |
|
6052
|
|
|
|
|
|
|
112, 116, 118, 120, 79, 111, 65, 69, |
|
6053
|
|
|
|
|
|
|
73, 79, 85, 97, 101, 105, 111, 117, |
|
6054
|
|
|
|
|
|
|
73, 105, 87, 119, 87, 119, 66, 68, |
|
6055
|
|
|
|
|
|
|
70, 72, 74, 78, 80, 84, 86, 90, |
|
6056
|
|
|
|
|
|
|
98, 100, 102, 104, 106, 110, 112, 116, |
|
6057
|
|
|
|
|
|
|
118, 122, 73, 105, 69, 101, 69, 101, |
|
6058
|
|
|
|
|
|
|
72, 73, 79, 83, 86, 88, 90, 104, |
|
6059
|
|
|
|
|
|
|
105, 111, 115, 118, 120, 122, 83, 115, |
|
6060
|
|
|
|
|
|
|
65, 69, 73, 78, 79, 82, 83, 85, |
|
6061
|
|
|
|
|
|
|
97, 101, 105, 110, 111, 114, 115, 117, |
|
6062
|
|
|
|
|
|
|
66, 68, 70, 72, 74, 78, 80, 84, |
|
6063
|
|
|
|
|
|
|
86, 90, 98, 100, 102, 104, 106, 110, |
|
6064
|
|
|
|
|
|
|
112, 116, 118, 122, 65, 69, 73, 79, |
|
6065
|
|
|
|
|
|
|
85, 89, 90, 97, 101, 105, 111, 117, |
|
6066
|
|
|
|
|
|
|
121, 122, 66, 88, 98, 120, 72, 73, |
|
6067
|
|
|
|
|
|
|
79, 83, 86, 88, 90, 104, 105, 111, |
|
6068
|
|
|
|
|
|
|
115, 118, 120, 122, 0 |
|
6069
|
|
|
|
|
|
|
}; |
|
6070
|
|
|
|
|
|
|
|
|
6071
|
|
|
|
|
|
|
static const char _NNPS_single_lengths[] = { |
|
6072
|
|
|
|
|
|
|
0, 4, 2, 2, 2, 2, 4, 0, |
|
6073
|
|
|
|
|
|
|
0, 4, 2, 10, 0, 2, 2, 2, |
|
6074
|
|
|
|
|
|
|
0, 0, 0, 2, 2, 2, 0, 14, |
|
6075
|
|
|
|
|
|
|
2, 16, 0, 14, 0, 14 |
|
6076
|
|
|
|
|
|
|
}; |
|
6077
|
|
|
|
|
|
|
|
|
6078
|
|
|
|
|
|
|
static const char _NNPS_range_lengths[] = { |
|
6079
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 10, |
|
6080
|
|
|
|
|
|
|
0, 10, 0, 0, 0, 0, 0, 0, |
|
6081
|
|
|
|
|
|
|
0, 10, 0, 0, 0, 0, 0, 0, |
|
6082
|
|
|
|
|
|
|
0, 0, 10, 2, 0, 0 |
|
6083
|
|
|
|
|
|
|
}; |
|
6084
|
|
|
|
|
|
|
|
|
6085
|
|
|
|
|
|
|
static const unsigned char _NNPS_index_offsets[] = { |
|
6086
|
|
|
|
|
|
|
0, 0, 5, 8, 11, 14, 17, 22, |
|
6087
|
|
|
|
|
|
|
33, 34, 49, 52, 63, 64, 67, 70, |
|
6088
|
|
|
|
|
|
|
73, 74, 85, 86, 89, 92, 95, 96, |
|
6089
|
|
|
|
|
|
|
111, 114, 131, 142, 159, 160 |
|
6090
|
|
|
|
|
|
|
}; |
|
6091
|
|
|
|
|
|
|
|
|
6092
|
|
|
|
|
|
|
static const char _NNPS_indicies[] = { |
|
6093
|
|
|
|
|
|
|
0, 2, 3, 4, 1, 5, 6, 1, |
|
6094
|
|
|
|
|
|
|
7, 8, 1, 8, 8, 1, 10, 11, |
|
6095
|
|
|
|
|
|
|
9, 12, 12, 12, 12, 1, 13, 13, |
|
6096
|
|
|
|
|
|
|
13, 13, 13, 13, 13, 13, 13, 13, |
|
6097
|
|
|
|
|
|
|
1, 14, 16, 15, 16, 15, 15, 15, |
|
6098
|
|
|
|
|
|
|
15, 15, 15, 15, 15, 15, 15, 15, |
|
6099
|
|
|
|
|
|
|
1, 17, 17, 1, 18, 18, 18, 18, |
|
6100
|
|
|
|
|
|
|
18, 18, 18, 18, 18, 18, 1, 19, |
|
6101
|
|
|
|
|
|
|
20, 21, 1, 22, 23, 1, 23, 23, |
|
6102
|
|
|
|
|
|
|
1, 24, 25, 25, 25, 25, 25, 25, |
|
6103
|
|
|
|
|
|
|
25, 25, 25, 25, 1, 26, 21, 21, |
|
6104
|
|
|
|
|
|
|
1, 6, 6, 1, 11, 11, 9, 1, |
|
6105
|
|
|
|
|
|
|
27, 28, 29, 30, 31, 12, 32, 27, |
|
6106
|
|
|
|
|
|
|
33, 29, 30, 34, 12, 32, 1, 35, |
|
6107
|
|
|
|
|
|
|
35, 1, 36, 36, 36, 37, 36, 38, |
|
6108
|
|
|
|
|
|
|
39, 40, 36, 36, 36, 37, 36, 38, |
|
6109
|
|
|
|
|
|
|
39, 40, 1, 41, 41, 41, 41, 41, |
|
6110
|
|
|
|
|
|
|
41, 41, 41, 41, 41, 1, 42, 42, |
|
6111
|
|
|
|
|
|
|
42, 42, 42, 42, 44, 42, 42, 42, |
|
6112
|
|
|
|
|
|
|
42, 42, 42, 44, 43, 43, 1, 24, |
|
6113
|
|
|
|
|
|
|
27, 33, 29, 30, 34, 12, 32, 27, |
|
6114
|
|
|
|
|
|
|
33, 29, 30, 34, 12, 32, 1, 0 |
|
6115
|
|
|
|
|
|
|
}; |
|
6116
|
|
|
|
|
|
|
|
|
6117
|
|
|
|
|
|
|
static const char _NNPS_trans_targs[] = { |
|
6118
|
|
|
|
|
|
|
2, 0, 5, 20, 21, 3, 4, 22, |
|
6119
|
|
|
|
|
|
|
22, 22, 23, 29, 22, 8, 22, 22, |
|
6120
|
|
|
|
|
|
|
24, 22, 12, 22, 14, 15, 22, 22, |
|
6121
|
|
|
|
|
|
|
22, 18, 22, 6, 7, 9, 25, 13, |
|
6122
|
|
|
|
|
|
|
27, 17, 19, 22, 22, 10, 11, 22, |
|
6123
|
|
|
|
|
|
|
26, 22, 22, 16, 28 |
|
6124
|
|
|
|
|
|
|
}; |
|
6125
|
|
|
|
|
|
|
|
|
6126
|
|
|
|
|
|
|
static const char _NNPS_trans_actions[] = { |
|
6127
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 29, |
|
6128
|
|
|
|
|
|
|
1, 27, 27, 27, 21, 0, 35, 25, |
|
6129
|
|
|
|
|
|
|
25, 19, 0, 17, 0, 0, 32, 5, |
|
6130
|
|
|
|
|
|
|
11, 0, 23, 0, 0, 0, 21, 0, |
|
6131
|
|
|
|
|
|
|
21, 0, 0, 3, 9, 0, 0, 15, |
|
6132
|
|
|
|
|
|
|
9, 7, 13, 0, 15 |
|
6133
|
|
|
|
|
|
|
}; |
|
6134
|
|
|
|
|
|
|
|
|
6135
|
|
|
|
|
|
|
static const int NNPS_start = 1; |
|
6136
|
|
|
|
|
|
|
|
|
6137
|
0
|
|
|
|
|
|
void english_morpho_guesser::add_NNPS(const string& form, vector& lemmas) const { |
|
6138
|
|
|
|
|
|
|
const char* p = form.c_str(); int cs; |
|
6139
|
|
|
|
|
|
|
char best = 'z'; unsigned remove = 0; const char* append = nullptr; |
|
6140
|
|
|
|
|
|
|
|
|
6141
|
|
|
|
|
|
|
{ |
|
6142
|
|
|
|
|
|
|
cs = NNPS_start; |
|
6143
|
|
|
|
|
|
|
} |
|
6144
|
|
|
|
|
|
|
|
|
6145
|
|
|
|
|
|
|
{ |
|
6146
|
|
|
|
|
|
|
int _klen; |
|
6147
|
|
|
|
|
|
|
unsigned int _trans; |
|
6148
|
|
|
|
|
|
|
const char *_acts; |
|
6149
|
|
|
|
|
|
|
unsigned int _nacts; |
|
6150
|
|
|
|
|
|
|
const char *_keys; |
|
6151
|
|
|
|
|
|
|
|
|
6152
|
0
|
0
|
|
|
|
|
if ( p == ( (form.c_str() + form.size())) ) |
|
6153
|
|
|
|
|
|
|
goto _test_eof; |
|
6154
|
|
|
|
|
|
|
if ( cs == 0 ) |
|
6155
|
|
|
|
|
|
|
goto _out; |
|
6156
|
|
|
|
|
|
|
_resume: |
|
6157
|
0
|
|
|
|
|
|
_keys = _NNPS_trans_keys + _NNPS_key_offsets[cs]; |
|
6158
|
0
|
|
|
|
|
|
_trans = _NNPS_index_offsets[cs]; |
|
6159
|
|
|
|
|
|
|
|
|
6160
|
0
|
|
|
|
|
|
_klen = _NNPS_single_lengths[cs]; |
|
6161
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
|
6162
|
|
|
|
|
|
|
const char *_lower = _keys; |
|
6163
|
|
|
|
|
|
|
const char *_mid; |
|
6164
|
0
|
|
|
|
|
|
const char *_upper = _keys + _klen - 1; |
|
6165
|
|
|
|
|
|
|
while (1) { |
|
6166
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
|
6167
|
|
|
|
|
|
|
break; |
|
6168
|
|
|
|
|
|
|
|
|
6169
|
0
|
|
|
|
|
|
_mid = _lower + ((_upper-_lower) >> 1); |
|
6170
|
0
|
0
|
|
|
|
|
if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid ) |
|
6171
|
0
|
|
|
|
|
|
_upper = _mid - 1; |
|
6172
|
0
|
0
|
|
|
|
|
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid ) |
|
6173
|
0
|
|
|
|
|
|
_lower = _mid + 1; |
|
6174
|
|
|
|
|
|
|
else { |
|
6175
|
0
|
|
|
|
|
|
_trans += (unsigned int)(_mid - _keys); |
|
6176
|
0
|
|
|
|
|
|
goto _match; |
|
6177
|
|
|
|
|
|
|
} |
|
6178
|
|
|
|
|
|
|
} |
|
6179
|
0
|
|
|
|
|
|
_keys += _klen; |
|
6180
|
0
|
|
|
|
|
|
_trans += _klen; |
|
6181
|
|
|
|
|
|
|
} |
|
6182
|
|
|
|
|
|
|
|
|
6183
|
0
|
|
|
|
|
|
_klen = _NNPS_range_lengths[cs]; |
|
6184
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
|
6185
|
|
|
|
|
|
|
const char *_lower = _keys; |
|
6186
|
|
|
|
|
|
|
const char *_mid; |
|
6187
|
0
|
|
|
|
|
|
const char *_upper = _keys + (_klen<<1) - 2; |
|
6188
|
|
|
|
|
|
|
while (1) { |
|
6189
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
|
6190
|
|
|
|
|
|
|
break; |
|
6191
|
|
|
|
|
|
|
|
|
6192
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
|
6193
|
0
|
0
|
|
|
|
|
if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] ) |
|
6194
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
|
6195
|
0
|
0
|
|
|
|
|
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] ) |
|
6196
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
|
6197
|
|
|
|
|
|
|
else { |
|
6198
|
0
|
|
|
|
|
|
_trans += (unsigned int)((_mid - _keys)>>1); |
|
6199
|
0
|
|
|
|
|
|
goto _match; |
|
6200
|
|
|
|
|
|
|
} |
|
6201
|
|
|
|
|
|
|
} |
|
6202
|
0
|
|
|
|
|
|
_trans += _klen; |
|
6203
|
|
|
|
|
|
|
} |
|
6204
|
|
|
|
|
|
|
|
|
6205
|
|
|
|
|
|
|
_match: |
|
6206
|
0
|
|
|
|
|
|
_trans = _NNPS_indicies[_trans]; |
|
6207
|
0
|
|
|
|
|
|
cs = _NNPS_trans_targs[_trans]; |
|
6208
|
|
|
|
|
|
|
|
|
6209
|
0
|
0
|
|
|
|
|
if ( _NNPS_trans_actions[_trans] == 0 ) |
|
6210
|
|
|
|
|
|
|
goto _again; |
|
6211
|
|
|
|
|
|
|
|
|
6212
|
0
|
|
|
|
|
|
_acts = _NNPS_actions + _NNPS_trans_actions[_trans]; |
|
6213
|
0
|
|
|
|
|
|
_nacts = (unsigned int) *_acts++; |
|
6214
|
0
|
0
|
|
|
|
|
while ( _nacts-- > 0 ) |
|
6215
|
|
|
|
|
|
|
{ |
|
6216
|
0
|
|
|
|
|
|
switch ( *_acts++ ) |
|
6217
|
|
|
|
|
|
|
{ |
|
6218
|
|
|
|
|
|
|
case 0: |
|
6219
|
0
|
0
|
|
|
|
|
{ if (best > 'a') best = 'a', remove = 2, append = "AN"; } |
|
6220
|
|
|
|
|
|
|
break; |
|
6221
|
|
|
|
|
|
|
case 1: |
|
6222
|
0
|
0
|
|
|
|
|
{ if (best > 'b') best = 'b', remove = 2, append = "an"; } |
|
6223
|
|
|
|
|
|
|
break; |
|
6224
|
|
|
|
|
|
|
case 2: |
|
6225
|
0
|
0
|
|
|
|
|
{ if (best > 'c') best = 'c', remove = 1, append = nullptr; } |
|
6226
|
|
|
|
|
|
|
break; |
|
6227
|
|
|
|
|
|
|
case 3: |
|
6228
|
0
|
0
|
|
|
|
|
{ if (best > 'd') best = 'd', remove = 3, append = "FE"; } |
|
6229
|
|
|
|
|
|
|
break; |
|
6230
|
|
|
|
|
|
|
case 4: |
|
6231
|
0
|
0
|
|
|
|
|
{ if (best > 'e') best = 'e', remove = 3, append = "fe"; } |
|
6232
|
|
|
|
|
|
|
break; |
|
6233
|
|
|
|
|
|
|
case 5: |
|
6234
|
0
|
0
|
|
|
|
|
{ if (best > 'f') best = 'f', remove = 2, append = nullptr; } |
|
6235
|
|
|
|
|
|
|
break; |
|
6236
|
|
|
|
|
|
|
case 6: |
|
6237
|
0
|
0
|
|
|
|
|
{ if (best > 'g') best = 'g', remove = 1, append = nullptr; } |
|
6238
|
|
|
|
|
|
|
break; |
|
6239
|
|
|
|
|
|
|
case 7: |
|
6240
|
0
|
0
|
|
|
|
|
{ if (best > 'h') best = 'h', remove = 2, append = nullptr; } |
|
6241
|
|
|
|
|
|
|
break; |
|
6242
|
|
|
|
|
|
|
case 8: |
|
6243
|
0
|
0
|
|
|
|
|
{ if (best > 'i') best = 'i', remove = 1, append = nullptr; } |
|
6244
|
|
|
|
|
|
|
break; |
|
6245
|
|
|
|
|
|
|
case 9: |
|
6246
|
0
|
0
|
|
|
|
|
{ if (best > 'j') best = 'j', remove = 2, append = nullptr; } |
|
6247
|
|
|
|
|
|
|
break; |
|
6248
|
|
|
|
|
|
|
case 10: |
|
6249
|
0
|
0
|
|
|
|
|
{ if (best > 'k') best = 'k', remove = 1, append = nullptr; } |
|
6250
|
|
|
|
|
|
|
break; |
|
6251
|
|
|
|
|
|
|
case 11: |
|
6252
|
0
|
0
|
|
|
|
|
{ if (best > 'l') best = 'l', remove = 1, append = nullptr; } |
|
6253
|
|
|
|
|
|
|
break; |
|
6254
|
|
|
|
|
|
|
case 12: |
|
6255
|
0
|
0
|
|
|
|
|
{ if (best > 'm') best = 'm', remove = 2, append = nullptr; } |
|
6256
|
|
|
|
|
|
|
break; |
|
6257
|
|
|
|
|
|
|
case 13: |
|
6258
|
0
|
0
|
|
|
|
|
{ if (best > 'n') best = 'n', remove = 3, append = "Y"; } |
|
6259
|
|
|
|
|
|
|
break; |
|
6260
|
|
|
|
|
|
|
case 14: |
|
6261
|
0
|
0
|
|
|
|
|
{ if (best > 'o') best = 'o', remove = 3, append = "y"; } |
|
6262
|
|
|
|
|
|
|
break; |
|
6263
|
|
|
|
|
|
|
case 15: |
|
6264
|
0
|
0
|
|
|
|
|
{ if (best > 'p') best = 'p', remove = 2, append = nullptr; } |
|
6265
|
|
|
|
|
|
|
break; |
|
6266
|
|
|
|
|
|
|
case 16: |
|
6267
|
0
|
0
|
|
|
|
|
{ if (best > 'q') best = 'q', remove = 1, append = nullptr; } |
|
6268
|
|
|
|
|
|
|
break; |
|
6269
|
|
|
|
|
|
|
} |
|
6270
|
|
|
|
|
|
|
} |
|
6271
|
|
|
|
|
|
|
|
|
6272
|
|
|
|
|
|
|
_again: |
|
6273
|
0
|
0
|
|
|
|
|
if ( cs == 0 ) |
|
6274
|
|
|
|
|
|
|
goto _out; |
|
6275
|
0
|
0
|
|
|
|
|
if ( ++p != ( (form.c_str() + form.size())) ) |
|
6276
|
|
|
|
|
|
|
goto _resume; |
|
6277
|
|
|
|
|
|
|
_test_eof: {} |
|
6278
|
|
|
|
|
|
|
_out: {} |
|
6279
|
|
|
|
|
|
|
} |
|
6280
|
|
|
|
|
|
|
|
|
6281
|
0
|
0
|
|
|
|
|
add(NNPS, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
|
|
|
0
|
|
|
|
|
|
|
6282
|
0
|
|
|
|
|
|
} |
|
6283
|
|
|
|
|
|
|
|
|
6284
|
|
|
|
|
|
|
static const char _VBG_actions[] = { |
|
6285
|
|
|
|
|
|
|
0, 1, 1, 1, 2, 1, 4, 1, |
|
6286
|
|
|
|
|
|
|
5, 1, 6, 1, 7, 1, 9, 1, |
|
6287
|
|
|
|
|
|
|
10, 1, 11, 1, 12, 1, 13, 1, |
|
6288
|
|
|
|
|
|
|
14, 1, 15, 1, 16, 1, 17, 2, |
|
6289
|
|
|
|
|
|
|
0, 12, 2, 3, 4, 2, 5, 9, |
|
6290
|
|
|
|
|
|
|
2, 5, 10, 2, 8, 9, 2, 9, |
|
6291
|
|
|
|
|
|
|
10, 2, 11, 12, 3, 0, 2, 12, |
|
6292
|
|
|
|
|
|
|
3, 2, 11, 12 |
|
6293
|
|
|
|
|
|
|
}; |
|
6294
|
|
|
|
|
|
|
|
|
6295
|
|
|
|
|
|
|
static const short _VBG_key_offsets[] = { |
|
6296
|
|
|
|
|
|
|
0, 0, 1, 2, 3, 9, 14, 24, |
|
6297
|
|
|
|
|
|
|
29, 34, 44, 46, 47, 48, 49, 50, |
|
6298
|
|
|
|
|
|
|
51, 52, 59, 66, 68, 70, 71, 72, |
|
6299
|
|
|
|
|
|
|
73, 74, 75, 76, 81, 89, 90, 91, |
|
6300
|
|
|
|
|
|
|
92, 93, 94, 96, 97, 98, 99, 100, |
|
6301
|
|
|
|
|
|
|
101, 102, 127, 127, 136, 137, 142, 153, |
|
6302
|
|
|
|
|
|
|
162, 171, 181, 186, 191, 197, 207, 207, |
|
6303
|
|
|
|
|
|
|
216, 228, 229, 240, 240, 249, 258, 267, |
|
6304
|
|
|
|
|
|
|
276, 285, 290, 302, 313, 318, 324, 334, |
|
6305
|
|
|
|
|
|
|
344, 355, 362, 373, 382, 391, 391, 402, |
|
6306
|
|
|
|
|
|
|
413, 415, 416, 417, 417, 418, 426, 437, |
|
6307
|
|
|
|
|
|
|
442, 448, 458, 468, 479, 486, 497, 504, |
|
6308
|
|
|
|
|
|
|
510, 519, 528, 537, 543 |
|
6309
|
|
|
|
|
|
|
}; |
|
6310
|
|
|
|
|
|
|
|
|
6311
|
|
|
|
|
|
|
static const char _VBG_trans_keys[] = { |
|
6312
|
|
|
|
|
|
|
103, 110, 105, 97, 101, 105, 111, 117, |
|
6313
|
|
|
|
|
|
|
121, 97, 101, 105, 111, 117, 98, 100, |
|
6314
|
|
|
|
|
|
|
102, 104, 106, 110, 112, 116, 118, 122, |
|
6315
|
|
|
|
|
|
|
97, 101, 105, 111, 117, 97, 101, 105, |
|
6316
|
|
|
|
|
|
|
111, 117, 98, 100, 102, 104, 106, 110, |
|
6317
|
|
|
|
|
|
|
112, 116, 118, 122, 98, 114, 105, 114, |
|
6318
|
|
|
|
|
|
|
112, 105, 109, 101, 97, 101, 105, 111, |
|
6319
|
|
|
|
|
|
|
117, 98, 122, 97, 101, 105, 111, 117, |
|
6320
|
|
|
|
|
|
|
98, 122, 97, 122, 98, 114, 105, 114, |
|
6321
|
|
|
|
|
|
|
112, 105, 109, 101, 97, 101, 105, 111, |
|
6322
|
|
|
|
|
|
|
117, 97, 101, 105, 110, 111, 115, 117, |
|
6323
|
|
|
|
|
|
|
120, 105, 112, 105, 109, 101, 98, 114, |
|
6324
|
|
|
|
|
|
|
105, 114, 112, 105, 109, 101, 98, 99, |
|
6325
|
|
|
|
|
|
|
100, 102, 103, 104, 106, 107, 108, 109, |
|
6326
|
|
|
|
|
|
|
110, 111, 112, 113, 114, 115, 116, 117, |
|
6327
|
|
|
|
|
|
|
118, 119, 120, 121, 122, 97, 105, 97, |
|
6328
|
|
|
|
|
|
|
98, 101, 105, 111, 117, 122, 99, 120, |
|
6329
|
|
|
|
|
|
|
113, 97, 101, 105, 111, 117, 98, 99, |
|
6330
|
|
|
|
|
|
|
100, 105, 111, 117, 122, 97, 101, 102, |
|
6331
|
|
|
|
|
|
|
120, 97, 100, 101, 105, 111, 117, 122, |
|
6332
|
|
|
|
|
|
|
98, 120, 97, 101, 102, 105, 111, 117, |
|
6333
|
|
|
|
|
|
|
122, 98, 120, 97, 101, 103, 105, 110, |
|
6334
|
|
|
|
|
|
|
111, 117, 122, 98, 120, 97, 101, 105, |
|
6335
|
|
|
|
|
|
|
111, 117, 101, 110, 111, 115, 120, 101, |
|
6336
|
|
|
|
|
|
|
110, 111, 112, 115, 120, 97, 101, 104, |
|
6337
|
|
|
|
|
|
|
105, 111, 116, 117, 122, 98, 120, 97, |
|
6338
|
|
|
|
|
|
|
101, 105, 106, 111, 117, 122, 98, 120, |
|
6339
|
|
|
|
|
|
|
98, 99, 100, 105, 107, 111, 117, 122, |
|
6340
|
|
|
|
|
|
|
97, 101, 102, 120, 105, 97, 101, 105, |
|
6341
|
|
|
|
|
|
|
108, 111, 114, 117, 119, 122, 98, 120, |
|
6342
|
|
|
|
|
|
|
97, 101, 105, 109, 111, 117, 122, 98, |
|
6343
|
|
|
|
|
|
|
120, 97, 101, 105, 110, 111, 117, 122, |
|
6344
|
|
|
|
|
|
|
98, 120, 97, 101, 105, 111, 112, 117, |
|
6345
|
|
|
|
|
|
|
122, 98, 120, 97, 101, 105, 111, 113, |
|
6346
|
|
|
|
|
|
|
117, 122, 98, 120, 97, 101, 105, 111, |
|
6347
|
|
|
|
|
|
|
114, 117, 122, 98, 120, 97, 101, 105, |
|
6348
|
|
|
|
|
|
|
111, 117, 98, 99, 100, 105, 108, 111, |
|
6349
|
|
|
|
|
|
|
116, 117, 97, 101, 102, 122, 101, 110, |
|
6350
|
|
|
|
|
|
|
111, 115, 120, 98, 104, 106, 116, 118, |
|
6351
|
|
|
|
|
|
|
122, 101, 110, 111, 115, 120, 101, 110, |
|
6352
|
|
|
|
|
|
|
111, 112, 115, 120, 101, 105, 110, 111, |
|
6353
|
|
|
|
|
|
|
115, 120, 98, 116, 118, 122, 101, 105, |
|
6354
|
|
|
|
|
|
|
110, 111, 115, 120, 98, 116, 118, 122, |
|
6355
|
|
|
|
|
|
|
101, 110, 111, 115, 120, 98, 104, 106, |
|
6356
|
|
|
|
|
|
|
116, 118, 122, 98, 101, 110, 111, 114, |
|
6357
|
|
|
|
|
|
|
115, 120, 101, 110, 111, 115, 120, 98, |
|
6358
|
|
|
|
|
|
|
104, 106, 116, 118, 122, 97, 101, 105, |
|
6359
|
|
|
|
|
|
|
111, 115, 117, 122, 98, 120, 97, 101, |
|
6360
|
|
|
|
|
|
|
105, 111, 116, 117, 122, 98, 120, 122, |
|
6361
|
|
|
|
|
|
|
98, 100, 102, 104, 106, 110, 112, 116, |
|
6362
|
|
|
|
|
|
|
118, 120, 122, 98, 100, 102, 104, 106, |
|
6363
|
|
|
|
|
|
|
110, 112, 116, 118, 120, 98, 114, 112, |
|
6364
|
|
|
|
|
|
|
114, 113, 97, 101, 105, 108, 111, 117, |
|
6365
|
|
|
|
|
|
|
98, 122, 101, 110, 111, 115, 120, 98, |
|
6366
|
|
|
|
|
|
|
104, 106, 116, 118, 122, 101, 110, 111, |
|
6367
|
|
|
|
|
|
|
115, 120, 101, 110, 111, 112, 115, 120, |
|
6368
|
|
|
|
|
|
|
101, 105, 110, 111, 115, 120, 98, 116, |
|
6369
|
|
|
|
|
|
|
118, 122, 101, 105, 110, 111, 115, 120, |
|
6370
|
|
|
|
|
|
|
98, 116, 118, 122, 101, 110, 111, 115, |
|
6371
|
|
|
|
|
|
|
120, 98, 104, 106, 116, 118, 122, 98, |
|
6372
|
|
|
|
|
|
|
101, 110, 111, 114, 115, 120, 101, 110, |
|
6373
|
|
|
|
|
|
|
111, 115, 120, 98, 104, 106, 116, 118, |
|
6374
|
|
|
|
|
|
|
122, 97, 101, 105, 111, 117, 98, 122, |
|
6375
|
|
|
|
|
|
|
97, 101, 105, 111, 117, 121, 97, 101, |
|
6376
|
|
|
|
|
|
|
105, 111, 117, 118, 122, 98, 120, 97, |
|
6377
|
|
|
|
|
|
|
101, 105, 111, 117, 119, 122, 98, 120, |
|
6378
|
|
|
|
|
|
|
97, 101, 105, 111, 117, 120, 122, 98, |
|
6379
|
|
|
|
|
|
|
119, 97, 101, 105, 111, 117, 121, 97, |
|
6380
|
|
|
|
|
|
|
101, 105, 111, 117, 121, 122, 98, 120, |
|
6381
|
|
|
|
|
|
|
0 |
|
6382
|
|
|
|
|
|
|
}; |
|
6383
|
|
|
|
|
|
|
|
|
6384
|
|
|
|
|
|
|
static const char _VBG_single_lengths[] = { |
|
6385
|
|
|
|
|
|
|
0, 1, 1, 1, 6, 5, 0, 5, |
|
6386
|
|
|
|
|
|
|
5, 0, 2, 1, 1, 1, 1, 1, |
|
6387
|
|
|
|
|
|
|
1, 5, 5, 0, 2, 1, 1, 1, |
|
6388
|
|
|
|
|
|
|
1, 1, 1, 5, 8, 1, 1, 1, |
|
6389
|
|
|
|
|
|
|
1, 1, 2, 1, 1, 1, 1, 1, |
|
6390
|
|
|
|
|
|
|
1, 23, 0, 7, 1, 5, 7, 7, |
|
6391
|
|
|
|
|
|
|
7, 8, 5, 5, 6, 8, 0, 7, |
|
6392
|
|
|
|
|
|
|
8, 1, 9, 0, 7, 7, 7, 7, |
|
6393
|
|
|
|
|
|
|
7, 5, 8, 5, 5, 6, 6, 6, |
|
6394
|
|
|
|
|
|
|
5, 7, 5, 7, 7, 0, 1, 1, |
|
6395
|
|
|
|
|
|
|
2, 1, 1, 0, 1, 6, 5, 5, |
|
6396
|
|
|
|
|
|
|
6, 6, 6, 5, 7, 5, 5, 6, |
|
6397
|
|
|
|
|
|
|
7, 7, 7, 6, 7 |
|
6398
|
|
|
|
|
|
|
}; |
|
6399
|
|
|
|
|
|
|
|
|
6400
|
|
|
|
|
|
|
static const char _VBG_range_lengths[] = { |
|
6401
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 5, 0, |
|
6402
|
|
|
|
|
|
|
0, 5, 0, 0, 0, 0, 0, 0, |
|
6403
|
|
|
|
|
|
|
0, 1, 1, 1, 0, 0, 0, 0, |
|
6404
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
6405
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
6406
|
|
|
|
|
|
|
0, 1, 0, 1, 0, 0, 2, 1, |
|
6407
|
|
|
|
|
|
|
1, 1, 0, 0, 0, 1, 0, 1, |
|
6408
|
|
|
|
|
|
|
2, 0, 1, 0, 1, 1, 1, 1, |
|
6409
|
|
|
|
|
|
|
1, 0, 2, 3, 0, 0, 2, 2, |
|
6410
|
|
|
|
|
|
|
3, 0, 3, 1, 1, 0, 5, 5, |
|
6411
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 1, 3, 0, |
|
6412
|
|
|
|
|
|
|
0, 2, 2, 3, 0, 3, 1, 0, |
|
6413
|
|
|
|
|
|
|
1, 1, 1, 0, 1 |
|
6414
|
|
|
|
|
|
|
}; |
|
6415
|
|
|
|
|
|
|
|
|
6416
|
|
|
|
|
|
|
static const short _VBG_index_offsets[] = { |
|
6417
|
|
|
|
|
|
|
0, 0, 2, 4, 6, 13, 19, 25, |
|
6418
|
|
|
|
|
|
|
31, 37, 43, 46, 48, 50, 52, 54, |
|
6419
|
|
|
|
|
|
|
56, 58, 65, 72, 74, 77, 79, 81, |
|
6420
|
|
|
|
|
|
|
83, 85, 87, 89, 95, 104, 106, 108, |
|
6421
|
|
|
|
|
|
|
110, 112, 114, 117, 119, 121, 123, 125, |
|
6422
|
|
|
|
|
|
|
127, 129, 154, 155, 164, 166, 172, 182, |
|
6423
|
|
|
|
|
|
|
191, 200, 210, 216, 222, 229, 239, 240, |
|
6424
|
|
|
|
|
|
|
249, 260, 262, 273, 274, 283, 292, 301, |
|
6425
|
|
|
|
|
|
|
310, 319, 325, 336, 345, 351, 358, 367, |
|
6426
|
|
|
|
|
|
|
376, 385, 393, 402, 411, 420, 421, 428, |
|
6427
|
|
|
|
|
|
|
435, 438, 440, 442, 443, 445, 453, 462, |
|
6428
|
|
|
|
|
|
|
468, 475, 484, 493, 502, 510, 519, 526, |
|
6429
|
|
|
|
|
|
|
533, 542, 551, 560, 567 |
|
6430
|
|
|
|
|
|
|
}; |
|
6431
|
|
|
|
|
|
|
|
|
6432
|
|
|
|
|
|
|
static const unsigned char _VBG_indicies[] = { |
|
6433
|
|
|
|
|
|
|
0, 1, 2, 1, 3, 1, 4, 4, |
|
6434
|
|
|
|
|
|
|
4, 4, 4, 4, 1, 5, 5, 5, |
|
6435
|
|
|
|
|
|
|
5, 6, 1, 7, 7, 7, 7, 7, |
|
6436
|
|
|
|
|
|
|
1, 8, 8, 8, 8, 9, 1, 5, |
|
6437
|
|
|
|
|
|
|
5, 5, 5, 10, 1, 11, 11, 11, |
|
6438
|
|
|
|
|
|
|
11, 11, 1, 11, 12, 1, 11, 1, |
|
6439
|
|
|
|
|
|
|
13, 1, 11, 1, 14, 1, 11, 1, |
|
6440
|
|
|
|
|
|
|
11, 1, 5, 5, 5, 5, 6, 15, |
|
6441
|
|
|
|
|
|
|
1, 5, 5, 5, 5, 6, 16, 1, |
|
6442
|
|
|
|
|
|
|
4, 1, 17, 18, 1, 17, 1, 19, |
|
6443
|
|
|
|
|
|
|
1, 17, 1, 20, 1, 17, 1, 17, |
|
6444
|
|
|
|
|
|
|
1, 21, 22, 21, 23, 24, 1, 25, |
|
6445
|
|
|
|
|
|
|
26, 25, 27, 28, 29, 25, 30, 1, |
|
6446
|
|
|
|
|
|
|
31, 1, 31, 1, 32, 1, 31, 1, |
|
6447
|
|
|
|
|
|
|
31, 1, 33, 34, 1, 33, 1, 35, |
|
6448
|
|
|
|
|
|
|
1, 33, 1, 36, 1, 33, 1, 33, |
|
6449
|
|
|
|
|
|
|
1, 38, 39, 40, 41, 42, 43, 44, |
|
6450
|
|
|
|
|
|
|
45, 46, 47, 48, 49, 50, 51, 52, |
|
6451
|
|
|
|
|
|
|
53, 54, 55, 56, 57, 58, 59, 60, |
|
6452
|
|
|
|
|
|
|
37, 1, 1, 61, 62, 61, 61, 61, |
|
6453
|
|
|
|
|
|
|
61, 63, 63, 1, 64, 1, 65, 65, |
|
6454
|
|
|
|
|
|
|
65, 65, 65, 1, 67, 68, 67, 66, |
|
6455
|
|
|
|
|
|
|
66, 66, 67, 66, 67, 1, 69, 62, |
|
6456
|
|
|
|
|
|
|
69, 69, 69, 69, 63, 63, 1, 61, |
|
6457
|
|
|
|
|
|
|
61, 62, 61, 61, 61, 63, 63, 1, |
|
6458
|
|
|
|
|
|
|
66, 66, 68, 66, 70, 66, 66, 67, |
|
6459
|
|
|
|
|
|
|
67, 1, 71, 71, 71, 71, 71, 1, |
|
6460
|
|
|
|
|
|
|
72, 73, 74, 75, 76, 1, 72, 73, |
|
6461
|
|
|
|
|
|
|
74, 11, 75, 76, 1, 61, 61, 62, |
|
6462
|
|
|
|
|
|
|
61, 61, 77, 61, 63, 63, 1, 78, |
|
6463
|
|
|
|
|
|
|
61, 61, 61, 62, 61, 61, 63, 63, |
|
6464
|
|
|
|
|
|
|
1, 63, 79, 63, 61, 62, 61, 61, |
|
6465
|
|
|
|
|
|
|
63, 61, 63, 1, 7, 1, 61, 61, |
|
6466
|
|
|
|
|
|
|
61, 68, 61, 80, 61, 80, 67, 67, |
|
6467
|
|
|
|
|
|
|
1, 5, 61, 61, 61, 62, 61, 61, |
|
6468
|
|
|
|
|
|
|
63, 63, 1, 81, 81, 82, 62, 81, |
|
6469
|
|
|
|
|
|
|
81, 63, 63, 1, 81, 81, 81, 81, |
|
6470
|
|
|
|
|
|
|
62, 81, 63, 63, 1, 61, 61, 61, |
|
6471
|
|
|
|
|
|
|
61, 62, 61, 63, 63, 1, 61, 83, |
|
6472
|
|
|
|
|
|
|
61, 84, 62, 61, 63, 63, 1, 5, |
|
6473
|
|
|
|
|
|
|
5, 5, 5, 6, 1, 85, 86, 85, |
|
6474
|
|
|
|
|
|
|
5, 86, 5, 86, 6, 5, 85, 1, |
|
6475
|
|
|
|
|
|
|
87, 88, 89, 90, 91, 85, 85, 85, |
|
6476
|
|
|
|
|
|
|
1, 87, 92, 89, 93, 94, 1, 87, |
|
6477
|
|
|
|
|
|
|
92, 89, 17, 93, 94, 1, 87, 17, |
|
6478
|
|
|
|
|
|
|
88, 89, 90, 91, 85, 85, 1, 87, |
|
6479
|
|
|
|
|
|
|
20, 88, 89, 90, 91, 85, 85, 1, |
|
6480
|
|
|
|
|
|
|
95, 88, 89, 90, 91, 85, 85, 85, |
|
6481
|
|
|
|
|
|
|
1, 17, 87, 92, 89, 18, 93, 94, |
|
6482
|
|
|
|
|
|
|
1, 87, 97, 89, 98, 99, 96, 96, |
|
6483
|
|
|
|
|
|
|
96, 1, 66, 66, 66, 66, 100, 66, |
|
6484
|
|
|
|
|
|
|
67, 67, 1, 101, 102, 103, 61, 62, |
|
6485
|
|
|
|
|
|
|
61, 63, 63, 1, 104, 106, 106, 106, |
|
6486
|
|
|
|
|
|
|
106, 106, 106, 105, 107, 107, 107, 107, |
|
6487
|
|
|
|
|
|
|
107, 107, 1, 31, 108, 1, 31, 1, |
|
6488
|
|
|
|
|
|
|
109, 1, 105, 110, 104, 5, 5, 5, |
|
6489
|
|
|
|
|
|
|
112, 5, 6, 111, 1, 113, 114, 115, |
|
6490
|
|
|
|
|
|
|
116, 117, 111, 111, 111, 1, 113, 118, |
|
6491
|
|
|
|
|
|
|
115, 119, 120, 1, 113, 118, 115, 33, |
|
6492
|
|
|
|
|
|
|
119, 120, 1, 113, 33, 114, 115, 116, |
|
6493
|
|
|
|
|
|
|
117, 111, 111, 1, 113, 36, 114, 115, |
|
6494
|
|
|
|
|
|
|
116, 117, 111, 111, 1, 121, 114, 115, |
|
6495
|
|
|
|
|
|
|
116, 117, 111, 111, 111, 1, 33, 113, |
|
6496
|
|
|
|
|
|
|
118, 115, 34, 119, 120, 1, 113, 123, |
|
6497
|
|
|
|
|
|
|
115, 124, 125, 122, 122, 122, 1, 5, |
|
6498
|
|
|
|
|
|
|
5, 5, 5, 6, 111, 1, 4, 4, |
|
6499
|
|
|
|
|
|
|
4, 4, 4, 4, 1, 66, 66, 66, |
|
6500
|
|
|
|
|
|
|
66, 66, 68, 67, 67, 1, 81, 81, |
|
6501
|
|
|
|
|
|
|
81, 81, 81, 62, 63, 63, 1, 81, |
|
6502
|
|
|
|
|
|
|
81, 81, 81, 81, 62, 63, 63, 1, |
|
6503
|
|
|
|
|
|
|
126, 126, 126, 126, 126, 4, 1, 127, |
|
6504
|
|
|
|
|
|
|
127, 127, 127, 127, 129, 130, 128, 1, |
|
6505
|
|
|
|
|
|
|
0 |
|
6506
|
|
|
|
|
|
|
}; |
|
6507
|
|
|
|
|
|
|
|
|
6508
|
|
|
|
|
|
|
static const char _VBG_trans_targs[] = { |
|
6509
|
|
|
|
|
|
|
2, 0, 3, 41, 42, 42, 44, 42, |
|
6510
|
|
|
|
|
|
|
42, 44, 44, 51, 52, 13, 15, 42, |
|
6511
|
|
|
|
|
|
|
42, 68, 69, 23, 25, 77, 78, 83, |
|
6512
|
|
|
|
|
|
|
84, 42, 80, 29, 82, 31, 33, 42, |
|
6513
|
|
|
|
|
|
|
32, 87, 88, 37, 39, 4, 43, 46, |
|
6514
|
|
|
|
|
|
|
47, 48, 49, 53, 55, 56, 58, 60, |
|
6515
|
|
|
|
|
|
|
61, 19, 62, 63, 64, 75, 76, 95, |
|
6516
|
|
|
|
|
|
|
96, 97, 98, 99, 100, 5, 45, 42, |
|
6517
|
|
|
|
|
|
|
42, 6, 7, 42, 45, 8, 50, 9, |
|
6518
|
|
|
|
|
|
|
10, 11, 12, 14, 16, 54, 42, 57, |
|
6519
|
|
|
|
|
|
|
59, 17, 18, 65, 66, 67, 74, 20, |
|
6520
|
|
|
|
|
|
|
70, 22, 71, 72, 21, 24, 26, 73, |
|
6521
|
|
|
|
|
|
|
67, 70, 71, 72, 45, 27, 85, 94, |
|
6522
|
|
|
|
|
|
|
42, 42, 79, 28, 81, 30, 42, 86, |
|
6523
|
|
|
|
|
|
|
93, 34, 89, 36, 90, 91, 35, 38, |
|
6524
|
|
|
|
|
|
|
40, 92, 86, 89, 90, 91, 65, 65, |
|
6525
|
|
|
|
|
|
|
42, 42, 45 |
|
6526
|
|
|
|
|
|
|
}; |
|
6527
|
|
|
|
|
|
|
|
|
6528
|
|
|
|
|
|
|
static const char _VBG_trans_actions[] = { |
|
6529
|
|
|
|
|
|
|
0, 0, 0, 29, 23, 15, 15, 3, |
|
6530
|
|
|
|
|
|
|
46, 46, 40, 0, 0, 0, 0, 5, |
|
6531
|
|
|
|
|
|
|
34, 0, 0, 0, 0, 15, 15, 15, |
|
6532
|
|
|
|
|
|
|
15, 11, 11, 0, 11, 0, 0, 9, |
|
6533
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
6534
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
6535
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 21, |
|
6536
|
|
|
|
|
|
|
0, 0, 0, 23, 0, 0, 19, 19, |
|
6537
|
|
|
|
|
|
|
7, 0, 0, 49, 49, 0, 49, 0, |
|
6538
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 19, 17, 19, |
|
6539
|
|
|
|
|
|
|
49, 0, 0, 27, 27, 0, 0, 0, |
|
6540
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
6541
|
|
|
|
|
|
|
25, 25, 25, 25, 56, 0, 9, 9, |
|
6542
|
|
|
|
|
|
|
13, 43, 43, 0, 9, 0, 37, 0, |
|
6543
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
6544
|
|
|
|
|
|
|
0, 0, 7, 7, 7, 7, 23, 1, |
|
6545
|
|
|
|
|
|
|
31, 1, 52 |
|
6546
|
|
|
|
|
|
|
}; |
|
6547
|
|
|
|
|
|
|
|
|
6548
|
|
|
|
|
|
|
static const char _VBG_eof_actions[] = { |
|
6549
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
6550
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
6551
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
6552
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
6553
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
6554
|
|
|
|
|
|
|
0, 0, 0, 3, 0, 0, 3, 3, |
|
6555
|
|
|
|
|
|
|
3, 3, 0, 3, 3, 3, 0, 3, |
|
6556
|
|
|
|
|
|
|
3, 0, 3, 0, 3, 3, 3, 3, |
|
6557
|
|
|
|
|
|
|
3, 0, 0, 25, 25, 25, 25, 25, |
|
6558
|
|
|
|
|
|
|
25, 25, 25, 3, 3, 0, 0, 0, |
|
6559
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 7, 7, |
|
6560
|
|
|
|
|
|
|
7, 7, 7, 7, 7, 7, 0, 0, |
|
6561
|
|
|
|
|
|
|
3, 3, 3, 0, 3 |
|
6562
|
|
|
|
|
|
|
}; |
|
6563
|
|
|
|
|
|
|
|
|
6564
|
|
|
|
|
|
|
static const int VBG_start = 1; |
|
6565
|
|
|
|
|
|
|
|
|
6566
|
0
|
|
|
|
|
|
void english_morpho_guesser::add_VBG(const string& form, vector& lemmas) const { |
|
6567
|
|
|
|
|
|
|
const char* p = form.c_str(); int cs; |
|
6568
|
|
|
|
|
|
|
char best = 'z'; unsigned remove = 0; const char* append = nullptr; |
|
6569
|
|
|
|
|
|
|
|
|
6570
|
|
|
|
|
|
|
{ |
|
6571
|
|
|
|
|
|
|
cs = VBG_start; |
|
6572
|
|
|
|
|
|
|
} |
|
6573
|
|
|
|
|
|
|
|
|
6574
|
|
|
|
|
|
|
{ |
|
6575
|
|
|
|
|
|
|
int _klen; |
|
6576
|
|
|
|
|
|
|
unsigned int _trans; |
|
6577
|
|
|
|
|
|
|
const char *_acts; |
|
6578
|
|
|
|
|
|
|
unsigned int _nacts; |
|
6579
|
|
|
|
|
|
|
const char *_keys; |
|
6580
|
|
|
|
|
|
|
|
|
6581
|
0
|
0
|
|
|
|
|
if ( p == ( (form.c_str() + form.size())) ) |
|
6582
|
|
|
|
|
|
|
goto _test_eof; |
|
6583
|
|
|
|
|
|
|
if ( cs == 0 ) |
|
6584
|
|
|
|
|
|
|
goto _out; |
|
6585
|
|
|
|
|
|
|
_resume: |
|
6586
|
0
|
|
|
|
|
|
_keys = _VBG_trans_keys + _VBG_key_offsets[cs]; |
|
6587
|
0
|
|
|
|
|
|
_trans = _VBG_index_offsets[cs]; |
|
6588
|
|
|
|
|
|
|
|
|
6589
|
0
|
|
|
|
|
|
_klen = _VBG_single_lengths[cs]; |
|
6590
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
|
6591
|
|
|
|
|
|
|
const char *_lower = _keys; |
|
6592
|
|
|
|
|
|
|
const char *_mid; |
|
6593
|
0
|
|
|
|
|
|
const char *_upper = _keys + _klen - 1; |
|
6594
|
|
|
|
|
|
|
while (1) { |
|
6595
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
|
6596
|
|
|
|
|
|
|
break; |
|
6597
|
|
|
|
|
|
|
|
|
6598
|
0
|
|
|
|
|
|
_mid = _lower + ((_upper-_lower) >> 1); |
|
6599
|
0
|
0
|
|
|
|
|
if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid ) |
|
6600
|
0
|
|
|
|
|
|
_upper = _mid - 1; |
|
6601
|
0
|
0
|
|
|
|
|
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid ) |
|
6602
|
0
|
|
|
|
|
|
_lower = _mid + 1; |
|
6603
|
|
|
|
|
|
|
else { |
|
6604
|
0
|
|
|
|
|
|
_trans += (unsigned int)(_mid - _keys); |
|
6605
|
0
|
|
|
|
|
|
goto _match; |
|
6606
|
|
|
|
|
|
|
} |
|
6607
|
|
|
|
|
|
|
} |
|
6608
|
0
|
|
|
|
|
|
_keys += _klen; |
|
6609
|
0
|
|
|
|
|
|
_trans += _klen; |
|
6610
|
|
|
|
|
|
|
} |
|
6611
|
|
|
|
|
|
|
|
|
6612
|
0
|
|
|
|
|
|
_klen = _VBG_range_lengths[cs]; |
|
6613
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
|
6614
|
|
|
|
|
|
|
const char *_lower = _keys; |
|
6615
|
|
|
|
|
|
|
const char *_mid; |
|
6616
|
0
|
|
|
|
|
|
const char *_upper = _keys + (_klen<<1) - 2; |
|
6617
|
|
|
|
|
|
|
while (1) { |
|
6618
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
|
6619
|
|
|
|
|
|
|
break; |
|
6620
|
|
|
|
|
|
|
|
|
6621
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
|
6622
|
0
|
0
|
|
|
|
|
if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] ) |
|
6623
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
|
6624
|
0
|
0
|
|
|
|
|
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] ) |
|
6625
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
|
6626
|
|
|
|
|
|
|
else { |
|
6627
|
0
|
|
|
|
|
|
_trans += (unsigned int)((_mid - _keys)>>1); |
|
6628
|
0
|
|
|
|
|
|
goto _match; |
|
6629
|
|
|
|
|
|
|
} |
|
6630
|
|
|
|
|
|
|
} |
|
6631
|
0
|
|
|
|
|
|
_trans += _klen; |
|
6632
|
|
|
|
|
|
|
} |
|
6633
|
|
|
|
|
|
|
|
|
6634
|
|
|
|
|
|
|
_match: |
|
6635
|
0
|
|
|
|
|
|
_trans = _VBG_indicies[_trans]; |
|
6636
|
0
|
|
|
|
|
|
cs = _VBG_trans_targs[_trans]; |
|
6637
|
|
|
|
|
|
|
|
|
6638
|
0
|
0
|
|
|
|
|
if ( _VBG_trans_actions[_trans] == 0 ) |
|
6639
|
|
|
|
|
|
|
goto _again; |
|
6640
|
|
|
|
|
|
|
|
|
6641
|
0
|
|
|
|
|
|
_acts = _VBG_actions + _VBG_trans_actions[_trans]; |
|
6642
|
0
|
|
|
|
|
|
_nacts = (unsigned int) *_acts++; |
|
6643
|
0
|
0
|
|
|
|
|
while ( _nacts-- > 0 ) |
|
6644
|
|
|
|
|
|
|
{ |
|
6645
|
0
|
|
|
|
|
|
switch ( *_acts++ ) |
|
6646
|
|
|
|
|
|
|
{ |
|
6647
|
|
|
|
|
|
|
case 0: |
|
6648
|
0
|
0
|
|
|
|
|
{ if (best > 'a') best = 'a', remove = 3, append = nullptr; } |
|
6649
|
|
|
|
|
|
|
break; |
|
6650
|
|
|
|
|
|
|
case 1: |
|
6651
|
0
|
0
|
|
|
|
|
{ if (best > 'b') best = 'b', remove = 3, append = "e"; } |
|
6652
|
|
|
|
|
|
|
break; |
|
6653
|
|
|
|
|
|
|
case 2: |
|
6654
|
0
|
0
|
|
|
|
|
{ if (best > 'c') best = 'c', remove = 3, append = nullptr; } |
|
6655
|
|
|
|
|
|
|
break; |
|
6656
|
|
|
|
|
|
|
case 3: |
|
6657
|
0
|
0
|
|
|
|
|
{ if (best > 'd') best = 'd', remove = 3, append = "e"; } |
|
6658
|
|
|
|
|
|
|
break; |
|
6659
|
|
|
|
|
|
|
case 4: |
|
6660
|
0
|
0
|
|
|
|
|
{ if (best > 'e') best = 'e', remove = 3, append = nullptr; } |
|
6661
|
|
|
|
|
|
|
break; |
|
6662
|
|
|
|
|
|
|
case 5: |
|
6663
|
0
|
0
|
|
|
|
|
{ if (best > 'f') best = 'f', remove = 3, append = "e"; } |
|
6664
|
|
|
|
|
|
|
break; |
|
6665
|
|
|
|
|
|
|
case 6: |
|
6666
|
0
|
0
|
|
|
|
|
{ if (best > 'g') best = 'g', remove = 3, append = nullptr; } |
|
6667
|
|
|
|
|
|
|
break; |
|
6668
|
|
|
|
|
|
|
case 7: |
|
6669
|
0
|
0
|
|
|
|
|
{ if (best > 'h') best = 'h', remove = 3, append = "e"; } |
|
6670
|
|
|
|
|
|
|
break; |
|
6671
|
|
|
|
|
|
|
case 8: |
|
6672
|
0
|
0
|
|
|
|
|
{ if (best > 'i') best = 'i', remove = 3, append = nullptr; } |
|
6673
|
|
|
|
|
|
|
break; |
|
6674
|
|
|
|
|
|
|
case 9: |
|
6675
|
0
|
0
|
|
|
|
|
{ if (best > 'j') best = 'j', remove = 3, append = "e"; } |
|
6676
|
|
|
|
|
|
|
break; |
|
6677
|
|
|
|
|
|
|
case 10: |
|
6678
|
0
|
0
|
|
|
|
|
{ if (best > 'k') best = 'k', remove = 3, append = nullptr; } |
|
6679
|
|
|
|
|
|
|
break; |
|
6680
|
|
|
|
|
|
|
case 11: |
|
6681
|
0
|
0
|
|
|
|
|
{ if (best > 'l') best = 'l', remove = 3, append = "e"; } |
|
6682
|
|
|
|
|
|
|
break; |
|
6683
|
|
|
|
|
|
|
case 12: |
|
6684
|
0
|
0
|
|
|
|
|
{ if (best > 'm') best = 'm', remove = 3, append = nullptr; } |
|
6685
|
|
|
|
|
|
|
break; |
|
6686
|
|
|
|
|
|
|
case 13: |
|
6687
|
0
|
0
|
|
|
|
|
{ if (best > 'n') best = 'n', remove = 3, append = "e"; } |
|
6688
|
|
|
|
|
|
|
break; |
|
6689
|
|
|
|
|
|
|
case 14: |
|
6690
|
0
|
0
|
|
|
|
|
{ if (best > 'o') best = 'o', remove = 3, append = nullptr; } |
|
6691
|
|
|
|
|
|
|
break; |
|
6692
|
|
|
|
|
|
|
case 15: |
|
6693
|
0
|
0
|
|
|
|
|
{ if (best > 'p') best = 'p', remove = 3, append = "e"; } |
|
6694
|
|
|
|
|
|
|
break; |
|
6695
|
|
|
|
|
|
|
case 16: |
|
6696
|
0
|
0
|
|
|
|
|
{ if (best > 'q') best = 'q', remove = 3, append = nullptr; } |
|
6697
|
|
|
|
|
|
|
break; |
|
6698
|
|
|
|
|
|
|
case 17: |
|
6699
|
0
|
0
|
|
|
|
|
{ if (best > 'r') best = 'r', remove = 3, append = "e"; } |
|
6700
|
|
|
|
|
|
|
break; |
|
6701
|
|
|
|
|
|
|
} |
|
6702
|
|
|
|
|
|
|
} |
|
6703
|
|
|
|
|
|
|
|
|
6704
|
|
|
|
|
|
|
_again: |
|
6705
|
0
|
0
|
|
|
|
|
if ( cs == 0 ) |
|
6706
|
|
|
|
|
|
|
goto _out; |
|
6707
|
0
|
0
|
|
|
|
|
if ( ++p != ( (form.c_str() + form.size())) ) |
|
6708
|
|
|
|
|
|
|
goto _resume; |
|
6709
|
|
|
|
|
|
|
_test_eof: {} |
|
6710
|
0
|
0
|
|
|
|
|
if ( p == ( (form.c_str() + form.size())) ) |
|
6711
|
|
|
|
|
|
|
{ |
|
6712
|
0
|
|
|
|
|
|
const char *__acts = _VBG_actions + _VBG_eof_actions[cs]; |
|
6713
|
0
|
|
|
|
|
|
unsigned int __nacts = (unsigned int) *__acts++; |
|
6714
|
0
|
0
|
|
|
|
|
while ( __nacts-- > 0 ) { |
|
6715
|
0
|
|
|
|
|
|
switch ( *__acts++ ) { |
|
6716
|
|
|
|
|
|
|
case 2: |
|
6717
|
0
|
0
|
|
|
|
|
{ if (best > 'c') best = 'c', remove = 3, append = nullptr; } |
|
6718
|
|
|
|
|
|
|
break; |
|
6719
|
|
|
|
|
|
|
case 5: |
|
6720
|
0
|
0
|
|
|
|
|
{ if (best > 'f') best = 'f', remove = 3, append = "e"; } |
|
6721
|
|
|
|
|
|
|
break; |
|
6722
|
|
|
|
|
|
|
case 15: |
|
6723
|
0
|
0
|
|
|
|
|
{ if (best > 'p') best = 'p', remove = 3, append = "e"; } |
|
6724
|
|
|
|
|
|
|
break; |
|
6725
|
|
|
|
|
|
|
} |
|
6726
|
|
|
|
|
|
|
} |
|
6727
|
|
|
|
|
|
|
} |
|
6728
|
|
|
|
|
|
|
|
|
6729
|
|
|
|
|
|
|
_out: {} |
|
6730
|
|
|
|
|
|
|
} |
|
6731
|
|
|
|
|
|
|
|
|
6732
|
0
|
0
|
|
|
|
|
add(VBG, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
|
|
|
0
|
|
|
|
|
|
|
6733
|
0
|
|
|
|
|
|
} |
|
6734
|
|
|
|
|
|
|
|
|
6735
|
|
|
|
|
|
|
static const char _VBD_VBN_actions[] = { |
|
6736
|
|
|
|
|
|
|
0, 1, 0, 1, 2, 1, 3, 1, |
|
6737
|
|
|
|
|
|
|
4, 1, 5, 1, 6, 1, 7, 1, |
|
6738
|
|
|
|
|
|
|
8, 1, 9, 1, 10, 1, 11, 1, |
|
6739
|
|
|
|
|
|
|
13, 1, 14, 1, 15, 1, 16, 1, |
|
6740
|
|
|
|
|
|
|
17, 2, 1, 16, 2, 4, 5, 2, |
|
6741
|
|
|
|
|
|
|
8, 16, 2, 9, 13, 2, 9, 14, |
|
6742
|
|
|
|
|
|
|
2, 12, 13, 2, 13, 14, 2, 15, |
|
6743
|
|
|
|
|
|
|
16, 3, 1, 3, 16, 3, 3, 15, |
|
6744
|
|
|
|
|
|
|
16 |
|
6745
|
|
|
|
|
|
|
}; |
|
6746
|
|
|
|
|
|
|
|
|
6747
|
|
|
|
|
|
|
static const short _VBD_VBN_key_offsets[] = { |
|
6748
|
|
|
|
|
|
|
0, 0, 2, 3, 9, 14, 24, 29, |
|
6749
|
|
|
|
|
|
|
34, 44, 46, 47, 48, 49, 50, 51, |
|
6750
|
|
|
|
|
|
|
52, 60, 67, 74, 76, 77, 78, 79, |
|
6751
|
|
|
|
|
|
|
80, 81, 82, 87, 95, 96, 97, 98, |
|
6752
|
|
|
|
|
|
|
99, 100, 102, 103, 104, 105, 106, 107, |
|
6753
|
|
|
|
|
|
|
108, 114, 115, 140, 140, 149, 150, 155, |
|
6754
|
|
|
|
|
|
|
166, 175, 184, 194, 199, 204, 210, 220, |
|
6755
|
|
|
|
|
|
|
220, 229, 241, 242, 253, 253, 262, 271, |
|
6756
|
|
|
|
|
|
|
280, 289, 298, 303, 316, 327, 332, 338, |
|
6757
|
|
|
|
|
|
|
348, 358, 369, 376, 387, 396, 405, 405, |
|
6758
|
|
|
|
|
|
|
416, 427, 429, 430, 431, 431, 432, 440, |
|
6759
|
|
|
|
|
|
|
451, 456, 462, 472, 482, 493, 500, 511, |
|
6760
|
|
|
|
|
|
|
518, 524, 533, 542, 551 |
|
6761
|
|
|
|
|
|
|
}; |
|
6762
|
|
|
|
|
|
|
|
|
6763
|
|
|
|
|
|
|
static const char _VBD_VBN_trans_keys[] = { |
|
6764
|
|
|
|
|
|
|
100, 110, 101, 97, 101, 105, 111, 117, |
|
6765
|
|
|
|
|
|
|
121, 97, 101, 105, 111, 117, 98, 100, |
|
6766
|
|
|
|
|
|
|
102, 104, 106, 110, 112, 116, 118, 122, |
|
6767
|
|
|
|
|
|
|
97, 101, 105, 111, 117, 97, 101, 105, |
|
6768
|
|
|
|
|
|
|
111, 117, 98, 100, 102, 104, 106, 110, |
|
6769
|
|
|
|
|
|
|
112, 116, 118, 122, 98, 114, 105, 114, |
|
6770
|
|
|
|
|
|
|
112, 105, 109, 101, 97, 101, 105, 111, |
|
6771
|
|
|
|
|
|
|
117, 121, 98, 122, 97, 101, 105, 111, |
|
6772
|
|
|
|
|
|
|
117, 98, 122, 97, 101, 105, 111, 117, |
|
6773
|
|
|
|
|
|
|
98, 122, 98, 114, 105, 114, 112, 105, |
|
6774
|
|
|
|
|
|
|
109, 101, 97, 101, 105, 111, 117, 97, |
|
6775
|
|
|
|
|
|
|
101, 105, 110, 111, 115, 117, 120, 105, |
|
6776
|
|
|
|
|
|
|
112, 105, 109, 101, 98, 114, 105, 114, |
|
6777
|
|
|
|
|
|
|
112, 105, 109, 101, 97, 101, 105, 111, |
|
6778
|
|
|
|
|
|
|
117, 121, 101, 98, 99, 100, 102, 103, |
|
6779
|
|
|
|
|
|
|
104, 105, 106, 107, 108, 109, 110, 112, |
|
6780
|
|
|
|
|
|
|
113, 114, 115, 116, 117, 118, 119, 120, |
|
6781
|
|
|
|
|
|
|
121, 122, 97, 111, 97, 98, 101, 105, |
|
6782
|
|
|
|
|
|
|
111, 117, 122, 99, 120, 113, 97, 101, |
|
6783
|
|
|
|
|
|
|
105, 111, 117, 98, 99, 100, 105, 111, |
|
6784
|
|
|
|
|
|
|
117, 122, 97, 101, 102, 120, 97, 100, |
|
6785
|
|
|
|
|
|
|
101, 105, 111, 117, 122, 98, 120, 97, |
|
6786
|
|
|
|
|
|
|
101, 102, 105, 111, 117, 122, 98, 120, |
|
6787
|
|
|
|
|
|
|
97, 101, 103, 105, 110, 111, 117, 122, |
|
6788
|
|
|
|
|
|
|
98, 120, 97, 101, 105, 111, 117, 101, |
|
6789
|
|
|
|
|
|
|
110, 111, 115, 120, 101, 110, 111, 112, |
|
6790
|
|
|
|
|
|
|
115, 120, 97, 101, 104, 105, 111, 116, |
|
6791
|
|
|
|
|
|
|
117, 122, 98, 120, 97, 101, 105, 106, |
|
6792
|
|
|
|
|
|
|
111, 117, 122, 98, 120, 98, 99, 100, |
|
6793
|
|
|
|
|
|
|
105, 107, 111, 117, 122, 97, 101, 102, |
|
6794
|
|
|
|
|
|
|
120, 105, 97, 101, 105, 108, 111, 114, |
|
6795
|
|
|
|
|
|
|
117, 119, 122, 98, 120, 97, 101, 105, |
|
6796
|
|
|
|
|
|
|
109, 111, 117, 122, 98, 120, 97, 101, |
|
6797
|
|
|
|
|
|
|
105, 110, 111, 117, 122, 98, 120, 97, |
|
6798
|
|
|
|
|
|
|
101, 105, 111, 112, 117, 122, 98, 120, |
|
6799
|
|
|
|
|
|
|
97, 101, 105, 111, 113, 117, 122, 98, |
|
6800
|
|
|
|
|
|
|
120, 97, 101, 105, 111, 114, 117, 122, |
|
6801
|
|
|
|
|
|
|
98, 120, 97, 101, 105, 111, 117, 98, |
|
6802
|
|
|
|
|
|
|
99, 100, 105, 108, 110, 111, 116, 117, |
|
6803
|
|
|
|
|
|
|
97, 101, 102, 122, 101, 110, 111, 115, |
|
6804
|
|
|
|
|
|
|
120, 98, 104, 106, 116, 118, 122, 101, |
|
6805
|
|
|
|
|
|
|
110, 111, 115, 120, 101, 110, 111, 112, |
|
6806
|
|
|
|
|
|
|
115, 120, 101, 105, 110, 111, 115, 120, |
|
6807
|
|
|
|
|
|
|
98, 116, 118, 122, 101, 105, 110, 111, |
|
6808
|
|
|
|
|
|
|
115, 120, 98, 116, 118, 122, 101, 110, |
|
6809
|
|
|
|
|
|
|
111, 115, 120, 98, 104, 106, 116, 118, |
|
6810
|
|
|
|
|
|
|
122, 98, 101, 110, 111, 114, 115, 120, |
|
6811
|
|
|
|
|
|
|
101, 110, 111, 115, 120, 98, 104, 106, |
|
6812
|
|
|
|
|
|
|
116, 118, 122, 97, 101, 105, 111, 115, |
|
6813
|
|
|
|
|
|
|
117, 122, 98, 120, 97, 101, 105, 111, |
|
6814
|
|
|
|
|
|
|
116, 117, 122, 98, 120, 122, 98, 100, |
|
6815
|
|
|
|
|
|
|
102, 104, 106, 110, 112, 116, 118, 120, |
|
6816
|
|
|
|
|
|
|
122, 98, 100, 102, 104, 106, 110, 112, |
|
6817
|
|
|
|
|
|
|
116, 118, 120, 98, 114, 112, 114, 113, |
|
6818
|
|
|
|
|
|
|
97, 101, 105, 108, 111, 117, 98, 122, |
|
6819
|
|
|
|
|
|
|
101, 110, 111, 115, 120, 98, 104, 106, |
|
6820
|
|
|
|
|
|
|
116, 118, 122, 101, 110, 111, 115, 120, |
|
6821
|
|
|
|
|
|
|
101, 110, 111, 112, 115, 120, 101, 105, |
|
6822
|
|
|
|
|
|
|
110, 111, 115, 120, 98, 116, 118, 122, |
|
6823
|
|
|
|
|
|
|
101, 105, 110, 111, 115, 120, 98, 116, |
|
6824
|
|
|
|
|
|
|
118, 122, 101, 110, 111, 115, 120, 98, |
|
6825
|
|
|
|
|
|
|
104, 106, 116, 118, 122, 98, 101, 110, |
|
6826
|
|
|
|
|
|
|
111, 114, 115, 120, 101, 110, 111, 115, |
|
6827
|
|
|
|
|
|
|
120, 98, 104, 106, 116, 118, 122, 97, |
|
6828
|
|
|
|
|
|
|
101, 105, 111, 117, 98, 122, 97, 101, |
|
6829
|
|
|
|
|
|
|
105, 111, 117, 121, 97, 101, 105, 111, |
|
6830
|
|
|
|
|
|
|
117, 118, 122, 98, 120, 97, 101, 105, |
|
6831
|
|
|
|
|
|
|
111, 117, 119, 122, 98, 120, 97, 101, |
|
6832
|
|
|
|
|
|
|
105, 111, 117, 120, 122, 98, 119, 97, |
|
6833
|
|
|
|
|
|
|
101, 105, 111, 117, 121, 122, 98, 120, |
|
6834
|
|
|
|
|
|
|
0 |
|
6835
|
|
|
|
|
|
|
}; |
|
6836
|
|
|
|
|
|
|
|
|
6837
|
|
|
|
|
|
|
static const char _VBD_VBN_single_lengths[] = { |
|
6838
|
|
|
|
|
|
|
0, 2, 1, 6, 5, 0, 5, 5, |
|
6839
|
|
|
|
|
|
|
0, 2, 1, 1, 1, 1, 1, 1, |
|
6840
|
|
|
|
|
|
|
6, 5, 5, 2, 1, 1, 1, 1, |
|
6841
|
|
|
|
|
|
|
1, 1, 5, 8, 1, 1, 1, 1, |
|
6842
|
|
|
|
|
|
|
1, 2, 1, 1, 1, 1, 1, 1, |
|
6843
|
|
|
|
|
|
|
6, 1, 23, 0, 7, 1, 5, 7, |
|
6844
|
|
|
|
|
|
|
7, 7, 8, 5, 5, 6, 8, 0, |
|
6845
|
|
|
|
|
|
|
7, 8, 1, 9, 0, 7, 7, 7, |
|
6846
|
|
|
|
|
|
|
7, 7, 5, 9, 5, 5, 6, 6, |
|
6847
|
|
|
|
|
|
|
6, 5, 7, 5, 7, 7, 0, 1, |
|
6848
|
|
|
|
|
|
|
1, 2, 1, 1, 0, 1, 6, 5, |
|
6849
|
|
|
|
|
|
|
5, 6, 6, 6, 5, 7, 5, 5, |
|
6850
|
|
|
|
|
|
|
6, 7, 7, 7, 7 |
|
6851
|
|
|
|
|
|
|
}; |
|
6852
|
|
|
|
|
|
|
|
|
6853
|
|
|
|
|
|
|
static const char _VBD_VBN_range_lengths[] = { |
|
6854
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 5, 0, 0, |
|
6855
|
|
|
|
|
|
|
5, 0, 0, 0, 0, 0, 0, 0, |
|
6856
|
|
|
|
|
|
|
1, 1, 1, 0, 0, 0, 0, 0, |
|
6857
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
6858
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
6859
|
|
|
|
|
|
|
0, 0, 1, 0, 1, 0, 0, 2, |
|
6860
|
|
|
|
|
|
|
1, 1, 1, 0, 0, 0, 1, 0, |
|
6861
|
|
|
|
|
|
|
1, 2, 0, 1, 0, 1, 1, 1, |
|
6862
|
|
|
|
|
|
|
1, 1, 0, 2, 3, 0, 0, 2, |
|
6863
|
|
|
|
|
|
|
2, 3, 0, 3, 1, 1, 0, 5, |
|
6864
|
|
|
|
|
|
|
5, 0, 0, 0, 0, 0, 1, 3, |
|
6865
|
|
|
|
|
|
|
0, 0, 2, 2, 3, 0, 3, 1, |
|
6866
|
|
|
|
|
|
|
0, 1, 1, 1, 1 |
|
6867
|
|
|
|
|
|
|
}; |
|
6868
|
|
|
|
|
|
|
|
|
6869
|
|
|
|
|
|
|
static const short _VBD_VBN_index_offsets[] = { |
|
6870
|
|
|
|
|
|
|
0, 0, 3, 5, 12, 18, 24, 30, |
|
6871
|
|
|
|
|
|
|
36, 42, 45, 47, 49, 51, 53, 55, |
|
6872
|
|
|
|
|
|
|
57, 65, 72, 79, 82, 84, 86, 88, |
|
6873
|
|
|
|
|
|
|
90, 92, 94, 100, 109, 111, 113, 115, |
|
6874
|
|
|
|
|
|
|
117, 119, 122, 124, 126, 128, 130, 132, |
|
6875
|
|
|
|
|
|
|
134, 141, 143, 168, 169, 178, 180, 186, |
|
6876
|
|
|
|
|
|
|
196, 205, 214, 224, 230, 236, 243, 253, |
|
6877
|
|
|
|
|
|
|
254, 263, 274, 276, 287, 288, 297, 306, |
|
6878
|
|
|
|
|
|
|
315, 324, 333, 339, 351, 360, 366, 373, |
|
6879
|
|
|
|
|
|
|
382, 391, 400, 408, 417, 426, 435, 436, |
|
6880
|
|
|
|
|
|
|
443, 450, 453, 455, 457, 458, 460, 468, |
|
6881
|
|
|
|
|
|
|
477, 483, 490, 499, 508, 517, 525, 534, |
|
6882
|
|
|
|
|
|
|
541, 548, 557, 566, 575 |
|
6883
|
|
|
|
|
|
|
}; |
|
6884
|
|
|
|
|
|
|
|
|
6885
|
|
|
|
|
|
|
static const unsigned char _VBD_VBN_indicies[] = { |
|
6886
|
|
|
|
|
|
|
0, 2, 1, 3, 1, 4, 4, 4, |
|
6887
|
|
|
|
|
|
|
4, 4, 4, 1, 5, 5, 5, 5, |
|
6888
|
|
|
|
|
|
|
6, 1, 7, 7, 7, 7, 7, 1, |
|
6889
|
|
|
|
|
|
|
8, 8, 8, 8, 9, 1, 5, 5, |
|
6890
|
|
|
|
|
|
|
5, 5, 10, 1, 11, 11, 11, 11, |
|
6891
|
|
|
|
|
|
|
11, 1, 11, 12, 1, 11, 1, 13, |
|
6892
|
|
|
|
|
|
|
1, 11, 1, 14, 1, 11, 1, 11, |
|
6893
|
|
|
|
|
|
|
1, 4, 4, 4, 4, 4, 16, 15, |
|
6894
|
|
|
|
|
|
|
1, 5, 5, 5, 5, 6, 17, 1, |
|
6895
|
|
|
|
|
|
|
5, 5, 5, 5, 6, 18, 1, 19, |
|
6896
|
|
|
|
|
|
|
20, 1, 19, 1, 21, 1, 19, 1, |
|
6897
|
|
|
|
|
|
|
22, 1, 19, 1, 19, 1, 23, 24, |
|
6898
|
|
|
|
|
|
|
23, 25, 26, 1, 27, 28, 27, 29, |
|
6899
|
|
|
|
|
|
|
30, 31, 27, 32, 1, 33, 1, 33, |
|
6900
|
|
|
|
|
|
|
1, 34, 1, 33, 1, 33, 1, 35, |
|
6901
|
|
|
|
|
|
|
36, 1, 35, 1, 37, 1, 35, 1, |
|
6902
|
|
|
|
|
|
|
38, 1, 35, 1, 35, 1, 39, 39, |
|
6903
|
|
|
|
|
|
|
39, 39, 39, 4, 1, 40, 1, 42, |
|
6904
|
|
|
|
|
|
|
43, 44, 45, 46, 47, 48, 49, 50, |
|
6905
|
|
|
|
|
|
|
51, 52, 53, 54, 55, 56, 57, 58, |
|
6906
|
|
|
|
|
|
|
59, 60, 61, 62, 63, 64, 41, 1, |
|
6907
|
|
|
|
|
|
|
1, 65, 66, 65, 65, 65, 65, 4, |
|
6908
|
|
|
|
|
|
|
4, 1, 67, 1, 68, 68, 68, 68, |
|
6909
|
|
|
|
|
|
|
68, 1, 70, 71, 70, 69, 69, 69, |
|
6910
|
|
|
|
|
|
|
70, 69, 70, 1, 72, 66, 72, 72, |
|
6911
|
|
|
|
|
|
|
72, 72, 4, 4, 1, 65, 65, 66, |
|
6912
|
|
|
|
|
|
|
65, 65, 65, 4, 4, 1, 69, 69, |
|
6913
|
|
|
|
|
|
|
71, 69, 73, 69, 69, 70, 70, 1, |
|
6914
|
|
|
|
|
|
|
74, 74, 74, 74, 74, 1, 75, 76, |
|
6915
|
|
|
|
|
|
|
77, 78, 79, 1, 75, 76, 77, 11, |
|
6916
|
|
|
|
|
|
|
78, 79, 1, 65, 65, 66, 65, 65, |
|
6917
|
|
|
|
|
|
|
80, 65, 4, 4, 1, 81, 65, 65, |
|
6918
|
|
|
|
|
|
|
65, 66, 65, 65, 4, 4, 1, 4, |
|
6919
|
|
|
|
|
|
|
82, 4, 65, 66, 65, 65, 4, 65, |
|
6920
|
|
|
|
|
|
|
4, 1, 7, 1, 65, 65, 65, 71, |
|
6921
|
|
|
|
|
|
|
65, 83, 65, 83, 70, 70, 1, 5, |
|
6922
|
|
|
|
|
|
|
65, 65, 65, 66, 65, 65, 4, 4, |
|
6923
|
|
|
|
|
|
|
1, 84, 84, 85, 66, 84, 84, 4, |
|
6924
|
|
|
|
|
|
|
4, 1, 84, 84, 84, 84, 66, 84, |
|
6925
|
|
|
|
|
|
|
4, 4, 1, 65, 65, 65, 65, 66, |
|
6926
|
|
|
|
|
|
|
65, 4, 4, 1, 65, 86, 65, 87, |
|
6927
|
|
|
|
|
|
|
66, 65, 4, 4, 1, 5, 5, 5, |
|
6928
|
|
|
|
|
|
|
5, 6, 1, 88, 89, 88, 5, 89, |
|
6929
|
|
|
|
|
|
|
89, 5, 89, 6, 5, 88, 1, 90, |
|
6930
|
|
|
|
|
|
|
91, 92, 93, 94, 88, 88, 88, 1, |
|
6931
|
|
|
|
|
|
|
90, 95, 92, 96, 97, 1, 90, 95, |
|
6932
|
|
|
|
|
|
|
92, 19, 96, 97, 1, 90, 19, 91, |
|
6933
|
|
|
|
|
|
|
92, 93, 94, 88, 88, 1, 90, 22, |
|
6934
|
|
|
|
|
|
|
91, 92, 93, 94, 88, 88, 1, 98, |
|
6935
|
|
|
|
|
|
|
91, 92, 93, 94, 88, 88, 88, 1, |
|
6936
|
|
|
|
|
|
|
19, 90, 95, 92, 20, 96, 97, 1, |
|
6937
|
|
|
|
|
|
|
90, 100, 92, 101, 102, 99, 99, 99, |
|
6938
|
|
|
|
|
|
|
1, 69, 69, 69, 69, 103, 69, 70, |
|
6939
|
|
|
|
|
|
|
70, 1, 104, 105, 106, 65, 66, 65, |
|
6940
|
|
|
|
|
|
|
4, 4, 1, 107, 109, 109, 109, 109, |
|
6941
|
|
|
|
|
|
|
109, 109, 108, 110, 110, 110, 110, 110, |
|
6942
|
|
|
|
|
|
|
110, 1, 33, 111, 1, 33, 1, 112, |
|
6943
|
|
|
|
|
|
|
1, 108, 113, 107, 5, 5, 5, 115, |
|
6944
|
|
|
|
|
|
|
5, 6, 114, 1, 116, 117, 118, 119, |
|
6945
|
|
|
|
|
|
|
120, 114, 114, 114, 1, 116, 121, 118, |
|
6946
|
|
|
|
|
|
|
122, 123, 1, 116, 121, 118, 35, 122, |
|
6947
|
|
|
|
|
|
|
123, 1, 116, 35, 117, 118, 119, 120, |
|
6948
|
|
|
|
|
|
|
114, 114, 1, 116, 38, 117, 118, 119, |
|
6949
|
|
|
|
|
|
|
120, 114, 114, 1, 124, 117, 118, 119, |
|
6950
|
|
|
|
|
|
|
120, 114, 114, 114, 1, 35, 116, 121, |
|
6951
|
|
|
|
|
|
|
118, 36, 122, 123, 1, 116, 126, 118, |
|
6952
|
|
|
|
|
|
|
127, 128, 125, 125, 125, 1, 5, 5, |
|
6953
|
|
|
|
|
|
|
5, 5, 6, 114, 1, 4, 4, 4, |
|
6954
|
|
|
|
|
|
|
4, 4, 4, 1, 69, 69, 69, 69, |
|
6955
|
|
|
|
|
|
|
69, 71, 70, 70, 1, 84, 84, 84, |
|
6956
|
|
|
|
|
|
|
84, 84, 66, 4, 4, 1, 84, 84, |
|
6957
|
|
|
|
|
|
|
84, 84, 84, 66, 4, 4, 1, 129, |
|
6958
|
|
|
|
|
|
|
129, 129, 129, 129, 131, 132, 130, 1, |
|
6959
|
|
|
|
|
|
|
0 |
|
6960
|
|
|
|
|
|
|
}; |
|
6961
|
|
|
|
|
|
|
|
|
6962
|
|
|
|
|
|
|
static const char _VBD_VBN_trans_targs[] = { |
|
6963
|
|
|
|
|
|
|
2, 0, 41, 42, 43, 43, 45, 43, |
|
6964
|
|
|
|
|
|
|
43, 45, 45, 52, 53, 12, 14, 43, |
|
6965
|
|
|
|
|
|
|
43, 43, 43, 69, 70, 22, 24, 78, |
|
6966
|
|
|
|
|
|
|
79, 84, 85, 43, 81, 28, 83, 30, |
|
6967
|
|
|
|
|
|
|
32, 43, 31, 88, 89, 36, 38, 66, |
|
6968
|
|
|
|
|
|
|
43, 3, 44, 47, 48, 49, 50, 54, |
|
6969
|
|
|
|
|
|
|
16, 56, 57, 59, 61, 62, 63, 64, |
|
6970
|
|
|
|
|
|
|
65, 76, 77, 96, 97, 98, 99, 40, |
|
6971
|
|
|
|
|
|
|
100, 4, 46, 43, 5, 6, 43, 46, |
|
6972
|
|
|
|
|
|
|
7, 51, 8, 9, 10, 11, 13, 15, |
|
6973
|
|
|
|
|
|
|
55, 43, 58, 60, 17, 18, 66, 67, |
|
6974
|
|
|
|
|
|
|
68, 75, 19, 71, 21, 72, 73, 20, |
|
6975
|
|
|
|
|
|
|
23, 25, 74, 68, 71, 72, 73, 46, |
|
6976
|
|
|
|
|
|
|
26, 86, 95, 43, 43, 80, 27, 82, |
|
6977
|
|
|
|
|
|
|
29, 43, 87, 94, 33, 90, 35, 91, |
|
6978
|
|
|
|
|
|
|
92, 34, 37, 39, 93, 87, 90, 91, |
|
6979
|
|
|
|
|
|
|
92, 66, 43, 43, 46 |
|
6980
|
|
|
|
|
|
|
}; |
|
6981
|
|
|
|
|
|
|
|
|
6982
|
|
|
|
|
|
|
static const char _VBD_VBN_trans_actions[] = { |
|
6983
|
|
|
|
|
|
|
0, 0, 0, 31, 29, 25, 25, 5, |
|
6984
|
|
|
|
|
|
|
51, 51, 45, 0, 0, 0, 0, 15, |
|
6985
|
|
|
|
|
|
|
39, 9, 36, 0, 0, 0, 0, 25, |
|
6986
|
|
|
|
|
|
|
25, 25, 25, 21, 21, 0, 21, 0, |
|
6987
|
|
|
|
|
|
|
0, 19, 0, 0, 0, 0, 0, 29, |
|
6988
|
|
|
|
|
|
|
1, 0, 0, 0, 0, 0, 0, 0, |
|
6989
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
6990
|
|
|
|
|
|
|
0, 0, 0, 27, 0, 0, 0, 0, |
|
6991
|
|
|
|
|
|
|
0, 0, 29, 17, 0, 0, 54, 54, |
|
6992
|
|
|
|
|
|
|
0, 54, 0, 0, 0, 0, 0, 0, |
|
6993
|
|
|
|
|
|
|
29, 27, 29, 54, 0, 0, 13, 13, |
|
6994
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
6995
|
|
|
|
|
|
|
0, 0, 0, 7, 7, 7, 7, 61, |
|
6996
|
|
|
|
|
|
|
0, 19, 19, 23, 48, 48, 0, 19, |
|
6997
|
|
|
|
|
|
|
0, 42, 0, 0, 0, 0, 0, 0, |
|
6998
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 17, 17, 17, |
|
6999
|
|
|
|
|
|
|
17, 3, 33, 3, 57 |
|
7000
|
|
|
|
|
|
|
}; |
|
7001
|
|
|
|
|
|
|
|
|
7002
|
|
|
|
|
|
|
static const char _VBD_VBN_eof_actions[] = { |
|
7003
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
7004
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
7005
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
7006
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
7007
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
7008
|
|
|
|
|
|
|
0, 0, 0, 0, 5, 0, 0, 5, |
|
7009
|
|
|
|
|
|
|
5, 5, 5, 0, 5, 5, 5, 0, |
|
7010
|
|
|
|
|
|
|
5, 5, 0, 5, 0, 5, 5, 5, |
|
7011
|
|
|
|
|
|
|
5, 5, 0, 0, 11, 11, 11, 11, |
|
7012
|
|
|
|
|
|
|
11, 11, 11, 11, 5, 5, 0, 0, |
|
7013
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 17, |
|
7014
|
|
|
|
|
|
|
17, 17, 17, 17, 17, 17, 17, 0, |
|
7015
|
|
|
|
|
|
|
0, 5, 5, 5, 5 |
|
7016
|
|
|
|
|
|
|
}; |
|
7017
|
|
|
|
|
|
|
|
|
7018
|
|
|
|
|
|
|
static const int VBD_VBN_start = 1; |
|
7019
|
|
|
|
|
|
|
|
|
7020
|
0
|
|
|
|
|
|
void english_morpho_guesser::add_VBD_VBN(const string& form, vector& lemmas) const { |
|
7021
|
|
|
|
|
|
|
const char* p = form.c_str(); int cs; |
|
7022
|
|
|
|
|
|
|
char best = 'z'; unsigned remove = 0; const char* append = nullptr; |
|
7023
|
|
|
|
|
|
|
|
|
7024
|
|
|
|
|
|
|
{ |
|
7025
|
|
|
|
|
|
|
cs = VBD_VBN_start; |
|
7026
|
|
|
|
|
|
|
} |
|
7027
|
|
|
|
|
|
|
|
|
7028
|
|
|
|
|
|
|
{ |
|
7029
|
|
|
|
|
|
|
int _klen; |
|
7030
|
|
|
|
|
|
|
unsigned int _trans; |
|
7031
|
|
|
|
|
|
|
const char *_acts; |
|
7032
|
|
|
|
|
|
|
unsigned int _nacts; |
|
7033
|
|
|
|
|
|
|
const char *_keys; |
|
7034
|
|
|
|
|
|
|
|
|
7035
|
0
|
0
|
|
|
|
|
if ( p == ( (form.c_str() + form.size())) ) |
|
7036
|
|
|
|
|
|
|
goto _test_eof; |
|
7037
|
|
|
|
|
|
|
if ( cs == 0 ) |
|
7038
|
|
|
|
|
|
|
goto _out; |
|
7039
|
|
|
|
|
|
|
_resume: |
|
7040
|
0
|
|
|
|
|
|
_keys = _VBD_VBN_trans_keys + _VBD_VBN_key_offsets[cs]; |
|
7041
|
0
|
|
|
|
|
|
_trans = _VBD_VBN_index_offsets[cs]; |
|
7042
|
|
|
|
|
|
|
|
|
7043
|
0
|
|
|
|
|
|
_klen = _VBD_VBN_single_lengths[cs]; |
|
7044
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
|
7045
|
|
|
|
|
|
|
const char *_lower = _keys; |
|
7046
|
|
|
|
|
|
|
const char *_mid; |
|
7047
|
0
|
|
|
|
|
|
const char *_upper = _keys + _klen - 1; |
|
7048
|
|
|
|
|
|
|
while (1) { |
|
7049
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
|
7050
|
|
|
|
|
|
|
break; |
|
7051
|
|
|
|
|
|
|
|
|
7052
|
0
|
|
|
|
|
|
_mid = _lower + ((_upper-_lower) >> 1); |
|
7053
|
0
|
0
|
|
|
|
|
if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid ) |
|
7054
|
0
|
|
|
|
|
|
_upper = _mid - 1; |
|
7055
|
0
|
0
|
|
|
|
|
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid ) |
|
7056
|
0
|
|
|
|
|
|
_lower = _mid + 1; |
|
7057
|
|
|
|
|
|
|
else { |
|
7058
|
0
|
|
|
|
|
|
_trans += (unsigned int)(_mid - _keys); |
|
7059
|
0
|
|
|
|
|
|
goto _match; |
|
7060
|
|
|
|
|
|
|
} |
|
7061
|
|
|
|
|
|
|
} |
|
7062
|
0
|
|
|
|
|
|
_keys += _klen; |
|
7063
|
0
|
|
|
|
|
|
_trans += _klen; |
|
7064
|
|
|
|
|
|
|
} |
|
7065
|
|
|
|
|
|
|
|
|
7066
|
0
|
|
|
|
|
|
_klen = _VBD_VBN_range_lengths[cs]; |
|
7067
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
|
7068
|
|
|
|
|
|
|
const char *_lower = _keys; |
|
7069
|
|
|
|
|
|
|
const char *_mid; |
|
7070
|
0
|
|
|
|
|
|
const char *_upper = _keys + (_klen<<1) - 2; |
|
7071
|
|
|
|
|
|
|
while (1) { |
|
7072
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
|
7073
|
|
|
|
|
|
|
break; |
|
7074
|
|
|
|
|
|
|
|
|
7075
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
|
7076
|
0
|
0
|
|
|
|
|
if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] ) |
|
7077
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
|
7078
|
0
|
0
|
|
|
|
|
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] ) |
|
7079
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
|
7080
|
|
|
|
|
|
|
else { |
|
7081
|
0
|
|
|
|
|
|
_trans += (unsigned int)((_mid - _keys)>>1); |
|
7082
|
0
|
|
|
|
|
|
goto _match; |
|
7083
|
|
|
|
|
|
|
} |
|
7084
|
|
|
|
|
|
|
} |
|
7085
|
0
|
|
|
|
|
|
_trans += _klen; |
|
7086
|
|
|
|
|
|
|
} |
|
7087
|
|
|
|
|
|
|
|
|
7088
|
|
|
|
|
|
|
_match: |
|
7089
|
0
|
|
|
|
|
|
_trans = _VBD_VBN_indicies[_trans]; |
|
7090
|
0
|
|
|
|
|
|
cs = _VBD_VBN_trans_targs[_trans]; |
|
7091
|
|
|
|
|
|
|
|
|
7092
|
0
|
0
|
|
|
|
|
if ( _VBD_VBN_trans_actions[_trans] == 0 ) |
|
7093
|
|
|
|
|
|
|
goto _again; |
|
7094
|
|
|
|
|
|
|
|
|
7095
|
0
|
|
|
|
|
|
_acts = _VBD_VBN_actions + _VBD_VBN_trans_actions[_trans]; |
|
7096
|
0
|
|
|
|
|
|
_nacts = (unsigned int) *_acts++; |
|
7097
|
0
|
0
|
|
|
|
|
while ( _nacts-- > 0 ) |
|
7098
|
|
|
|
|
|
|
{ |
|
7099
|
0
|
|
|
|
|
|
switch ( *_acts++ ) |
|
7100
|
|
|
|
|
|
|
{ |
|
7101
|
|
|
|
|
|
|
case 0: |
|
7102
|
0
|
0
|
|
|
|
|
{ if (best > 'a') best = 'a', remove = 1, append = nullptr; } |
|
7103
|
|
|
|
|
|
|
break; |
|
7104
|
|
|
|
|
|
|
case 1: |
|
7105
|
0
|
0
|
|
|
|
|
{ if (best > 'b') best = 'b', remove = 2, append = nullptr; } |
|
7106
|
|
|
|
|
|
|
break; |
|
7107
|
|
|
|
|
|
|
case 2: |
|
7108
|
0
|
0
|
|
|
|
|
{ if (best > 'c') best = 'c', remove = 1, append = nullptr; } |
|
7109
|
|
|
|
|
|
|
break; |
|
7110
|
|
|
|
|
|
|
case 3: |
|
7111
|
0
|
0
|
|
|
|
|
{ if (best > 'd') best = 'd', remove = 2, append = nullptr; } |
|
7112
|
|
|
|
|
|
|
break; |
|
7113
|
|
|
|
|
|
|
case 4: |
|
7114
|
0
|
0
|
|
|
|
|
{ if (best > 'e') best = 'e', remove = 1, append = nullptr; } |
|
7115
|
|
|
|
|
|
|
break; |
|
7116
|
|
|
|
|
|
|
case 5: |
|
7117
|
0
|
0
|
|
|
|
|
{ if (best > 'f') best = 'f', remove = 2, append = nullptr; } |
|
7118
|
|
|
|
|
|
|
break; |
|
7119
|
|
|
|
|
|
|
case 7: |
|
7120
|
0
|
0
|
|
|
|
|
{ if (best > 'h') best = 'h', remove = 2, append = nullptr; } |
|
7121
|
|
|
|
|
|
|
break; |
|
7122
|
|
|
|
|
|
|
case 8: |
|
7123
|
0
|
0
|
|
|
|
|
{ if (best > 'i') best = 'i', remove = 3, append = "y"; } |
|
7124
|
|
|
|
|
|
|
break; |
|
7125
|
|
|
|
|
|
|
case 9: |
|
7126
|
0
|
0
|
|
|
|
|
{ if (best > 'j') best = 'j', remove = 1, append = nullptr; } |
|
7127
|
|
|
|
|
|
|
break; |
|
7128
|
|
|
|
|
|
|
case 10: |
|
7129
|
0
|
0
|
|
|
|
|
{ if (best > 'k') best = 'k', remove = 2, append = nullptr; } |
|
7130
|
|
|
|
|
|
|
break; |
|
7131
|
|
|
|
|
|
|
case 11: |
|
7132
|
0
|
0
|
|
|
|
|
{ if (best > 'l') best = 'l', remove = 1, append = nullptr; } |
|
7133
|
|
|
|
|
|
|
break; |
|
7134
|
|
|
|
|
|
|
case 12: |
|
7135
|
0
|
0
|
|
|
|
|
{ if (best > 'm') best = 'm', remove = 2, append = nullptr; } |
|
7136
|
|
|
|
|
|
|
break; |
|
7137
|
|
|
|
|
|
|
case 13: |
|
7138
|
0
|
0
|
|
|
|
|
{ if (best > 'n') best = 'n', remove = 1, append = nullptr; } |
|
7139
|
|
|
|
|
|
|
break; |
|
7140
|
|
|
|
|
|
|
case 14: |
|
7141
|
0
|
0
|
|
|
|
|
{ if (best > 'o') best = 'o', remove = 2, append = nullptr; } |
|
7142
|
|
|
|
|
|
|
break; |
|
7143
|
|
|
|
|
|
|
case 15: |
|
7144
|
0
|
0
|
|
|
|
|
{ if (best > 'p') best = 'p', remove = 1, append = nullptr; } |
|
7145
|
|
|
|
|
|
|
break; |
|
7146
|
|
|
|
|
|
|
case 16: |
|
7147
|
0
|
0
|
|
|
|
|
{ if (best > 'q') best = 'q', remove = 2, append = nullptr; } |
|
7148
|
|
|
|
|
|
|
break; |
|
7149
|
|
|
|
|
|
|
case 17: |
|
7150
|
0
|
0
|
|
|
|
|
{ if (best > 'r') best = 'r', remove = 1, append = nullptr; } |
|
7151
|
|
|
|
|
|
|
break; |
|
7152
|
|
|
|
|
|
|
} |
|
7153
|
|
|
|
|
|
|
} |
|
7154
|
|
|
|
|
|
|
|
|
7155
|
|
|
|
|
|
|
_again: |
|
7156
|
0
|
0
|
|
|
|
|
if ( cs == 0 ) |
|
7157
|
|
|
|
|
|
|
goto _out; |
|
7158
|
0
|
0
|
|
|
|
|
if ( ++p != ( (form.c_str() + form.size())) ) |
|
7159
|
|
|
|
|
|
|
goto _resume; |
|
7160
|
|
|
|
|
|
|
_test_eof: {} |
|
7161
|
0
|
0
|
|
|
|
|
if ( p == ( (form.c_str() + form.size())) ) |
|
7162
|
|
|
|
|
|
|
{ |
|
7163
|
0
|
|
|
|
|
|
const char *__acts = _VBD_VBN_actions + _VBD_VBN_eof_actions[cs]; |
|
7164
|
0
|
|
|
|
|
|
unsigned int __nacts = (unsigned int) *__acts++; |
|
7165
|
0
|
0
|
|
|
|
|
while ( __nacts-- > 0 ) { |
|
7166
|
0
|
|
|
|
|
|
switch ( *__acts++ ) { |
|
7167
|
|
|
|
|
|
|
case 3: |
|
7168
|
0
|
0
|
|
|
|
|
{ if (best > 'd') best = 'd', remove = 2, append = nullptr; } |
|
7169
|
|
|
|
|
|
|
break; |
|
7170
|
|
|
|
|
|
|
case 6: |
|
7171
|
0
|
0
|
|
|
|
|
{ if (best > 'g') best = 'g', remove = 1, append = nullptr; } |
|
7172
|
|
|
|
|
|
|
break; |
|
7173
|
|
|
|
|
|
|
case 9: |
|
7174
|
0
|
0
|
|
|
|
|
{ if (best > 'j') best = 'j', remove = 1, append = nullptr; } |
|
7175
|
|
|
|
|
|
|
break; |
|
7176
|
|
|
|
|
|
|
} |
|
7177
|
|
|
|
|
|
|
} |
|
7178
|
|
|
|
|
|
|
} |
|
7179
|
|
|
|
|
|
|
|
|
7180
|
|
|
|
|
|
|
_out: {} |
|
7181
|
|
|
|
|
|
|
} |
|
7182
|
|
|
|
|
|
|
|
|
7183
|
0
|
0
|
|
|
|
|
add(VBD, VBN, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
|
|
|
0
|
|
|
|
|
|
|
7184
|
0
|
|
|
|
|
|
} |
|
7185
|
|
|
|
|
|
|
|
|
7186
|
|
|
|
|
|
|
static const char _VBZ_actions[] = { |
|
7187
|
|
|
|
|
|
|
0, 1, 0, 1, 1, 1, 2, 1, |
|
7188
|
|
|
|
|
|
|
3, 1, 4, 1, 5, 1, 6, 1, |
|
7189
|
|
|
|
|
|
|
7, 1, 8 |
|
7190
|
|
|
|
|
|
|
}; |
|
7191
|
|
|
|
|
|
|
|
|
7192
|
|
|
|
|
|
|
static const char _VBZ_key_offsets[] = { |
|
7193
|
|
|
|
|
|
|
0, 0, 1, 2, 4, 14, 14, 25, |
|
7194
|
|
|
|
|
|
|
26, 31, 31, 31, 31, 37, 45, 54 |
|
7195
|
|
|
|
|
|
|
}; |
|
7196
|
|
|
|
|
|
|
|
|
7197
|
|
|
|
|
|
|
static const char _VBZ_trans_keys[] = { |
|
7198
|
|
|
|
|
|
|
115, 101, 99, 115, 98, 100, 102, 104, |
|
7199
|
|
|
|
|
|
|
106, 110, 112, 116, 118, 122, 122, 98, |
|
7200
|
|
|
|
|
|
|
100, 102, 104, 106, 110, 112, 116, 118, |
|
7201
|
|
|
|
|
|
|
120, 111, 97, 101, 105, 111, 117, 104, |
|
7202
|
|
|
|
|
|
|
105, 111, 115, 120, 122, 97, 101, 105, |
|
7203
|
|
|
|
|
|
|
110, 111, 114, 115, 117, 97, 101, 105, |
|
7204
|
|
|
|
|
|
|
111, 117, 121, 122, 98, 120, 0 |
|
7205
|
|
|
|
|
|
|
}; |
|
7206
|
|
|
|
|
|
|
|
|
7207
|
|
|
|
|
|
|
static const char _VBZ_single_lengths[] = { |
|
7208
|
|
|
|
|
|
|
0, 1, 1, 2, 0, 0, 1, 1, |
|
7209
|
|
|
|
|
|
|
5, 0, 0, 0, 6, 8, 7, 0 |
|
7210
|
|
|
|
|
|
|
}; |
|
7211
|
|
|
|
|
|
|
|
|
7212
|
|
|
|
|
|
|
static const char _VBZ_range_lengths[] = { |
|
7213
|
|
|
|
|
|
|
0, 0, 0, 0, 5, 0, 5, 0, |
|
7214
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 1, 0 |
|
7215
|
|
|
|
|
|
|
}; |
|
7216
|
|
|
|
|
|
|
|
|
7217
|
|
|
|
|
|
|
static const char _VBZ_index_offsets[] = { |
|
7218
|
|
|
|
|
|
|
0, 0, 2, 4, 7, 13, 14, 21, |
|
7219
|
|
|
|
|
|
|
23, 29, 30, 31, 32, 39, 48, 57 |
|
7220
|
|
|
|
|
|
|
}; |
|
7221
|
|
|
|
|
|
|
|
|
7222
|
|
|
|
|
|
|
static const char _VBZ_indicies[] = { |
|
7223
|
|
|
|
|
|
|
0, 1, 3, 2, 4, 4, 1, 5, |
|
7224
|
|
|
|
|
|
|
5, 5, 5, 5, 1, 6, 7, 7, |
|
7225
|
|
|
|
|
|
|
7, 7, 7, 7, 1, 8, 1, 9, |
|
7226
|
|
|
|
|
|
|
9, 9, 9, 9, 1, 8, 10, 1, |
|
7227
|
|
|
|
|
|
|
11, 12, 13, 14, 4, 15, 1, 16, |
|
7228
|
|
|
|
|
|
|
16, 16, 17, 16, 18, 19, 16, 1, |
|
7229
|
|
|
|
|
|
|
20, 20, 20, 20, 20, 20, 22, 21, |
|
7230
|
|
|
|
|
|
|
1, 10, 0 |
|
7231
|
|
|
|
|
|
|
}; |
|
7232
|
|
|
|
|
|
|
|
|
7233
|
|
|
|
|
|
|
static const char _VBZ_trans_targs[] = { |
|
7234
|
|
|
|
|
|
|
2, 0, 11, 12, 11, 5, 11, 11, |
|
7235
|
|
|
|
|
|
|
11, 9, 11, 3, 4, 6, 13, 14, |
|
7236
|
|
|
|
|
|
|
11, 7, 8, 11, 11, 10, 15 |
|
7237
|
|
|
|
|
|
|
}; |
|
7238
|
|
|
|
|
|
|
|
|
7239
|
|
|
|
|
|
|
static const char _VBZ_trans_actions[] = { |
|
7240
|
|
|
|
|
|
|
0, 0, 17, 17, 11, 0, 13, 15, |
|
7241
|
|
|
|
|
|
|
9, 0, 3, 0, 0, 0, 11, 11, |
|
7242
|
|
|
|
|
|
|
1, 0, 0, 7, 5, 0, 7 |
|
7243
|
|
|
|
|
|
|
}; |
|
7244
|
|
|
|
|
|
|
|
|
7245
|
|
|
|
|
|
|
static const int VBZ_start = 1; |
|
7246
|
|
|
|
|
|
|
|
|
7247
|
0
|
|
|
|
|
|
void english_morpho_guesser::add_VBZ(const string& form, vector& lemmas) const { |
|
7248
|
|
|
|
|
|
|
const char* p = form.c_str(); int cs; |
|
7249
|
|
|
|
|
|
|
char best = 'z'; unsigned remove = 0; const char* append = nullptr; |
|
7250
|
|
|
|
|
|
|
|
|
7251
|
|
|
|
|
|
|
{ |
|
7252
|
|
|
|
|
|
|
cs = VBZ_start; |
|
7253
|
|
|
|
|
|
|
} |
|
7254
|
|
|
|
|
|
|
|
|
7255
|
|
|
|
|
|
|
{ |
|
7256
|
|
|
|
|
|
|
int _klen; |
|
7257
|
|
|
|
|
|
|
unsigned int _trans; |
|
7258
|
|
|
|
|
|
|
const char *_acts; |
|
7259
|
|
|
|
|
|
|
unsigned int _nacts; |
|
7260
|
|
|
|
|
|
|
const char *_keys; |
|
7261
|
|
|
|
|
|
|
|
|
7262
|
0
|
0
|
|
|
|
|
if ( p == ( (form.c_str() + form.size())) ) |
|
7263
|
|
|
|
|
|
|
goto _test_eof; |
|
7264
|
|
|
|
|
|
|
if ( cs == 0 ) |
|
7265
|
|
|
|
|
|
|
goto _out; |
|
7266
|
|
|
|
|
|
|
_resume: |
|
7267
|
0
|
|
|
|
|
|
_keys = _VBZ_trans_keys + _VBZ_key_offsets[cs]; |
|
7268
|
0
|
|
|
|
|
|
_trans = _VBZ_index_offsets[cs]; |
|
7269
|
|
|
|
|
|
|
|
|
7270
|
0
|
|
|
|
|
|
_klen = _VBZ_single_lengths[cs]; |
|
7271
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
|
7272
|
|
|
|
|
|
|
const char *_lower = _keys; |
|
7273
|
|
|
|
|
|
|
const char *_mid; |
|
7274
|
0
|
|
|
|
|
|
const char *_upper = _keys + _klen - 1; |
|
7275
|
|
|
|
|
|
|
while (1) { |
|
7276
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
|
7277
|
|
|
|
|
|
|
break; |
|
7278
|
|
|
|
|
|
|
|
|
7279
|
0
|
|
|
|
|
|
_mid = _lower + ((_upper-_lower) >> 1); |
|
7280
|
0
|
0
|
|
|
|
|
if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid ) |
|
7281
|
0
|
|
|
|
|
|
_upper = _mid - 1; |
|
7282
|
0
|
0
|
|
|
|
|
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid ) |
|
7283
|
0
|
|
|
|
|
|
_lower = _mid + 1; |
|
7284
|
|
|
|
|
|
|
else { |
|
7285
|
0
|
|
|
|
|
|
_trans += (unsigned int)(_mid - _keys); |
|
7286
|
0
|
|
|
|
|
|
goto _match; |
|
7287
|
|
|
|
|
|
|
} |
|
7288
|
|
|
|
|
|
|
} |
|
7289
|
0
|
|
|
|
|
|
_keys += _klen; |
|
7290
|
0
|
|
|
|
|
|
_trans += _klen; |
|
7291
|
|
|
|
|
|
|
} |
|
7292
|
|
|
|
|
|
|
|
|
7293
|
0
|
|
|
|
|
|
_klen = _VBZ_range_lengths[cs]; |
|
7294
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
|
7295
|
|
|
|
|
|
|
const char *_lower = _keys; |
|
7296
|
|
|
|
|
|
|
const char *_mid; |
|
7297
|
0
|
|
|
|
|
|
const char *_upper = _keys + (_klen<<1) - 2; |
|
7298
|
|
|
|
|
|
|
while (1) { |
|
7299
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
|
7300
|
|
|
|
|
|
|
break; |
|
7301
|
|
|
|
|
|
|
|
|
7302
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
|
7303
|
0
|
0
|
|
|
|
|
if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] ) |
|
7304
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
|
7305
|
0
|
0
|
|
|
|
|
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] ) |
|
7306
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
|
7307
|
|
|
|
|
|
|
else { |
|
7308
|
0
|
|
|
|
|
|
_trans += (unsigned int)((_mid - _keys)>>1); |
|
7309
|
0
|
|
|
|
|
|
goto _match; |
|
7310
|
|
|
|
|
|
|
} |
|
7311
|
|
|
|
|
|
|
} |
|
7312
|
0
|
|
|
|
|
|
_trans += _klen; |
|
7313
|
|
|
|
|
|
|
} |
|
7314
|
|
|
|
|
|
|
|
|
7315
|
|
|
|
|
|
|
_match: |
|
7316
|
0
|
|
|
|
|
|
_trans = _VBZ_indicies[_trans]; |
|
7317
|
0
|
|
|
|
|
|
cs = _VBZ_trans_targs[_trans]; |
|
7318
|
|
|
|
|
|
|
|
|
7319
|
0
|
0
|
|
|
|
|
if ( _VBZ_trans_actions[_trans] == 0 ) |
|
7320
|
|
|
|
|
|
|
goto _again; |
|
7321
|
|
|
|
|
|
|
|
|
7322
|
0
|
|
|
|
|
|
_acts = _VBZ_actions + _VBZ_trans_actions[_trans]; |
|
7323
|
0
|
|
|
|
|
|
_nacts = (unsigned int) *_acts++; |
|
7324
|
0
|
0
|
|
|
|
|
while ( _nacts-- > 0 ) |
|
7325
|
|
|
|
|
|
|
{ |
|
7326
|
0
|
|
|
|
|
|
switch ( *_acts++ ) |
|
7327
|
|
|
|
|
|
|
{ |
|
7328
|
|
|
|
|
|
|
case 0: |
|
7329
|
0
|
0
|
|
|
|
|
{ if (best > 'a') best = 'a', remove = 1, append = nullptr; } |
|
7330
|
|
|
|
|
|
|
break; |
|
7331
|
|
|
|
|
|
|
case 1: |
|
7332
|
0
|
0
|
|
|
|
|
{ if (best > 'b') best = 'b', remove = 2, append = nullptr; } |
|
7333
|
|
|
|
|
|
|
break; |
|
7334
|
|
|
|
|
|
|
case 2: |
|
7335
|
0
|
0
|
|
|
|
|
{ if (best > 'c') best = 'c', remove = 1, append = nullptr; } |
|
7336
|
|
|
|
|
|
|
break; |
|
7337
|
|
|
|
|
|
|
case 3: |
|
7338
|
0
|
0
|
|
|
|
|
{ if (best > 'd') best = 'd', remove = 2, append = nullptr; } |
|
7339
|
|
|
|
|
|
|
break; |
|
7340
|
|
|
|
|
|
|
case 4: |
|
7341
|
0
|
0
|
|
|
|
|
{ if (best > 'e') best = 'e', remove = 1, append = nullptr; } |
|
7342
|
|
|
|
|
|
|
break; |
|
7343
|
|
|
|
|
|
|
case 5: |
|
7344
|
0
|
0
|
|
|
|
|
{ if (best > 'f') best = 'f', remove = 2, append = nullptr; } |
|
7345
|
|
|
|
|
|
|
break; |
|
7346
|
|
|
|
|
|
|
case 6: |
|
7347
|
0
|
0
|
|
|
|
|
{ if (best > 'g') best = 'g', remove = 3, append = "y"; } |
|
7348
|
|
|
|
|
|
|
break; |
|
7349
|
|
|
|
|
|
|
case 7: |
|
7350
|
0
|
0
|
|
|
|
|
{ if (best > 'h') best = 'h', remove = 2, append = nullptr; } |
|
7351
|
|
|
|
|
|
|
break; |
|
7352
|
|
|
|
|
|
|
case 8: |
|
7353
|
0
|
0
|
|
|
|
|
{ if (best > 'i') best = 'i', remove = 1, append = nullptr; } |
|
7354
|
|
|
|
|
|
|
break; |
|
7355
|
|
|
|
|
|
|
} |
|
7356
|
|
|
|
|
|
|
} |
|
7357
|
|
|
|
|
|
|
|
|
7358
|
|
|
|
|
|
|
_again: |
|
7359
|
0
|
0
|
|
|
|
|
if ( cs == 0 ) |
|
7360
|
|
|
|
|
|
|
goto _out; |
|
7361
|
0
|
0
|
|
|
|
|
if ( ++p != ( (form.c_str() + form.size())) ) |
|
7362
|
|
|
|
|
|
|
goto _resume; |
|
7363
|
|
|
|
|
|
|
_test_eof: {} |
|
7364
|
|
|
|
|
|
|
_out: {} |
|
7365
|
|
|
|
|
|
|
} |
|
7366
|
|
|
|
|
|
|
|
|
7367
|
0
|
0
|
|
|
|
|
add(VBZ, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
|
|
|
0
|
|
|
|
|
|
|
7368
|
0
|
|
|
|
|
|
} |
|
7369
|
|
|
|
|
|
|
|
|
7370
|
|
|
|
|
|
|
static const char _JJR_RBR_actions[] = { |
|
7371
|
|
|
|
|
|
|
0, 1, 0, 1, 1, 1, 3, 1, |
|
7372
|
|
|
|
|
|
|
4, 1, 5, 2, 1, 4, 2, 2, |
|
7373
|
|
|
|
|
|
|
5, 2, 4, 5 |
|
7374
|
|
|
|
|
|
|
}; |
|
7375
|
|
|
|
|
|
|
|
|
7376
|
|
|
|
|
|
|
static const unsigned char _JJR_RBR_key_offsets[] = { |
|
7377
|
|
|
|
|
|
|
0, 0, 1, 2, 26, 26, 32, 37, |
|
7378
|
|
|
|
|
|
|
50, 56, 62, 73, 79, 85, 91, 102, |
|
7379
|
|
|
|
|
|
|
103, 109, 115, 117, 123, 129, 135, 146, |
|
7380
|
|
|
|
|
|
|
152, 163, 169, 175, 181 |
|
7381
|
|
|
|
|
|
|
}; |
|
7382
|
|
|
|
|
|
|
|
|
7383
|
|
|
|
|
|
|
static const char _JJR_RBR_trans_keys[] = { |
|
7384
|
|
|
|
|
|
|
114, 101, 98, 99, 100, 101, 102, 103, |
|
7385
|
|
|
|
|
|
|
104, 105, 106, 107, 108, 109, 110, 112, |
|
7386
|
|
|
|
|
|
|
113, 114, 115, 116, 117, 118, 119, 120, |
|
7387
|
|
|
|
|
|
|
121, 122, 97, 98, 101, 105, 111, 117, |
|
7388
|
|
|
|
|
|
|
97, 101, 105, 111, 117, 98, 99, 100, |
|
7389
|
|
|
|
|
|
|
105, 111, 117, 122, 97, 101, 102, 109, |
|
7390
|
|
|
|
|
|
|
112, 120, 97, 100, 101, 105, 111, 117, |
|
7391
|
|
|
|
|
|
|
97, 101, 102, 105, 111, 117, 97, 101, |
|
7392
|
|
|
|
|
|
|
103, 105, 111, 117, 122, 98, 109, 112, |
|
7393
|
|
|
|
|
|
|
120, 97, 101, 104, 105, 111, 117, 97, |
|
7394
|
|
|
|
|
|
|
101, 105, 106, 111, 117, 97, 101, 105, |
|
7395
|
|
|
|
|
|
|
107, 111, 117, 97, 101, 105, 108, 111, |
|
7396
|
|
|
|
|
|
|
117, 122, 98, 109, 112, 120, 101, 97, |
|
7397
|
|
|
|
|
|
|
101, 105, 109, 111, 117, 97, 101, 105, |
|
7398
|
|
|
|
|
|
|
110, 111, 117, 97, 122, 97, 101, 105, |
|
7399
|
|
|
|
|
|
|
111, 112, 117, 97, 101, 105, 111, 113, |
|
7400
|
|
|
|
|
|
|
117, 97, 101, 105, 111, 114, 117, 97, |
|
7401
|
|
|
|
|
|
|
101, 105, 111, 115, 117, 122, 98, 109, |
|
7402
|
|
|
|
|
|
|
112, 120, 97, 101, 105, 111, 116, 117, |
|
7403
|
|
|
|
|
|
|
97, 101, 105, 111, 117, 118, 122, 98, |
|
7404
|
|
|
|
|
|
|
109, 112, 120, 97, 101, 105, 111, 117, |
|
7405
|
|
|
|
|
|
|
119, 97, 101, 105, 111, 117, 120, 97, |
|
7406
|
|
|
|
|
|
|
101, 105, 111, 117, 121, 97, 101, 105, |
|
7407
|
|
|
|
|
|
|
111, 117, 122, 0 |
|
7408
|
|
|
|
|
|
|
}; |
|
7409
|
|
|
|
|
|
|
|
|
7410
|
|
|
|
|
|
|
static const char _JJR_RBR_single_lengths[] = { |
|
7411
|
|
|
|
|
|
|
0, 1, 1, 24, 0, 6, 5, 7, |
|
7412
|
|
|
|
|
|
|
6, 6, 7, 6, 6, 6, 7, 1, |
|
7413
|
|
|
|
|
|
|
6, 6, 0, 6, 6, 6, 7, 6, |
|
7414
|
|
|
|
|
|
|
7, 6, 6, 6, 6 |
|
7415
|
|
|
|
|
|
|
}; |
|
7416
|
|
|
|
|
|
|
|
|
7417
|
|
|
|
|
|
|
static const char _JJR_RBR_range_lengths[] = { |
|
7418
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 3, |
|
7419
|
|
|
|
|
|
|
0, 0, 2, 0, 0, 0, 2, 0, |
|
7420
|
|
|
|
|
|
|
0, 0, 1, 0, 0, 0, 2, 0, |
|
7421
|
|
|
|
|
|
|
2, 0, 0, 0, 0 |
|
7422
|
|
|
|
|
|
|
}; |
|
7423
|
|
|
|
|
|
|
|
|
7424
|
|
|
|
|
|
|
static const unsigned char _JJR_RBR_index_offsets[] = { |
|
7425
|
|
|
|
|
|
|
0, 0, 2, 4, 29, 30, 37, 43, |
|
7426
|
|
|
|
|
|
|
54, 61, 68, 78, 85, 92, 99, 109, |
|
7427
|
|
|
|
|
|
|
111, 118, 125, 127, 134, 141, 148, 158, |
|
7428
|
|
|
|
|
|
|
165, 175, 182, 189, 196 |
|
7429
|
|
|
|
|
|
|
}; |
|
7430
|
|
|
|
|
|
|
|
|
7431
|
|
|
|
|
|
|
static const char _JJR_RBR_indicies[] = { |
|
7432
|
|
|
|
|
|
|
0, 1, 2, 1, 4, 5, 6, 7, |
|
7433
|
|
|
|
|
|
|
8, 9, 10, 11, 12, 13, 14, 15, |
|
7434
|
|
|
|
|
|
|
16, 17, 18, 19, 20, 21, 7, 22, |
|
7435
|
|
|
|
|
|
|
23, 24, 25, 26, 3, 1, 27, 28, |
|
7436
|
|
|
|
|
|
|
27, 27, 27, 27, 1, 29, 29, 29, |
|
7437
|
|
|
|
|
|
|
29, 29, 1, 30, 31, 30, 27, 27, |
|
7438
|
|
|
|
|
|
|
27, 30, 27, 30, 30, 1, 27, 28, |
|
7439
|
|
|
|
|
|
|
27, 27, 27, 27, 1, 27, 27, 28, |
|
7440
|
|
|
|
|
|
|
27, 27, 27, 1, 27, 27, 31, 27, |
|
7441
|
|
|
|
|
|
|
27, 27, 30, 30, 30, 1, 27, 27, |
|
7442
|
|
|
|
|
|
|
28, 27, 27, 27, 1, 27, 27, 27, |
|
7443
|
|
|
|
|
|
|
28, 27, 27, 1, 27, 27, 27, 28, |
|
7444
|
|
|
|
|
|
|
27, 27, 1, 27, 27, 27, 32, 27, |
|
7445
|
|
|
|
|
|
|
27, 30, 30, 30, 1, 1, 33, 27, |
|
7446
|
|
|
|
|
|
|
27, 27, 28, 27, 27, 1, 34, 34, |
|
7447
|
|
|
|
|
|
|
34, 28, 34, 34, 1, 29, 1, 34, |
|
7448
|
|
|
|
|
|
|
34, 34, 34, 28, 34, 1, 27, 27, |
|
7449
|
|
|
|
|
|
|
27, 27, 28, 27, 1, 27, 27, 27, |
|
7450
|
|
|
|
|
|
|
27, 28, 27, 1, 27, 27, 27, 27, |
|
7451
|
|
|
|
|
|
|
31, 27, 30, 30, 30, 1, 27, 27, |
|
7452
|
|
|
|
|
|
|
27, 27, 28, 27, 1, 27, 27, 27, |
|
7453
|
|
|
|
|
|
|
27, 27, 31, 30, 30, 30, 1, 34, |
|
7454
|
|
|
|
|
|
|
34, 34, 34, 34, 28, 1, 34, 34, |
|
7455
|
|
|
|
|
|
|
34, 34, 34, 28, 1, 27, 27, 27, |
|
7456
|
|
|
|
|
|
|
27, 27, 28, 1, 27, 27, 27, 27, |
|
7457
|
|
|
|
|
|
|
27, 28, 1, 0 |
|
7458
|
|
|
|
|
|
|
}; |
|
7459
|
|
|
|
|
|
|
|
|
7460
|
|
|
|
|
|
|
static const char _JJR_RBR_trans_targs[] = { |
|
7461
|
|
|
|
|
|
|
2, 0, 3, 4, 5, 7, 8, 4, |
|
7462
|
|
|
|
|
|
|
9, 10, 11, 4, 12, 13, 14, 16, |
|
7463
|
|
|
|
|
|
|
17, 19, 20, 21, 22, 23, 24, 25, |
|
7464
|
|
|
|
|
|
|
26, 27, 28, 6, 4, 4, 4, 4, |
|
7465
|
|
|
|
|
|
|
15, 4, 18 |
|
7466
|
|
|
|
|
|
|
}; |
|
7467
|
|
|
|
|
|
|
|
|
7468
|
|
|
|
|
|
|
static const char _JJR_RBR_trans_actions[] = { |
|
7469
|
|
|
|
|
|
|
0, 0, 0, 9, 9, 9, 9, 17, |
|
7470
|
|
|
|
|
|
|
9, 9, 9, 14, 9, 9, 9, 9, |
|
7471
|
|
|
|
|
|
|
9, 9, 9, 9, 9, 9, 9, 9, |
|
7472
|
|
|
|
|
|
|
9, 9, 9, 7, 3, 5, 7, 11, |
|
7473
|
|
|
|
|
|
|
11, 1, 7 |
|
7474
|
|
|
|
|
|
|
}; |
|
7475
|
|
|
|
|
|
|
|
|
7476
|
|
|
|
|
|
|
static const int JJR_RBR_start = 1; |
|
7477
|
|
|
|
|
|
|
|
|
7478
|
0
|
|
|
|
|
|
void english_morpho_guesser::add_JJR_RBR(const string& form, unsigned negation_len, vector& lemmas) const { |
|
7479
|
0
|
|
|
|
|
|
const char* p = form.c_str() + negation_len; int cs; |
|
7480
|
|
|
|
|
|
|
char best = 'z'; unsigned remove = 0; const char* append = nullptr; |
|
7481
|
|
|
|
|
|
|
|
|
7482
|
|
|
|
|
|
|
{ |
|
7483
|
|
|
|
|
|
|
cs = JJR_RBR_start; |
|
7484
|
|
|
|
|
|
|
} |
|
7485
|
|
|
|
|
|
|
|
|
7486
|
|
|
|
|
|
|
{ |
|
7487
|
|
|
|
|
|
|
int _klen; |
|
7488
|
|
|
|
|
|
|
unsigned int _trans; |
|
7489
|
|
|
|
|
|
|
const char *_acts; |
|
7490
|
|
|
|
|
|
|
unsigned int _nacts; |
|
7491
|
|
|
|
|
|
|
const char *_keys; |
|
7492
|
|
|
|
|
|
|
|
|
7493
|
0
|
0
|
|
|
|
|
if ( p == ( (form.c_str() + form.size())) ) |
|
7494
|
|
|
|
|
|
|
goto _test_eof; |
|
7495
|
|
|
|
|
|
|
if ( cs == 0 ) |
|
7496
|
|
|
|
|
|
|
goto _out; |
|
7497
|
|
|
|
|
|
|
_resume: |
|
7498
|
0
|
|
|
|
|
|
_keys = _JJR_RBR_trans_keys + _JJR_RBR_key_offsets[cs]; |
|
7499
|
0
|
|
|
|
|
|
_trans = _JJR_RBR_index_offsets[cs]; |
|
7500
|
|
|
|
|
|
|
|
|
7501
|
0
|
|
|
|
|
|
_klen = _JJR_RBR_single_lengths[cs]; |
|
7502
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
|
7503
|
|
|
|
|
|
|
const char *_lower = _keys; |
|
7504
|
|
|
|
|
|
|
const char *_mid; |
|
7505
|
0
|
|
|
|
|
|
const char *_upper = _keys + _klen - 1; |
|
7506
|
|
|
|
|
|
|
while (1) { |
|
7507
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
|
7508
|
|
|
|
|
|
|
break; |
|
7509
|
|
|
|
|
|
|
|
|
7510
|
0
|
|
|
|
|
|
_mid = _lower + ((_upper-_lower) >> 1); |
|
7511
|
0
|
0
|
|
|
|
|
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < *_mid ) |
|
7512
|
0
|
|
|
|
|
|
_upper = _mid - 1; |
|
7513
|
0
|
0
|
|
|
|
|
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > *_mid ) |
|
7514
|
0
|
|
|
|
|
|
_lower = _mid + 1; |
|
7515
|
|
|
|
|
|
|
else { |
|
7516
|
0
|
|
|
|
|
|
_trans += (unsigned int)(_mid - _keys); |
|
7517
|
0
|
|
|
|
|
|
goto _match; |
|
7518
|
|
|
|
|
|
|
} |
|
7519
|
|
|
|
|
|
|
} |
|
7520
|
0
|
|
|
|
|
|
_keys += _klen; |
|
7521
|
0
|
|
|
|
|
|
_trans += _klen; |
|
7522
|
|
|
|
|
|
|
} |
|
7523
|
|
|
|
|
|
|
|
|
7524
|
0
|
|
|
|
|
|
_klen = _JJR_RBR_range_lengths[cs]; |
|
7525
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
|
7526
|
|
|
|
|
|
|
const char *_lower = _keys; |
|
7527
|
|
|
|
|
|
|
const char *_mid; |
|
7528
|
0
|
|
|
|
|
|
const char *_upper = _keys + (_klen<<1) - 2; |
|
7529
|
|
|
|
|
|
|
while (1) { |
|
7530
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
|
7531
|
|
|
|
|
|
|
break; |
|
7532
|
|
|
|
|
|
|
|
|
7533
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
|
7534
|
0
|
0
|
|
|
|
|
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < _mid[0] ) |
|
7535
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
|
7536
|
0
|
0
|
|
|
|
|
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > _mid[1] ) |
|
7537
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
|
7538
|
|
|
|
|
|
|
else { |
|
7539
|
0
|
|
|
|
|
|
_trans += (unsigned int)((_mid - _keys)>>1); |
|
7540
|
0
|
|
|
|
|
|
goto _match; |
|
7541
|
|
|
|
|
|
|
} |
|
7542
|
|
|
|
|
|
|
} |
|
7543
|
0
|
|
|
|
|
|
_trans += _klen; |
|
7544
|
|
|
|
|
|
|
} |
|
7545
|
|
|
|
|
|
|
|
|
7546
|
|
|
|
|
|
|
_match: |
|
7547
|
0
|
|
|
|
|
|
_trans = _JJR_RBR_indicies[_trans]; |
|
7548
|
0
|
|
|
|
|
|
cs = _JJR_RBR_trans_targs[_trans]; |
|
7549
|
|
|
|
|
|
|
|
|
7550
|
0
|
0
|
|
|
|
|
if ( _JJR_RBR_trans_actions[_trans] == 0 ) |
|
7551
|
|
|
|
|
|
|
goto _again; |
|
7552
|
|
|
|
|
|
|
|
|
7553
|
0
|
|
|
|
|
|
_acts = _JJR_RBR_actions + _JJR_RBR_trans_actions[_trans]; |
|
7554
|
0
|
|
|
|
|
|
_nacts = (unsigned int) *_acts++; |
|
7555
|
0
|
0
|
|
|
|
|
while ( _nacts-- > 0 ) |
|
7556
|
|
|
|
|
|
|
{ |
|
7557
|
0
|
|
|
|
|
|
switch ( *_acts++ ) |
|
7558
|
|
|
|
|
|
|
{ |
|
7559
|
|
|
|
|
|
|
case 0: |
|
7560
|
0
|
0
|
|
|
|
|
{ if (best > 'a') best = 'a', remove = 2, append = nullptr; } |
|
7561
|
|
|
|
|
|
|
break; |
|
7562
|
|
|
|
|
|
|
case 1: |
|
7563
|
0
|
0
|
|
|
|
|
{ if (best > 'b') best = 'b', remove = 3, append = nullptr; } |
|
7564
|
|
|
|
|
|
|
break; |
|
7565
|
|
|
|
|
|
|
case 2: |
|
7566
|
0
|
0
|
|
|
|
|
{ if (best > 'c') best = 'c', remove = 3, append = "y"; } |
|
7567
|
|
|
|
|
|
|
break; |
|
7568
|
|
|
|
|
|
|
case 3: |
|
7569
|
0
|
0
|
|
|
|
|
{ if (best > 'd') best = 'd', remove = 2, append = nullptr; } |
|
7570
|
|
|
|
|
|
|
break; |
|
7571
|
|
|
|
|
|
|
case 4: |
|
7572
|
0
|
0
|
|
|
|
|
{ if (best > 'e') best = 'e', remove = 1, append = nullptr; } |
|
7573
|
|
|
|
|
|
|
break; |
|
7574
|
|
|
|
|
|
|
case 5: |
|
7575
|
0
|
0
|
|
|
|
|
{ if (best > 'f') best = 'f', remove = 2, append = nullptr; } |
|
7576
|
|
|
|
|
|
|
break; |
|
7577
|
|
|
|
|
|
|
} |
|
7578
|
|
|
|
|
|
|
} |
|
7579
|
|
|
|
|
|
|
|
|
7580
|
|
|
|
|
|
|
_again: |
|
7581
|
0
|
0
|
|
|
|
|
if ( cs == 0 ) |
|
7582
|
|
|
|
|
|
|
goto _out; |
|
7583
|
0
|
0
|
|
|
|
|
if ( ++p != ( (form.c_str() + form.size())) ) |
|
7584
|
|
|
|
|
|
|
goto _resume; |
|
7585
|
|
|
|
|
|
|
_test_eof: {} |
|
7586
|
|
|
|
|
|
|
_out: {} |
|
7587
|
|
|
|
|
|
|
} |
|
7588
|
|
|
|
|
|
|
|
|
7589
|
0
|
0
|
|
|
|
|
add(JJR, RBR, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
7590
|
0
|
|
|
|
|
|
} |
|
7591
|
|
|
|
|
|
|
|
|
7592
|
|
|
|
|
|
|
static const char _JJS_RBS_actions[] = { |
|
7593
|
|
|
|
|
|
|
0, 1, 1, 1, 2, 1, 4, 1, |
|
7594
|
|
|
|
|
|
|
5, 2, 0, 5, 2, 1, 4, 2, |
|
7595
|
|
|
|
|
|
|
3, 5 |
|
7596
|
|
|
|
|
|
|
}; |
|
7597
|
|
|
|
|
|
|
|
|
7598
|
|
|
|
|
|
|
static const unsigned char _JJS_RBS_key_offsets[] = { |
|
7599
|
|
|
|
|
|
|
0, 0, 1, 2, 3, 25, 25, 25, |
|
7600
|
|
|
|
|
|
|
31, 44, 50, 56, 67, 73, 79, 85, |
|
7601
|
|
|
|
|
|
|
96, 102, 108, 114, 120, 126, 137, 143, |
|
7602
|
|
|
|
|
|
|
154, 160, 166, 172, 178, 178, 183, 183, |
|
7603
|
|
|
|
|
|
|
183, 184 |
|
7604
|
|
|
|
|
|
|
}; |
|
7605
|
|
|
|
|
|
|
|
|
7606
|
|
|
|
|
|
|
static const char _JJS_RBS_trans_keys[] = { |
|
7607
|
|
|
|
|
|
|
116, 115, 101, 98, 99, 100, 102, 103, |
|
7608
|
|
|
|
|
|
|
104, 105, 106, 107, 108, 109, 110, 112, |
|
7609
|
|
|
|
|
|
|
113, 114, 115, 116, 118, 119, 120, 121, |
|
7610
|
|
|
|
|
|
|
122, 97, 98, 101, 105, 111, 117, 98, |
|
7611
|
|
|
|
|
|
|
99, 100, 105, 111, 117, 122, 97, 101, |
|
7612
|
|
|
|
|
|
|
102, 109, 112, 120, 97, 100, 101, 105, |
|
7613
|
|
|
|
|
|
|
111, 117, 97, 101, 102, 105, 111, 117, |
|
7614
|
|
|
|
|
|
|
97, 101, 103, 105, 111, 117, 122, 98, |
|
7615
|
|
|
|
|
|
|
109, 112, 120, 97, 101, 104, 105, 111, |
|
7616
|
|
|
|
|
|
|
117, 97, 101, 105, 106, 111, 117, 97, |
|
7617
|
|
|
|
|
|
|
101, 105, 107, 111, 117, 97, 101, 105, |
|
7618
|
|
|
|
|
|
|
108, 111, 117, 122, 98, 109, 112, 120, |
|
7619
|
|
|
|
|
|
|
97, 101, 105, 109, 111, 117, 97, 101, |
|
7620
|
|
|
|
|
|
|
105, 110, 111, 117, 97, 101, 105, 111, |
|
7621
|
|
|
|
|
|
|
112, 117, 97, 101, 105, 111, 113, 117, |
|
7622
|
|
|
|
|
|
|
97, 101, 105, 111, 114, 117, 97, 101, |
|
7623
|
|
|
|
|
|
|
105, 111, 115, 117, 122, 98, 109, 112, |
|
7624
|
|
|
|
|
|
|
120, 97, 101, 105, 111, 116, 117, 97, |
|
7625
|
|
|
|
|
|
|
101, 105, 111, 117, 118, 122, 98, 109, |
|
7626
|
|
|
|
|
|
|
112, 120, 97, 101, 105, 111, 117, 119, |
|
7627
|
|
|
|
|
|
|
97, 101, 105, 111, 117, 120, 97, 101, |
|
7628
|
|
|
|
|
|
|
105, 111, 117, 121, 97, 101, 105, 111, |
|
7629
|
|
|
|
|
|
|
117, 122, 97, 101, 105, 111, 117, 101, |
|
7630
|
|
|
|
|
|
|
97, 122, 0 |
|
7631
|
|
|
|
|
|
|
}; |
|
7632
|
|
|
|
|
|
|
|
|
7633
|
|
|
|
|
|
|
static const char _JJS_RBS_single_lengths[] = { |
|
7634
|
|
|
|
|
|
|
0, 1, 1, 1, 22, 0, 0, 6, |
|
7635
|
|
|
|
|
|
|
7, 6, 6, 7, 6, 6, 6, 7, |
|
7636
|
|
|
|
|
|
|
6, 6, 6, 6, 6, 7, 6, 7, |
|
7637
|
|
|
|
|
|
|
6, 6, 6, 6, 0, 5, 0, 0, |
|
7638
|
|
|
|
|
|
|
1, 0 |
|
7639
|
|
|
|
|
|
|
}; |
|
7640
|
|
|
|
|
|
|
|
|
7641
|
|
|
|
|
|
|
static const char _JJS_RBS_range_lengths[] = { |
|
7642
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
7643
|
|
|
|
|
|
|
3, 0, 0, 2, 0, 0, 0, 2, |
|
7644
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 2, 0, 2, |
|
7645
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
7646
|
|
|
|
|
|
|
0, 1 |
|
7647
|
|
|
|
|
|
|
}; |
|
7648
|
|
|
|
|
|
|
|
|
7649
|
|
|
|
|
|
|
static const unsigned char _JJS_RBS_index_offsets[] = { |
|
7650
|
|
|
|
|
|
|
0, 0, 2, 4, 6, 29, 30, 31, |
|
7651
|
|
|
|
|
|
|
38, 49, 56, 63, 73, 80, 87, 94, |
|
7652
|
|
|
|
|
|
|
104, 111, 118, 125, 132, 139, 149, 156, |
|
7653
|
|
|
|
|
|
|
166, 173, 180, 187, 194, 195, 201, 202, |
|
7654
|
|
|
|
|
|
|
203, 205 |
|
7655
|
|
|
|
|
|
|
}; |
|
7656
|
|
|
|
|
|
|
|
|
7657
|
|
|
|
|
|
|
static const char _JJS_RBS_indicies[] = { |
|
7658
|
|
|
|
|
|
|
0, 1, 2, 1, 3, 1, 5, 6, |
|
7659
|
|
|
|
|
|
|
7, 8, 9, 10, 11, 12, 13, 14, |
|
7660
|
|
|
|
|
|
|
15, 16, 17, 18, 19, 20, 21, 22, |
|
7661
|
|
|
|
|
|
|
23, 24, 25, 26, 4, 27, 28, 29, |
|
7662
|
|
|
|
|
|
|
30, 29, 29, 29, 29, 27, 31, 32, |
|
7663
|
|
|
|
|
|
|
31, 29, 29, 29, 31, 29, 31, 31, |
|
7664
|
|
|
|
|
|
|
27, 29, 30, 29, 29, 29, 29, 27, |
|
7665
|
|
|
|
|
|
|
29, 29, 30, 29, 29, 29, 27, 29, |
|
7666
|
|
|
|
|
|
|
29, 32, 29, 29, 29, 31, 31, 31, |
|
7667
|
|
|
|
|
|
|
27, 29, 29, 30, 29, 29, 29, 27, |
|
7668
|
|
|
|
|
|
|
29, 29, 29, 30, 29, 29, 27, 29, |
|
7669
|
|
|
|
|
|
|
29, 29, 30, 29, 29, 27, 29, 29, |
|
7670
|
|
|
|
|
|
|
29, 33, 29, 29, 31, 31, 31, 27, |
|
7671
|
|
|
|
|
|
|
29, 29, 29, 30, 29, 29, 27, 34, |
|
7672
|
|
|
|
|
|
|
34, 34, 30, 34, 34, 27, 34, 34, |
|
7673
|
|
|
|
|
|
|
34, 34, 30, 34, 27, 29, 29, 29, |
|
7674
|
|
|
|
|
|
|
29, 30, 29, 27, 29, 29, 29, 29, |
|
7675
|
|
|
|
|
|
|
30, 29, 27, 29, 29, 29, 29, 32, |
|
7676
|
|
|
|
|
|
|
29, 31, 31, 31, 27, 29, 29, 29, |
|
7677
|
|
|
|
|
|
|
29, 30, 29, 27, 29, 29, 29, 29, |
|
7678
|
|
|
|
|
|
|
29, 32, 31, 31, 31, 27, 34, 34, |
|
7679
|
|
|
|
|
|
|
34, 34, 34, 30, 27, 34, 34, 34, |
|
7680
|
|
|
|
|
|
|
34, 34, 30, 27, 29, 29, 29, 29, |
|
7681
|
|
|
|
|
|
|
29, 30, 27, 29, 29, 29, 29, 29, |
|
7682
|
|
|
|
|
|
|
30, 27, 1, 35, 35, 35, 35, 35, |
|
7683
|
|
|
|
|
|
|
28, 28, 27, 28, 36, 35, 28, 0 |
|
7684
|
|
|
|
|
|
|
}; |
|
7685
|
|
|
|
|
|
|
|
|
7686
|
|
|
|
|
|
|
static const char _JJS_RBS_trans_targs[] = { |
|
7687
|
|
|
|
|
|
|
2, 0, 3, 4, 5, 7, 8, 9, |
|
7688
|
|
|
|
|
|
|
10, 11, 12, 31, 13, 14, 15, 16, |
|
7689
|
|
|
|
|
|
|
17, 18, 19, 20, 21, 22, 23, 24, |
|
7690
|
|
|
|
|
|
|
25, 26, 27, 6, 28, 29, 30, 30, |
|
7691
|
|
|
|
|
|
|
30, 32, 33, 28, 28 |
|
7692
|
|
|
|
|
|
|
}; |
|
7693
|
|
|
|
|
|
|
|
|
7694
|
|
|
|
|
|
|
static const char _JJS_RBS_trans_actions[] = { |
|
7695
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
7696
|
|
|
|
|
|
|
0, 0, 0, 3, 0, 0, 0, 0, |
|
7697
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
7698
|
|
|
|
|
|
|
0, 0, 0, 0, 7, 5, 1, 5, |
|
7699
|
|
|
|
|
|
|
12, 12, 5, 15, 9 |
|
7700
|
|
|
|
|
|
|
}; |
|
7701
|
|
|
|
|
|
|
|
|
7702
|
|
|
|
|
|
|
static const int JJS_RBS_start = 1; |
|
7703
|
|
|
|
|
|
|
|
|
7704
|
0
|
|
|
|
|
|
void english_morpho_guesser::add_JJS_RBS(const string& form, unsigned negation_len, vector& lemmas) const { |
|
7705
|
0
|
|
|
|
|
|
const char* p = form.c_str() + negation_len; int cs; |
|
7706
|
|
|
|
|
|
|
char best = 'z'; unsigned remove = 0; const char* append = nullptr; |
|
7707
|
|
|
|
|
|
|
|
|
7708
|
|
|
|
|
|
|
{ |
|
7709
|
|
|
|
|
|
|
cs = JJS_RBS_start; |
|
7710
|
|
|
|
|
|
|
} |
|
7711
|
|
|
|
|
|
|
|
|
7712
|
|
|
|
|
|
|
{ |
|
7713
|
|
|
|
|
|
|
int _klen; |
|
7714
|
|
|
|
|
|
|
unsigned int _trans; |
|
7715
|
|
|
|
|
|
|
const char *_acts; |
|
7716
|
|
|
|
|
|
|
unsigned int _nacts; |
|
7717
|
|
|
|
|
|
|
const char *_keys; |
|
7718
|
|
|
|
|
|
|
|
|
7719
|
0
|
0
|
|
|
|
|
if ( p == ( (form.c_str() + form.size())) ) |
|
7720
|
|
|
|
|
|
|
goto _test_eof; |
|
7721
|
|
|
|
|
|
|
if ( cs == 0 ) |
|
7722
|
|
|
|
|
|
|
goto _out; |
|
7723
|
|
|
|
|
|
|
_resume: |
|
7724
|
0
|
|
|
|
|
|
_keys = _JJS_RBS_trans_keys + _JJS_RBS_key_offsets[cs]; |
|
7725
|
0
|
|
|
|
|
|
_trans = _JJS_RBS_index_offsets[cs]; |
|
7726
|
|
|
|
|
|
|
|
|
7727
|
0
|
|
|
|
|
|
_klen = _JJS_RBS_single_lengths[cs]; |
|
7728
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
|
7729
|
|
|
|
|
|
|
const char *_lower = _keys; |
|
7730
|
|
|
|
|
|
|
const char *_mid; |
|
7731
|
0
|
|
|
|
|
|
const char *_upper = _keys + _klen - 1; |
|
7732
|
|
|
|
|
|
|
while (1) { |
|
7733
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
|
7734
|
|
|
|
|
|
|
break; |
|
7735
|
|
|
|
|
|
|
|
|
7736
|
0
|
|
|
|
|
|
_mid = _lower + ((_upper-_lower) >> 1); |
|
7737
|
0
|
0
|
|
|
|
|
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < *_mid ) |
|
7738
|
0
|
|
|
|
|
|
_upper = _mid - 1; |
|
7739
|
0
|
0
|
|
|
|
|
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > *_mid ) |
|
7740
|
0
|
|
|
|
|
|
_lower = _mid + 1; |
|
7741
|
|
|
|
|
|
|
else { |
|
7742
|
0
|
|
|
|
|
|
_trans += (unsigned int)(_mid - _keys); |
|
7743
|
0
|
|
|
|
|
|
goto _match; |
|
7744
|
|
|
|
|
|
|
} |
|
7745
|
|
|
|
|
|
|
} |
|
7746
|
0
|
|
|
|
|
|
_keys += _klen; |
|
7747
|
0
|
|
|
|
|
|
_trans += _klen; |
|
7748
|
|
|
|
|
|
|
} |
|
7749
|
|
|
|
|
|
|
|
|
7750
|
0
|
|
|
|
|
|
_klen = _JJS_RBS_range_lengths[cs]; |
|
7751
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
|
7752
|
|
|
|
|
|
|
const char *_lower = _keys; |
|
7753
|
|
|
|
|
|
|
const char *_mid; |
|
7754
|
0
|
|
|
|
|
|
const char *_upper = _keys + (_klen<<1) - 2; |
|
7755
|
|
|
|
|
|
|
while (1) { |
|
7756
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
|
7757
|
|
|
|
|
|
|
break; |
|
7758
|
|
|
|
|
|
|
|
|
7759
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
|
7760
|
0
|
0
|
|
|
|
|
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < _mid[0] ) |
|
7761
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
|
7762
|
0
|
0
|
|
|
|
|
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > _mid[1] ) |
|
7763
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
|
7764
|
|
|
|
|
|
|
else { |
|
7765
|
0
|
|
|
|
|
|
_trans += (unsigned int)((_mid - _keys)>>1); |
|
7766
|
0
|
|
|
|
|
|
goto _match; |
|
7767
|
|
|
|
|
|
|
} |
|
7768
|
|
|
|
|
|
|
} |
|
7769
|
0
|
|
|
|
|
|
_trans += _klen; |
|
7770
|
|
|
|
|
|
|
} |
|
7771
|
|
|
|
|
|
|
|
|
7772
|
|
|
|
|
|
|
_match: |
|
7773
|
0
|
|
|
|
|
|
_trans = _JJS_RBS_indicies[_trans]; |
|
7774
|
0
|
|
|
|
|
|
cs = _JJS_RBS_trans_targs[_trans]; |
|
7775
|
|
|
|
|
|
|
|
|
7776
|
0
|
0
|
|
|
|
|
if ( _JJS_RBS_trans_actions[_trans] == 0 ) |
|
7777
|
|
|
|
|
|
|
goto _again; |
|
7778
|
|
|
|
|
|
|
|
|
7779
|
0
|
|
|
|
|
|
_acts = _JJS_RBS_actions + _JJS_RBS_trans_actions[_trans]; |
|
7780
|
0
|
|
|
|
|
|
_nacts = (unsigned int) *_acts++; |
|
7781
|
0
|
0
|
|
|
|
|
while ( _nacts-- > 0 ) |
|
7782
|
|
|
|
|
|
|
{ |
|
7783
|
0
|
|
|
|
|
|
switch ( *_acts++ ) |
|
7784
|
|
|
|
|
|
|
{ |
|
7785
|
|
|
|
|
|
|
case 0: |
|
7786
|
0
|
0
|
|
|
|
|
{ if (best > 'a') best = 'a', remove = 3, append = nullptr; } |
|
7787
|
|
|
|
|
|
|
break; |
|
7788
|
|
|
|
|
|
|
case 1: |
|
7789
|
0
|
0
|
|
|
|
|
{ if (best > 'b') best = 'b', remove = 4, append = nullptr; } |
|
7790
|
|
|
|
|
|
|
break; |
|
7791
|
|
|
|
|
|
|
case 2: |
|
7792
|
0
|
0
|
|
|
|
|
{ if (best > 'c') best = 'c', remove = 4, append = "y"; } |
|
7793
|
|
|
|
|
|
|
break; |
|
7794
|
|
|
|
|
|
|
case 3: |
|
7795
|
0
|
0
|
|
|
|
|
{ if (best > 'd') best = 'd', remove = 3, append = nullptr; } |
|
7796
|
|
|
|
|
|
|
break; |
|
7797
|
|
|
|
|
|
|
case 4: |
|
7798
|
0
|
0
|
|
|
|
|
{ if (best > 'e') best = 'e', remove = 2, append = nullptr; } |
|
7799
|
|
|
|
|
|
|
break; |
|
7800
|
|
|
|
|
|
|
case 5: |
|
7801
|
0
|
0
|
|
|
|
|
{ if (best > 'f') best = 'f', remove = 3, append = nullptr; } |
|
7802
|
|
|
|
|
|
|
break; |
|
7803
|
|
|
|
|
|
|
} |
|
7804
|
|
|
|
|
|
|
} |
|
7805
|
|
|
|
|
|
|
|
|
7806
|
|
|
|
|
|
|
_again: |
|
7807
|
0
|
0
|
|
|
|
|
if ( cs == 0 ) |
|
7808
|
|
|
|
|
|
|
goto _out; |
|
7809
|
0
|
0
|
|
|
|
|
if ( ++p != ( (form.c_str() + form.size())) ) |
|
7810
|
|
|
|
|
|
|
goto _resume; |
|
7811
|
|
|
|
|
|
|
_test_eof: {} |
|
7812
|
|
|
|
|
|
|
_out: {} |
|
7813
|
|
|
|
|
|
|
} |
|
7814
|
|
|
|
|
|
|
|
|
7815
|
0
|
0
|
|
|
|
|
add(JJS, RBS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
7816
|
0
|
|
|
|
|
|
} |
|
7817
|
|
|
|
|
|
|
|
|
7818
|
|
|
|
|
|
|
} // namespace morphodita |
|
7819
|
|
|
|
|
|
|
|
|
7820
|
|
|
|
|
|
|
///////// |
|
7821
|
|
|
|
|
|
|
// File: morphodita/morpho/external_morpho.h |
|
7822
|
|
|
|
|
|
|
///////// |
|
7823
|
|
|
|
|
|
|
|
|
7824
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
7825
|
|
|
|
|
|
|
// |
|
7826
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
7827
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
7828
|
|
|
|
|
|
|
// |
|
7829
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
7830
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
7831
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
7832
|
|
|
|
|
|
|
|
|
7833
|
|
|
|
|
|
|
namespace morphodita { |
|
7834
|
|
|
|
|
|
|
|
|
7835
|
0
|
|
|
|
|
|
class external_morpho : public morpho { |
|
7836
|
|
|
|
|
|
|
public: |
|
7837
|
0
|
|
|
|
|
|
external_morpho(unsigned version) : version(version) {} |
|
7838
|
|
|
|
|
|
|
|
|
7839
|
|
|
|
|
|
|
virtual int analyze(string_piece form, morpho::guesser_mode guesser, vector& lemmas) const override; |
|
7840
|
|
|
|
|
|
|
virtual int generate(string_piece lemma, const char* tag_wildcard, guesser_mode guesser, vector& forms) const override; |
|
7841
|
|
|
|
|
|
|
virtual int raw_lemma_len(string_piece lemma) const override; |
|
7842
|
|
|
|
|
|
|
virtual int lemma_id_len(string_piece lemma) const override; |
|
7843
|
|
|
|
|
|
|
virtual int raw_form_len(string_piece form) const override; |
|
7844
|
|
|
|
|
|
|
virtual tokenizer* new_tokenizer() const override; |
|
7845
|
|
|
|
|
|
|
|
|
7846
|
|
|
|
|
|
|
bool load(istream& is); |
|
7847
|
|
|
|
|
|
|
|
|
7848
|
|
|
|
|
|
|
private: |
|
7849
|
|
|
|
|
|
|
unsigned version; |
|
7850
|
|
|
|
|
|
|
|
|
7851
|
|
|
|
|
|
|
string unknown_tag; |
|
7852
|
|
|
|
|
|
|
}; |
|
7853
|
|
|
|
|
|
|
|
|
7854
|
|
|
|
|
|
|
} // namespace morphodita |
|
7855
|
|
|
|
|
|
|
|
|
7856
|
|
|
|
|
|
|
///////// |
|
7857
|
|
|
|
|
|
|
// File: morphodita/tokenizer/generic_tokenizer.h |
|
7858
|
|
|
|
|
|
|
///////// |
|
7859
|
|
|
|
|
|
|
|
|
7860
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
7861
|
|
|
|
|
|
|
// |
|
7862
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
7863
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
7864
|
|
|
|
|
|
|
// |
|
7865
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
7866
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
7867
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
7868
|
|
|
|
|
|
|
|
|
7869
|
|
|
|
|
|
|
namespace morphodita { |
|
7870
|
|
|
|
|
|
|
|
|
7871
|
0
|
|
|
|
|
|
class generic_tokenizer : public ragel_tokenizer { |
|
7872
|
|
|
|
|
|
|
public: |
|
7873
|
|
|
|
|
|
|
enum { LATEST = 2 }; |
|
7874
|
|
|
|
|
|
|
generic_tokenizer(unsigned version); |
|
7875
|
|
|
|
|
|
|
|
|
7876
|
|
|
|
|
|
|
virtual bool next_sentence(vector& tokens) override; |
|
7877
|
|
|
|
|
|
|
}; |
|
7878
|
|
|
|
|
|
|
|
|
7879
|
|
|
|
|
|
|
} // namespace morphodita |
|
7880
|
|
|
|
|
|
|
|
|
7881
|
|
|
|
|
|
|
///////// |
|
7882
|
|
|
|
|
|
|
// File: morphodita/morpho/external_morpho.cpp |
|
7883
|
|
|
|
|
|
|
///////// |
|
7884
|
|
|
|
|
|
|
|
|
7885
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
7886
|
|
|
|
|
|
|
// |
|
7887
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
7888
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
7889
|
|
|
|
|
|
|
// |
|
7890
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
7891
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
7892
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
7893
|
|
|
|
|
|
|
|
|
7894
|
|
|
|
|
|
|
namespace morphodita { |
|
7895
|
|
|
|
|
|
|
|
|
7896
|
0
|
|
|
|
|
|
bool external_morpho::load(istream& is) { |
|
7897
|
|
|
|
|
|
|
binary_decoder data; |
|
7898
|
0
|
0
|
|
|
|
|
if (!compressor::load(is, data)) return false; |
|
|
|
0
|
|
|
|
|
|
|
7899
|
|
|
|
|
|
|
|
|
7900
|
|
|
|
|
|
|
try { |
|
7901
|
|
|
|
|
|
|
// Load unknown_tag |
|
7902
|
0
|
0
|
|
|
|
|
unsigned length = data.next_1B(); |
|
7903
|
0
|
0
|
|
|
|
|
unknown_tag.assign(data.next(length), length); |
|
|
|
0
|
|
|
|
|
|
|
7904
|
|
|
|
|
|
|
} catch (binary_decoder_error&) { |
|
7905
|
|
|
|
|
|
|
return false; |
|
7906
|
|
|
|
|
|
|
} |
|
7907
|
|
|
|
|
|
|
|
|
7908
|
0
|
|
|
|
|
|
return data.is_end(); |
|
7909
|
|
|
|
|
|
|
} |
|
7910
|
|
|
|
|
|
|
|
|
7911
|
0
|
|
|
|
|
|
int external_morpho::analyze(string_piece form, guesser_mode /*guesser*/, vector& lemmas) const { |
|
7912
|
|
|
|
|
|
|
lemmas.clear(); |
|
7913
|
|
|
|
|
|
|
|
|
7914
|
0
|
0
|
|
|
|
|
if (form.len) { |
|
7915
|
|
|
|
|
|
|
// Start by skipping the first form |
|
7916
|
|
|
|
|
|
|
string_piece lemmatags = form; |
|
7917
|
0
|
0
|
|
|
|
|
while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++; |
|
|
|
0
|
|
|
|
|
|
|
7918
|
0
|
0
|
|
|
|
|
if (lemmatags.len) lemmatags.len--, lemmatags.str++; |
|
7919
|
|
|
|
|
|
|
|
|
7920
|
|
|
|
|
|
|
// Split lemmatags using ' ' into lemma-tag pairs. |
|
7921
|
0
|
0
|
|
|
|
|
while (lemmatags.len) { |
|
7922
|
|
|
|
|
|
|
auto lemma_start = lemmatags.str; |
|
7923
|
0
|
0
|
|
|
|
|
while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++; |
|
|
|
0
|
|
|
|
|
|
|
7924
|
0
|
0
|
|
|
|
|
if (!lemmatags.len) break; |
|
7925
|
|
|
|
|
|
|
auto lemma_len = lemmatags.str - lemma_start; |
|
7926
|
0
|
|
|
|
|
|
lemmatags.len--, lemmatags.str++; |
|
7927
|
|
|
|
|
|
|
|
|
7928
|
|
|
|
|
|
|
auto tag_start = lemmatags.str; |
|
7929
|
0
|
0
|
|
|
|
|
while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++; |
|
|
|
0
|
|
|
|
|
|
|
7930
|
|
|
|
|
|
|
auto tag_len = lemmatags.str - tag_start; |
|
7931
|
0
|
0
|
|
|
|
|
if (lemmatags.len) lemmatags.len--, lemmatags.str++; |
|
7932
|
|
|
|
|
|
|
|
|
7933
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(lemma_start, lemma_len), string(tag_start, tag_len)); |
|
7934
|
|
|
|
|
|
|
} |
|
7935
|
|
|
|
|
|
|
|
|
7936
|
0
|
0
|
|
|
|
|
if (!lemmas.empty()) return NO_GUESSER; |
|
7937
|
|
|
|
|
|
|
} |
|
7938
|
|
|
|
|
|
|
|
|
7939
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), unknown_tag); |
|
7940
|
0
|
|
|
|
|
|
return -1; |
|
7941
|
|
|
|
|
|
|
} |
|
7942
|
|
|
|
|
|
|
|
|
7943
|
0
|
|
|
|
|
|
int external_morpho::generate(string_piece lemma, const char* tag_wildcard, morpho::guesser_mode /*guesser*/, vector& forms) const { |
|
7944
|
|
|
|
|
|
|
forms.clear(); |
|
7945
|
|
|
|
|
|
|
|
|
7946
|
0
|
|
|
|
|
|
tag_filter filter(tag_wildcard); |
|
7947
|
|
|
|
|
|
|
|
|
7948
|
0
|
0
|
|
|
|
|
if (lemma.len) { |
|
7949
|
|
|
|
|
|
|
// Start by locating the lemma |
|
7950
|
|
|
|
|
|
|
string_piece formtags = lemma; |
|
7951
|
0
|
0
|
|
|
|
|
while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++; |
|
|
|
0
|
|
|
|
|
|
|
7952
|
0
|
|
|
|
|
|
string_piece real_lemma(lemma.str, lemma.len - formtags.len); |
|
7953
|
0
|
0
|
|
|
|
|
if (formtags.len) formtags.len--, formtags.str++; |
|
7954
|
|
|
|
|
|
|
|
|
7955
|
|
|
|
|
|
|
// Split formtags using ' ' into form-tag pairs. |
|
7956
|
|
|
|
|
|
|
bool any_result = false; |
|
7957
|
0
|
0
|
|
|
|
|
while (formtags.len) { |
|
7958
|
|
|
|
|
|
|
auto form_start = formtags.str; |
|
7959
|
0
|
0
|
|
|
|
|
while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++; |
|
|
|
0
|
|
|
|
|
|
|
7960
|
0
|
0
|
|
|
|
|
if (!formtags.len) break; |
|
7961
|
|
|
|
|
|
|
auto form_len = formtags.str - form_start; |
|
7962
|
0
|
|
|
|
|
|
formtags.len--, formtags.str++; |
|
7963
|
|
|
|
|
|
|
|
|
7964
|
|
|
|
|
|
|
auto tag_start = formtags.str; |
|
7965
|
0
|
0
|
|
|
|
|
while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++; |
|
|
|
0
|
|
|
|
|
|
|
7966
|
|
|
|
|
|
|
auto tag_len = formtags.str - tag_start; |
|
7967
|
0
|
0
|
|
|
|
|
if (formtags.len) formtags.len--, formtags.str++; |
|
7968
|
|
|
|
|
|
|
|
|
7969
|
|
|
|
|
|
|
any_result = true; |
|
7970
|
|
|
|
|
|
|
string tag(tag_start, tag_len); |
|
7971
|
0
|
0
|
|
|
|
|
if (filter.matches(tag.c_str())) { |
|
7972
|
0
|
0
|
|
|
|
|
if (forms.empty()) forms.emplace_back(string(real_lemma.str, real_lemma.len)); |
|
|
|
0
|
|
|
|
|
|
|
7973
|
0
|
0
|
|
|
|
|
forms.back().forms.emplace_back(string(form_start, form_len), tag); |
|
7974
|
|
|
|
|
|
|
} |
|
7975
|
|
|
|
|
|
|
} |
|
7976
|
|
|
|
|
|
|
|
|
7977
|
0
|
0
|
|
|
|
|
if (any_result) return NO_GUESSER; |
|
7978
|
|
|
|
|
|
|
} |
|
7979
|
|
|
|
|
|
|
|
|
7980
|
|
|
|
|
|
|
return -1; |
|
7981
|
|
|
|
|
|
|
} |
|
7982
|
|
|
|
|
|
|
|
|
7983
|
0
|
|
|
|
|
|
int external_morpho::raw_lemma_len(string_piece lemma) const { |
|
7984
|
|
|
|
|
|
|
unsigned lemma_len = 0; |
|
7985
|
0
|
0
|
|
|
|
|
while (lemma_len < lemma.len && lemma.str[lemma_len] != ' ') lemma_len++; |
|
|
|
0
|
|
|
|
|
|
|
7986
|
0
|
|
|
|
|
|
return lemma_len; |
|
7987
|
|
|
|
|
|
|
} |
|
7988
|
|
|
|
|
|
|
|
|
7989
|
0
|
|
|
|
|
|
int external_morpho::lemma_id_len(string_piece lemma) const { |
|
7990
|
|
|
|
|
|
|
unsigned lemma_len = 0; |
|
7991
|
0
|
0
|
|
|
|
|
while (lemma_len < lemma.len && lemma.str[lemma_len] != ' ') lemma_len++; |
|
|
|
0
|
|
|
|
|
|
|
7992
|
0
|
|
|
|
|
|
return lemma_len; |
|
7993
|
|
|
|
|
|
|
} |
|
7994
|
|
|
|
|
|
|
|
|
7995
|
0
|
|
|
|
|
|
int external_morpho::raw_form_len(string_piece form) const { |
|
7996
|
|
|
|
|
|
|
unsigned form_len = 0; |
|
7997
|
0
|
0
|
|
|
|
|
while (form_len < form.len && form.str[form_len] != ' ') form_len++; |
|
|
|
0
|
|
|
|
|
|
|
7998
|
0
|
|
|
|
|
|
return form_len; |
|
7999
|
|
|
|
|
|
|
} |
|
8000
|
|
|
|
|
|
|
|
|
8001
|
0
|
|
|
|
|
|
tokenizer* external_morpho::new_tokenizer() const { |
|
8002
|
0
|
|
|
|
|
|
return new generic_tokenizer(version); |
|
8003
|
|
|
|
|
|
|
} |
|
8004
|
|
|
|
|
|
|
|
|
8005
|
|
|
|
|
|
|
} // namespace morphodita |
|
8006
|
|
|
|
|
|
|
|
|
8007
|
|
|
|
|
|
|
///////// |
|
8008
|
|
|
|
|
|
|
// File: morphodita/morpho/generic_lemma_addinfo.h |
|
8009
|
|
|
|
|
|
|
///////// |
|
8010
|
|
|
|
|
|
|
|
|
8011
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
8012
|
|
|
|
|
|
|
// |
|
8013
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
8014
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
8015
|
|
|
|
|
|
|
// |
|
8016
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
8017
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
8018
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
8019
|
|
|
|
|
|
|
|
|
8020
|
|
|
|
|
|
|
namespace morphodita { |
|
8021
|
|
|
|
|
|
|
|
|
8022
|
|
|
|
|
|
|
// Declarations |
|
8023
|
0
|
|
|
|
|
|
struct generic_lemma_addinfo { |
|
8024
|
|
|
|
|
|
|
inline static int raw_lemma_len(string_piece lemma); |
|
8025
|
|
|
|
|
|
|
inline static int lemma_id_len(string_piece lemma); |
|
8026
|
|
|
|
|
|
|
inline static string format(const unsigned char* addinfo, int addinfo_len); |
|
8027
|
|
|
|
|
|
|
inline static bool generatable(const unsigned char* addinfo, int addinfo_len); |
|
8028
|
|
|
|
|
|
|
|
|
8029
|
|
|
|
|
|
|
inline int parse(string_piece lemma, bool die_on_failure = false); |
|
8030
|
|
|
|
|
|
|
inline bool match_lemma_id(const unsigned char* other_addinfo, int other_addinfo_len); |
|
8031
|
|
|
|
|
|
|
|
|
8032
|
|
|
|
|
|
|
vector data; |
|
8033
|
|
|
|
|
|
|
}; |
|
8034
|
|
|
|
|
|
|
|
|
8035
|
|
|
|
|
|
|
// Definitions |
|
8036
|
|
|
|
|
|
|
int generic_lemma_addinfo::raw_lemma_len(string_piece lemma) { |
|
8037
|
0
|
|
|
|
|
|
return lemma.len; |
|
8038
|
|
|
|
|
|
|
} |
|
8039
|
|
|
|
|
|
|
|
|
8040
|
|
|
|
|
|
|
int generic_lemma_addinfo::lemma_id_len(string_piece lemma) { |
|
8041
|
0
|
|
|
|
|
|
return lemma.len; |
|
8042
|
|
|
|
|
|
|
} |
|
8043
|
|
|
|
|
|
|
|
|
8044
|
|
|
|
|
|
|
string generic_lemma_addinfo::format(const unsigned char* /*addinfo*/, int /*addinfo_len*/) { |
|
8045
|
|
|
|
|
|
|
return string(); |
|
8046
|
|
|
|
|
|
|
} |
|
8047
|
|
|
|
|
|
|
|
|
8048
|
|
|
|
|
|
|
bool generic_lemma_addinfo::generatable(const unsigned char* /*addinfo*/, int /*addinfo_len*/) { |
|
8049
|
|
|
|
|
|
|
return true; |
|
8050
|
|
|
|
|
|
|
} |
|
8051
|
|
|
|
|
|
|
|
|
8052
|
|
|
|
|
|
|
int generic_lemma_addinfo::parse(string_piece lemma, bool /*die_on_failure*/) { |
|
8053
|
0
|
|
|
|
|
|
return lemma.len; |
|
8054
|
|
|
|
|
|
|
} |
|
8055
|
|
|
|
|
|
|
|
|
8056
|
|
|
|
|
|
|
bool generic_lemma_addinfo::match_lemma_id(const unsigned char* /*other_addinfo*/, int /*other_addinfo_len*/) { |
|
8057
|
|
|
|
|
|
|
return true; |
|
8058
|
|
|
|
|
|
|
} |
|
8059
|
|
|
|
|
|
|
|
|
8060
|
|
|
|
|
|
|
} // namespace morphodita |
|
8061
|
|
|
|
|
|
|
|
|
8062
|
|
|
|
|
|
|
///////// |
|
8063
|
|
|
|
|
|
|
// File: morphodita/morpho/generic_morpho.h |
|
8064
|
|
|
|
|
|
|
///////// |
|
8065
|
|
|
|
|
|
|
|
|
8066
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
8067
|
|
|
|
|
|
|
// |
|
8068
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
8069
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
8070
|
|
|
|
|
|
|
// |
|
8071
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
8072
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
8073
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
8074
|
|
|
|
|
|
|
|
|
8075
|
|
|
|
|
|
|
namespace morphodita { |
|
8076
|
|
|
|
|
|
|
|
|
8077
|
4
|
|
|
|
|
|
class generic_morpho : public morpho { |
|
8078
|
|
|
|
|
|
|
public: |
|
8079
|
1
|
|
|
|
|
|
generic_morpho(unsigned version) : version(version) {} |
|
8080
|
|
|
|
|
|
|
|
|
8081
|
|
|
|
|
|
|
virtual int analyze(string_piece form, morpho::guesser_mode guesser, vector& lemmas) const override; |
|
8082
|
|
|
|
|
|
|
virtual int generate(string_piece lemma, const char* tag_wildcard, guesser_mode guesser, vector& forms) const override; |
|
8083
|
|
|
|
|
|
|
virtual int raw_lemma_len(string_piece lemma) const override; |
|
8084
|
|
|
|
|
|
|
virtual int lemma_id_len(string_piece lemma) const override; |
|
8085
|
|
|
|
|
|
|
virtual int raw_form_len(string_piece form) const override; |
|
8086
|
|
|
|
|
|
|
virtual tokenizer* new_tokenizer() const override; |
|
8087
|
|
|
|
|
|
|
|
|
8088
|
|
|
|
|
|
|
bool load(istream& is); |
|
8089
|
|
|
|
|
|
|
private: |
|
8090
|
|
|
|
|
|
|
inline void analyze_special(string_piece form, vector& lemmas) const; |
|
8091
|
|
|
|
|
|
|
|
|
8092
|
|
|
|
|
|
|
unsigned version; |
|
8093
|
|
|
|
|
|
|
morpho_dictionary dictionary; |
|
8094
|
|
|
|
|
|
|
unique_ptr statistical_guesser; |
|
8095
|
|
|
|
|
|
|
|
|
8096
|
|
|
|
|
|
|
string unknown_tag, number_tag, punctuation_tag, symbol_tag; |
|
8097
|
|
|
|
|
|
|
}; |
|
8098
|
|
|
|
|
|
|
|
|
8099
|
|
|
|
|
|
|
} // namespace morphodita |
|
8100
|
|
|
|
|
|
|
|
|
8101
|
|
|
|
|
|
|
///////// |
|
8102
|
|
|
|
|
|
|
// File: morphodita/morpho/generic_morpho.cpp |
|
8103
|
|
|
|
|
|
|
///////// |
|
8104
|
|
|
|
|
|
|
|
|
8105
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
8106
|
|
|
|
|
|
|
// |
|
8107
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
8108
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
8109
|
|
|
|
|
|
|
// |
|
8110
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
8111
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
8112
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
8113
|
|
|
|
|
|
|
|
|
8114
|
|
|
|
|
|
|
namespace morphodita { |
|
8115
|
|
|
|
|
|
|
|
|
8116
|
1
|
|
|
|
|
|
bool generic_morpho::load(istream& is) { |
|
8117
|
|
|
|
|
|
|
binary_decoder data; |
|
8118
|
1
|
50
|
|
|
|
|
if (!compressor::load(is, data)) return false; |
|
|
|
50
|
|
|
|
|
|
|
8119
|
|
|
|
|
|
|
|
|
8120
|
|
|
|
|
|
|
try { |
|
8121
|
|
|
|
|
|
|
// Load tags |
|
8122
|
1
|
50
|
|
|
|
|
unsigned length = data.next_1B(); |
|
8123
|
1
|
50
|
|
|
|
|
unknown_tag.assign(data.next(length), length); |
|
8124
|
1
|
50
|
|
|
|
|
length = data.next_1B(); |
|
8125
|
1
|
50
|
|
|
|
|
number_tag.assign(data.next(length), length); |
|
8126
|
1
|
50
|
|
|
|
|
length = data.next_1B(); |
|
8127
|
1
|
50
|
|
|
|
|
punctuation_tag.assign(data.next(length), length); |
|
8128
|
1
|
50
|
|
|
|
|
length = data.next_1B(); |
|
8129
|
1
|
50
|
|
|
|
|
symbol_tag.assign(data.next(length), length); |
|
8130
|
|
|
|
|
|
|
|
|
8131
|
|
|
|
|
|
|
// Load dictionary |
|
8132
|
1
|
50
|
|
|
|
|
dictionary.load(data); |
|
8133
|
|
|
|
|
|
|
|
|
8134
|
|
|
|
|
|
|
// Optionally statistical guesser if present |
|
8135
|
|
|
|
|
|
|
statistical_guesser.reset(); |
|
8136
|
1
|
50
|
|
|
|
|
if (data.next_1B()) { |
|
|
|
50
|
|
|
|
|
|
|
8137
|
1
|
50
|
|
|
|
|
statistical_guesser.reset(new morpho_statistical_guesser()); |
|
8138
|
1
|
50
|
|
|
|
|
statistical_guesser->load(data); |
|
8139
|
|
0
|
|
|
|
|
} |
|
8140
|
|
|
|
|
|
|
} catch (binary_decoder_error&) { |
|
8141
|
|
|
|
|
|
|
return false; |
|
8142
|
|
|
|
|
|
|
} |
|
8143
|
|
|
|
|
|
|
|
|
8144
|
1
|
|
|
|
|
|
return data.is_end(); |
|
8145
|
|
|
|
|
|
|
} |
|
8146
|
|
|
|
|
|
|
|
|
8147
|
7
|
|
|
|
|
|
int generic_morpho::analyze(string_piece form, guesser_mode guesser, vector& lemmas) const { |
|
8148
|
|
|
|
|
|
|
lemmas.clear(); |
|
8149
|
|
|
|
|
|
|
|
|
8150
|
7
|
50
|
|
|
|
|
if (form.len) { |
|
8151
|
|
|
|
|
|
|
// Generate all casing variants if needed (they are different than given form). |
|
8152
|
|
|
|
|
|
|
string form_uclc; // first uppercase, rest lowercase |
|
8153
|
|
|
|
|
|
|
string form_lc; // all lowercase |
|
8154
|
7
|
50
|
|
|
|
|
generate_casing_variants(form, form_uclc, form_lc); |
|
8155
|
|
|
|
|
|
|
|
|
8156
|
|
|
|
|
|
|
// Start by analysing using the dictionary and all casing variants. |
|
8157
|
7
|
50
|
|
|
|
|
dictionary.analyze(form, lemmas); |
|
8158
|
7
|
50
|
|
|
|
|
if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas); |
|
|
|
0
|
|
|
|
|
|
|
8159
|
7
|
100
|
|
|
|
|
if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas); |
|
|
|
50
|
|
|
|
|
|
|
8160
|
7
|
50
|
|
|
|
|
if (!lemmas.empty()) return NO_GUESSER; |
|
8161
|
|
|
|
|
|
|
|
|
8162
|
|
|
|
|
|
|
// Then call analyze_special to handle numbers, punctuation and symbols. |
|
8163
|
0
|
0
|
|
|
|
|
analyze_special(form, lemmas); |
|
8164
|
0
|
0
|
|
|
|
|
if (!lemmas.empty()) return NO_GUESSER; |
|
8165
|
|
|
|
|
|
|
|
|
8166
|
|
|
|
|
|
|
// For the statistical guesser, use all casing variants. |
|
8167
|
0
|
0
|
|
|
|
|
if (guesser == GUESSER && statistical_guesser) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8168
|
0
|
0
|
|
|
|
|
if (form_uclc.empty() && form_lc.empty()) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8169
|
0
|
0
|
|
|
|
|
statistical_guesser->analyze(form, lemmas, nullptr); |
|
8170
|
|
|
|
|
|
|
else { |
|
8171
|
0
|
0
|
|
|
|
|
morpho_statistical_guesser::used_rules used_rules; used_rules.reserve(3); |
|
8172
|
0
|
0
|
|
|
|
|
statistical_guesser->analyze(form, lemmas, &used_rules); |
|
8173
|
0
|
0
|
|
|
|
|
if (!form_uclc.empty()) statistical_guesser->analyze(form_uclc, lemmas, &used_rules); |
|
|
|
0
|
|
|
|
|
|
|
8174
|
0
|
0
|
|
|
|
|
if (!form_lc.empty()) statistical_guesser->analyze(form_lc, lemmas, &used_rules); |
|
|
|
0
|
|
|
|
|
|
|
8175
|
|
|
|
|
|
|
} |
|
8176
|
|
|
|
|
|
|
} |
|
8177
|
0
|
0
|
|
|
|
|
if (!lemmas.empty()) return GUESSER; |
|
8178
|
|
|
|
|
|
|
} |
|
8179
|
|
|
|
|
|
|
|
|
8180
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), unknown_tag); |
|
8181
|
7
|
|
|
|
|
|
return -1; |
|
8182
|
|
|
|
|
|
|
} |
|
8183
|
|
|
|
|
|
|
|
|
8184
|
0
|
|
|
|
|
|
int generic_morpho::generate(string_piece lemma, const char* tag_wildcard, morpho::guesser_mode /*guesser*/, vector& forms) const { |
|
8185
|
|
|
|
|
|
|
forms.clear(); |
|
8186
|
|
|
|
|
|
|
|
|
8187
|
0
|
|
|
|
|
|
tag_filter filter(tag_wildcard); |
|
8188
|
|
|
|
|
|
|
|
|
8189
|
0
|
0
|
|
|
|
|
if (lemma.len) { |
|
8190
|
0
|
0
|
|
|
|
|
if (dictionary.generate(lemma, filter, forms)) |
|
|
|
0
|
|
|
|
|
|
|
8191
|
|
|
|
|
|
|
return NO_GUESSER; |
|
8192
|
|
|
|
|
|
|
} |
|
8193
|
|
|
|
|
|
|
|
|
8194
|
|
|
|
|
|
|
return -1; |
|
8195
|
|
|
|
|
|
|
} |
|
8196
|
|
|
|
|
|
|
|
|
8197
|
0
|
|
|
|
|
|
int generic_morpho::raw_lemma_len(string_piece lemma) const { |
|
8198
|
0
|
|
|
|
|
|
return generic_lemma_addinfo::raw_lemma_len(lemma); |
|
8199
|
|
|
|
|
|
|
} |
|
8200
|
|
|
|
|
|
|
|
|
8201
|
0
|
|
|
|
|
|
int generic_morpho::lemma_id_len(string_piece lemma) const { |
|
8202
|
0
|
|
|
|
|
|
return generic_lemma_addinfo::lemma_id_len(lemma); |
|
8203
|
|
|
|
|
|
|
} |
|
8204
|
|
|
|
|
|
|
|
|
8205
|
7
|
|
|
|
|
|
int generic_morpho::raw_form_len(string_piece form) const { |
|
8206
|
7
|
|
|
|
|
|
return form.len; |
|
8207
|
|
|
|
|
|
|
} |
|
8208
|
|
|
|
|
|
|
|
|
8209
|
0
|
|
|
|
|
|
tokenizer* generic_morpho::new_tokenizer() const { |
|
8210
|
0
|
|
|
|
|
|
return new generic_tokenizer(version); |
|
8211
|
|
|
|
|
|
|
} |
|
8212
|
|
|
|
|
|
|
|
|
8213
|
0
|
|
|
|
|
|
void generic_morpho::analyze_special(string_piece form, vector& lemmas) const { |
|
8214
|
|
|
|
|
|
|
using namespace unilib; |
|
8215
|
|
|
|
|
|
|
|
|
8216
|
|
|
|
|
|
|
// Analyzer for numbers, punctuation and symbols. |
|
8217
|
|
|
|
|
|
|
// Number is anything matching [+-]? is_Pn* ([.,] is_Pn*)? ([Ee] [+-]? is_Pn+)? for at least one is_Pn* nonempty. |
|
8218
|
|
|
|
|
|
|
// Punctuation is any form beginning with either unicode punctuation or punctuation_exceptions character. |
|
8219
|
|
|
|
|
|
|
// Beware that numbers takes precedence, so - is punctuation, -3 is number, -. is punctuation, -.3 is number. |
|
8220
|
0
|
0
|
|
|
|
|
if (!form.len) return; |
|
8221
|
|
|
|
|
|
|
|
|
8222
|
0
|
|
|
|
|
|
string_piece number = form; |
|
8223
|
0
|
|
|
|
|
|
char32_t first = utf8::decode(number.str, number.len); |
|
8224
|
|
|
|
|
|
|
|
|
8225
|
|
|
|
|
|
|
// Try matching a number. |
|
8226
|
|
|
|
|
|
|
char32_t codepoint = first; |
|
8227
|
|
|
|
|
|
|
bool any_digit = false; |
|
8228
|
0
|
0
|
|
|
|
|
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(number.str, number.len); |
|
8229
|
0
|
0
|
|
|
|
|
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
|
8230
|
0
|
0
|
|
|
|
|
if ((codepoint == '.' && number.len) || codepoint == ',') codepoint = utf8::decode(number.str, number.len); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8231
|
0
|
0
|
|
|
|
|
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
|
8232
|
0
|
0
|
|
|
|
|
if (any_digit && (codepoint == 'e' || codepoint == 'E')) { |
|
|
|
0
|
|
|
|
|
|
|
8233
|
0
|
|
|
|
|
|
codepoint = utf8::decode(number.str, number.len); |
|
8234
|
0
|
0
|
|
|
|
|
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(number.str, number.len); |
|
8235
|
|
|
|
|
|
|
any_digit = false; |
|
8236
|
0
|
0
|
|
|
|
|
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
|
8237
|
|
|
|
|
|
|
} |
|
8238
|
|
|
|
|
|
|
|
|
8239
|
0
|
0
|
|
|
|
|
if (any_digit && !number.len && (!codepoint || codepoint == '.')) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8240
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), number_tag); |
|
8241
|
0
|
|
|
|
|
|
return; |
|
8242
|
|
|
|
|
|
|
} |
|
8243
|
|
|
|
|
|
|
|
|
8244
|
|
|
|
|
|
|
// Try matching punctuation or symbol. |
|
8245
|
|
|
|
|
|
|
bool punctuation = true, symbol = true; |
|
8246
|
0
|
|
|
|
|
|
string_piece form_ori = form; |
|
8247
|
0
|
0
|
|
|
|
|
while (form.len) { |
|
8248
|
0
|
|
|
|
|
|
codepoint = utf8::decode(form.str, form.len); |
|
8249
|
0
|
0
|
|
|
|
|
punctuation = punctuation && unicode::category(codepoint) & unicode::P; |
|
|
|
0
|
|
|
|
|
|
|
8250
|
0
|
0
|
|
|
|
|
symbol = symbol && unicode::category(codepoint) & unicode::S; |
|
|
|
0
|
|
|
|
|
|
|
8251
|
|
|
|
|
|
|
} |
|
8252
|
0
|
0
|
|
|
|
|
if (punctuation) |
|
8253
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form_ori.str, form_ori.len), punctuation_tag); |
|
8254
|
0
|
0
|
|
|
|
|
else if (symbol) |
|
8255
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form_ori.str, form_ori.len), symbol_tag); |
|
8256
|
|
|
|
|
|
|
} |
|
8257
|
|
|
|
|
|
|
|
|
8258
|
|
|
|
|
|
|
} // namespace morphodita |
|
8259
|
|
|
|
|
|
|
|
|
8260
|
|
|
|
|
|
|
///////// |
|
8261
|
|
|
|
|
|
|
// File: morphodita/morpho/generic_morpho_encoder.h |
|
8262
|
|
|
|
|
|
|
///////// |
|
8263
|
|
|
|
|
|
|
|
|
8264
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
8265
|
|
|
|
|
|
|
// |
|
8266
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
8267
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
8268
|
|
|
|
|
|
|
// |
|
8269
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
8270
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
8271
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
8272
|
|
|
|
|
|
|
|
|
8273
|
|
|
|
|
|
|
namespace morphodita { |
|
8274
|
|
|
|
|
|
|
|
|
8275
|
|
|
|
|
|
|
class generic_morpho_encoder { |
|
8276
|
|
|
|
|
|
|
public: |
|
8277
|
0
|
|
|
|
|
|
struct tags { |
|
8278
|
|
|
|
|
|
|
string unknown_tag, number_tag, punctuation_tag, symbol_tag; |
|
8279
|
|
|
|
|
|
|
}; |
|
8280
|
|
|
|
|
|
|
static void encode(istream& in_dictionary, int max_suffix_len, const tags& tags, istream& in_statistical_guesser, ostream& out_morpho); |
|
8281
|
|
|
|
|
|
|
}; |
|
8282
|
|
|
|
|
|
|
|
|
8283
|
|
|
|
|
|
|
} // namespace morphodita |
|
8284
|
|
|
|
|
|
|
|
|
8285
|
|
|
|
|
|
|
///////// |
|
8286
|
|
|
|
|
|
|
// File: morphodita/morpho/persistent_unordered_map_encoder.h |
|
8287
|
|
|
|
|
|
|
///////// |
|
8288
|
|
|
|
|
|
|
|
|
8289
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
8290
|
|
|
|
|
|
|
// |
|
8291
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
8292
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
8293
|
|
|
|
|
|
|
// |
|
8294
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
8295
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
8296
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
8297
|
|
|
|
|
|
|
|
|
8298
|
|
|
|
|
|
|
namespace morphodita { |
|
8299
|
|
|
|
|
|
|
|
|
8300
|
|
|
|
|
|
|
template |
|
8301
|
0
|
|
|
|
|
|
persistent_unordered_map::persistent_unordered_map(const unordered_map& map, double load_factor, EntryEncode entry_encode) { |
|
8302
|
0
|
0
|
|
|
|
|
construct(std::map(map.begin(), map.end()), load_factor, entry_encode); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8303
|
0
|
|
|
|
|
|
} |
|
8304
|
|
|
|
|
|
|
|
|
8305
|
|
|
|
|
|
|
template |
|
8306
|
0
|
|
|
|
|
|
persistent_unordered_map::persistent_unordered_map(const unordered_map& map, double load_factor, bool add_prefixes, bool add_suffixes, EntryEncode entry_encode) { |
|
8307
|
|
|
|
|
|
|
// Copy data, possibly including prefixes and suffixes |
|
8308
|
0
|
0
|
|
|
|
|
std::map enlarged_map(map.begin(), map.end()); |
|
|
|
0
|
|
|
|
|
|
|
8309
|
|
|
|
|
|
|
|
|
8310
|
0
|
0
|
|
|
|
|
for (auto&& entry : map) { |
|
|
|
0
|
|
|
|
|
|
|
8311
|
0
|
|
|
|
|
|
const string& key = entry.first; |
|
8312
|
|
|
|
|
|
|
|
|
8313
|
0
|
0
|
|
|
|
|
if (!key.empty() && add_prefixes) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8314
|
0
|
0
|
|
|
|
|
for (unsigned i = key.size() - 1; i; i--) |
|
|
|
0
|
|
|
|
|
|
|
8315
|
0
|
0
|
|
|
|
|
enlarged_map[key.substr(0, i)]; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8316
|
|
|
|
|
|
|
|
|
8317
|
0
|
0
|
|
|
|
|
if (!key.empty() && add_suffixes) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8318
|
0
|
0
|
|
|
|
|
for (unsigned i = 1; i < key.size(); i++) |
|
|
|
0
|
|
|
|
|
|
|
8319
|
0
|
0
|
|
|
|
|
enlarged_map[key.substr(i)]; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8320
|
|
|
|
|
|
|
} |
|
8321
|
|
|
|
|
|
|
|
|
8322
|
0
|
0
|
|
|
|
|
construct(enlarged_map, load_factor, entry_encode); |
|
|
|
0
|
|
|
|
|
|
|
8323
|
0
|
|
|
|
|
|
} |
|
8324
|
|
|
|
|
|
|
|
|
8325
|
|
|
|
|
|
|
// We could (and used to) use unordered_map as input parameter. |
|
8326
|
|
|
|
|
|
|
// Nevertheless, as order is unspecified, the resulting persistent_unordered_map |
|
8327
|
|
|
|
|
|
|
// has different collision chains when generated on 32-bit and 64-bit machines. |
|
8328
|
|
|
|
|
|
|
// To guarantee uniform binary representation, we use map instead. |
|
8329
|
|
|
|
|
|
|
template |
|
8330
|
0
|
|
|
|
|
|
void persistent_unordered_map::construct(const map& map, double load_factor, EntryEncode entry_encode) { |
|
8331
|
|
|
|
|
|
|
// 1) Count number of elements for each size |
|
8332
|
|
|
|
|
|
|
vector sizes; |
|
8333
|
0
|
0
|
|
|
|
|
for (auto&& elem : map) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8334
|
0
|
|
|
|
|
|
unsigned len = elem.first.size(); |
|
8335
|
0
|
0
|
|
|
|
|
if (len >= sizes.size()) sizes.resize(len + 1); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8336
|
0
|
|
|
|
|
|
sizes[len]++; |
|
8337
|
|
|
|
|
|
|
} |
|
8338
|
0
|
0
|
|
|
|
|
for (auto&& size : sizes) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8339
|
0
|
0
|
|
|
|
|
resize(unsigned(load_factor * size)); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8340
|
|
|
|
|
|
|
|
|
8341
|
|
|
|
|
|
|
// 2) Add sizes of element data |
|
8342
|
0
|
0
|
|
|
|
|
for (auto&& elem : map) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8343
|
0
|
0
|
|
|
|
|
binary_encoder enc; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8344
|
0
|
0
|
|
|
|
|
entry_encode(enc, elem.second); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8345
|
0
|
|
|
|
|
|
add(elem.first.c_str(), elem.first.size(), enc.data.size()); |
|
8346
|
|
|
|
|
|
|
} |
|
8347
|
0
|
0
|
|
|
|
|
done_adding(); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8348
|
|
|
|
|
|
|
|
|
8349
|
|
|
|
|
|
|
// 3) Fill in element data |
|
8350
|
0
|
0
|
|
|
|
|
for (auto&& elem : map) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8351
|
0
|
0
|
|
|
|
|
binary_encoder enc; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8352
|
0
|
0
|
|
|
|
|
entry_encode(enc, elem.second); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8353
|
0
|
|
|
|
|
|
small_memcpy(fill(elem.first.c_str(), elem.first.size(), enc.data.size()), enc.data.data(), enc.data.size()); |
|
8354
|
|
|
|
|
|
|
} |
|
8355
|
0
|
|
|
|
|
|
done_filling(); |
|
8356
|
0
|
|
|
|
|
|
} |
|
8357
|
|
|
|
|
|
|
|
|
8358
|
0
|
|
|
|
|
|
void persistent_unordered_map::save(binary_encoder& enc) { |
|
8359
|
0
|
|
|
|
|
|
enc.add_1B(hashes.size()); |
|
8360
|
|
|
|
|
|
|
|
|
8361
|
0
|
0
|
|
|
|
|
for (auto&& hash : hashes) |
|
8362
|
0
|
|
|
|
|
|
hash.save(enc); |
|
8363
|
0
|
|
|
|
|
|
} |
|
8364
|
|
|
|
|
|
|
|
|
8365
|
0
|
|
|
|
|
|
void persistent_unordered_map::fnv_hash::save(binary_encoder& enc) { |
|
8366
|
0
|
|
|
|
|
|
enc.add_4B(hash.size()); |
|
8367
|
|
|
|
|
|
|
enc.add_data(hash); |
|
8368
|
|
|
|
|
|
|
|
|
8369
|
0
|
|
|
|
|
|
enc.add_4B(data.size()); |
|
8370
|
|
|
|
|
|
|
enc.add_data(data); |
|
8371
|
0
|
|
|
|
|
|
} |
|
8372
|
|
|
|
|
|
|
|
|
8373
|
|
|
|
|
|
|
} // namespace morphodita |
|
8374
|
|
|
|
|
|
|
|
|
8375
|
|
|
|
|
|
|
///////// |
|
8376
|
|
|
|
|
|
|
// File: morphodita/morpho/raw_morpho_dictionary_reader.h |
|
8377
|
|
|
|
|
|
|
///////// |
|
8378
|
|
|
|
|
|
|
|
|
8379
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
8380
|
|
|
|
|
|
|
// |
|
8381
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
8382
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
8383
|
|
|
|
|
|
|
// |
|
8384
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
8385
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
8386
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
8387
|
|
|
|
|
|
|
|
|
8388
|
|
|
|
|
|
|
namespace morphodita { |
|
8389
|
|
|
|
|
|
|
|
|
8390
|
0
|
|
|
|
|
|
class raw_morpho_dictionary_reader { |
|
8391
|
|
|
|
|
|
|
public: |
|
8392
|
0
|
|
|
|
|
|
raw_morpho_dictionary_reader(istream& in) : in(in) {} |
|
8393
|
|
|
|
|
|
|
bool next_lemma(string& lemma, vector>& tagged_forms); |
|
8394
|
|
|
|
|
|
|
private: |
|
8395
|
|
|
|
|
|
|
istream& in; |
|
8396
|
|
|
|
|
|
|
string line; |
|
8397
|
|
|
|
|
|
|
vector tokens; |
|
8398
|
|
|
|
|
|
|
unordered_set seen_lemmas; |
|
8399
|
|
|
|
|
|
|
}; |
|
8400
|
|
|
|
|
|
|
|
|
8401
|
|
|
|
|
|
|
} // namespace morphodita |
|
8402
|
|
|
|
|
|
|
|
|
8403
|
|
|
|
|
|
|
///////// |
|
8404
|
|
|
|
|
|
|
// File: utils/new_unique_ptr.h |
|
8405
|
|
|
|
|
|
|
///////// |
|
8406
|
|
|
|
|
|
|
|
|
8407
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
|
8408
|
|
|
|
|
|
|
// |
|
8409
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
8410
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
8411
|
|
|
|
|
|
|
// |
|
8412
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
8413
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
8414
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
8415
|
|
|
|
|
|
|
|
|
8416
|
|
|
|
|
|
|
namespace utils { |
|
8417
|
|
|
|
|
|
|
|
|
8418
|
|
|
|
|
|
|
template |
|
8419
|
3
|
|
|
|
|
|
unique_ptr new_unique_ptr(Args&&... args) { |
|
8420
|
3
|
50
|
|
|
|
|
return unique_ptr(new T(std::forward(args)...)); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8421
|
|
|
|
|
|
|
} |
|
8422
|
|
|
|
|
|
|
|
|
8423
|
|
|
|
|
|
|
} // namespace utils |
|
8424
|
|
|
|
|
|
|
|
|
8425
|
|
|
|
|
|
|
///////// |
|
8426
|
|
|
|
|
|
|
// File: morphodita/morpho/morpho_dictionary_encoder.h |
|
8427
|
|
|
|
|
|
|
///////// |
|
8428
|
|
|
|
|
|
|
|
|
8429
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
8430
|
|
|
|
|
|
|
// |
|
8431
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
8432
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
8433
|
|
|
|
|
|
|
// |
|
8434
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
8435
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
8436
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
8437
|
|
|
|
|
|
|
|
|
8438
|
|
|
|
|
|
|
namespace morphodita { |
|
8439
|
|
|
|
|
|
|
|
|
8440
|
|
|
|
|
|
|
// Declarations |
|
8441
|
|
|
|
|
|
|
template |
|
8442
|
|
|
|
|
|
|
class morpho_dictionary_encoder { |
|
8443
|
|
|
|
|
|
|
public: |
|
8444
|
|
|
|
|
|
|
static void encode(istream& is, int max_suffix_len, binary_encoder& enc); |
|
8445
|
|
|
|
|
|
|
}; |
|
8446
|
|
|
|
|
|
|
|
|
8447
|
|
|
|
|
|
|
// Definitions |
|
8448
|
|
|
|
|
|
|
template |
|
8449
|
0
|
|
|
|
|
|
class dictionary { |
|
8450
|
|
|
|
|
|
|
public: |
|
8451
|
|
|
|
|
|
|
void load(istream& is, int max_suffix_len); |
|
8452
|
|
|
|
|
|
|
void encode(binary_encoder& enc); |
|
8453
|
|
|
|
|
|
|
|
|
8454
|
|
|
|
|
|
|
private: |
|
8455
|
0
|
|
|
|
|
|
class trie { |
|
8456
|
|
|
|
|
|
|
public: |
|
8457
|
0
|
|
|
|
|
|
trie() : depth(0) {} |
|
8458
|
|
|
|
|
|
|
|
|
8459
|
0
|
|
|
|
|
|
void add(const char* str) { |
|
8460
|
0
|
0
|
|
|
|
|
if (!*str) return; |
|
8461
|
|
|
|
|
|
|
|
|
8462
|
0
|
0
|
|
|
|
|
for (auto&& child : children) |
|
8463
|
0
|
0
|
|
|
|
|
if (child.first == *str) { |
|
8464
|
0
|
|
|
|
|
|
child.second->add(str + 1); |
|
8465
|
0
|
|
|
|
|
|
depth = max(depth, 1 + child.second->depth); |
|
8466
|
|
|
|
|
|
|
return; |
|
8467
|
|
|
|
|
|
|
} |
|
8468
|
0
|
0
|
|
|
|
|
children.emplace_back(*str, new_unique_ptr()); |
|
8469
|
0
|
|
|
|
|
|
children.back().second->add(str + 1); |
|
8470
|
0
|
|
|
|
|
|
depth = max(depth, 1 + children.back().second->depth); |
|
8471
|
|
|
|
|
|
|
} |
|
8472
|
|
|
|
|
|
|
|
|
8473
|
0
|
|
|
|
|
|
string find_candidate_prefix(int max_suffix_len) { |
|
8474
|
|
|
|
|
|
|
string current, best; |
|
8475
|
0
|
|
|
|
|
|
int best_length = 0; |
|
8476
|
0
|
0
|
|
|
|
|
find_candidate_prefix(max_suffix_len, current, best, best_length, 0); |
|
8477
|
0
|
|
|
|
|
|
return best; |
|
8478
|
|
|
|
|
|
|
} |
|
8479
|
0
|
|
|
|
|
|
void find_candidate_prefix(int max_suffix_len, string& current, string& best, int& best_length, int length) { |
|
8480
|
0
|
0
|
|
|
|
|
if (depth < max_suffix_len && length > best_length) { |
|
|
|
0
|
|
|
|
|
|
|
8481
|
|
|
|
|
|
|
best = current; |
|
8482
|
0
|
|
|
|
|
|
best_length = length; |
|
8483
|
|
|
|
|
|
|
} |
|
8484
|
0
|
0
|
|
|
|
|
for (auto&& child : children) { |
|
8485
|
0
|
|
|
|
|
|
current.push_back(child.first); |
|
8486
|
0
|
0
|
|
|
|
|
child.second->find_candidate_prefix(max_suffix_len, current, best, best_length, children.size() == 1 ? length + 1 : 1); |
|
8487
|
0
|
|
|
|
|
|
current.resize(current.size() - 1); |
|
8488
|
|
|
|
|
|
|
} |
|
8489
|
0
|
|
|
|
|
|
} |
|
8490
|
|
|
|
|
|
|
|
|
8491
|
|
|
|
|
|
|
vector>> children; |
|
8492
|
|
|
|
|
|
|
int depth; |
|
8493
|
|
|
|
|
|
|
}; |
|
8494
|
|
|
|
|
|
|
|
|
8495
|
0
|
|
|
|
|
|
class histogram { |
|
8496
|
|
|
|
|
|
|
public: |
|
8497
|
0
|
|
|
|
|
|
void add(const string& str) { |
|
8498
|
0
|
0
|
|
|
|
|
if (str.size() >= lengths.size()) lengths.resize(str.size() + 1); |
|
8499
|
|
|
|
|
|
|
lengths[str.size()].insert(str); |
|
8500
|
0
|
|
|
|
|
|
} |
|
8501
|
|
|
|
|
|
|
|
|
8502
|
0
|
|
|
|
|
|
void encode(binary_encoder& enc) { |
|
8503
|
0
|
|
|
|
|
|
enc.add_1B(lengths.size()); |
|
8504
|
0
|
0
|
|
|
|
|
for (auto&& set : lengths) |
|
8505
|
0
|
|
|
|
|
|
enc.add_4B(set.size()); |
|
8506
|
0
|
|
|
|
|
|
} |
|
8507
|
|
|
|
|
|
|
|
|
8508
|
|
|
|
|
|
|
vector> lengths; |
|
8509
|
|
|
|
|
|
|
}; |
|
8510
|
|
|
|
|
|
|
|
|
8511
|
0
|
|
|
|
|
|
struct lemma_info { |
|
8512
|
0
|
|
|
|
|
|
lemma_info(string lemma) { |
|
8513
|
0
|
0
|
|
|
|
|
this->lemma = lemma.substr(0, addinfo.parse(lemma, true)); |
|
8514
|
0
|
|
|
|
|
|
} |
|
8515
|
|
|
|
|
|
|
|
|
8516
|
|
|
|
|
|
|
string lemma; |
|
8517
|
|
|
|
|
|
|
LemmaAddinfo addinfo; |
|
8518
|
0
|
|
|
|
|
|
struct lemma_form_info { |
|
8519
|
0
|
|
|
|
|
|
lemma_form_info(string form, int clas) : form(form), clas(clas) {} |
|
8520
|
|
|
|
|
|
|
|
|
8521
|
|
|
|
|
|
|
string form; |
|
8522
|
|
|
|
|
|
|
int clas; |
|
8523
|
|
|
|
|
|
|
|
|
8524
|
0
|
0
|
|
|
|
|
bool operator<(const lemma_form_info& other) const { return form < other.form || (form == other.form && clas < other.clas); } |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8525
|
|
|
|
|
|
|
}; |
|
8526
|
|
|
|
|
|
|
vector forms; |
|
8527
|
|
|
|
|
|
|
|
|
8528
|
0
|
0
|
|
|
|
|
bool operator<(const lemma_info& other) const { return lemma < other.lemma || (lemma == other.lemma && addinfo.data < other.addinfo.data); } |
|
8529
|
|
|
|
|
|
|
}; |
|
8530
|
|
|
|
|
|
|
|
|
8531
|
|
|
|
|
|
|
unordered_map classes; |
|
8532
|
|
|
|
|
|
|
unordered_map>> suffixes; |
|
8533
|
|
|
|
|
|
|
|
|
8534
|
|
|
|
|
|
|
vector tags; |
|
8535
|
|
|
|
|
|
|
unordered_map tags_map; |
|
8536
|
|
|
|
|
|
|
|
|
8537
|
|
|
|
|
|
|
histogram lemmas_hist, forms_hist; |
|
8538
|
|
|
|
|
|
|
|
|
8539
|
|
|
|
|
|
|
vector lemmas; |
|
8540
|
|
|
|
|
|
|
}; |
|
8541
|
|
|
|
|
|
|
|
|
8542
|
|
|
|
|
|
|
template |
|
8543
|
0
|
|
|
|
|
|
void morpho_dictionary_encoder::encode(istream& is, int max_suffix_len, binary_encoder& enc) { |
|
8544
|
0
|
|
|
|
|
|
dictionary dict; |
|
8545
|
|
|
|
|
|
|
|
|
8546
|
|
|
|
|
|
|
// Load the dictionary and create classes |
|
8547
|
0
|
0
|
|
|
|
|
dict.load(is, max_suffix_len); |
|
8548
|
|
|
|
|
|
|
|
|
8549
|
|
|
|
|
|
|
// Encode the dictionary |
|
8550
|
0
|
0
|
|
|
|
|
dict.encode(enc); |
|
8551
|
0
|
|
|
|
|
|
} |
|
8552
|
|
|
|
|
|
|
|
|
8553
|
|
|
|
|
|
|
template |
|
8554
|
0
|
|
|
|
|
|
void dictionary::load(istream& is, int max_suffix_len) { |
|
8555
|
|
|
|
|
|
|
// Load lemmas and create classes |
|
8556
|
0
|
|
|
|
|
|
raw_morpho_dictionary_reader raw(is); |
|
8557
|
|
|
|
|
|
|
string lemma; |
|
8558
|
0
|
|
|
|
|
|
vector> forms; |
|
8559
|
0
|
0
|
|
|
|
|
while(raw.next_lemma(lemma, forms)) { |
|
|
|
0
|
|
|
|
|
|
|
8560
|
|
|
|
|
|
|
// Make sure forms are unique |
|
8561
|
|
|
|
|
|
|
sort(forms.begin(), forms.end()); |
|
8562
|
|
|
|
|
|
|
auto forms_end = unique(forms.begin(), forms.end()); |
|
8563
|
0
|
0
|
|
|
|
|
if (forms_end != forms.end()) { |
|
8564
|
|
|
|
|
|
|
// cerr << "Warning: repeated form-tag in lemma " << lemma << '.' << endl; |
|
8565
|
|
|
|
|
|
|
forms.erase(forms_end, forms.end()); |
|
8566
|
|
|
|
|
|
|
} |
|
8567
|
|
|
|
|
|
|
|
|
8568
|
|
|
|
|
|
|
// Create lemma_info |
|
8569
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(lemma); |
|
8570
|
|
|
|
|
|
|
auto& lemma_info = lemmas.back(); |
|
8571
|
0
|
0
|
|
|
|
|
lemmas_hist.add(lemma_info.lemma); |
|
8572
|
|
|
|
|
|
|
|
|
8573
|
|
|
|
|
|
|
// Create classes |
|
8574
|
0
|
0
|
|
|
|
|
while (!forms.empty()) { |
|
8575
|
|
|
|
|
|
|
trie t; |
|
8576
|
0
|
0
|
|
|
|
|
for (auto&& form : forms) |
|
8577
|
0
|
0
|
|
|
|
|
t.add(form.first.c_str()); |
|
8578
|
|
|
|
|
|
|
|
|
8579
|
|
|
|
|
|
|
// Find prefix of forms in class being added. |
|
8580
|
0
|
0
|
|
|
|
|
string prefix = t.find_candidate_prefix(max_suffix_len); |
|
8581
|
|
|
|
|
|
|
|
|
8582
|
|
|
|
|
|
|
// Find forms of the class being added. |
|
8583
|
|
|
|
|
|
|
auto start = forms.begin(); |
|
8584
|
0
|
0
|
|
|
|
|
while (start != forms.end() && start->first.compare(0, prefix.size(), prefix) != 0) start++; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8585
|
0
|
0
|
|
|
|
|
if (start == forms.end()) training_failure("Internal error when generating classes, cannot find prefix '" << prefix << "'!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8586
|
|
|
|
|
|
|
auto end = start; |
|
8587
|
0
|
0
|
|
|
|
|
while (end != forms.end() && end->first.compare(0, prefix.size(), prefix) == 0) end++; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8588
|
|
|
|
|
|
|
|
|
8589
|
|
|
|
|
|
|
// Find common prefix of class forms -- may be larger than prefix. |
|
8590
|
0
|
|
|
|
|
|
int common_prefix = prefix.size(); |
|
8591
|
0
|
0
|
|
|
|
|
while (common_prefix < int(start->first.size()) && start->first[common_prefix] == (end-1)->first[common_prefix]) common_prefix++; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8592
|
|
|
|
|
|
|
|
|
8593
|
|
|
|
|
|
|
string clas; |
|
8594
|
0
|
0
|
|
|
|
|
for (auto form = start; form != end; form++) { |
|
8595
|
0
|
0
|
|
|
|
|
if (!clas.empty()) clas.push_back('\t'); |
|
|
|
0
|
|
|
|
|
|
|
8596
|
0
|
0
|
|
|
|
|
clas.append(form->first, common_prefix, string::npos); |
|
8597
|
0
|
0
|
|
|
|
|
clas.push_back('\t'); |
|
8598
|
|
|
|
|
|
|
clas.append(form->second); |
|
8599
|
|
|
|
|
|
|
} |
|
8600
|
|
|
|
|
|
|
|
|
8601
|
0
|
|
|
|
|
|
auto class_it = classes.emplace(clas, int(classes.size())); |
|
8602
|
0
|
|
|
|
|
|
int class_id = class_it.first->second; |
|
8603
|
0
|
0
|
|
|
|
|
if (class_it.second) { |
|
8604
|
|
|
|
|
|
|
// New class, add it, together with its tags. |
|
8605
|
0
|
0
|
|
|
|
|
for (auto form = start; form != end; form++) { |
|
8606
|
0
|
|
|
|
|
|
int tag = tags_map.emplace(form->second, int(tags.size())).first->second; |
|
8607
|
0
|
0
|
|
|
|
|
if (tag >= int(tags.size())) tags.emplace_back(form->second); |
|
|
|
0
|
|
|
|
|
|
|
8608
|
0
|
0
|
|
|
|
|
suffixes[form->first.substr(common_prefix)][class_id].emplace_back(tag); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8609
|
|
|
|
|
|
|
} |
|
8610
|
|
|
|
|
|
|
} |
|
8611
|
|
|
|
|
|
|
|
|
8612
|
|
|
|
|
|
|
// Move forms in the class being added to lemma and remove them from unprocessed forms. |
|
8613
|
0
|
0
|
|
|
|
|
lemma_info.forms.emplace_back(start->first.substr(0, common_prefix), class_id); |
|
|
|
0
|
|
|
|
|
|
|
8614
|
0
|
0
|
|
|
|
|
forms_hist.add(lemma_info.forms.back().form); |
|
8615
|
|
|
|
|
|
|
forms.erase(start, end); |
|
8616
|
|
|
|
|
|
|
} |
|
8617
|
|
|
|
|
|
|
stable_sort(lemma_info.forms.begin(), lemma_info.forms.end()); |
|
8618
|
|
|
|
|
|
|
} |
|
8619
|
|
|
|
|
|
|
stable_sort(lemmas.begin(), lemmas.end()); |
|
8620
|
0
|
|
|
|
|
|
} |
|
8621
|
|
|
|
|
|
|
|
|
8622
|
|
|
|
|
|
|
template |
|
8623
|
0
|
|
|
|
|
|
void dictionary::encode(binary_encoder& enc) { |
|
8624
|
|
|
|
|
|
|
// Encode lemmas and forms |
|
8625
|
0
|
|
|
|
|
|
lemmas_hist.encode(enc); |
|
8626
|
0
|
|
|
|
|
|
forms_hist.encode(enc); |
|
8627
|
|
|
|
|
|
|
|
|
8628
|
0
|
|
|
|
|
|
string prev = ""; |
|
8629
|
0
|
|
|
|
|
|
enc.add_4B(lemmas.size()); |
|
8630
|
0
|
0
|
|
|
|
|
for (auto&& lemma : lemmas) { |
|
8631
|
|
|
|
|
|
|
int cpl = 0; |
|
8632
|
0
|
0
|
|
|
|
|
while (prev[cpl] && prev[cpl] == lemma.lemma[cpl]) cpl++; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8633
|
|
|
|
|
|
|
|
|
8634
|
0
|
0
|
|
|
|
|
enc.add_1B(prev.length() - cpl); |
|
8635
|
0
|
0
|
|
|
|
|
enc.add_1B(lemma.lemma.size() - cpl); |
|
8636
|
0
|
0
|
|
|
|
|
enc.add_data(lemma.lemma.substr(cpl)); |
|
8637
|
0
|
0
|
|
|
|
|
enc.add_1B(lemma.addinfo.data.size()); |
|
8638
|
|
|
|
|
|
|
enc.add_data(lemma.addinfo.data); |
|
8639
|
0
|
0
|
|
|
|
|
enc.add_1B(lemma.forms.size()); |
|
8640
|
|
|
|
|
|
|
|
|
8641
|
|
|
|
|
|
|
string prev_form = lemma.lemma; |
|
8642
|
0
|
0
|
|
|
|
|
for (auto&& lemma_form : lemma.forms) { |
|
8643
|
|
|
|
|
|
|
unsigned best_prev_from = 0, best_form_from = 0, best_len = 0; |
|
8644
|
0
|
0
|
|
|
|
|
for (unsigned prev_from = 0; prev_from < prev_form.size(); prev_from++) |
|
8645
|
0
|
0
|
|
|
|
|
for (unsigned form_from = 0; form_from < lemma_form.form.size(); form_from++) { |
|
8646
|
|
|
|
|
|
|
unsigned len = 0; |
|
8647
|
0
|
0
|
|
|
|
|
while (prev_from + len < prev_form.size() && form_from + len < lemma_form.form.size() && prev_form[prev_from+len] == lemma_form.form[form_from+len]) len++; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8648
|
0
|
0
|
|
|
|
|
if (len > best_len) best_prev_from = prev_from, best_form_from = form_from, best_len = len; |
|
8649
|
|
|
|
|
|
|
} |
|
8650
|
|
|
|
|
|
|
|
|
8651
|
|
|
|
|
|
|
enum { REMOVE_START = 1, REMOVE_END = 2, ADD_START = 4, ADD_END = 8 }; |
|
8652
|
0
|
0
|
|
|
|
|
enc.add_1B(REMOVE_START * (best_prev_from>0) + REMOVE_END * (best_prev_from+best_len
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8653
|
|
|
|
|
|
|
ADD_START * (best_form_from>0) + ADD_END * (best_form_from+best_len
|
|
8654
|
0
|
0
|
|
|
|
|
if (best_prev_from > 0) enc.add_1B(best_prev_from); |
|
|
|
0
|
|
|
|
|
|
|
8655
|
0
|
0
|
|
|
|
|
if (best_prev_from + best_len < prev_form.size()) enc.add_1B(prev_form.size() - best_prev_from - best_len); |
|
|
|
0
|
|
|
|
|
|
|
8656
|
0
|
0
|
|
|
|
|
if (best_form_from > 0) { |
|
8657
|
0
|
0
|
|
|
|
|
enc.add_1B(best_form_from); |
|
8658
|
0
|
0
|
|
|
|
|
enc.add_data(lemma_form.form.substr(0, best_form_from)); |
|
8659
|
|
|
|
|
|
|
} |
|
8660
|
0
|
0
|
|
|
|
|
if (best_form_from + best_len < lemma_form.form.size()) { |
|
8661
|
0
|
0
|
|
|
|
|
enc.add_1B(lemma_form.form.size() - best_form_from - best_len); |
|
8662
|
0
|
0
|
|
|
|
|
enc.add_data(lemma_form.form.substr(best_form_from + best_len)); |
|
8663
|
|
|
|
|
|
|
} |
|
8664
|
0
|
0
|
|
|
|
|
enc.add_2B(lemma_form.clas); |
|
8665
|
|
|
|
|
|
|
|
|
8666
|
0
|
|
|
|
|
|
prev_form = lemma_form.form; |
|
8667
|
|
|
|
|
|
|
} |
|
8668
|
|
|
|
|
|
|
|
|
8669
|
|
|
|
|
|
|
prev = lemma.lemma; |
|
8670
|
|
|
|
|
|
|
} |
|
8671
|
|
|
|
|
|
|
|
|
8672
|
|
|
|
|
|
|
// Encode tags |
|
8673
|
0
|
0
|
|
|
|
|
enc.add_2B(tags.size()); |
|
8674
|
0
|
0
|
|
|
|
|
for (auto&& tag : tags) { |
|
8675
|
0
|
0
|
|
|
|
|
enc.add_1B(tag.size()); |
|
8676
|
|
|
|
|
|
|
enc.add_data(tag); |
|
8677
|
|
|
|
|
|
|
} |
|
8678
|
|
|
|
|
|
|
|
|
8679
|
|
|
|
|
|
|
// Encode classes |
|
8680
|
0
|
0
|
|
|
|
|
persistent_unordered_map(suffixes, 5, false, true, [](binary_encoder& enc, const map>& suffix) { |
|
8681
|
0
|
|
|
|
|
|
enc.add_2B(suffix.size()); |
|
8682
|
0
|
0
|
|
|
|
|
for (auto&& clas : suffix) |
|
8683
|
0
|
|
|
|
|
|
enc.add_2B(clas.first); |
|
8684
|
|
|
|
|
|
|
uint32_t tags = 0, prev_tags = 0; |
|
8685
|
0
|
0
|
|
|
|
|
for (auto&& clas : suffix) { |
|
8686
|
0
|
0
|
|
|
|
|
enc.add_2B(tags - prev_tags < (1<<16) ? uint16_t(tags) : tags); |
|
8687
|
|
|
|
|
|
|
prev_tags = tags; |
|
8688
|
0
|
|
|
|
|
|
tags += clas.second.size(); |
|
8689
|
|
|
|
|
|
|
} |
|
8690
|
0
|
0
|
|
|
|
|
enc.add_2B(tags - prev_tags < (1<<16) ? uint16_t(tags) : tags); |
|
8691
|
0
|
0
|
|
|
|
|
for (auto&& clas : suffix) |
|
8692
|
0
|
0
|
|
|
|
|
for (auto&& tag : clas.second) |
|
8693
|
0
|
|
|
|
|
|
enc.add_2B(tag); |
|
8694
|
0
|
0
|
|
|
|
|
}).save(enc); |
|
8695
|
0
|
|
|
|
|
|
} |
|
8696
|
|
|
|
|
|
|
|
|
8697
|
|
|
|
|
|
|
} // namespace morphodita |
|
8698
|
|
|
|
|
|
|
|
|
8699
|
|
|
|
|
|
|
///////// |
|
8700
|
|
|
|
|
|
|
// File: morphodita/morpho/morpho_prefix_guesser_encoder.h |
|
8701
|
|
|
|
|
|
|
///////// |
|
8702
|
|
|
|
|
|
|
|
|
8703
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
8704
|
|
|
|
|
|
|
// |
|
8705
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
8706
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
8707
|
|
|
|
|
|
|
// |
|
8708
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
8709
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
8710
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
8711
|
|
|
|
|
|
|
|
|
8712
|
|
|
|
|
|
|
namespace morphodita { |
|
8713
|
|
|
|
|
|
|
|
|
8714
|
|
|
|
|
|
|
class morpho_prefix_guesser_encoder { |
|
8715
|
|
|
|
|
|
|
public: |
|
8716
|
|
|
|
|
|
|
static void encode(istream& is, binary_encoder& enc); |
|
8717
|
|
|
|
|
|
|
}; |
|
8718
|
|
|
|
|
|
|
|
|
8719
|
|
|
|
|
|
|
} // namespace morphodita |
|
8720
|
|
|
|
|
|
|
|
|
8721
|
|
|
|
|
|
|
///////// |
|
8722
|
|
|
|
|
|
|
// File: morphodita/morpho/morpho_statistical_guesser_encoder.h |
|
8723
|
|
|
|
|
|
|
///////// |
|
8724
|
|
|
|
|
|
|
|
|
8725
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
8726
|
|
|
|
|
|
|
// |
|
8727
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
8728
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
8729
|
|
|
|
|
|
|
// |
|
8730
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
8731
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
8732
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
8733
|
|
|
|
|
|
|
|
|
8734
|
|
|
|
|
|
|
namespace morphodita { |
|
8735
|
|
|
|
|
|
|
|
|
8736
|
|
|
|
|
|
|
class morpho_statistical_guesser_encoder { |
|
8737
|
|
|
|
|
|
|
public: |
|
8738
|
|
|
|
|
|
|
static void encode(istream& is, binary_encoder& enc); |
|
8739
|
|
|
|
|
|
|
}; |
|
8740
|
|
|
|
|
|
|
|
|
8741
|
|
|
|
|
|
|
} // namespace morphodita |
|
8742
|
|
|
|
|
|
|
|
|
8743
|
|
|
|
|
|
|
///////// |
|
8744
|
|
|
|
|
|
|
// File: morphodita/morpho/generic_morpho_encoder.cpp |
|
8745
|
|
|
|
|
|
|
///////// |
|
8746
|
|
|
|
|
|
|
|
|
8747
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
8748
|
|
|
|
|
|
|
// |
|
8749
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
8750
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
8751
|
|
|
|
|
|
|
// |
|
8752
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
8753
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
8754
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
8755
|
|
|
|
|
|
|
|
|
8756
|
|
|
|
|
|
|
namespace morphodita { |
|
8757
|
|
|
|
|
|
|
|
|
8758
|
0
|
|
|
|
|
|
void generic_morpho_encoder::encode(istream& in_dictionary, int max_suffix_len, const tags& tags, istream& in_statistical_guesser, ostream& out_morpho) { |
|
8759
|
0
|
|
|
|
|
|
binary_encoder enc; |
|
8760
|
|
|
|
|
|
|
|
|
8761
|
0
|
0
|
|
|
|
|
enc.add_1B(tags.unknown_tag.size()); |
|
8762
|
|
|
|
|
|
|
enc.add_data(tags.unknown_tag); |
|
8763
|
0
|
0
|
|
|
|
|
enc.add_1B(tags.number_tag.size()); |
|
8764
|
|
|
|
|
|
|
enc.add_data(tags.number_tag); |
|
8765
|
0
|
0
|
|
|
|
|
enc.add_1B(tags.punctuation_tag.size()); |
|
8766
|
|
|
|
|
|
|
enc.add_data(tags.punctuation_tag); |
|
8767
|
0
|
0
|
|
|
|
|
enc.add_1B(tags.symbol_tag.size()); |
|
8768
|
|
|
|
|
|
|
enc.add_data(tags.symbol_tag); |
|
8769
|
|
|
|
|
|
|
|
|
8770
|
|
|
|
|
|
|
// cerr << "Encoding dictionary." << endl; |
|
8771
|
0
|
0
|
|
|
|
|
morpho_dictionary_encoder::encode(in_dictionary, max_suffix_len, enc); |
|
8772
|
|
|
|
|
|
|
|
|
8773
|
|
|
|
|
|
|
// Load and encode statistical guesser if requested |
|
8774
|
0
|
0
|
|
|
|
|
enc.add_1B(bool(in_statistical_guesser)); |
|
8775
|
0
|
0
|
|
|
|
|
if (in_statistical_guesser) { |
|
8776
|
|
|
|
|
|
|
// cerr << "Encoding statistical guesser." << endl; |
|
8777
|
0
|
0
|
|
|
|
|
morpho_statistical_guesser_encoder::encode(in_statistical_guesser, enc); |
|
8778
|
|
|
|
|
|
|
} |
|
8779
|
|
|
|
|
|
|
|
|
8780
|
|
|
|
|
|
|
// done, save the dictionary |
|
8781
|
|
|
|
|
|
|
// cerr << "Compressing dictionary." << endl; |
|
8782
|
0
|
0
|
|
|
|
|
if (!compressor::save(out_morpho, enc)) training_failure("Cannot compress and write dictionary to file!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8783
|
|
|
|
|
|
|
// cerr << "Dictionary saved." << endl; |
|
8784
|
0
|
|
|
|
|
|
} |
|
8785
|
|
|
|
|
|
|
|
|
8786
|
|
|
|
|
|
|
} // namespace morphodita |
|
8787
|
|
|
|
|
|
|
|
|
8788
|
|
|
|
|
|
|
///////// |
|
8789
|
|
|
|
|
|
|
// File: morphodita/morpho/morpho_ids.h |
|
8790
|
|
|
|
|
|
|
///////// |
|
8791
|
|
|
|
|
|
|
|
|
8792
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
8793
|
|
|
|
|
|
|
// |
|
8794
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
8795
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
8796
|
|
|
|
|
|
|
// |
|
8797
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
8798
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
8799
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
8800
|
|
|
|
|
|
|
|
|
8801
|
|
|
|
|
|
|
namespace morphodita { |
|
8802
|
|
|
|
|
|
|
|
|
8803
|
|
|
|
|
|
|
class morpho_ids { |
|
8804
|
|
|
|
|
|
|
public: |
|
8805
|
|
|
|
|
|
|
enum morpho_id { |
|
8806
|
|
|
|
|
|
|
CZECH = 0, |
|
8807
|
|
|
|
|
|
|
ENGLISH_V1 = 1, |
|
8808
|
|
|
|
|
|
|
GENERIC = 2, |
|
8809
|
|
|
|
|
|
|
EXTERNAL = 3, |
|
8810
|
|
|
|
|
|
|
ENGLISH_V2 = 4, |
|
8811
|
|
|
|
|
|
|
ENGLISH_V3 = 5, ENGLISH = ENGLISH_V3, |
|
8812
|
|
|
|
|
|
|
SLOVAK_PDT = 6, |
|
8813
|
|
|
|
|
|
|
DERIVATOR_DICTIONARY = 7, |
|
8814
|
|
|
|
|
|
|
}; |
|
8815
|
|
|
|
|
|
|
|
|
8816
|
|
|
|
|
|
|
static bool parse(const string& str, morpho_id& id) { |
|
8817
|
|
|
|
|
|
|
if (str == "czech") return id = CZECH, true; |
|
8818
|
|
|
|
|
|
|
if (str == "english") return id = ENGLISH, true; |
|
8819
|
|
|
|
|
|
|
if (str == "external") return id = EXTERNAL, true; |
|
8820
|
|
|
|
|
|
|
if (str == "generic") return id = GENERIC, true; |
|
8821
|
|
|
|
|
|
|
if (str == "slovak_pdt") return id = SLOVAK_PDT, true; |
|
8822
|
|
|
|
|
|
|
return false; |
|
8823
|
|
|
|
|
|
|
} |
|
8824
|
|
|
|
|
|
|
}; |
|
8825
|
|
|
|
|
|
|
|
|
8826
|
|
|
|
|
|
|
typedef morpho_ids::morpho_id morpho_id; |
|
8827
|
|
|
|
|
|
|
|
|
8828
|
|
|
|
|
|
|
} // namespace morphodita |
|
8829
|
|
|
|
|
|
|
|
|
8830
|
|
|
|
|
|
|
///////// |
|
8831
|
|
|
|
|
|
|
// File: morphodita/morpho/morpho.cpp |
|
8832
|
|
|
|
|
|
|
///////// |
|
8833
|
|
|
|
|
|
|
|
|
8834
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
8835
|
|
|
|
|
|
|
// |
|
8836
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
8837
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
8838
|
|
|
|
|
|
|
// |
|
8839
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
8840
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
8841
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
8842
|
|
|
|
|
|
|
|
|
8843
|
|
|
|
|
|
|
namespace morphodita { |
|
8844
|
|
|
|
|
|
|
|
|
8845
|
1
|
|
|
|
|
|
morpho* morpho::load(istream& is) { |
|
8846
|
1
|
|
|
|
|
|
morpho_id id = morpho_id(is.get()); |
|
8847
|
1
|
|
|
|
|
|
switch (id) { |
|
8848
|
|
|
|
|
|
|
case morpho_ids::CZECH: |
|
8849
|
|
|
|
|
|
|
{ |
|
8850
|
0
|
|
|
|
|
|
auto res = new_unique_ptr(czech_morpho::morpho_language::CZECH, 1); |
|
8851
|
0
|
0
|
|
|
|
|
if (res->load(is)) return res.release(); |
|
|
|
0
|
|
|
|
|
|
|
8852
|
|
|
|
|
|
|
break; |
|
8853
|
|
|
|
|
|
|
} |
|
8854
|
|
|
|
|
|
|
case morpho_ids::ENGLISH_V1: |
|
8855
|
|
|
|
|
|
|
case morpho_ids::ENGLISH_V2: |
|
8856
|
|
|
|
|
|
|
case morpho_ids::ENGLISH_V3: |
|
8857
|
|
|
|
|
|
|
{ |
|
8858
|
|
|
|
|
|
|
auto res = new_unique_ptr(id == morpho_ids::ENGLISH_V1 ? 1 : |
|
8859
|
|
|
|
|
|
|
id == morpho_ids::ENGLISH_V2 ? 2 : |
|
8860
|
0
|
0
|
|
|
|
|
3); |
|
|
|
0
|
|
|
|
|
|
|
8861
|
0
|
0
|
|
|
|
|
if (res->load(is)) return res.release(); |
|
|
|
0
|
|
|
|
|
|
|
8862
|
|
|
|
|
|
|
break; |
|
8863
|
|
|
|
|
|
|
} |
|
8864
|
|
|
|
|
|
|
case morpho_ids::EXTERNAL: |
|
8865
|
|
|
|
|
|
|
{ |
|
8866
|
0
|
|
|
|
|
|
auto res = new_unique_ptr(1); |
|
8867
|
0
|
0
|
|
|
|
|
if (res->load(is)) return res.release(); |
|
|
|
0
|
|
|
|
|
|
|
8868
|
|
|
|
|
|
|
break; |
|
8869
|
|
|
|
|
|
|
} |
|
8870
|
|
|
|
|
|
|
case morpho_ids::GENERIC: |
|
8871
|
|
|
|
|
|
|
{ |
|
8872
|
1
|
|
|
|
|
|
auto res = new_unique_ptr(1); |
|
8873
|
1
|
50
|
|
|
|
|
if (res->load(is)) return res.release(); |
|
|
|
50
|
|
|
|
|
|
|
8874
|
|
|
|
|
|
|
break; |
|
8875
|
|
|
|
|
|
|
} |
|
8876
|
|
|
|
|
|
|
case morpho_ids::SLOVAK_PDT: |
|
8877
|
|
|
|
|
|
|
{ |
|
8878
|
0
|
|
|
|
|
|
auto res = new_unique_ptr(czech_morpho::morpho_language::SLOVAK, 3); |
|
8879
|
0
|
0
|
|
|
|
|
if (res->load(is)) return res.release(); |
|
|
|
0
|
|
|
|
|
|
|
8880
|
|
|
|
|
|
|
break; |
|
8881
|
|
|
|
|
|
|
} |
|
8882
|
|
|
|
|
|
|
case morpho_ids::DERIVATOR_DICTIONARY: |
|
8883
|
|
|
|
|
|
|
{ |
|
8884
|
0
|
|
|
|
|
|
auto derinet = new_unique_ptr(); |
|
8885
|
0
|
0
|
|
|
|
|
if (!derinet->load(is)) return nullptr; |
|
|
|
0
|
|
|
|
|
|
|
8886
|
|
|
|
|
|
|
|
|
8887
|
0
|
0
|
|
|
|
|
unique_ptr dictionary(load(is)); |
|
8888
|
0
|
0
|
|
|
|
|
if (!dictionary) return nullptr; |
|
8889
|
0
|
|
|
|
|
|
derinet->dictionary = dictionary.get(); |
|
8890
|
|
|
|
|
|
|
dictionary->derinet.reset(derinet.release()); |
|
8891
|
0
|
|
|
|
|
|
return dictionary.release(); |
|
8892
|
|
|
|
|
|
|
} |
|
8893
|
|
|
|
|
|
|
} |
|
8894
|
|
|
|
|
|
|
|
|
8895
|
|
|
|
|
|
|
return nullptr; |
|
8896
|
|
|
|
|
|
|
} |
|
8897
|
|
|
|
|
|
|
|
|
8898
|
0
|
|
|
|
|
|
morpho* morpho::load(const char* fname) { |
|
8899
|
0
|
0
|
|
|
|
|
ifstream f(path_from_utf8(fname).c_str(), ifstream::binary); |
|
8900
|
0
|
0
|
|
|
|
|
if (!f) return nullptr; |
|
8901
|
|
|
|
|
|
|
|
|
8902
|
0
|
0
|
|
|
|
|
return load(f); |
|
8903
|
|
|
|
|
|
|
} |
|
8904
|
|
|
|
|
|
|
|
|
8905
|
0
|
|
|
|
|
|
const derivator* morpho::get_derivator() const { |
|
8906
|
0
|
|
|
|
|
|
return derinet.get(); |
|
8907
|
|
|
|
|
|
|
} |
|
8908
|
|
|
|
|
|
|
|
|
8909
|
|
|
|
|
|
|
} // namespace morphodita |
|
8910
|
|
|
|
|
|
|
|
|
8911
|
|
|
|
|
|
|
///////// |
|
8912
|
|
|
|
|
|
|
// File: morphodita/morpho/morpho_statistical_guesser.cpp |
|
8913
|
|
|
|
|
|
|
///////// |
|
8914
|
|
|
|
|
|
|
|
|
8915
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
8916
|
|
|
|
|
|
|
// |
|
8917
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
8918
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
8919
|
|
|
|
|
|
|
// |
|
8920
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
8921
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
8922
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
8923
|
|
|
|
|
|
|
|
|
8924
|
|
|
|
|
|
|
namespace morphodita { |
|
8925
|
|
|
|
|
|
|
|
|
8926
|
1
|
|
|
|
|
|
void morpho_statistical_guesser::load(binary_decoder& data) { |
|
8927
|
|
|
|
|
|
|
// Load tags and default tag |
|
8928
|
1
|
|
|
|
|
|
tags.resize(data.next_2B()); |
|
8929
|
7
|
100
|
|
|
|
|
for (auto&& tag : tags) { |
|
8930
|
6
|
|
|
|
|
|
tag.resize(data.next_1B()); |
|
8931
|
403
|
100
|
|
|
|
|
for (unsigned i = 0; i < tag.size(); i++) |
|
8932
|
397
|
|
|
|
|
|
tag[i] = data.next_1B(); |
|
8933
|
|
|
|
|
|
|
} |
|
8934
|
1
|
|
|
|
|
|
default_tag = data.next_2B(); |
|
8935
|
|
|
|
|
|
|
|
|
8936
|
|
|
|
|
|
|
// Load rules |
|
8937
|
1
|
|
|
|
|
|
rules.load(data); |
|
8938
|
1
|
|
|
|
|
|
} |
|
8939
|
|
|
|
|
|
|
|
|
8940
|
|
|
|
|
|
|
// Helper method for analyze. |
|
8941
|
0
|
|
|
|
|
|
static bool contains(morpho_statistical_guesser::used_rules* used, const string& rule) { |
|
8942
|
0
|
0
|
|
|
|
|
if (!used) return false; |
|
8943
|
|
|
|
|
|
|
|
|
8944
|
0
|
0
|
|
|
|
|
for (auto&& used_rule : *used) |
|
8945
|
0
|
0
|
|
|
|
|
if (used_rule == rule) |
|
8946
|
|
|
|
|
|
|
return true; |
|
8947
|
|
|
|
|
|
|
|
|
8948
|
|
|
|
|
|
|
return false; |
|
8949
|
|
|
|
|
|
|
} |
|
8950
|
|
|
|
|
|
|
|
|
8951
|
|
|
|
|
|
|
// Produces unique lemma-tag pairs. |
|
8952
|
0
|
|
|
|
|
|
void morpho_statistical_guesser::analyze(string_piece form, vector& lemmas, morpho_statistical_guesser::used_rules* used) { |
|
8953
|
|
|
|
|
|
|
unsigned lemmas_initial_size = lemmas.size(); |
|
8954
|
|
|
|
|
|
|
|
|
8955
|
|
|
|
|
|
|
// We have rules in format "suffix prefix" in rules. |
|
8956
|
|
|
|
|
|
|
// Find the matching rule with longest suffix and of those with longest prefix. |
|
8957
|
0
|
0
|
|
|
|
|
string rule_label; rule_label.reserve(12); |
|
8958
|
|
|
|
|
|
|
unsigned suffix_len = 0; |
|
8959
|
0
|
0
|
|
|
|
|
for (; suffix_len < form.len; suffix_len++) { |
|
8960
|
0
|
0
|
|
|
|
|
rule_label.push_back(form.str[form.len - (suffix_len + 1)]); |
|
8961
|
0
|
0
|
|
|
|
|
if (!rules.at(rule_label.c_str(), rule_label.size(), [](pointer_decoder& data){ data.next(data.next_2B()); })) |
|
8962
|
|
|
|
|
|
|
break; |
|
8963
|
|
|
|
|
|
|
} |
|
8964
|
|
|
|
|
|
|
|
|
8965
|
0
|
0
|
|
|
|
|
for (suffix_len++; suffix_len--; ) { |
|
8966
|
0
|
|
|
|
|
|
rule_label.resize(suffix_len); |
|
8967
|
0
|
0
|
|
|
|
|
rule_label.push_back(' '); |
|
8968
|
|
|
|
|
|
|
|
|
8969
|
|
|
|
|
|
|
const unsigned char* rule = nullptr; |
|
8970
|
|
|
|
|
|
|
unsigned rule_prefix_len = 0; |
|
8971
|
0
|
0
|
|
|
|
|
for (unsigned prefix_len = 0; prefix_len + suffix_len <= form.len; prefix_len++) { |
|
8972
|
0
|
0
|
|
|
|
|
if (prefix_len) rule_label.push_back(form.str[prefix_len - 1]); |
|
|
|
0
|
|
|
|
|
|
|
8973
|
0
|
|
|
|
|
|
const unsigned char* found = rules.at(rule_label.c_str(), rule_label.size(), [](pointer_decoder& data){ data.next(data.next_2B()); }); |
|
8974
|
0
|
0
|
|
|
|
|
if (!found) break; |
|
8975
|
0
|
0
|
|
|
|
|
if (*(found += sizeof(uint16_t))) { |
|
8976
|
|
|
|
|
|
|
rule = found; |
|
8977
|
|
|
|
|
|
|
rule_prefix_len = prefix_len; |
|
8978
|
|
|
|
|
|
|
} |
|
8979
|
|
|
|
|
|
|
} |
|
8980
|
|
|
|
|
|
|
|
|
8981
|
0
|
0
|
|
|
|
|
if (rule) { |
|
8982
|
0
|
|
|
|
|
|
rule_label.resize(suffix_len + 1 + rule_prefix_len); |
|
8983
|
0
|
0
|
|
|
|
|
if (rule_label.size() > 1 && !contains(used, rule_label)) { // ignore rule ' ' |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8984
|
0
|
0
|
|
|
|
|
if (used) used->push_back(rule_label); |
|
|
|
0
|
|
|
|
|
|
|
8985
|
0
|
0
|
|
|
|
|
for (int rules_len = *rule++; rules_len; rules_len--) { |
|
8986
|
0
|
|
|
|
|
|
unsigned pref_del_len = *rule++; const char* pref_del = (const char*)rule; rule += pref_del_len; |
|
8987
|
0
|
|
|
|
|
|
unsigned pref_add_len = *rule++; const char* pref_add = (const char*)rule; rule += pref_add_len; |
|
8988
|
0
|
|
|
|
|
|
unsigned suff_del_len = *rule++; const char* suff_del = (const char*)rule; rule += suff_del_len; |
|
8989
|
0
|
|
|
|
|
|
unsigned suff_add_len = *rule++; const char* suff_add = (const char*)rule; rule += suff_add_len; |
|
8990
|
0
|
|
|
|
|
|
unsigned tags_len = *rule++; const uint16_t* tags = (const uint16_t*)rule; rule += tags_len * sizeof(uint16_t); |
|
8991
|
|
|
|
|
|
|
|
|
8992
|
0
|
0
|
|
|
|
|
if (pref_del_len + suff_del_len > form.len || |
|
|
|
0
|
|
|
|
|
|
|
8993
|
0
|
0
|
|
|
|
|
(pref_del_len && !small_memeq(pref_del, form.str, pref_del_len)) || |
|
|
|
0
|
|
|
|
|
|
|
8994
|
0
|
0
|
|
|
|
|
(suff_del_len && !small_memeq(suff_del, form.str + form.len - suff_del_len, suff_del_len)) || |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
8995
|
0
|
|
|
|
|
|
(form.len + pref_add_len - pref_del_len + suff_add_len - suff_del_len == 0)) |
|
8996
|
0
|
|
|
|
|
|
continue; |
|
8997
|
|
|
|
|
|
|
|
|
8998
|
|
|
|
|
|
|
string lemma; |
|
8999
|
0
|
0
|
|
|
|
|
lemma.reserve(form.len + pref_add_len - pref_del_len + suff_add_len - suff_del_len); |
|
9000
|
0
|
0
|
|
|
|
|
if (pref_add_len) lemma.append(pref_add, pref_add_len); |
|
|
|
0
|
|
|
|
|
|
|
9001
|
0
|
0
|
|
|
|
|
if (pref_del_len + suff_del_len < form.len) lemma.append(form.str + pref_del_len, form.len - pref_del_len - suff_del_len); |
|
|
|
0
|
|
|
|
|
|
|
9002
|
0
|
0
|
|
|
|
|
if (suff_add_len) lemma.append(suff_add, suff_add_len); |
|
|
|
0
|
|
|
|
|
|
|
9003
|
0
|
0
|
|
|
|
|
while (tags_len--) |
|
9004
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(lemma, this->tags[unaligned_load_inc(tags)]); |
|
9005
|
|
|
|
|
|
|
} |
|
9006
|
|
|
|
|
|
|
} |
|
9007
|
|
|
|
|
|
|
break; |
|
9008
|
|
|
|
|
|
|
} |
|
9009
|
|
|
|
|
|
|
} |
|
9010
|
|
|
|
|
|
|
|
|
9011
|
|
|
|
|
|
|
// If nothing was found, use default tag. |
|
9012
|
0
|
0
|
|
|
|
|
if (lemmas.size() == lemmas_initial_size) |
|
9013
|
0
|
0
|
|
|
|
|
if (!contains(used, string())) { |
|
9014
|
0
|
0
|
|
|
|
|
if (used) used->push_back(string()); |
|
9015
|
0
|
0
|
|
|
|
|
lemmas.emplace_back(string(form.str, form.len), tags[default_tag]); |
|
9016
|
|
|
|
|
|
|
} |
|
9017
|
0
|
|
|
|
|
|
} |
|
9018
|
|
|
|
|
|
|
|
|
9019
|
|
|
|
|
|
|
} // namespace morphodita |
|
9020
|
|
|
|
|
|
|
|
|
9021
|
|
|
|
|
|
|
///////// |
|
9022
|
|
|
|
|
|
|
// File: utils/split.h |
|
9023
|
|
|
|
|
|
|
///////// |
|
9024
|
|
|
|
|
|
|
|
|
9025
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
|
9026
|
|
|
|
|
|
|
// |
|
9027
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
9028
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
9029
|
|
|
|
|
|
|
// |
|
9030
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
9031
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
9032
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
9033
|
|
|
|
|
|
|
|
|
9034
|
|
|
|
|
|
|
namespace utils { |
|
9035
|
|
|
|
|
|
|
|
|
9036
|
|
|
|
|
|
|
// |
|
9037
|
|
|
|
|
|
|
// Declarations |
|
9038
|
|
|
|
|
|
|
// |
|
9039
|
|
|
|
|
|
|
|
|
9040
|
|
|
|
|
|
|
// Split given text on the separator character. |
|
9041
|
|
|
|
|
|
|
inline void split(const string& text, char sep, vector& tokens); |
|
9042
|
|
|
|
|
|
|
inline void split(string_piece text, char sep, vector& tokens); |
|
9043
|
|
|
|
|
|
|
|
|
9044
|
|
|
|
|
|
|
// |
|
9045
|
|
|
|
|
|
|
// Definitions |
|
9046
|
|
|
|
|
|
|
// |
|
9047
|
|
|
|
|
|
|
|
|
9048
|
0
|
|
|
|
|
|
void split(const string& text, char sep, vector& tokens) { |
|
9049
|
0
|
|
|
|
|
|
tokens.clear(); |
|
9050
|
0
|
0
|
|
|
|
|
if (text.empty()) return; |
|
9051
|
|
|
|
|
|
|
|
|
9052
|
0
|
|
|
|
|
|
string::size_type index = 0; |
|
9053
|
0
|
0
|
|
|
|
|
for (string::size_type next; (next = text.find(sep, index)) != string::npos; index = next + 1) |
|
9054
|
0
|
|
|
|
|
|
tokens.emplace_back(text, index, next - index); |
|
9055
|
|
|
|
|
|
|
|
|
9056
|
0
|
|
|
|
|
|
tokens.emplace_back(text, index); |
|
9057
|
|
|
|
|
|
|
} |
|
9058
|
|
|
|
|
|
|
|
|
9059
|
53
|
|
|
|
|
|
void split(string_piece text, char sep, vector& tokens) { |
|
9060
|
|
|
|
|
|
|
tokens.clear(); |
|
9061
|
53
|
50
|
|
|
|
|
if (!text.len) return; |
|
9062
|
|
|
|
|
|
|
|
|
9063
|
53
|
|
|
|
|
|
const char* str = text.str; |
|
9064
|
121
|
100
|
|
|
|
|
for (const char* next; (next = (const char*) memchr(str, sep, text.str + text.len - str)); str = next + 1) |
|
9065
|
68
|
|
|
|
|
|
tokens.emplace_back(str, next - str); |
|
9066
|
|
|
|
|
|
|
|
|
9067
|
53
|
|
|
|
|
|
tokens.emplace_back(str, text.str + text.len - str); |
|
9068
|
|
|
|
|
|
|
} |
|
9069
|
|
|
|
|
|
|
|
|
9070
|
|
|
|
|
|
|
} // namespace utils |
|
9071
|
|
|
|
|
|
|
|
|
9072
|
|
|
|
|
|
|
///////// |
|
9073
|
|
|
|
|
|
|
// File: morphodita/morpho/morpho_statistical_guesser_encoder.cpp |
|
9074
|
|
|
|
|
|
|
///////// |
|
9075
|
|
|
|
|
|
|
|
|
9076
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
9077
|
|
|
|
|
|
|
// |
|
9078
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
9079
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
9080
|
|
|
|
|
|
|
// |
|
9081
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
9082
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
9083
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
9084
|
|
|
|
|
|
|
|
|
9085
|
|
|
|
|
|
|
namespace morphodita { |
|
9086
|
|
|
|
|
|
|
|
|
9087
|
0
|
|
|
|
|
|
void morpho_statistical_guesser_encoder::encode(istream& is, binary_encoder& enc) { |
|
9088
|
|
|
|
|
|
|
unordered_map, vector>>> statistical_guesser; |
|
9089
|
0
|
|
|
|
|
|
vector tags; |
|
9090
|
|
|
|
|
|
|
unordered_map tags_map; |
|
9091
|
|
|
|
|
|
|
|
|
9092
|
|
|
|
|
|
|
// Load statistical guesser |
|
9093
|
|
|
|
|
|
|
string line; |
|
9094
|
0
|
|
|
|
|
|
vector tokens; |
|
9095
|
0
|
0
|
|
|
|
|
if (!getline(is, line)) training_failure("Missing first line with default tag in statistical guesser file"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9096
|
0
|
|
|
|
|
|
int statistical_guesser_default = tags_map.emplace(line.data(), int(tags.size())).first->second; |
|
9097
|
0
|
0
|
|
|
|
|
if (unsigned(statistical_guesser_default) >= tags.size()) tags.emplace_back(line.data()); |
|
|
|
0
|
|
|
|
|
|
|
9098
|
|
|
|
|
|
|
|
|
9099
|
0
|
0
|
|
|
|
|
while (getline(is, line)) { |
|
|
|
0
|
|
|
|
|
|
|
9100
|
0
|
0
|
|
|
|
|
split(line, '\t', tokens); |
|
9101
|
0
|
0
|
|
|
|
|
if (tokens.size() < 3 || (tokens.size() % 2) != 1) training_failure("Cannot parse line " << line << " in statistical guesser file!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9102
|
|
|
|
|
|
|
|
|
9103
|
0
|
|
|
|
|
|
vector affixes; |
|
9104
|
0
|
0
|
|
|
|
|
split(tokens[0], ' ', affixes); |
|
9105
|
0
|
0
|
|
|
|
|
if (affixes.size() != 2) training_failure("Cannot parse prefix_suffix '" << tokens[0] << "' in statistical guesser file!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9106
|
|
|
|
|
|
|
reverse(affixes[1].begin(), affixes[1].end()); |
|
9107
|
|
|
|
|
|
|
|
|
9108
|
0
|
0
|
|
|
|
|
auto& rules = statistical_guesser[affixes[1] + ' ' + affixes[0]]; |
|
|
|
0
|
|
|
|
|
|
|
9109
|
0
|
0
|
|
|
|
|
for (unsigned i = 1; i < tokens.size(); i+= 2) { |
|
9110
|
0
|
|
|
|
|
|
vector replacements; |
|
9111
|
0
|
0
|
|
|
|
|
split(tokens[i], ' ', replacements); |
|
9112
|
0
|
0
|
|
|
|
|
if (replacements.size() != 4) training_failure("Cannot parse replacement rule '" << tokens[i] << "' in statistical guesser file!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9113
|
|
|
|
|
|
|
|
|
9114
|
0
|
|
|
|
|
|
vector rule_tags; |
|
9115
|
0
|
0
|
|
|
|
|
split(tokens[i+1], ' ', rule_tags); |
|
9116
|
|
|
|
|
|
|
vector decoded_tags; |
|
9117
|
0
|
0
|
|
|
|
|
for (auto&& rule_tag : rule_tags) { |
|
9118
|
0
|
|
|
|
|
|
int tag = tags_map.emplace(rule_tag, int(tags.size())).first->second; |
|
9119
|
0
|
0
|
|
|
|
|
if (unsigned(tag) >= tags.size()) tags.emplace_back(rule_tag); |
|
|
|
0
|
|
|
|
|
|
|
9120
|
0
|
0
|
|
|
|
|
decoded_tags.emplace_back(tag); |
|
9121
|
|
|
|
|
|
|
} |
|
9122
|
|
|
|
|
|
|
|
|
9123
|
0
|
0
|
|
|
|
|
rules.emplace_back(replacements, decoded_tags); |
|
9124
|
|
|
|
|
|
|
} |
|
9125
|
|
|
|
|
|
|
} |
|
9126
|
|
|
|
|
|
|
|
|
9127
|
|
|
|
|
|
|
// Encode statistical guesser |
|
9128
|
0
|
0
|
|
|
|
|
enc.add_2B(tags.size()); |
|
9129
|
0
|
0
|
|
|
|
|
for (auto&& tag : tags) { |
|
9130
|
0
|
0
|
|
|
|
|
enc.add_1B(tag.size()); |
|
9131
|
|
|
|
|
|
|
enc.add_data(tag); |
|
9132
|
|
|
|
|
|
|
} |
|
9133
|
0
|
0
|
|
|
|
|
enc.add_2B(statistical_guesser_default); |
|
9134
|
|
|
|
|
|
|
|
|
9135
|
0
|
|
|
|
|
|
persistent_unordered_map(statistical_guesser, 5, true, false, [](binary_encoder& enc, vector, vector>> rules) { |
|
9136
|
0
|
|
|
|
|
|
binary_encoder e; |
|
9137
|
0
|
0
|
|
|
|
|
e.add_1B(rules.size()); |
|
9138
|
0
|
0
|
|
|
|
|
for (auto&& rule : rules) { |
|
9139
|
0
|
0
|
|
|
|
|
if (rule.first.size() != 4) training_failure("Replacement rule not of size 4 in statistical guesser!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9140
|
0
|
0
|
|
|
|
|
for (auto&& affix : rule.first) { |
|
9141
|
0
|
0
|
|
|
|
|
e.add_1B(affix.size()); |
|
9142
|
|
|
|
|
|
|
e.add_data(affix); |
|
9143
|
|
|
|
|
|
|
} |
|
9144
|
0
|
0
|
|
|
|
|
e.add_1B(rule.second.size()); |
|
9145
|
0
|
0
|
|
|
|
|
for (auto&& tag : rule.second) |
|
9146
|
0
|
0
|
|
|
|
|
e.add_2B(tag); |
|
9147
|
|
|
|
|
|
|
} |
|
9148
|
0
|
0
|
|
|
|
|
enc.add_2B(e.data.size()); |
|
9149
|
|
|
|
|
|
|
enc.add_data(e.data); |
|
9150
|
0
|
0
|
|
|
|
|
}).save(enc); |
|
|
|
0
|
|
|
|
|
|
|
9151
|
0
|
|
|
|
|
|
} |
|
9152
|
|
|
|
|
|
|
|
|
9153
|
|
|
|
|
|
|
} // namespace morphodita |
|
9154
|
|
|
|
|
|
|
|
|
9155
|
|
|
|
|
|
|
///////// |
|
9156
|
|
|
|
|
|
|
// File: morphodita/morpho/morpho_statistical_guesser_trainer.h |
|
9157
|
|
|
|
|
|
|
///////// |
|
9158
|
|
|
|
|
|
|
|
|
9159
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
9160
|
|
|
|
|
|
|
// |
|
9161
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
9162
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
9163
|
|
|
|
|
|
|
// |
|
9164
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
9165
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
9166
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
9167
|
|
|
|
|
|
|
|
|
9168
|
|
|
|
|
|
|
namespace morphodita { |
|
9169
|
|
|
|
|
|
|
|
|
9170
|
|
|
|
|
|
|
class morpho_statistical_guesser_trainer { |
|
9171
|
|
|
|
|
|
|
public: |
|
9172
|
|
|
|
|
|
|
static void train(istream& is, unsigned suffix_len, unsigned rules_per_suffix, unsigned max_prefixes, unsigned min_prefix_count, ostream& os); |
|
9173
|
|
|
|
|
|
|
|
|
9174
|
|
|
|
|
|
|
private: |
|
9175
|
0
|
|
|
|
|
|
struct instance { |
|
9176
|
|
|
|
|
|
|
string form, lemma, tag; |
|
9177
|
|
|
|
|
|
|
string lemma_rule, form_prefix; |
|
9178
|
|
|
|
|
|
|
|
|
9179
|
|
|
|
|
|
|
instance(const string& form, const string& lemma, const string& tag); |
|
9180
|
|
|
|
|
|
|
}; |
|
9181
|
|
|
|
|
|
|
|
|
9182
|
|
|
|
|
|
|
enum casing { CASE_LC, CASE_UCLC, CASE_UC, CASE_OTHER }; |
|
9183
|
|
|
|
|
|
|
static casing get_casing(const string& word, bool allow_nonletters); |
|
9184
|
|
|
|
|
|
|
static void set_casing(const string& original, casing c, string& word); |
|
9185
|
|
|
|
|
|
|
static bool suffix(const string& word, unsigned& length); |
|
9186
|
|
|
|
|
|
|
}; |
|
9187
|
|
|
|
|
|
|
|
|
9188
|
|
|
|
|
|
|
} // namespace morphodita |
|
9189
|
|
|
|
|
|
|
|
|
9190
|
|
|
|
|
|
|
///////// |
|
9191
|
|
|
|
|
|
|
// File: morphodita/morpho/morpho_statistical_guesser_trainer.cpp |
|
9192
|
|
|
|
|
|
|
///////// |
|
9193
|
|
|
|
|
|
|
|
|
9194
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
9195
|
|
|
|
|
|
|
// |
|
9196
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
9197
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
9198
|
|
|
|
|
|
|
// |
|
9199
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
9200
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
9201
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
9202
|
|
|
|
|
|
|
|
|
9203
|
|
|
|
|
|
|
namespace morphodita { |
|
9204
|
|
|
|
|
|
|
|
|
9205
|
0
|
|
|
|
|
|
void morpho_statistical_guesser_trainer::train(istream& is, unsigned suffix_len, unsigned rules_per_suffix, unsigned max_prefixes, unsigned min_prefix_count, ostream& os) { |
|
9206
|
0
|
|
|
|
|
|
vector data; |
|
9207
|
|
|
|
|
|
|
|
|
9208
|
|
|
|
|
|
|
// Load training data |
|
9209
|
|
|
|
|
|
|
string form; |
|
9210
|
0
|
|
|
|
|
|
vector tokens; |
|
9211
|
0
|
0
|
|
|
|
|
for (string line; getline(is, line);) { |
|
|
|
0
|
|
|
|
|
|
|
9212
|
0
|
0
|
|
|
|
|
if (line.empty()) continue; |
|
9213
|
|
|
|
|
|
|
|
|
9214
|
0
|
0
|
|
|
|
|
split(line, '\t', tokens); |
|
9215
|
0
|
0
|
|
|
|
|
if (tokens.size() != 3) training_failure("The guesser training line '" << line << "' does not contain three columns!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9216
|
0
|
0
|
|
|
|
|
if (tokens[0].empty() || tokens[1].empty() || tokens[2].empty()) training_failure("The guesser training line '" << line << "' contains an empty column!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9217
|
|
|
|
|
|
|
|
|
9218
|
|
|
|
|
|
|
// Normalize case |
|
9219
|
0
|
|
|
|
|
|
casing form_case = get_casing(tokens[0], false); |
|
9220
|
0
|
|
|
|
|
|
casing lemma_case = get_casing(tokens[1], true); |
|
9221
|
0
|
0
|
|
|
|
|
if ((lemma_case == CASE_LC && (form_case == CASE_UCLC || form_case == CASE_UC)) || |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9222
|
0
|
|
|
|
|
|
(lemma_case == CASE_UCLC && form_case == CASE_UC)) { |
|
9223
|
0
|
0
|
|
|
|
|
set_casing(tokens[0], lemma_case, form); |
|
9224
|
|
|
|
|
|
|
} else { |
|
9225
|
0
|
|
|
|
|
|
form.swap(tokens[0]); |
|
9226
|
|
|
|
|
|
|
} |
|
9227
|
|
|
|
|
|
|
|
|
9228
|
0
|
0
|
|
|
|
|
data.emplace_back(form, tokens[1], tokens[2]); |
|
9229
|
|
|
|
|
|
|
} |
|
9230
|
|
|
|
|
|
|
|
|
9231
|
|
|
|
|
|
|
// Generate at most max_prefixes prefixes with min_prefix_count |
|
9232
|
|
|
|
|
|
|
unordered_map> prefixes_with_forms; |
|
9233
|
0
|
0
|
|
|
|
|
for (auto&& instance : data) |
|
9234
|
0
|
0
|
|
|
|
|
if (!instance.form_prefix.empty()) |
|
9235
|
0
|
|
|
|
|
|
prefixes_with_forms[instance.form_prefix].insert(instance.form); |
|
9236
|
|
|
|
|
|
|
|
|
9237
|
0
|
|
|
|
|
|
vector> prefixes_with_counts; |
|
9238
|
0
|
0
|
|
|
|
|
for (auto&& prefix : prefixes_with_forms) |
|
9239
|
0
|
0
|
|
|
|
|
if (prefix.second.size() >= min_prefix_count) |
|
9240
|
0
|
0
|
|
|
|
|
prefixes_with_counts.emplace_back(unsigned(prefix.second.size()), prefix.first); |
|
9241
|
|
|
|
|
|
|
|
|
9242
|
0
|
0
|
|
|
|
|
if (prefixes_with_counts.size() > max_prefixes) { |
|
9243
|
|
|
|
|
|
|
sort(prefixes_with_counts.begin(), prefixes_with_counts.end(), greater>()); |
|
9244
|
0
|
0
|
|
|
|
|
prefixes_with_counts.resize(max_prefixes); |
|
9245
|
|
|
|
|
|
|
} |
|
9246
|
|
|
|
|
|
|
|
|
9247
|
|
|
|
|
|
|
unordered_set prefixes; |
|
9248
|
|
|
|
|
|
|
prefixes.emplace(); |
|
9249
|
0
|
0
|
|
|
|
|
for (auto&& prefix : prefixes_with_counts) |
|
9250
|
0
|
|
|
|
|
|
prefixes.insert(prefix.second); |
|
9251
|
|
|
|
|
|
|
|
|
9252
|
|
|
|
|
|
|
// Generate the guesser rules |
|
9253
|
|
|
|
|
|
|
unordered_map> tags; |
|
9254
|
|
|
|
|
|
|
unordered_map>> rules; |
|
9255
|
|
|
|
|
|
|
unordered_set suffixes; |
|
9256
|
|
|
|
|
|
|
string prefix_suffix, tag_lemma_rule; |
|
9257
|
0
|
0
|
|
|
|
|
for (auto&& instance : data) { |
|
9258
|
|
|
|
|
|
|
// Add tag |
|
9259
|
0
|
|
|
|
|
|
tags[instance.tag].insert(instance.form); |
|
9260
|
|
|
|
|
|
|
|
|
9261
|
|
|
|
|
|
|
// Find longest matching prefix |
|
9262
|
|
|
|
|
|
|
unsigned prefix_length = 0; |
|
9263
|
0
|
0
|
|
|
|
|
for (auto&& prefix : prefixes) |
|
9264
|
0
|
0
|
|
|
|
|
if (prefix.size() > prefix_length && instance.form.compare(0, prefix.size(), prefix) == 0) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9265
|
0
|
|
|
|
|
|
prefix_length = prefix.size(); |
|
9266
|
|
|
|
|
|
|
|
|
9267
|
0
|
0
|
|
|
|
|
tag_lemma_rule.assign(instance.lemma_rule).append("\t").append(instance.tag); |
|
9268
|
|
|
|
|
|
|
|
|
9269
|
|
|
|
|
|
|
// Add prefix + all suffixes of length 1..suffix_len to rules |
|
9270
|
0
|
0
|
|
|
|
|
for (unsigned length = 0, utf8_length = 0; length < suffix_len && suffix(instance.form, utf8_length); length++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9271
|
0
|
0
|
|
|
|
|
prefix_suffix.assign(instance.form, 0, prefix_length).append(" ").append(instance.form, instance.form.size() - utf8_length, utf8_length); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9272
|
|
|
|
|
|
|
rules[prefix_suffix][tag_lemma_rule].insert(instance.form); |
|
9273
|
0
|
|
|
|
|
|
suffixes.emplace(instance.form, instance.form.size() - utf8_length, utf8_length); |
|
9274
|
|
|
|
|
|
|
} |
|
9275
|
|
|
|
|
|
|
} |
|
9276
|
|
|
|
|
|
|
|
|
9277
|
|
|
|
|
|
|
// Start generating the guesser description by writing the most "frequent" tag |
|
9278
|
|
|
|
|
|
|
string most_frequent_tag; unsigned most_frequent_tag_count = 0; |
|
9279
|
0
|
0
|
|
|
|
|
for (auto&& tag : tags) |
|
9280
|
0
|
0
|
|
|
|
|
if (tag.second.size() > most_frequent_tag_count) |
|
9281
|
0
|
|
|
|
|
|
most_frequent_tag.assign(tag.first), most_frequent_tag_count = tag.second.size(); |
|
9282
|
|
|
|
|
|
|
|
|
9283
|
|
|
|
|
|
|
os << most_frequent_tag << endl; |
|
9284
|
|
|
|
|
|
|
|
|
9285
|
|
|
|
|
|
|
// For every prefix-suffix, write at most rules_per_suffix most "frequent" rules |
|
9286
|
|
|
|
|
|
|
string rule_key, output; |
|
9287
|
|
|
|
|
|
|
unordered_set rules_set; |
|
9288
|
0
|
|
|
|
|
|
vector> rules_counts; |
|
9289
|
0
|
0
|
|
|
|
|
for (auto&& suffix : suffixes) { |
|
9290
|
0
|
0
|
|
|
|
|
for (auto&& prefix : prefixes) { |
|
9291
|
0
|
|
|
|
|
|
rules_counts.clear(); |
|
9292
|
|
|
|
|
|
|
rules_set.clear(); |
|
9293
|
|
|
|
|
|
|
|
|
9294
|
|
|
|
|
|
|
// Gather at most rules_per_suffix rules |
|
9295
|
0
|
0
|
|
|
|
|
for (int prefix_len = int(prefix.size()); prefix_len >= 0; prefix_len -= prefix.empty() ? 1 : prefix.size()) { |
|
|
|
0
|
|
|
|
|
|
|
9296
|
0
|
0
|
|
|
|
|
for (int suffix_len = int(suffix.size()); rules_counts.size() < rules_per_suffix && suffix_len > 0; suffix_len--) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9297
|
0
|
0
|
|
|
|
|
rule_key.assign(prefix, 0, prefix_len).append(" ").append(suffix, suffix.size() - suffix_len, suffix_len); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9298
|
0
|
0
|
|
|
|
|
if (!rules.count(rule_key)) continue; |
|
9299
|
|
|
|
|
|
|
|
|
9300
|
|
|
|
|
|
|
unsigned rules_counts_original = rules_counts.size(); |
|
9301
|
0
|
0
|
|
|
|
|
for (auto&& entry : rules[rule_key]) |
|
9302
|
0
|
0
|
|
|
|
|
if (!rules_set.count(entry.first)) { |
|
9303
|
0
|
0
|
|
|
|
|
rules_counts.emplace_back(unsigned(entry.second.size()), entry.first); |
|
9304
|
|
|
|
|
|
|
rules_set.insert(entry.first); |
|
9305
|
|
|
|
|
|
|
} |
|
9306
|
|
|
|
|
|
|
|
|
9307
|
|
|
|
|
|
|
sort(rules_counts.begin() + rules_counts_original, rules_counts.end(), greater>()); |
|
9308
|
|
|
|
|
|
|
|
|
9309
|
0
|
0
|
|
|
|
|
if (rules_counts.size() >= rules_per_suffix) { |
|
9310
|
0
|
0
|
|
|
|
|
rules_counts.resize(rules_per_suffix); |
|
9311
|
|
|
|
|
|
|
break; |
|
9312
|
|
|
|
|
|
|
} |
|
9313
|
|
|
|
|
|
|
} |
|
9314
|
|
|
|
|
|
|
// Stop if there are no rules for given prefix |
|
9315
|
0
|
0
|
|
|
|
|
if (rules_set.empty()) break; |
|
9316
|
|
|
|
|
|
|
} |
|
9317
|
0
|
0
|
|
|
|
|
if (!rules_set.empty()) { |
|
9318
|
|
|
|
|
|
|
// Write the chosen rules |
|
9319
|
0
|
0
|
|
|
|
|
output.assign(prefix).append(" ").append(suffix); |
|
9320
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < rules_counts.size(); i++) { |
|
9321
|
0
|
|
|
|
|
|
unsigned tab = rules_counts[i].second.find('\t'); |
|
9322
|
|
|
|
|
|
|
|
|
9323
|
0
|
0
|
|
|
|
|
output.append("\t").append(rules_counts[i].second, 0, tab).append("\t").append(rules_counts[i].second, tab + 1, string::npos); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9324
|
|
|
|
|
|
|
|
|
9325
|
|
|
|
|
|
|
// Join rules with same lemma_rule |
|
9326
|
0
|
0
|
|
|
|
|
for (unsigned start = i; i+1 < rules_counts.size() && rules_counts[i+1].second.compare(0, tab + 1, rules_counts[start].second, 0, tab + 1) == 0; i++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9327
|
0
|
0
|
|
|
|
|
output.append(" ").append(rules_counts[i+1].second, tab + 1, string::npos); |
|
|
|
0
|
|
|
|
|
|
|
9328
|
|
|
|
|
|
|
} |
|
9329
|
|
|
|
|
|
|
os << output << endl; |
|
9330
|
|
|
|
|
|
|
} |
|
9331
|
|
|
|
|
|
|
} |
|
9332
|
|
|
|
|
|
|
} |
|
9333
|
0
|
|
|
|
|
|
} |
|
9334
|
|
|
|
|
|
|
|
|
9335
|
0
|
|
|
|
|
|
morpho_statistical_guesser_trainer::instance::instance(const string& form, const string& lemma, const string& tag) |
|
9336
|
0
|
|
|
|
|
|
: form(form), lemma(lemma), tag(tag) |
|
9337
|
|
|
|
|
|
|
{ |
|
9338
|
|
|
|
|
|
|
using namespace unilib; |
|
9339
|
|
|
|
|
|
|
|
|
9340
|
|
|
|
|
|
|
unsigned length_best = 0; |
|
9341
|
|
|
|
|
|
|
int form_best = 0, lemma_best = 0; |
|
9342
|
0
|
0
|
|
|
|
|
for (int offset = -int(lemma.size() - 1); offset < int(form.size()) - 1; offset++) { |
|
9343
|
0
|
|
|
|
|
|
unsigned form_offset = max(0, offset); |
|
9344
|
0
|
|
|
|
|
|
unsigned lemma_offset = max(0, -offset); |
|
9345
|
0
|
0
|
|
|
|
|
for (unsigned length = 0; form_offset < form.size() && lemma_offset < lemma.size(); form_offset++, lemma_offset++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9346
|
0
|
0
|
|
|
|
|
if (form[form_offset] == lemma[lemma_offset]) { |
|
9347
|
0
|
0
|
|
|
|
|
if (++length > length_best && utf8::valid(form.c_str() + form_offset + 1 - length, length)) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9348
|
0
|
|
|
|
|
|
length_best = length, form_best = form_offset + 1 - length, lemma_best = lemma_offset + 1 - length; |
|
9349
|
|
|
|
|
|
|
} else { |
|
9350
|
|
|
|
|
|
|
length = 0; |
|
9351
|
|
|
|
|
|
|
} |
|
9352
|
|
|
|
|
|
|
} |
|
9353
|
|
|
|
|
|
|
|
|
9354
|
0
|
0
|
|
|
|
|
form_prefix.assign(form, 0, lemma_best == 0 ? form_best : 0); |
|
|
|
0
|
|
|
|
|
|
|
9355
|
0
|
0
|
|
|
|
|
lemma_rule.assign(form, 0, form_best).append(" ").append(lemma, 0, lemma_best).append(" ") |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9356
|
0
|
0
|
|
|
|
|
.append(form, form_best + length_best, string::npos).append(" ").append(lemma, lemma_best + length_best, string::npos); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9357
|
0
|
|
|
|
|
|
} |
|
9358
|
|
|
|
|
|
|
|
|
9359
|
0
|
|
|
|
|
|
morpho_statistical_guesser_trainer::casing morpho_statistical_guesser_trainer::get_casing(const string& word, bool allow_nonletters) { |
|
9360
|
|
|
|
|
|
|
using namespace unilib; |
|
9361
|
|
|
|
|
|
|
|
|
9362
|
|
|
|
|
|
|
casing c = CASE_OTHER; |
|
9363
|
|
|
|
|
|
|
int index = 0; |
|
9364
|
0
|
0
|
|
|
|
|
for (auto&& chr : utf8::decoder(word)) { |
|
9365
|
0
|
|
|
|
|
|
auto cat = unicode::category(chr); |
|
9366
|
|
|
|
|
|
|
|
|
9367
|
|
|
|
|
|
|
// Return OTHER for non-letters |
|
9368
|
0
|
0
|
|
|
|
|
if (allow_nonletters && index >= 2 && cat & ~unicode::L) continue; |
|
|
|
0
|
|
|
|
|
|
|
9369
|
0
|
0
|
|
|
|
|
if (cat & ~unicode::L) return CASE_OTHER; |
|
9370
|
|
|
|
|
|
|
|
|
9371
|
0
|
0
|
|
|
|
|
if (index == 0) { |
|
9372
|
0
|
0
|
|
|
|
|
c = cat & unicode::Ll ? CASE_LC : CASE_UC; |
|
9373
|
0
|
0
|
|
|
|
|
} else if (c == CASE_UC && index == 1) { |
|
9374
|
0
|
0
|
|
|
|
|
c = cat & unicode::Ll ? CASE_UCLC : CASE_UC; |
|
9375
|
0
|
0
|
|
|
|
|
} else if (c == CASE_UC) { |
|
9376
|
0
|
0
|
|
|
|
|
if (cat & ~unicode::Lut) return CASE_OTHER; |
|
9377
|
|
|
|
|
|
|
} else /*CASE_LC or CASE_UCLC*/ { |
|
9378
|
0
|
0
|
|
|
|
|
if (cat & ~unicode::Ll) return CASE_OTHER; |
|
9379
|
|
|
|
|
|
|
} |
|
9380
|
0
|
|
|
|
|
|
index++; |
|
9381
|
|
|
|
|
|
|
} |
|
9382
|
0
|
|
|
|
|
|
return c; |
|
9383
|
|
|
|
|
|
|
} |
|
9384
|
|
|
|
|
|
|
|
|
9385
|
0
|
|
|
|
|
|
void morpho_statistical_guesser_trainer::set_casing(const string& original, casing c, string& word) { |
|
9386
|
|
|
|
|
|
|
using namespace unilib; |
|
9387
|
|
|
|
|
|
|
|
|
9388
|
|
|
|
|
|
|
word.clear(); |
|
9389
|
|
|
|
|
|
|
bool first = true; |
|
9390
|
0
|
0
|
|
|
|
|
for (auto&& chr : utf8::decoder(original)) { |
|
9391
|
0
|
0
|
|
|
|
|
utf8::append(word, (c == CASE_UC || (c == CASE_UCLC && first)) ? unicode::uppercase(chr) : unicode::lowercase(chr)); |
|
|
|
0
|
|
|
|
|
|
|
9392
|
|
|
|
|
|
|
first = false; |
|
9393
|
|
|
|
|
|
|
} |
|
9394
|
0
|
|
|
|
|
|
} |
|
9395
|
|
|
|
|
|
|
|
|
9396
|
0
|
|
|
|
|
|
bool morpho_statistical_guesser_trainer::suffix(const string& word, unsigned& length) { |
|
9397
|
|
|
|
|
|
|
using namespace unilib; |
|
9398
|
|
|
|
|
|
|
|
|
9399
|
|
|
|
|
|
|
unsigned additional = 1; |
|
9400
|
0
|
0
|
|
|
|
|
while (additional + length <= word.size() && !utf8::valid(word.c_str() + word.size() - length - additional, additional)) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9401
|
0
|
|
|
|
|
|
additional++; |
|
9402
|
|
|
|
|
|
|
|
|
9403
|
0
|
0
|
|
|
|
|
if (additional + length > word.size()) return false; |
|
9404
|
|
|
|
|
|
|
|
|
9405
|
0
|
|
|
|
|
|
length += additional; |
|
9406
|
0
|
|
|
|
|
|
return true; |
|
9407
|
|
|
|
|
|
|
} |
|
9408
|
|
|
|
|
|
|
|
|
9409
|
|
|
|
|
|
|
} // namespace morphodita |
|
9410
|
|
|
|
|
|
|
|
|
9411
|
|
|
|
|
|
|
///////// |
|
9412
|
|
|
|
|
|
|
// File: morphodita/morpho/raw_morpho_dictionary_reader.cpp |
|
9413
|
|
|
|
|
|
|
///////// |
|
9414
|
|
|
|
|
|
|
|
|
9415
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
9416
|
|
|
|
|
|
|
// |
|
9417
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
9418
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
9419
|
|
|
|
|
|
|
// |
|
9420
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
9421
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
9422
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
9423
|
|
|
|
|
|
|
|
|
9424
|
|
|
|
|
|
|
namespace morphodita { |
|
9425
|
|
|
|
|
|
|
|
|
9426
|
0
|
|
|
|
|
|
bool raw_morpho_dictionary_reader::next_lemma(string& lemma, vector>& tagged_forms) { |
|
9427
|
0
|
0
|
|
|
|
|
if (line.empty()) { |
|
9428
|
0
|
0
|
|
|
|
|
if (!getline(in, line)) |
|
9429
|
|
|
|
|
|
|
return false; |
|
9430
|
0
|
|
|
|
|
|
split(line, '\t', tokens); |
|
9431
|
0
|
0
|
|
|
|
|
if (tokens.size() != 3) training_failure("Line " << line << " does not have three columns!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9432
|
|
|
|
|
|
|
} |
|
9433
|
|
|
|
|
|
|
|
|
9434
|
|
|
|
|
|
|
lemma = tokens[0]; |
|
9435
|
0
|
0
|
|
|
|
|
if (seen_lemmas.count(lemma)) |
|
9436
|
0
|
0
|
|
|
|
|
training_failure("Raw morphological dictionary contains lemma '" << lemma << "' multiple times - all forms of one lemma must be in continuous region!"); |
|
|
|
0
|
|
|
|
|
|
|
9437
|
|
|
|
|
|
|
seen_lemmas.insert(lemma); |
|
9438
|
|
|
|
|
|
|
|
|
9439
|
|
|
|
|
|
|
tagged_forms.clear(); |
|
9440
|
0
|
|
|
|
|
|
tagged_forms.emplace_back(tokens[2], tokens[1]); |
|
9441
|
0
|
0
|
|
|
|
|
while (getline(in, line)) { |
|
9442
|
0
|
|
|
|
|
|
split(line, '\t', tokens); |
|
9443
|
0
|
0
|
|
|
|
|
if (tokens.size() != 3) training_failure("Line " << line << " does not have three columns!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9444
|
|
|
|
|
|
|
|
|
9445
|
0
|
0
|
|
|
|
|
if (lemma != tokens[0]) break; |
|
9446
|
0
|
|
|
|
|
|
tagged_forms.emplace_back(tokens[2], tokens[1]); |
|
9447
|
|
|
|
|
|
|
} |
|
9448
|
|
|
|
|
|
|
|
|
9449
|
|
|
|
|
|
|
return true; |
|
9450
|
|
|
|
|
|
|
} |
|
9451
|
|
|
|
|
|
|
|
|
9452
|
|
|
|
|
|
|
} // namespace morphodita |
|
9453
|
|
|
|
|
|
|
|
|
9454
|
|
|
|
|
|
|
///////// |
|
9455
|
|
|
|
|
|
|
// File: morphodita/morpho/tag_filter.cpp |
|
9456
|
|
|
|
|
|
|
///////// |
|
9457
|
|
|
|
|
|
|
|
|
9458
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
9459
|
|
|
|
|
|
|
// |
|
9460
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
9461
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
9462
|
|
|
|
|
|
|
// |
|
9463
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
9464
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
9465
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
9466
|
|
|
|
|
|
|
|
|
9467
|
|
|
|
|
|
|
namespace morphodita { |
|
9468
|
|
|
|
|
|
|
|
|
9469
|
0
|
|
|
|
|
|
tag_filter::tag_filter(const char* filter) { |
|
9470
|
0
|
0
|
|
|
|
|
if (!filter) return; |
|
9471
|
|
|
|
|
|
|
|
|
9472
|
0
|
0
|
|
|
|
|
wildcard.assign(filter); |
|
9473
|
|
|
|
|
|
|
filter = wildcard.c_str(); |
|
9474
|
|
|
|
|
|
|
|
|
9475
|
0
|
0
|
|
|
|
|
for (int tag_pos = 0, filter_pos = 0; filter[filter_pos]; tag_pos++, filter_pos++) { |
|
9476
|
0
|
0
|
|
|
|
|
if (filter[filter_pos] == '?') continue; |
|
9477
|
0
|
0
|
|
|
|
|
if (filter[filter_pos] == '[') { |
|
9478
|
0
|
|
|
|
|
|
filter_pos++; |
|
9479
|
|
|
|
|
|
|
|
|
9480
|
0
|
|
|
|
|
|
bool negate = false; |
|
9481
|
0
|
0
|
|
|
|
|
if (filter[filter_pos] == '^') negate = true, filter_pos++; |
|
9482
|
|
|
|
|
|
|
|
|
9483
|
0
|
|
|
|
|
|
int chars_start = filter_pos; |
|
9484
|
0
|
0
|
|
|
|
|
for (bool first = true; filter[filter_pos] && (first || filter[filter_pos] != ']'); first = false) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9485
|
0
|
|
|
|
|
|
filter_pos++; |
|
9486
|
|
|
|
|
|
|
|
|
9487
|
0
|
0
|
|
|
|
|
filters.emplace_back(tag_pos, negate, chars_start, filter_pos - chars_start); |
|
9488
|
0
|
0
|
|
|
|
|
if (!filter[filter_pos]) break; |
|
9489
|
|
|
|
|
|
|
} else { |
|
9490
|
0
|
0
|
|
|
|
|
filters.emplace_back(tag_pos, false, filter_pos, 1); |
|
9491
|
|
|
|
|
|
|
} |
|
9492
|
|
|
|
|
|
|
} |
|
9493
|
|
|
|
|
|
|
} |
|
9494
|
|
|
|
|
|
|
|
|
9495
|
|
|
|
|
|
|
} // namespace morphodita |
|
9496
|
|
|
|
|
|
|
|
|
9497
|
|
|
|
|
|
|
///////// |
|
9498
|
|
|
|
|
|
|
// File: morphodita/tagger/elementary_features.h |
|
9499
|
|
|
|
|
|
|
///////// |
|
9500
|
|
|
|
|
|
|
|
|
9501
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
9502
|
|
|
|
|
|
|
// |
|
9503
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
9504
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
9505
|
|
|
|
|
|
|
// |
|
9506
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
9507
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
9508
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
9509
|
|
|
|
|
|
|
|
|
9510
|
|
|
|
|
|
|
namespace morphodita { |
|
9511
|
|
|
|
|
|
|
|
|
9512
|
|
|
|
|
|
|
// Declarations |
|
9513
|
|
|
|
|
|
|
enum elementary_feature_type { PER_FORM, PER_TAG, DYNAMIC }; |
|
9514
|
|
|
|
|
|
|
enum elementary_feature_range { ONLY_CURRENT, ANY_OFFSET }; |
|
9515
|
|
|
|
|
|
|
|
|
9516
|
|
|
|
|
|
|
typedef uint32_t elementary_feature_value; |
|
9517
|
|
|
|
|
|
|
enum :elementary_feature_value { elementary_feature_unknown = 0, elementary_feature_empty = 1 }; |
|
9518
|
|
|
|
|
|
|
|
|
9519
|
136
|
|
|
|
|
|
struct elementary_feature_description { |
|
9520
|
|
|
|
|
|
|
string name; |
|
9521
|
|
|
|
|
|
|
elementary_feature_type type; |
|
9522
|
|
|
|
|
|
|
elementary_feature_range range; |
|
9523
|
|
|
|
|
|
|
int index; |
|
9524
|
|
|
|
|
|
|
int map_index; |
|
9525
|
|
|
|
|
|
|
}; |
|
9526
|
|
|
|
|
|
|
|
|
9527
|
|
|
|
|
|
|
template |
|
9528
|
1
|
|
|
|
|
|
class elementary_features { |
|
9529
|
|
|
|
|
|
|
public: |
|
9530
|
|
|
|
|
|
|
bool load(istream& is); |
|
9531
|
|
|
|
|
|
|
bool save(ostream& out); |
|
9532
|
|
|
|
|
|
|
|
|
9533
|
|
|
|
|
|
|
vector |
|
9534
|
|
|
|
|
|
|
}; |
|
9535
|
|
|
|
|
|
|
|
|
9536
|
0
|
|
|
|
|
|
class persistent_elementary_feature_map : public persistent_unordered_map { |
|
9537
|
|
|
|
|
|
|
public: |
|
9538
|
|
|
|
|
|
|
persistent_elementary_feature_map() : persistent_unordered_map() {} |
|
9539
|
|
|
|
|
|
|
persistent_elementary_feature_map(const persistent_unordered_map&& map) : persistent_unordered_map(map) {} |
|
9540
|
|
|
|
|
|
|
|
|
9541
|
|
|
|
|
|
|
elementary_feature_value value(const char* feature, int len) const { |
|
9542
|
92
|
|
|
|
|
|
auto* it = at_typed(feature, len); |
|
9543
|
92
|
0
|
|
|
|
|
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
9544
|
|
|
|
|
|
|
} |
|
9545
|
|
|
|
|
|
|
}; |
|
9546
|
|
|
|
|
|
|
|
|
9547
|
|
|
|
|
|
|
// Definitions |
|
9548
|
|
|
|
|
|
|
template |
|
9549
|
1
|
|
|
|
|
|
inline bool elementary_features |
|
9550
|
|
|
|
|
|
|
binary_decoder data; |
|
9551
|
1
|
50
|
|
|
|
|
if (!compressor::load(is, data)) return false; |
|
|
|
50
|
|
|
|
|
|
|
9552
|
|
|
|
|
|
|
|
|
9553
|
|
|
|
|
|
|
try { |
|
9554
|
1
|
50
|
|
|
|
|
maps.resize(data.next_1B()); |
|
|
|
50
|
|
|
|
|
|
|
9555
|
28
|
100
|
|
|
|
|
for (auto&& map : maps) |
|
9556
|
27
|
50
|
|
|
|
|
map.load(data); |
|
|
|
0
|
|
|
|
|
|
|
9557
|
|
|
|
|
|
|
} catch (binary_decoder_error&) { |
|
9558
|
|
|
|
|
|
|
return false; |
|
9559
|
|
|
|
|
|
|
} |
|
9560
|
|
|
|
|
|
|
|
|
9561
|
1
|
|
|
|
|
|
return data.is_end(); |
|
9562
|
|
|
|
|
|
|
} |
|
9563
|
|
|
|
|
|
|
|
|
9564
|
|
|
|
|
|
|
} // namespace morphodita |
|
9565
|
|
|
|
|
|
|
|
|
9566
|
|
|
|
|
|
|
///////// |
|
9567
|
|
|
|
|
|
|
// File: morphodita/tagger/vli.h |
|
9568
|
|
|
|
|
|
|
///////// |
|
9569
|
|
|
|
|
|
|
|
|
9570
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
9571
|
|
|
|
|
|
|
// |
|
9572
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
9573
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
9574
|
|
|
|
|
|
|
// |
|
9575
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
9576
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
9577
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
9578
|
|
|
|
|
|
|
|
|
9579
|
|
|
|
|
|
|
namespace morphodita { |
|
9580
|
|
|
|
|
|
|
|
|
9581
|
|
|
|
|
|
|
// Declarations |
|
9582
|
|
|
|
|
|
|
template |
|
9583
|
|
|
|
|
|
|
class vli { |
|
9584
|
|
|
|
|
|
|
public: |
|
9585
|
|
|
|
|
|
|
static int max_length(); |
|
9586
|
|
|
|
|
|
|
static void encode(T value, char*& where); |
|
9587
|
|
|
|
|
|
|
static T decode(const char*& from); |
|
9588
|
|
|
|
|
|
|
}; |
|
9589
|
|
|
|
|
|
|
|
|
9590
|
|
|
|
|
|
|
// Definitions |
|
9591
|
|
|
|
|
|
|
template <> |
|
9592
|
|
|
|
|
|
|
inline int vli::max_length() { |
|
9593
|
|
|
|
|
|
|
return 5; |
|
9594
|
|
|
|
|
|
|
} |
|
9595
|
|
|
|
|
|
|
|
|
9596
|
|
|
|
|
|
|
template <> |
|
9597
|
1171
|
|
|
|
|
|
inline void vli::encode(uint32_t value, char*& where) { |
|
9598
|
1171
|
50
|
|
|
|
|
if (value < 0x80) *where++ = value; |
|
9599
|
0
|
0
|
|
|
|
|
else if (value < 0x4000) *where++ = (value >> 7) | 0x80u, *where++ = value & 0x7Fu; |
|
9600
|
0
|
0
|
|
|
|
|
else if (value < 0x200000) *where++ = (value >> 14) | 0x80u, *where++ = ((value >> 7) & 0x7Fu) | 0x80u, *where++ = value & 0x7Fu; |
|
9601
|
0
|
0
|
|
|
|
|
else if (value < 0x10000000) *where++ = (value >> 21) | 0x80u, *where++ = ((value >> 14) & 0x7Fu) | 0x80u, *where++ = ((value >> 7) & 0x7Fu) | 0x80u, *where++ = value & 0x7Fu; |
|
9602
|
0
|
|
|
|
|
|
else *where++ = (value >> 28) | 0x80u, *where++ = ((value >> 21) & 0x7Fu) | 0x80u, *where++ = ((value >> 14) & 0x7Fu) | 0x80u, *where++ = ((value >> 7) & 0x7Fu) | 0x80u, *where++ = value & 0x7Fu; |
|
9603
|
1171
|
|
|
|
|
|
} |
|
9604
|
|
|
|
|
|
|
|
|
9605
|
|
|
|
|
|
|
template <> |
|
9606
|
|
|
|
|
|
|
inline uint32_t vli::decode(const char*& from) { |
|
9607
|
|
|
|
|
|
|
uint32_t value = 0; |
|
9608
|
0
|
0
|
|
|
|
|
while (((unsigned char)(*from)) & 0x80u) value = (value << 7) | (((unsigned char)(*from++)) ^ 0x80u); |
|
|
|
0
|
|
|
|
|
|
|
9609
|
0
|
|
|
|
|
|
value = (value << 7) | ((unsigned char)(*from++)); |
|
9610
|
|
|
|
|
|
|
return value; |
|
9611
|
|
|
|
|
|
|
} |
|
9612
|
|
|
|
|
|
|
|
|
9613
|
|
|
|
|
|
|
} // namespace morphodita |
|
9614
|
|
|
|
|
|
|
|
|
9615
|
|
|
|
|
|
|
///////// |
|
9616
|
|
|
|
|
|
|
// File: morphodita/tagger/feature_sequences.h |
|
9617
|
|
|
|
|
|
|
///////// |
|
9618
|
|
|
|
|
|
|
|
|
9619
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
9620
|
|
|
|
|
|
|
// |
|
9621
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
9622
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
9623
|
|
|
|
|
|
|
// |
|
9624
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
9625
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
9626
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
9627
|
|
|
|
|
|
|
|
|
9628
|
|
|
|
|
|
|
namespace morphodita { |
|
9629
|
|
|
|
|
|
|
|
|
9630
|
|
|
|
|
|
|
// Declarations |
|
9631
|
|
|
|
|
|
|
typedef int32_t feature_sequence_score; |
|
9632
|
|
|
|
|
|
|
typedef int64_t feature_sequences_score; |
|
9633
|
|
|
|
|
|
|
|
|
9634
|
|
|
|
|
|
|
struct feature_sequence_element { |
|
9635
|
|
|
|
|
|
|
elementary_feature_type type; |
|
9636
|
|
|
|
|
|
|
int elementary_index; |
|
9637
|
|
|
|
|
|
|
int sequence_index; |
|
9638
|
|
|
|
|
|
|
|
|
9639
|
|
|
|
|
|
|
feature_sequence_element() {} |
|
9640
|
0
|
|
|
|
|
|
feature_sequence_element(elementary_feature_type type, int elementary_index, int sequence_index) : type(type), elementary_index(elementary_index), sequence_index(sequence_index) {} |
|
9641
|
|
|
|
|
|
|
}; |
|
9642
|
|
|
|
|
|
|
|
|
9643
|
74
|
0
|
|
|
|
|
struct feature_sequence { |
|
|
|
0
|
|
|
|
|
|
|
9644
|
|
|
|
|
|
|
vector elements; |
|
9645
|
|
|
|
|
|
|
int dependant_range = 1; |
|
9646
|
|
|
|
|
|
|
}; |
|
9647
|
|
|
|
|
|
|
|
|
9648
|
|
|
|
|
|
|
template |
|
9649
|
3
|
0
|
|
|
|
|
class feature_sequences { |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9650
|
|
|
|
|
|
|
public: |
|
9651
|
|
|
|
|
|
|
typedef typename ElementaryFeatures::per_form_features per_form_features; |
|
9652
|
|
|
|
|
|
|
typedef typename ElementaryFeatures::per_tag_features per_tag_features; |
|
9653
|
|
|
|
|
|
|
typedef typename ElementaryFeatures::dynamic_features dynamic_features; |
|
9654
|
|
|
|
|
|
|
|
|
9655
|
|
|
|
|
|
|
void parse(int window_size, istream& is); |
|
9656
|
|
|
|
|
|
|
bool load(istream& is); |
|
9657
|
|
|
|
|
|
|
bool save(ostream& os); |
|
9658
|
|
|
|
|
|
|
|
|
9659
|
|
|
|
|
|
|
struct cache; |
|
9660
|
|
|
|
|
|
|
|
|
9661
|
|
|
|
|
|
|
inline void initialize_sentence(const vector& forms, const vector>& analyses, cache& c) const; |
|
9662
|
|
|
|
|
|
|
inline void compute_dynamic_features(int form_index, int tag_index, const dynamic_features* prev_dynamic, dynamic_features& dynamic, cache& c) const; |
|
9663
|
|
|
|
|
|
|
inline feature_sequences_score score(int form_index, int tags_window[], int tags_unchanged, dynamic_features& dynamic, cache& c) const; |
|
9664
|
|
|
|
|
|
|
void feature_keys(int form_index, int tags_window[], int tags_unchanged, dynamic_features& dynamic, vector& keys, cache& c) const; |
|
9665
|
|
|
|
|
|
|
|
|
9666
|
|
|
|
|
|
|
ElementaryFeatures elementary; |
|
9667
|
|
|
|
|
|
|
vector |
|
9668
|
|
|
|
|
|
|
vector sequences; |
|
9669
|
|
|
|
|
|
|
}; |
|
9670
|
|
|
|
|
|
|
|
|
9671
|
0
|
|
|
|
|
|
class persistent_feature_sequence_map : public persistent_unordered_map { |
|
9672
|
|
|
|
|
|
|
public: |
|
9673
|
|
|
|
|
|
|
persistent_feature_sequence_map() : persistent_unordered_map() {} |
|
9674
|
|
|
|
|
|
|
persistent_feature_sequence_map(const persistent_unordered_map&& map) : persistent_unordered_map(map) {} |
|
9675
|
|
|
|
|
|
|
|
|
9676
|
|
|
|
|
|
|
feature_sequence_score score(const char* feature, int len) const { |
|
9677
|
346
|
|
|
|
|
|
auto* it = at_typed(feature, len); |
|
9678
|
346
|
0
|
|
|
|
|
return it ? unaligned_load(it) : 0; |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
9679
|
|
|
|
|
|
|
} |
|
9680
|
|
|
|
|
|
|
}; |
|
9681
|
|
|
|
|
|
|
|
|
9682
|
|
|
|
|
|
|
template using persistent_feature_sequences = feature_sequences; |
|
9683
|
|
|
|
|
|
|
|
|
9684
|
|
|
|
|
|
|
// Definitions |
|
9685
|
|
|
|
|
|
|
template |
|
9686
|
1
|
|
|
|
|
|
inline bool feature_sequences::load(istream& is) { |
|
9687
|
1
|
50
|
|
|
|
|
if (!elementary.load(is)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9688
|
|
|
|
|
|
|
|
|
9689
|
|
|
|
|
|
|
binary_decoder data; |
|
9690
|
1
|
50
|
|
|
|
|
if (!compressor::load(is, data)) return false; |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9691
|
|
|
|
|
|
|
|
|
9692
|
|
|
|
|
|
|
try { |
|
9693
|
1
|
50
|
|
|
|
|
sequences.resize(data.next_1B()); |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9694
|
75
|
100
|
|
|
|
|
for (auto&& sequence : sequences) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9695
|
74
|
50
|
|
|
|
|
sequence.dependant_range = data.next_4B(); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9696
|
74
|
50
|
|
|
|
|
sequence.elements.resize(data.next_1B()); |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9697
|
228
|
100
|
|
|
|
|
for (auto&& element : sequence.elements) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9698
|
154
|
50
|
|
|
|
|
element.type = elementary_feature_type(data.next_4B()); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9699
|
154
|
50
|
|
|
|
|
element.elementary_index = data.next_4B(); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9700
|
154
|
50
|
|
|
|
|
element.sequence_index = data.next_4B(); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9701
|
|
|
|
|
|
|
} |
|
9702
|
|
|
|
|
|
|
} |
|
9703
|
|
|
|
|
|
|
|
|
9704
|
1
|
50
|
|
|
|
|
scores.resize(data.next_1B()); |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9705
|
75
|
100
|
|
|
|
|
for (auto&& score : scores) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9706
|
74
|
50
|
|
|
|
|
score.load(data); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9707
|
|
|
|
|
|
|
} catch (binary_decoder_error&) { |
|
9708
|
|
|
|
|
|
|
return false; |
|
9709
|
|
|
|
|
|
|
} |
|
9710
|
|
|
|
|
|
|
|
|
9711
|
1
|
|
|
|
|
|
return data.is_end(); |
|
9712
|
|
|
|
|
|
|
} |
|
9713
|
|
|
|
|
|
|
|
|
9714
|
|
|
|
|
|
|
template |
|
9715
|
2
|
|
|
|
|
|
struct feature_sequences::cache { |
|
9716
|
|
|
|
|
|
|
const vector* forms; |
|
9717
|
|
|
|
|
|
|
const vector>* analyses; |
|
9718
|
|
|
|
|
|
|
vector elementary_per_form; |
|
9719
|
|
|
|
|
|
|
vector> elementary_per_tag; |
|
9720
|
|
|
|
|
|
|
|
|
9721
|
0
|
|
|
|
|
|
struct cache_element { |
|
9722
|
|
|
|
|
|
|
vector key; |
|
9723
|
|
|
|
|
|
|
int key_size; |
|
9724
|
|
|
|
|
|
|
feature_sequence_score score; |
|
9725
|
|
|
|
|
|
|
|
|
9726
|
74
|
0
|
|
|
|
|
cache_element(int elements) : key(vli::max_length() * elements), key_size(0), score(0) {} |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9727
|
|
|
|
|
|
|
}; |
|
9728
|
|
|
|
|
|
|
vector caches; |
|
9729
|
|
|
|
|
|
|
vector window; |
|
9730
|
|
|
|
|
|
|
vector key; |
|
9731
|
|
|
|
|
|
|
feature_sequences_score score; |
|
9732
|
|
|
|
|
|
|
|
|
9733
|
1
|
|
|
|
|
|
cache(const feature_sequences& self) : score(0) { |
|
9734
|
1
|
0
|
|
|
|
|
caches.reserve(self.sequences.size()); |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9735
|
|
|
|
|
|
|
int max_sequence_elements = 0, max_window_size = 1; |
|
9736
|
75
|
0
|
|
|
|
|
for (auto&& sequence : self.sequences) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9737
|
74
|
0
|
|
|
|
|
caches.emplace_back(int(sequence.elements.size())); |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9738
|
74
|
0
|
|
|
|
|
if (int(sequence.elements.size()) > max_sequence_elements) max_sequence_elements = sequence.elements.size(); |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9739
|
228
|
0
|
|
|
|
|
for (auto&& element : sequence.elements) |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9740
|
154
|
0
|
|
|
|
|
if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9741
|
|
|
|
|
|
|
max_window_size = 1 - element.sequence_index; |
|
9742
|
|
|
|
|
|
|
} |
|
9743
|
1
|
0
|
|
|
|
|
key.resize(max_sequence_elements * vli::max_length()); |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9744
|
1
|
0
|
|
|
|
|
window.resize(max_window_size); |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9745
|
1
|
|
|
|
|
|
} |
|
9746
|
|
|
|
|
|
|
}; |
|
9747
|
|
|
|
|
|
|
|
|
9748
|
|
|
|
|
|
|
template |
|
9749
|
1
|
|
|
|
|
|
void feature_sequences::initialize_sentence(const vector& forms, const vector>& analyses, cache& c) const { |
|
9750
|
|
|
|
|
|
|
// Store forms and forms_size |
|
9751
|
1
|
|
|
|
|
|
c.forms = &forms; |
|
9752
|
1
|
|
|
|
|
|
c.analyses = &analyses; |
|
9753
|
|
|
|
|
|
|
|
|
9754
|
|
|
|
|
|
|
// Enlarge elementary features vectors if needed |
|
9755
|
1
|
0
|
|
|
|
|
if (forms.size() > c.elementary_per_form.size()) c.elementary_per_form.resize(forms.size() * 2); |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9756
|
1
|
0
|
|
|
|
|
if (forms.size() > c.elementary_per_tag.size()) c.elementary_per_tag.resize(forms.size() * 2); |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9757
|
8
|
0
|
|
|
|
|
for (unsigned i = 0; i < forms.size(); i++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9758
|
7
|
0
|
|
|
|
|
if (analyses[i].size() > c.elementary_per_tag[i].size()) |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9759
|
7
|
|
|
|
|
|
c.elementary_per_tag[i].resize(analyses[i].size() * 2); |
|
9760
|
|
|
|
|
|
|
|
|
9761
|
|
|
|
|
|
|
// Compute elementary features |
|
9762
|
1
|
|
|
|
|
|
elementary.compute_features(forms, analyses, c.elementary_per_form, c.elementary_per_tag); |
|
9763
|
|
|
|
|
|
|
|
|
9764
|
|
|
|
|
|
|
// Clear score cache, because scores may have been modified |
|
9765
|
1
|
|
|
|
|
|
c.score = 0; |
|
9766
|
75
|
0
|
|
|
|
|
for (auto&& cache : c.caches) |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9767
|
74
|
|
|
|
|
|
cache.key_size = cache.score = 0; |
|
9768
|
1
|
|
|
|
|
|
} |
|
9769
|
|
|
|
|
|
|
|
|
9770
|
|
|
|
|
|
|
template |
|
9771
|
30
|
|
|
|
|
|
void feature_sequences::compute_dynamic_features(int form_index, int tag_index, const dynamic_features* prev_dynamic, dynamic_features& dynamic, cache& c) const { |
|
9772
|
15
|
0
|
|
|
|
|
elementary.compute_dynamic_features((*c.analyses)[form_index][tag_index], c.elementary_per_form[form_index], c.elementary_per_tag[form_index][tag_index], form_index > 0 ? prev_dynamic : nullptr, dynamic); |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9773
|
15
|
|
|
|
|
|
} |
|
9774
|
|
|
|
|
|
|
|
|
9775
|
|
|
|
|
|
|
template |
|
9776
|
26
|
|
|
|
|
|
feature_sequences_score feature_sequences::score(int form_index, int tags_window[], int tags_unchanged, dynamic_features& dynamic, cache& c) const { |
|
9777
|
|
|
|
|
|
|
// Start by creating a window of per_tag_features* |
|
9778
|
43
|
0
|
|
|
|
|
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9779
|
90
|
|
|
|
|
|
c.window[i] = &c.elementary_per_tag[form_index - i][tags_window[i]]; |
|
9780
|
|
|
|
|
|
|
|
|
9781
|
|
|
|
|
|
|
// Compute the score |
|
9782
|
13
|
|
|
|
|
|
feature_sequences_score result = c.score; |
|
9783
|
671
|
0
|
|
|
|
|
for (unsigned i = 0; i < sequences.size(); i++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9784
|
658
|
0
|
|
|
|
|
if (tags_unchanged >= sequences[i].dependant_range) |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9785
|
|
|
|
|
|
|
break; |
|
9786
|
|
|
|
|
|
|
|
|
9787
|
653
|
|
|
|
|
|
char* key = c.key.data(); |
|
9788
|
1824
|
0
|
|
|
|
|
for (unsigned j = 0; j < sequences[i].elements.size(); j++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9789
|
|
|
|
|
|
|
auto& element = sequences[i].elements[j]; |
|
9790
|
|
|
|
|
|
|
elementary_feature_value value; |
|
9791
|
|
|
|
|
|
|
|
|
9792
|
1345
|
|
|
|
|
|
switch (element.type) { |
|
9793
|
|
|
|
|
|
|
case PER_FORM: |
|
9794
|
475
|
0
|
|
|
|
|
value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index]; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9795
|
|
|
|
|
|
|
break; |
|
9796
|
|
|
|
|
|
|
case PER_TAG: |
|
9797
|
844
|
0
|
|
|
|
|
value = form_index + element.sequence_index < 0 ? elementary_feature_empty : c.window[-element.sequence_index]->values[element.elementary_index]; |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9798
|
|
|
|
|
|
|
break; |
|
9799
|
|
|
|
|
|
|
case DYNAMIC: |
|
9800
|
|
|
|
|
|
|
default: |
|
9801
|
26
|
|
|
|
|
|
value = dynamic.values[element.elementary_index]; |
|
9802
|
|
|
|
|
|
|
} |
|
9803
|
|
|
|
|
|
|
|
|
9804
|
1345
|
0
|
|
|
|
|
if (value == elementary_feature_unknown) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9805
|
174
|
|
|
|
|
|
key = c.key.data(); |
|
9806
|
174
|
|
|
|
|
|
break; |
|
9807
|
|
|
|
|
|
|
} |
|
9808
|
1171
|
|
|
|
|
|
vli::encode(value, key); |
|
9809
|
|
|
|
|
|
|
} |
|
9810
|
|
|
|
|
|
|
|
|
9811
|
653
|
|
|
|
|
|
result -= c.caches[i].score; |
|
9812
|
653
|
|
|
|
|
|
int key_size = key - c.key.data(); |
|
9813
|
653
|
0
|
|
|
|
|
if (!key_size) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9814
|
174
|
|
|
|
|
|
c.caches[i].score = 0; |
|
9815
|
174
|
|
|
|
|
|
c.caches[i].key_size = 0; |
|
9816
|
834
|
0
|
|
|
|
|
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9817
|
0
|
|
|
|
|
|
c.caches[i].score = scores[i].score(c.key.data(), key_size); |
|
9818
|
346
|
|
|
|
|
|
c.caches[i].key_size = key_size; |
|
9819
|
346
|
|
|
|
|
|
small_memcpy(c.caches[i].key.data(), c.key.data(), key_size); |
|
9820
|
|
|
|
|
|
|
} |
|
9821
|
653
|
|
|
|
|
|
result += c.caches[i].score; |
|
9822
|
|
|
|
|
|
|
} |
|
9823
|
|
|
|
|
|
|
|
|
9824
|
13
|
|
|
|
|
|
c.score = result; |
|
9825
|
13
|
|
|
|
|
|
return result; |
|
9826
|
|
|
|
|
|
|
} |
|
9827
|
|
|
|
|
|
|
|
|
9828
|
|
|
|
|
|
|
template |
|
9829
|
0
|
|
|
|
|
|
void feature_sequences::feature_keys(int form_index, int tags_window[], int tags_unchanged, dynamic_features& dynamic, vector& keys, cache& c) const { |
|
9830
|
0
|
|
|
|
|
|
score(form_index, tags_window, tags_unchanged, dynamic, c); |
|
9831
|
|
|
|
|
|
|
|
|
9832
|
0
|
|
|
|
|
|
keys.resize(c.caches.size()); |
|
9833
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < c.caches.size(); i++) |
|
9834
|
0
|
|
|
|
|
|
keys[i].assign(c.caches[i].key.data(), c.caches[i].key_size); |
|
9835
|
0
|
|
|
|
|
|
} |
|
9836
|
|
|
|
|
|
|
|
|
9837
|
|
|
|
|
|
|
} // namespace morphodita |
|
9838
|
|
|
|
|
|
|
|
|
9839
|
|
|
|
|
|
|
///////// |
|
9840
|
|
|
|
|
|
|
// File: morphodita/tagger/viterbi.h |
|
9841
|
|
|
|
|
|
|
///////// |
|
9842
|
|
|
|
|
|
|
|
|
9843
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
9844
|
|
|
|
|
|
|
// |
|
9845
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
9846
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
9847
|
|
|
|
|
|
|
// |
|
9848
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
9849
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
9850
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
9851
|
|
|
|
|
|
|
|
|
9852
|
|
|
|
|
|
|
namespace morphodita { |
|
9853
|
|
|
|
|
|
|
|
|
9854
|
|
|
|
|
|
|
// Declarations |
|
9855
|
|
|
|
|
|
|
template |
|
9856
|
|
|
|
|
|
|
class viterbi { |
|
9857
|
|
|
|
|
|
|
public: |
|
9858
|
|
|
|
|
|
|
viterbi(const FeatureSequences& features, int decoding_order, int window_size) |
|
9859
|
1
|
|
|
|
|
|
: features(features), decoding_order(decoding_order), window_size(window_size) {} |
|
9860
|
|
|
|
|
|
|
|
|
9861
|
|
|
|
|
|
|
struct cache; |
|
9862
|
|
|
|
|
|
|
void tag(const vector& forms, const vector>& analyses, cache& c, vector& tags) const; |
|
9863
|
|
|
|
|
|
|
|
|
9864
|
|
|
|
|
|
|
private: |
|
9865
|
|
|
|
|
|
|
struct node; |
|
9866
|
|
|
|
|
|
|
|
|
9867
|
|
|
|
|
|
|
const FeatureSequences& features; |
|
9868
|
|
|
|
|
|
|
int decoding_order, window_size; |
|
9869
|
|
|
|
|
|
|
}; |
|
9870
|
|
|
|
|
|
|
|
|
9871
|
|
|
|
|
|
|
// Definitions |
|
9872
|
|
|
|
|
|
|
template |
|
9873
|
2
|
|
|
|
|
|
struct viterbi::cache { |
|
9874
|
|
|
|
|
|
|
vector nodes; |
|
9875
|
|
|
|
|
|
|
typename FeatureSequences::cache features_cache; |
|
9876
|
|
|
|
|
|
|
|
|
9877
|
1
|
0
|
|
|
|
|
cache(const viterbi& self) : features_cache(self.features) {} |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9878
|
|
|
|
|
|
|
}; |
|
9879
|
|
|
|
|
|
|
|
|
9880
|
|
|
|
|
|
|
template |
|
9881
|
|
|
|
|
|
|
struct viterbi::node { |
|
9882
|
|
|
|
|
|
|
int tag; |
|
9883
|
|
|
|
|
|
|
int prev; |
|
9884
|
|
|
|
|
|
|
feature_sequences_score score; |
|
9885
|
|
|
|
|
|
|
typename FeatureSequences::dynamic_features dynamic; |
|
9886
|
|
|
|
|
|
|
}; |
|
9887
|
|
|
|
|
|
|
|
|
9888
|
|
|
|
|
|
|
template |
|
9889
|
1
|
|
|
|
|
|
void viterbi::tag(const vector& forms, const vector>& analyses, cache& c, vector& tags) const { |
|
9890
|
2
|
0
|
|
|
|
|
if (!forms.size()) return; |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9891
|
|
|
|
|
|
|
|
|
9892
|
|
|
|
|
|
|
// Count number of nodes and allocate |
|
9893
|
|
|
|
|
|
|
unsigned nodes = 0; |
|
9894
|
8
|
0
|
|
|
|
|
for (unsigned i = 0, states = 1; i < forms.size(); i++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9895
|
7
|
0
|
|
|
|
|
if (analyses[i].empty()) return; |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9896
|
7
|
0
|
|
|
|
|
states = (i+1 >= unsigned(decoding_order) ? states / analyses[i-decoding_order+1].size() : states) * analyses[i].size(); |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9897
|
7
|
|
|
|
|
|
nodes += states; |
|
9898
|
|
|
|
|
|
|
} |
|
9899
|
1
|
0
|
|
|
|
|
if (nodes > c.nodes.size()) c.nodes.resize(nodes); |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9900
|
|
|
|
|
|
|
|
|
9901
|
|
|
|
|
|
|
// Init feature sequences |
|
9902
|
1
|
|
|
|
|
|
features.initialize_sentence(forms, analyses, c.features_cache); |
|
9903
|
|
|
|
|
|
|
|
|
9904
|
|
|
|
|
|
|
int window_stack[16]; vector window_heap; |
|
9905
|
1
|
0
|
|
|
|
|
int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data()); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9906
|
|
|
|
|
|
|
typename FeatureSequences::dynamic_features dynamic; |
|
9907
|
|
|
|
|
|
|
feature_sequences_score score; |
|
9908
|
|
|
|
|
|
|
|
|
9909
|
|
|
|
|
|
|
// Compute all nodes score |
|
9910
|
|
|
|
|
|
|
int nodes_prev = -1, nodes_now = 0; |
|
9911
|
8
|
0
|
|
|
|
|
for (unsigned i = 0; i < forms.size(); i++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9912
|
|
|
|
|
|
|
int nodes_next = nodes_now; |
|
9913
|
|
|
|
|
|
|
|
|
9914
|
28
|
0
|
|
|
|
|
for (int j = 0; j < window_size; j++) window[j] = -1; |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9915
|
17
|
0
|
|
|
|
|
for (int tag = 0; tag < int(analyses[i].size()); tag++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9916
|
25
|
0
|
|
|
|
|
for (int prev = nodes_prev; prev < nodes_now; prev++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9917
|
|
|
|
|
|
|
// Compute predecessors and number of unchanges |
|
9918
|
15
|
|
|
|
|
|
int same_tags = window[0] == tag; |
|
9919
|
15
|
|
|
|
|
|
window[0] = tag; |
|
9920
|
36
|
0
|
|
|
|
|
for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9921
|
21
|
0
|
|
|
|
|
same_tags += same_tags == n && window[n] == c.nodes[p].tag; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9922
|
42
|
|
|
|
|
|
window[n] = c.nodes[p].tag; |
|
9923
|
|
|
|
|
|
|
} |
|
9924
|
|
|
|
|
|
|
|
|
9925
|
|
|
|
|
|
|
// Compute dynamic elementary features and score |
|
9926
|
15
|
0
|
|
|
|
|
features.compute_dynamic_features(i, tag, prev >= 0 ? &c.nodes[prev].dynamic : nullptr, dynamic, c.features_cache); |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9927
|
15
|
0
|
|
|
|
|
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9928
|
12
|
|
|
|
|
|
(prev >= 0 ? c.nodes[prev].score : 0); |
|
9929
|
|
|
|
|
|
|
|
|
9930
|
|
|
|
|
|
|
// Update existing node or create a new one |
|
9931
|
15
|
0
|
|
|
|
|
if (same_tags >= decoding_order-1) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9932
|
2
|
0
|
|
|
|
|
if (score <= c.nodes[nodes_next-1].score) continue; |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9933
|
|
|
|
|
|
|
nodes_next--; |
|
9934
|
|
|
|
|
|
|
} |
|
9935
|
28
|
|
|
|
|
|
c.nodes[nodes_next].tag = tag; |
|
9936
|
14
|
|
|
|
|
|
c.nodes[nodes_next].prev = prev; |
|
9937
|
14
|
|
|
|
|
|
c.nodes[nodes_next].score = score; |
|
9938
|
14
|
|
|
|
|
|
c.nodes[nodes_next++].dynamic = dynamic; |
|
9939
|
|
|
|
|
|
|
} |
|
9940
|
|
|
|
|
|
|
|
|
9941
|
|
|
|
|
|
|
nodes_prev = nodes_now; |
|
9942
|
|
|
|
|
|
|
nodes_now = nodes_next; |
|
9943
|
|
|
|
|
|
|
} |
|
9944
|
|
|
|
|
|
|
|
|
9945
|
|
|
|
|
|
|
// Choose the best ending node |
|
9946
|
|
|
|
|
|
|
int best = nodes_prev; |
|
9947
|
2
|
0
|
|
|
|
|
for (int node = nodes_prev + 1; node < nodes_now; node++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9948
|
1
|
0
|
|
|
|
|
if (c.nodes[node].score > c.nodes[best].score) |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9949
|
|
|
|
|
|
|
best = node; |
|
9950
|
|
|
|
|
|
|
|
|
9951
|
8
|
0
|
|
|
|
|
for (int i = forms.size() - 1; i >= 0; i--, best = c.nodes[best].prev) |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
9952
|
21
|
|
|
|
|
|
tags[i] = c.nodes[best].tag; |
|
9953
|
|
|
|
|
|
|
} |
|
9954
|
|
|
|
|
|
|
|
|
9955
|
|
|
|
|
|
|
} // namespace morphodita |
|
9956
|
|
|
|
|
|
|
|
|
9957
|
|
|
|
|
|
|
///////// |
|
9958
|
|
|
|
|
|
|
// File: morphodita/tagger/conllu_elementary_features.h |
|
9959
|
|
|
|
|
|
|
///////// |
|
9960
|
|
|
|
|
|
|
|
|
9961
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
9962
|
|
|
|
|
|
|
// |
|
9963
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
9964
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
9965
|
|
|
|
|
|
|
// |
|
9966
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
9967
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
9968
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
9969
|
|
|
|
|
|
|
|
|
9970
|
|
|
|
|
|
|
namespace morphodita { |
|
9971
|
|
|
|
|
|
|
|
|
9972
|
|
|
|
|
|
|
// Declarations |
|
9973
|
|
|
|
|
|
|
template |
|
9974
|
1
|
|
|
|
|
|
class conllu_elementary_features : public elementary_features |
|
9975
|
|
|
|
|
|
|
public: |
|
9976
|
|
|
|
|
|
|
conllu_elementary_features(); |
|
9977
|
|
|
|
|
|
|
|
|
9978
|
|
|
|
|
|
|
enum features_per_form { FORM, FOLLOWING_VERB_TAG, FOLLOWING_VERB_FORM, NUM, CAP, DASH, PREFIX1, PREFIX2, PREFIX3, PREFIX4, PREFIX5, PREFIX6, PREFIX7, PREFIX8, PREFIX9, SUFFIX1, SUFFIX2, SUFFIX3, SUFFIX4, SUFFIX5, SUFFIX6, SUFFIX7, SUFFIX8, SUFFIX9, PER_FORM_TOTAL }; |
|
9979
|
|
|
|
|
|
|
enum features_per_tag { TAG, TAG_UPOS, TAG_CASE, TAG_GENDER, TAG_NUMBER, TAG_NEGATIVE, TAG_PERSON, LEMMA, PER_TAG_TOTAL }; |
|
9980
|
|
|
|
|
|
|
enum features_dynamic { PREVIOUS_VERB_TAG, PREVIOUS_VERB_FORM, PREVIOUS_OR_CURRENT_VERB_TAG, PREVIOUS_OR_CURRENT_VERB_FORM, DYNAMIC_TOTAL }; |
|
9981
|
|
|
|
|
|
|
enum features_map { MAP_NONE = -1, MAP_FORM, MAP_PREFIX1, MAP_PREFIX2, MAP_PREFIX3, MAP_PREFIX4, MAP_PREFIX5, MAP_PREFIX6, MAP_PREFIX7, MAP_PREFIX8, MAP_PREFIX9, MAP_SUFFIX1, MAP_SUFFIX2, MAP_SUFFIX3, MAP_SUFFIX4, MAP_SUFFIX5, MAP_SUFFIX6, MAP_SUFFIX7, MAP_SUFFIX8, MAP_SUFFIX9, MAP_TAG, MAP_TAG_UPOS, MAP_TAG_CASE, MAP_TAG_GENDER, MAP_TAG_NUMBER, MAP_TAG_NEGATIVE, MAP_TAG_PERSON, MAP_LEMMA, MAP_TOTAL } ; |
|
9982
|
|
|
|
|
|
|
|
|
9983
|
|
|
|
|
|
|
struct per_form_features { elementary_feature_value values[PER_FORM_TOTAL]; }; |
|
9984
|
|
|
|
|
|
|
struct per_tag_features { elementary_feature_value values[PER_TAG_TOTAL]; }; |
|
9985
|
|
|
|
|
|
|
struct dynamic_features { elementary_feature_value values[DYNAMIC_TOTAL]; }; |
|
9986
|
|
|
|
|
|
|
|
|
9987
|
|
|
|
|
|
|
static vector descriptions; |
|
9988
|
|
|
|
|
|
|
|
|
9989
|
|
|
|
|
|
|
void compute_features(const vector& forms, const vector>& analyses, vector& per_form, vector>& per_tag) const; |
|
9990
|
|
|
|
|
|
|
inline void compute_dynamic_features(const tagged_lemma& tag, const per_form_features& per_form, const per_tag_features& per_tag, const dynamic_features* prev_dynamic, dynamic_features& dynamic) const; |
|
9991
|
|
|
|
|
|
|
|
|
9992
|
|
|
|
|
|
|
using elementary_features |
|
9993
|
|
|
|
|
|
|
}; |
|
9994
|
|
|
|
|
|
|
|
|
9995
|
|
|
|
|
|
|
typedef conllu_elementary_features persistent_conllu_elementary_features; |
|
9996
|
|
|
|
|
|
|
|
|
9997
|
|
|
|
|
|
|
// Definitions |
|
9998
|
|
|
|
|
|
|
template |
|
9999
|
1
|
|
|
|
|
|
conllu_elementary_features |
|
10000
|
1
|
0
|
|
|
|
|
maps.resize(MAP_TOTAL); |
|
|
|
50
|
|
|
|
|
|
|
10001
|
1
|
|
|
|
|
|
} |
|
10002
|
|
|
|
|
|
|
|
|
10003
|
|
|
|
|
|
|
template |
|
10004
|
70
|
50
|
|
|
|
|
vector conllu_elementary_features |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
10005
|
|
|
|
|
|
|
{"Form", PER_FORM, ANY_OFFSET, FORM, MAP_FORM}, |
|
10006
|
|
|
|
|
|
|
{"FollowingVerbTag", PER_FORM, ANY_OFFSET, FOLLOWING_VERB_TAG, MAP_TAG}, |
|
10007
|
|
|
|
|
|
|
{"FollowingVerbForm", PER_FORM, ANY_OFFSET, FOLLOWING_VERB_FORM, MAP_FORM}, |
|
10008
|
|
|
|
|
|
|
{"Num", PER_FORM, ONLY_CURRENT, NUM, MAP_NONE}, |
|
10009
|
|
|
|
|
|
|
{"Cap", PER_FORM, ONLY_CURRENT, CAP, MAP_NONE}, |
|
10010
|
|
|
|
|
|
|
{"Dash", PER_FORM, ONLY_CURRENT, DASH, MAP_NONE}, |
|
10011
|
|
|
|
|
|
|
{"Prefix1", PER_FORM, ONLY_CURRENT, PREFIX1, MAP_PREFIX1}, |
|
10012
|
|
|
|
|
|
|
{"Prefix2", PER_FORM, ONLY_CURRENT, PREFIX2, MAP_PREFIX2}, |
|
10013
|
|
|
|
|
|
|
{"Prefix3", PER_FORM, ONLY_CURRENT, PREFIX3, MAP_PREFIX3}, |
|
10014
|
|
|
|
|
|
|
{"Prefix4", PER_FORM, ONLY_CURRENT, PREFIX4, MAP_PREFIX4}, |
|
10015
|
|
|
|
|
|
|
{"Prefix5", PER_FORM, ONLY_CURRENT, PREFIX5, MAP_PREFIX5}, |
|
10016
|
|
|
|
|
|
|
{"Prefix6", PER_FORM, ONLY_CURRENT, PREFIX6, MAP_PREFIX6}, |
|
10017
|
|
|
|
|
|
|
{"Prefix7", PER_FORM, ONLY_CURRENT, PREFIX7, MAP_PREFIX7}, |
|
10018
|
|
|
|
|
|
|
{"Prefix8", PER_FORM, ONLY_CURRENT, PREFIX8, MAP_PREFIX8}, |
|
10019
|
|
|
|
|
|
|
{"Prefix9", PER_FORM, ONLY_CURRENT, PREFIX9, MAP_PREFIX9}, |
|
10020
|
|
|
|
|
|
|
{"Suffix1", PER_FORM, ONLY_CURRENT, SUFFIX1, MAP_SUFFIX1}, |
|
10021
|
|
|
|
|
|
|
{"Suffix2", PER_FORM, ONLY_CURRENT, SUFFIX2, MAP_SUFFIX2}, |
|
10022
|
|
|
|
|
|
|
{"Suffix3", PER_FORM, ONLY_CURRENT, SUFFIX3, MAP_SUFFIX3}, |
|
10023
|
|
|
|
|
|
|
{"Suffix4", PER_FORM, ONLY_CURRENT, SUFFIX4, MAP_SUFFIX4}, |
|
10024
|
|
|
|
|
|
|
{"Suffix5", PER_FORM, ONLY_CURRENT, SUFFIX5, MAP_SUFFIX5}, |
|
10025
|
|
|
|
|
|
|
{"Suffix6", PER_FORM, ONLY_CURRENT, SUFFIX6, MAP_SUFFIX6}, |
|
10026
|
|
|
|
|
|
|
{"Suffix7", PER_FORM, ONLY_CURRENT, SUFFIX7, MAP_SUFFIX7}, |
|
10027
|
|
|
|
|
|
|
{"Suffix8", PER_FORM, ONLY_CURRENT, SUFFIX8, MAP_SUFFIX8}, |
|
10028
|
|
|
|
|
|
|
{"Suffix9", PER_FORM, ONLY_CURRENT, SUFFIX9, MAP_SUFFIX9}, |
|
10029
|
|
|
|
|
|
|
|
|
10030
|
|
|
|
|
|
|
{"Tag", PER_TAG, ANY_OFFSET, TAG, MAP_TAG}, |
|
10031
|
|
|
|
|
|
|
{"TagUPos", PER_TAG, ANY_OFFSET, TAG_UPOS, MAP_TAG_UPOS}, |
|
10032
|
|
|
|
|
|
|
{"TagCase", PER_TAG, ANY_OFFSET, TAG_CASE, MAP_TAG_CASE}, |
|
10033
|
|
|
|
|
|
|
{"TagGender", PER_TAG, ANY_OFFSET, TAG_GENDER, MAP_TAG_GENDER}, |
|
10034
|
|
|
|
|
|
|
{"TagNumber", PER_TAG, ANY_OFFSET, TAG_NUMBER, MAP_TAG_NUMBER}, |
|
10035
|
|
|
|
|
|
|
{"TagNegative", PER_TAG, ANY_OFFSET, TAG_NEGATIVE, MAP_TAG_NEGATIVE}, |
|
10036
|
|
|
|
|
|
|
{"TagPerson", PER_TAG, ANY_OFFSET, TAG_PERSON, MAP_TAG_PERSON}, |
|
10037
|
|
|
|
|
|
|
{"Lemma", PER_TAG, ANY_OFFSET, LEMMA, MAP_LEMMA}, |
|
10038
|
|
|
|
|
|
|
|
|
10039
|
|
|
|
|
|
|
{"PreviousVerbTag", DYNAMIC, ANY_OFFSET, PREVIOUS_VERB_TAG, MAP_TAG}, |
|
10040
|
|
|
|
|
|
|
{"PreviousVerbForm", DYNAMIC, ANY_OFFSET, PREVIOUS_VERB_FORM, MAP_FORM}, |
|
10041
|
|
|
|
|
|
|
}; |
|
10042
|
|
|
|
|
|
|
|
|
10043
|
|
|
|
|
|
|
template |
|
10044
|
1
|
|
|
|
|
|
void conllu_elementary_features |
|
10045
|
|
|
|
|
|
|
using namespace unilib; |
|
10046
|
|
|
|
|
|
|
|
|
10047
|
|
|
|
|
|
|
// We process the sentence in reverse order, so that we can compute FollowingVerbTag and FollowingVerbLemma directly. |
|
10048
|
|
|
|
|
|
|
elementary_feature_value following_verb_tag = elementary_feature_empty, following_verb_form = elementary_feature_empty; |
|
10049
|
8
|
100
|
|
|
|
|
for (unsigned i = forms.size(); i--;) { |
|
|
|
0
|
|
|
|
|
|
|
10050
|
|
|
|
|
|
|
int verb_candidate = -1; |
|
10051
|
|
|
|
|
|
|
|
|
10052
|
|
|
|
|
|
|
// Per_tag features and verb_candidate |
|
10053
|
17
|
100
|
|
|
|
|
for (unsigned j = 0; j < analyses[i].size(); j++) { |
|
|
|
0
|
|
|
|
|
|
|
10054
|
10
|
|
|
|
|
|
const string& tag = analyses[i][j].tag; |
|
10055
|
10
|
|
|
|
|
|
const string& lemma = analyses[i][j].lemma; |
|
10056
|
|
|
|
|
|
|
|
|
10057
|
|
|
|
|
|
|
// Tag consists of three parts separated by tag[0] character |
|
10058
|
|
|
|
|
|
|
// - first is TAG_UPOS, |
|
10059
|
|
|
|
|
|
|
// - second is TAG_LPOS, |
|
10060
|
|
|
|
|
|
|
// - then there is any number of | separated named fields in format Name=Value |
|
10061
|
0
|
|
|
|
|
|
per_tag[i][j].values[TAG] = maps[MAP_TAG].value(tag.c_str(), tag.size()); |
|
10062
|
10
|
|
|
|
|
|
per_tag[i][j].values[TAG_UPOS] = per_tag[i][j].values[TAG_CASE] = per_tag[i][j].values[TAG_GENDER] = elementary_feature_empty; |
|
10063
|
10
|
|
|
|
|
|
per_tag[i][j].values[TAG_NUMBER] = per_tag[i][j].values[TAG_NEGATIVE] = per_tag[i][j].values[TAG_PERSON] = elementary_feature_empty; |
|
10064
|
10
|
100
|
|
|
|
|
per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == lemma ? per_tag[i][j-1].values[LEMMA] : |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
10065
|
|
|
|
|
|
|
maps[MAP_LEMMA].value(lemma.c_str(), lemma.size()); |
|
10066
|
|
|
|
|
|
|
|
|
10067
|
10
|
|
|
|
|
|
char separator = tag[0]; |
|
10068
|
10
|
|
|
|
|
|
size_t index = tag.find(separator, 1); |
|
10069
|
10
|
50
|
|
|
|
|
if (index == string::npos) index = tag.size(); |
|
|
|
0
|
|
|
|
|
|
|
10070
|
10
|
50
|
|
|
|
|
per_tag[i][j].values[TAG_UPOS] = maps[MAP_TAG_UPOS].value(tag.c_str() + (index ? 1 : 0), index - (index ? 1 : 0)); |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
10071
|
|
|
|
|
|
|
|
|
10072
|
10
|
50
|
|
|
|
|
if (index < tag.size()) index++; |
|
|
|
0
|
|
|
|
|
|
|
10073
|
10
|
50
|
|
|
|
|
if (index < tag.size()) index = tag.find(separator, index); |
|
|
|
0
|
|
|
|
|
|
|
10074
|
10
|
50
|
|
|
|
|
if (index < tag.size()) index++; |
|
|
|
0
|
|
|
|
|
|
|
10075
|
50
|
100
|
|
|
|
|
for (size_t length; index < tag.size(); index += length + 1) { |
|
|
|
0
|
|
|
|
|
|
|
10076
|
40
|
|
|
|
|
|
length = tag.find('|', index); |
|
10077
|
40
|
100
|
|
|
|
|
length = (length == string::npos ? tag.size() : length) - index; |
|
|
|
0
|
|
|
|
|
|
|
10078
|
|
|
|
|
|
|
|
|
10079
|
280
|
50
|
|
|
|
|
for (size_t equal_sign = 0; equal_sign + 1 < length; equal_sign++) |
|
|
|
0
|
|
|
|
|
|
|
10080
|
280
|
100
|
|
|
|
|
if (tag[index + equal_sign] == '=') { |
|
|
|
0
|
|
|
|
|
|
|
10081
|
|
|
|
|
|
|
int value = -1, map; |
|
10082
|
40
|
|
|
|
|
|
switch (equal_sign) { |
|
10083
|
|
|
|
|
|
|
case 4: |
|
10084
|
6
|
100
|
|
|
|
|
if (tag.compare(index, equal_sign, "Case") == 0) value = TAG_CASE, map = MAP_TAG_CASE; |
|
|
|
0
|
|
|
|
|
|
|
10085
|
|
|
|
|
|
|
break; |
|
10086
|
|
|
|
|
|
|
case 6: |
|
10087
|
16
|
100
|
|
|
|
|
if (tag.compare(index, equal_sign, "Gender") == 0) value = TAG_GENDER, map = MAP_TAG_GENDER; |
|
|
|
0
|
|
|
|
|
|
|
10088
|
16
|
100
|
|
|
|
|
if (tag.compare(index, equal_sign, "Number") == 0) value = TAG_NUMBER, map = MAP_TAG_NUMBER; |
|
|
|
0
|
|
|
|
|
|
|
10089
|
16
|
100
|
|
|
|
|
if (tag.compare(index, equal_sign, "Person") == 0) value = TAG_PERSON, map = MAP_TAG_PERSON; |
|
|
|
0
|
|
|
|
|
|
|
10090
|
|
|
|
|
|
|
break; |
|
10091
|
|
|
|
|
|
|
case 8: |
|
10092
|
10
|
100
|
|
|
|
|
if (tag.compare(index, equal_sign, "Negative") == 0) value = TAG_NEGATIVE, map = MAP_TAG_NEGATIVE; |
|
|
|
0
|
|
|
|
|
|
|
10093
|
|
|
|
|
|
|
break; |
|
10094
|
|
|
|
|
|
|
} |
|
10095
|
|
|
|
|
|
|
|
|
10096
|
40
|
100
|
|
|
|
|
if (value >= 0) |
|
|
|
0
|
|
|
|
|
|
|
10097
|
19
|
|
|
|
|
|
per_tag[i][j].values[value] = maps[map].value(tag.c_str() + index + equal_sign + 1, length - equal_sign - 1); |
|
10098
|
|
|
|
|
|
|
break; |
|
10099
|
|
|
|
|
|
|
} |
|
10100
|
|
|
|
|
|
|
} |
|
10101
|
|
|
|
|
|
|
|
|
10102
|
10
|
50
|
|
|
|
|
if (tag.size() >= 2 && tag[1] == 'V') { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
10103
|
|
|
|
|
|
|
int tag_compare; |
|
10104
|
5
|
100
|
|
|
|
|
verb_candidate = verb_candidate < 0 || (tag_compare = tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate; |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
10105
|
|
|
|
|
|
|
} |
|
10106
|
|
|
|
|
|
|
} |
|
10107
|
|
|
|
|
|
|
|
|
10108
|
|
|
|
|
|
|
// Per_form features |
|
10109
|
0
|
|
|
|
|
|
per_form[i].values[FORM] = maps[MAP_FORM].value(forms[i].str, forms[i].len); |
|
10110
|
7
|
|
|
|
|
|
per_form[i].values[FOLLOWING_VERB_TAG] = following_verb_tag; |
|
10111
|
7
|
|
|
|
|
|
per_form[i].values[FOLLOWING_VERB_FORM] = following_verb_form; |
|
10112
|
|
|
|
|
|
|
|
|
10113
|
|
|
|
|
|
|
// Update following_verb_{tag,lemma} _after_ filling FOLLOWING_VERB_{TAG,LEMMA}. |
|
10114
|
7
|
|
|
|
|
|
if (verb_candidate >= 0) { |
|
10115
|
4
|
|
|
|
|
|
following_verb_tag = per_tag[i][verb_candidate].values[TAG]; |
|
10116
|
2
|
|
|
|
|
|
following_verb_form = per_form[i].values[FORM]; |
|
10117
|
|
|
|
|
|
|
} |
|
10118
|
|
|
|
|
|
|
|
|
10119
|
|
|
|
|
|
|
// Ortographic per_form features if needed |
|
10120
|
7
|
100
|
|
|
|
|
if (analyses[i].size() == 1) { |
|
|
|
0
|
|
|
|
|
|
|
10121
|
5
|
|
|
|
|
|
per_form[i].values[NUM] = per_form[i].values[CAP] = per_form[i].values[DASH] = elementary_feature_unknown; |
|
10122
|
5
|
|
|
|
|
|
per_form[i].values[PREFIX1] = per_form[i].values[PREFIX2] = per_form[i].values[PREFIX3] = elementary_feature_unknown; |
|
10123
|
5
|
|
|
|
|
|
per_form[i].values[PREFIX4] = per_form[i].values[PREFIX5] = per_form[i].values[PREFIX6] = elementary_feature_unknown; |
|
10124
|
5
|
|
|
|
|
|
per_form[i].values[PREFIX7] = per_form[i].values[PREFIX8] = per_form[i].values[PREFIX9] = elementary_feature_unknown; |
|
10125
|
5
|
|
|
|
|
|
per_form[i].values[SUFFIX1] = per_form[i].values[SUFFIX2] = per_form[i].values[SUFFIX3] = elementary_feature_unknown; |
|
10126
|
5
|
|
|
|
|
|
per_form[i].values[SUFFIX4] = per_form[i].values[SUFFIX5] = per_form[i].values[SUFFIX6] = elementary_feature_unknown; |
|
10127
|
5
|
|
|
|
|
|
per_form[i].values[SUFFIX7] = per_form[i].values[SUFFIX8] = per_form[i].values[SUFFIX9] = elementary_feature_unknown; |
|
10128
|
2
|
50
|
|
|
|
|
} else if (forms[i].len <= 0) { |
|
|
|
0
|
|
|
|
|
|
|
10129
|
0
|
|
|
|
|
|
per_form[i].values[NUM] = per_form[i].values[CAP] = per_form[i].values[DASH] = elementary_feature_empty + 1; |
|
10130
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX1] = per_form[i].values[PREFIX2] = per_form[i].values[PREFIX3] = elementary_feature_empty; |
|
10131
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX4] = per_form[i].values[PREFIX5] = per_form[i].values[PREFIX6] = elementary_feature_empty; |
|
10132
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX7] = per_form[i].values[PREFIX8] = per_form[i].values[PREFIX9] = elementary_feature_empty; |
|
10133
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX1] = per_form[i].values[SUFFIX2] = per_form[i].values[SUFFIX3] = elementary_feature_empty; |
|
10134
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX4] = per_form[i].values[SUFFIX5] = per_form[i].values[SUFFIX6] = elementary_feature_empty; |
|
10135
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX7] = per_form[i].values[SUFFIX8] = per_form[i].values[SUFFIX9] = elementary_feature_empty; |
|
10136
|
|
|
|
|
|
|
} else { |
|
10137
|
2
|
|
|
|
|
|
string_piece form = forms[i]; |
|
10138
|
2
|
|
|
|
|
|
const char* form_start = form.str; |
|
10139
|
|
|
|
|
|
|
|
|
10140
|
|
|
|
|
|
|
bool num = false, cap = false, dash = false; |
|
10141
|
18
|
|
|
|
|
|
size_t indices[18] = {0, form.len, form.len, form.len, form.len, form.len, form.len, form.len, form.len, form.len, 0, 0, 0, 0, 0, 0, 0, 0}; // careful here regarding forms shorter than 9 characters |
|
10142
|
|
|
|
|
|
|
int index = 0; |
|
10143
|
18
|
100
|
|
|
|
|
while (form.len) { |
|
|
|
0
|
|
|
|
|
|
|
10144
|
16
|
|
|
|
|
|
indices[(index++) % 18] = form.str - form_start; |
|
10145
|
|
|
|
|
|
|
|
|
10146
|
16
|
|
|
|
|
|
unicode::category_t cat = unicode::category(utf8::decode(form.str, form.len)); |
|
10147
|
16
|
50
|
|
|
|
|
num = num || cat & unicode::N; |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
10148
|
16
|
100
|
|
|
|
|
cap = cap || cat & unicode::Lut; |
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
10149
|
16
|
50
|
|
|
|
|
dash = dash || cat & unicode::Pd; |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
10150
|
|
|
|
|
|
|
|
|
10151
|
16
|
50
|
|
|
|
|
if (index == 10 || (!form.len && index < 10)) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
10152
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX1] = maps[MAP_PREFIX1].value(form_start, indices[1]); |
|
10153
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX2] = maps[MAP_PREFIX2].value(form_start, indices[2]); |
|
10154
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX3] = maps[MAP_PREFIX3].value(form_start, indices[3]); |
|
10155
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX4] = maps[MAP_PREFIX4].value(form_start, indices[4]); |
|
10156
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX5] = maps[MAP_PREFIX5].value(form_start, indices[5]); |
|
10157
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX6] = maps[MAP_PREFIX6].value(form_start, indices[6]); |
|
10158
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX7] = maps[MAP_PREFIX7].value(form_start, indices[7]); |
|
10159
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX8] = maps[MAP_PREFIX8].value(form_start, indices[8]); |
|
10160
|
2
|
|
|
|
|
|
per_form[i].values[PREFIX9] = maps[MAP_PREFIX9].value(form_start, indices[9]); |
|
10161
|
|
|
|
|
|
|
} |
|
10162
|
|
|
|
|
|
|
} |
|
10163
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX1] = maps[MAP_SUFFIX1].value(form_start + indices[(index+18-1) % 18], form.str - form_start - indices[(index+18-1) % 18]); |
|
10164
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX2] = maps[MAP_SUFFIX2].value(form_start + indices[(index+18-2) % 18], form.str - form_start - indices[(index+18-2) % 18]); |
|
10165
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX3] = maps[MAP_SUFFIX3].value(form_start + indices[(index+18-3) % 18], form.str - form_start - indices[(index+18-3) % 18]); |
|
10166
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX4] = maps[MAP_SUFFIX4].value(form_start + indices[(index+18-4) % 18], form.str - form_start - indices[(index+18-4) % 18]); |
|
10167
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX5] = maps[MAP_SUFFIX5].value(form_start + indices[(index+18-5) % 18], form.str - form_start - indices[(index+18-5) % 18]); |
|
10168
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX6] = maps[MAP_SUFFIX6].value(form_start + indices[(index+18-6) % 18], form.str - form_start - indices[(index+18-6) % 18]); |
|
10169
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX7] = maps[MAP_SUFFIX7].value(form_start + indices[(index+18-7) % 18], form.str - form_start - indices[(index+18-7) % 18]); |
|
10170
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX8] = maps[MAP_SUFFIX8].value(form_start + indices[(index+18-8) % 18], form.str - form_start - indices[(index+18-8) % 18]); |
|
10171
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX9] = maps[MAP_SUFFIX9].value(form_start + indices[(index+18-9) % 18], form.str - form_start - indices[(index+18-9) % 18]); |
|
10172
|
2
|
|
|
|
|
|
per_form[i].values[NUM] = elementary_feature_empty + 1 + num; |
|
10173
|
2
|
|
|
|
|
|
per_form[i].values[CAP] = elementary_feature_empty + 1 + cap; |
|
10174
|
2
|
|
|
|
|
|
per_form[i].values[DASH] = elementary_feature_empty + 1 + dash; |
|
10175
|
|
|
|
|
|
|
} |
|
10176
|
|
|
|
|
|
|
} |
|
10177
|
1
|
|
|
|
|
|
} |
|
10178
|
|
|
|
|
|
|
|
|
10179
|
|
|
|
|
|
|
template |
|
10180
|
|
|
|
|
|
|
void conllu_elementary_features |
|
10181
|
15
|
100
|
|
|
|
|
if (prev_dynamic) { |
|
|
|
0
|
|
|
|
|
|
|
10182
|
12
|
|
|
|
|
|
dynamic.values[PREVIOUS_VERB_TAG] = prev_dynamic->values[PREVIOUS_OR_CURRENT_VERB_TAG]; |
|
10183
|
12
|
|
|
|
|
|
dynamic.values[PREVIOUS_VERB_FORM] = prev_dynamic->values[PREVIOUS_OR_CURRENT_VERB_FORM]; |
|
10184
|
|
|
|
|
|
|
} else { |
|
10185
|
3
|
|
|
|
|
|
dynamic.values[PREVIOUS_VERB_TAG] = elementary_feature_empty; |
|
10186
|
3
|
|
|
|
|
|
dynamic.values[PREVIOUS_VERB_FORM] = elementary_feature_empty; |
|
10187
|
|
|
|
|
|
|
} |
|
10188
|
|
|
|
|
|
|
|
|
10189
|
15
|
50
|
|
|
|
|
if (tag.tag.size() >= 2 && tag.tag[1] == 'V') { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
10190
|
4
|
|
|
|
|
|
dynamic.values[PREVIOUS_OR_CURRENT_VERB_TAG] = per_tag.values[TAG]; |
|
10191
|
4
|
|
|
|
|
|
dynamic.values[PREVIOUS_OR_CURRENT_VERB_FORM] = per_form.values[FORM]; |
|
10192
|
|
|
|
|
|
|
} else { |
|
10193
|
11
|
|
|
|
|
|
dynamic.values[PREVIOUS_OR_CURRENT_VERB_TAG] = dynamic.values[PREVIOUS_VERB_TAG]; |
|
10194
|
11
|
|
|
|
|
|
dynamic.values[PREVIOUS_OR_CURRENT_VERB_FORM] = dynamic.values[PREVIOUS_VERB_FORM]; |
|
10195
|
|
|
|
|
|
|
} |
|
10196
|
|
|
|
|
|
|
} |
|
10197
|
|
|
|
|
|
|
|
|
10198
|
|
|
|
|
|
|
} // namespace morphodita |
|
10199
|
|
|
|
|
|
|
|
|
10200
|
|
|
|
|
|
|
///////// |
|
10201
|
|
|
|
|
|
|
// File: morphodita/tagger/czech_elementary_features.h |
|
10202
|
|
|
|
|
|
|
///////// |
|
10203
|
|
|
|
|
|
|
|
|
10204
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
10205
|
|
|
|
|
|
|
// |
|
10206
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
10207
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
10208
|
|
|
|
|
|
|
// |
|
10209
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
10210
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
10211
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
10212
|
|
|
|
|
|
|
|
|
10213
|
|
|
|
|
|
|
namespace morphodita { |
|
10214
|
|
|
|
|
|
|
|
|
10215
|
|
|
|
|
|
|
// Declarations |
|
10216
|
|
|
|
|
|
|
template |
|
10217
|
0
|
|
|
|
|
|
class czech_elementary_features : public elementary_features |
|
10218
|
|
|
|
|
|
|
public: |
|
10219
|
|
|
|
|
|
|
czech_elementary_features(); |
|
10220
|
|
|
|
|
|
|
|
|
10221
|
|
|
|
|
|
|
enum features_per_form { FORM, FOLLOWING_VERB_TAG, FOLLOWING_VERB_LEMMA, NUM, CAP, DASH, PREFIX1, PREFIX2, PREFIX3, PREFIX4, SUFFIX1, SUFFIX2, SUFFIX3, SUFFIX4, PER_FORM_TOTAL }; |
|
10222
|
|
|
|
|
|
|
enum features_per_tag { TAG, TAG3, TAG5, TAG25, LEMMA, PER_TAG_TOTAL }; |
|
10223
|
|
|
|
|
|
|
enum features_dynamic { PREVIOUS_VERB_TAG, PREVIOUS_VERB_LEMMA, PREVIOUS_OR_CURRENT_VERB_TAG, PREVIOUS_OR_CURRENT_VERB_LEMMA, DYNAMIC_TOTAL }; |
|
10224
|
|
|
|
|
|
|
enum features_map { MAP_NONE = -1, MAP_FORM, MAP_LEMMA, MAP_PREFIX1, MAP_PREFIX2, MAP_PREFIX3, MAP_PREFIX4, MAP_SUFFIX1, MAP_SUFFIX2, MAP_SUFFIX3, MAP_SUFFIX4, MAP_TAG, MAP_TAG3, MAP_TAG5, MAP_TAG25, MAP_TOTAL } ; |
|
10225
|
|
|
|
|
|
|
|
|
10226
|
|
|
|
|
|
|
struct per_form_features { elementary_feature_value values[PER_FORM_TOTAL]; }; |
|
10227
|
|
|
|
|
|
|
struct per_tag_features { elementary_feature_value values[PER_TAG_TOTAL]; }; |
|
10228
|
|
|
|
|
|
|
struct dynamic_features { elementary_feature_value values[DYNAMIC_TOTAL]; }; |
|
10229
|
|
|
|
|
|
|
|
|
10230
|
|
|
|
|
|
|
static vector descriptions; |
|
10231
|
|
|
|
|
|
|
|
|
10232
|
|
|
|
|
|
|
void compute_features(const vector& forms, const vector>& analyses, vector& per_form, vector>& per_tag) const; |
|
10233
|
|
|
|
|
|
|
inline void compute_dynamic_features(const tagged_lemma& tag, const per_form_features& per_form, const per_tag_features& per_tag, const dynamic_features* prev_dynamic, dynamic_features& dynamic) const; |
|
10234
|
|
|
|
|
|
|
|
|
10235
|
|
|
|
|
|
|
using elementary_features |
|
10236
|
|
|
|
|
|
|
}; |
|
10237
|
|
|
|
|
|
|
|
|
10238
|
|
|
|
|
|
|
typedef czech_elementary_features persistent_czech_elementary_features; |
|
10239
|
|
|
|
|
|
|
|
|
10240
|
|
|
|
|
|
|
// Definitions |
|
10241
|
|
|
|
|
|
|
template |
|
10242
|
0
|
|
|
|
|
|
czech_elementary_features |
|
10243
|
0
|
0
|
|
|
|
|
maps.resize(MAP_TOTAL); |
|
10244
|
0
|
|
|
|
|
|
} |
|
10245
|
|
|
|
|
|
|
|
|
10246
|
|
|
|
|
|
|
template |
|
10247
|
|
|
|
|
|
|
vector czech_elementary_features |
|
10248
|
|
|
|
|
|
|
{"Form", PER_FORM, ANY_OFFSET, FORM, MAP_FORM}, |
|
10249
|
|
|
|
|
|
|
{"FollowingVerbTag", PER_FORM, ANY_OFFSET, FOLLOWING_VERB_TAG, MAP_TAG}, |
|
10250
|
|
|
|
|
|
|
{"FollowingVerbLemma", PER_FORM, ANY_OFFSET, FOLLOWING_VERB_LEMMA, MAP_LEMMA }, |
|
10251
|
|
|
|
|
|
|
{"Num", PER_FORM, ONLY_CURRENT, NUM, MAP_NONE}, |
|
10252
|
|
|
|
|
|
|
{"Cap", PER_FORM, ONLY_CURRENT, CAP, MAP_NONE}, |
|
10253
|
|
|
|
|
|
|
{"Dash", PER_FORM, ONLY_CURRENT, DASH, MAP_NONE}, |
|
10254
|
|
|
|
|
|
|
{"Prefix1", PER_FORM, ONLY_CURRENT, PREFIX1, MAP_PREFIX1}, |
|
10255
|
|
|
|
|
|
|
{"Prefix2", PER_FORM, ONLY_CURRENT, PREFIX2, MAP_PREFIX2}, |
|
10256
|
|
|
|
|
|
|
{"Prefix3", PER_FORM, ONLY_CURRENT, PREFIX3, MAP_PREFIX3}, |
|
10257
|
|
|
|
|
|
|
{"Prefix4", PER_FORM, ONLY_CURRENT, PREFIX4, MAP_PREFIX4}, |
|
10258
|
|
|
|
|
|
|
{"Suffix1", PER_FORM, ONLY_CURRENT, SUFFIX1, MAP_SUFFIX1}, |
|
10259
|
|
|
|
|
|
|
{"Suffix2", PER_FORM, ONLY_CURRENT, SUFFIX2, MAP_SUFFIX2}, |
|
10260
|
|
|
|
|
|
|
{"Suffix3", PER_FORM, ONLY_CURRENT, SUFFIX3, MAP_SUFFIX3}, |
|
10261
|
|
|
|
|
|
|
{"Suffix4", PER_FORM, ONLY_CURRENT, SUFFIX4, MAP_SUFFIX4}, |
|
10262
|
|
|
|
|
|
|
|
|
10263
|
|
|
|
|
|
|
{"Tag", PER_TAG, ANY_OFFSET, TAG, MAP_TAG}, |
|
10264
|
|
|
|
|
|
|
{"Tag3", PER_TAG, ANY_OFFSET, TAG3, MAP_TAG3}, |
|
10265
|
|
|
|
|
|
|
{"Tag5", PER_TAG, ANY_OFFSET, TAG5, MAP_TAG5}, |
|
10266
|
|
|
|
|
|
|
{"Tag25", PER_TAG, ANY_OFFSET, TAG25, MAP_TAG25}, |
|
10267
|
|
|
|
|
|
|
{"Lemma", PER_TAG, ANY_OFFSET, LEMMA, MAP_LEMMA}, |
|
10268
|
|
|
|
|
|
|
|
|
10269
|
|
|
|
|
|
|
{"PreviousVerbTag", DYNAMIC, ANY_OFFSET, PREVIOUS_VERB_TAG, MAP_TAG}, |
|
10270
|
|
|
|
|
|
|
{"PreviousVerbLemma", DYNAMIC, ANY_OFFSET, PREVIOUS_VERB_LEMMA, MAP_LEMMA} |
|
10271
|
|
|
|
|
|
|
}; |
|
10272
|
|
|
|
|
|
|
|
|
10273
|
|
|
|
|
|
|
template |
|
10274
|
0
|
|
|
|
|
|
void czech_elementary_features |
|
10275
|
|
|
|
|
|
|
using namespace unilib; |
|
10276
|
|
|
|
|
|
|
|
|
10277
|
|
|
|
|
|
|
// We process the sentence in reverse order, so that we can compute FollowingVerbTag and FollowingVerbLemma directly. |
|
10278
|
|
|
|
|
|
|
elementary_feature_value following_verb_tag = elementary_feature_empty, following_verb_lemma = elementary_feature_empty; |
|
10279
|
0
|
0
|
|
|
|
|
for (unsigned i = forms.size(); i--;) { |
|
10280
|
|
|
|
|
|
|
int verb_candidate = -1; |
|
10281
|
|
|
|
|
|
|
|
|
10282
|
|
|
|
|
|
|
// Per_tag features and verb_candidate |
|
10283
|
0
|
0
|
|
|
|
|
for (unsigned j = 0; j < analyses[i].size(); j++) { |
|
10284
|
|
|
|
|
|
|
char tag25[2]; |
|
10285
|
0
|
|
|
|
|
|
per_tag[i][j].values[TAG] = maps[MAP_TAG].value(analyses[i][j].tag.c_str(), analyses[i][j].tag.size()); |
|
10286
|
0
|
0
|
|
|
|
|
per_tag[i][j].values[TAG3] = analyses[i][j].tag.size() >= 3 ? maps[MAP_TAG3].value(analyses[i][j].tag.c_str() + 2, 1) : elementary_feature_empty; |
|
10287
|
0
|
0
|
|
|
|
|
per_tag[i][j].values[TAG5] = analyses[i][j].tag.size() >= 5 ? maps[MAP_TAG5].value(analyses[i][j].tag.c_str() + 4, 1) : elementary_feature_empty; |
|
10288
|
0
|
0
|
|
|
|
|
per_tag[i][j].values[TAG25] = analyses[i][j].tag.size() >= 5 ? maps[MAP_TAG25].value((tag25[0] = analyses[i][j].tag[1], tag25[1] = analyses[i][j].tag[4], tag25), 2) : elementary_feature_empty; |
|
10289
|
0
|
0
|
|
|
|
|
per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == analyses[i][j].lemma ? per_tag[i][j-1].values[LEMMA] : |
|
|
|
0
|
|
|
|
|
|
|
10290
|
|
|
|
|
|
|
maps[MAP_LEMMA].value(analyses[i][j].lemma.c_str(), analyses[i][j].lemma.size()); |
|
10291
|
|
|
|
|
|
|
|
|
10292
|
0
|
0
|
|
|
|
|
if (analyses[i][j].tag[0] == 'V') { |
|
10293
|
|
|
|
|
|
|
int tag_compare; |
|
10294
|
0
|
0
|
|
|
|
|
verb_candidate = verb_candidate < 0 || (tag_compare = analyses[i][j].tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && analyses[i][j].lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate; |
|
|
|
0
|
|
|
|
|
|
|
10295
|
|
|
|
|
|
|
} |
|
10296
|
|
|
|
|
|
|
} |
|
10297
|
|
|
|
|
|
|
|
|
10298
|
|
|
|
|
|
|
// Per_form features |
|
10299
|
0
|
|
|
|
|
|
per_form[i].values[FORM] = maps[MAP_FORM].value(forms[i].str, forms[i].len); |
|
10300
|
0
|
|
|
|
|
|
per_form[i].values[FOLLOWING_VERB_TAG] = following_verb_tag; |
|
10301
|
0
|
|
|
|
|
|
per_form[i].values[FOLLOWING_VERB_LEMMA] = following_verb_lemma; |
|
10302
|
|
|
|
|
|
|
|
|
10303
|
|
|
|
|
|
|
// Update following_verb_{tag,lemma} _after_ filling FOLLOWING_VERB_{TAG,LEMMA}. |
|
10304
|
0
|
0
|
|
|
|
|
if (verb_candidate >= 0) { |
|
10305
|
0
|
|
|
|
|
|
following_verb_tag = per_tag[i][verb_candidate].values[TAG]; |
|
10306
|
0
|
|
|
|
|
|
following_verb_lemma = per_tag[i][verb_candidate].values[LEMMA]; |
|
10307
|
|
|
|
|
|
|
} |
|
10308
|
|
|
|
|
|
|
|
|
10309
|
|
|
|
|
|
|
// Ortographic per_form features if needed |
|
10310
|
0
|
0
|
|
|
|
|
if (analyses[i].size() == 1) { |
|
10311
|
0
|
|
|
|
|
|
per_form[i].values[NUM] = per_form[i].values[CAP] = per_form[i].values[DASH] = elementary_feature_unknown; |
|
10312
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX1] = per_form[i].values[PREFIX2] = per_form[i].values[PREFIX3] = per_form[i].values[PREFIX4] = elementary_feature_unknown; |
|
10313
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX1] = per_form[i].values[SUFFIX2] = per_form[i].values[SUFFIX3] = per_form[i].values[SUFFIX4] = elementary_feature_unknown; |
|
10314
|
0
|
0
|
|
|
|
|
} else if (forms[i].len <= 0) { |
|
10315
|
0
|
|
|
|
|
|
per_form[i].values[NUM] = per_form[i].values[CAP] = per_form[i].values[DASH] = elementary_feature_empty + 1; |
|
10316
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX1] = per_form[i].values[PREFIX2] = per_form[i].values[PREFIX3] = per_form[i].values[PREFIX4] = elementary_feature_empty; |
|
10317
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX1] = per_form[i].values[SUFFIX2] = per_form[i].values[SUFFIX3] = per_form[i].values[SUFFIX4] = elementary_feature_empty; |
|
10318
|
|
|
|
|
|
|
} else { |
|
10319
|
0
|
|
|
|
|
|
string_piece form = forms[i]; |
|
10320
|
0
|
|
|
|
|
|
const char* form_start = form.str; |
|
10321
|
|
|
|
|
|
|
|
|
10322
|
|
|
|
|
|
|
bool num = false, cap = false, dash = false; |
|
10323
|
0
|
|
|
|
|
|
size_t indices[8] = {0, form.len, form.len, form.len, form.len, 0, 0, 0}; // careful here regarding forms shorter than 4 characters |
|
10324
|
|
|
|
|
|
|
int index = 0; |
|
10325
|
0
|
0
|
|
|
|
|
while (form.len) { |
|
10326
|
0
|
|
|
|
|
|
indices[(index++)&7] = form.str - form_start; |
|
10327
|
|
|
|
|
|
|
|
|
10328
|
0
|
|
|
|
|
|
unicode::category_t cat = unicode::category(utf8::decode(form.str, form.len)); |
|
10329
|
0
|
0
|
|
|
|
|
num = num || cat & unicode::N; |
|
|
|
0
|
|
|
|
|
|
|
10330
|
0
|
0
|
|
|
|
|
cap = cap || cat & unicode::Lut; |
|
|
|
0
|
|
|
|
|
|
|
10331
|
0
|
0
|
|
|
|
|
dash = dash || cat & unicode::Pd; |
|
|
|
0
|
|
|
|
|
|
|
10332
|
|
|
|
|
|
|
|
|
10333
|
0
|
0
|
|
|
|
|
if (index == 5 || (!form.len && index < 5)) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
10334
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX1] = maps[MAP_PREFIX1].value(form_start, indices[1]); |
|
10335
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX2] = maps[MAP_PREFIX2].value(form_start, indices[2]); |
|
10336
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX3] = maps[MAP_PREFIX3].value(form_start, indices[3]); |
|
10337
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX4] = maps[MAP_PREFIX4].value(form_start, indices[4]); |
|
10338
|
|
|
|
|
|
|
} |
|
10339
|
|
|
|
|
|
|
} |
|
10340
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX1] = maps[MAP_SUFFIX1].value(form_start + indices[(index-1)&7], form.str - form_start - indices[(index-1)&7]); |
|
10341
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX2] = maps[MAP_SUFFIX2].value(form_start + indices[(index-2)&7], form.str - form_start - indices[(index-2)&7]); |
|
10342
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX3] = maps[MAP_SUFFIX3].value(form_start + indices[(index-3)&7], form.str - form_start - indices[(index-3)&7]); |
|
10343
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX4] = maps[MAP_SUFFIX4].value(form_start + indices[(index-4)&7], form.str - form_start - indices[(index-4)&7]); |
|
10344
|
0
|
|
|
|
|
|
per_form[i].values[NUM] = elementary_feature_empty + 1 + num; |
|
10345
|
0
|
|
|
|
|
|
per_form[i].values[CAP] = elementary_feature_empty + 1 + cap; |
|
10346
|
0
|
|
|
|
|
|
per_form[i].values[DASH] = elementary_feature_empty + 1 + dash; |
|
10347
|
|
|
|
|
|
|
} |
|
10348
|
|
|
|
|
|
|
} |
|
10349
|
0
|
|
|
|
|
|
} |
|
10350
|
|
|
|
|
|
|
|
|
10351
|
|
|
|
|
|
|
template |
|
10352
|
|
|
|
|
|
|
void czech_elementary_features |
|
10353
|
0
|
0
|
|
|
|
|
if (prev_dynamic) { |
|
10354
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_VERB_TAG] = prev_dynamic->values[PREVIOUS_OR_CURRENT_VERB_TAG]; |
|
10355
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_VERB_LEMMA] = prev_dynamic->values[PREVIOUS_OR_CURRENT_VERB_LEMMA]; |
|
10356
|
|
|
|
|
|
|
} else { |
|
10357
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_VERB_TAG] = elementary_feature_empty; |
|
10358
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_VERB_LEMMA] = elementary_feature_empty; |
|
10359
|
|
|
|
|
|
|
} |
|
10360
|
|
|
|
|
|
|
|
|
10361
|
0
|
0
|
|
|
|
|
if (tag.tag[0] == 'V') { |
|
10362
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_OR_CURRENT_VERB_TAG] = per_tag.values[TAG]; |
|
10363
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_OR_CURRENT_VERB_LEMMA] = per_tag.values[LEMMA]; |
|
10364
|
|
|
|
|
|
|
} else { |
|
10365
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_OR_CURRENT_VERB_TAG] = dynamic.values[PREVIOUS_VERB_TAG]; |
|
10366
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_OR_CURRENT_VERB_LEMMA] = dynamic.values[PREVIOUS_VERB_LEMMA]; |
|
10367
|
|
|
|
|
|
|
} |
|
10368
|
|
|
|
|
|
|
} |
|
10369
|
|
|
|
|
|
|
|
|
10370
|
|
|
|
|
|
|
} // namespace morphodita |
|
10371
|
|
|
|
|
|
|
|
|
10372
|
|
|
|
|
|
|
///////// |
|
10373
|
|
|
|
|
|
|
// File: morphodita/tagger/generic_elementary_features.h |
|
10374
|
|
|
|
|
|
|
///////// |
|
10375
|
|
|
|
|
|
|
|
|
10376
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
10377
|
|
|
|
|
|
|
// |
|
10378
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
10379
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
10380
|
|
|
|
|
|
|
// |
|
10381
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
10382
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
10383
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
10384
|
|
|
|
|
|
|
|
|
10385
|
|
|
|
|
|
|
namespace morphodita { |
|
10386
|
|
|
|
|
|
|
|
|
10387
|
|
|
|
|
|
|
// Declarations |
|
10388
|
|
|
|
|
|
|
template |
|
10389
|
0
|
|
|
|
|
|
class generic_elementary_features : public elementary_features |
|
10390
|
|
|
|
|
|
|
public: |
|
10391
|
|
|
|
|
|
|
generic_elementary_features(); |
|
10392
|
|
|
|
|
|
|
|
|
10393
|
|
|
|
|
|
|
enum features_per_form { FORM, FOLLOWING_VERB_TAG, FOLLOWING_VERB_LEMMA, NUM, CAP, DASH, PREFIX1, PREFIX2, PREFIX3, PREFIX4, PREFIX5, PREFIX6, PREFIX7, PREFIX8, PREFIX9, SUFFIX1, SUFFIX2, SUFFIX3, SUFFIX4, SUFFIX5, SUFFIX6, SUFFIX7, SUFFIX8, SUFFIX9, PER_FORM_TOTAL }; |
|
10394
|
|
|
|
|
|
|
enum features_per_tag { TAG, TAG1, TAG2, TAG3, TAG4, TAG5, LEMMA, PER_TAG_TOTAL }; |
|
10395
|
|
|
|
|
|
|
enum features_dynamic { PREVIOUS_VERB_TAG, PREVIOUS_VERB_LEMMA, PREVIOUS_OR_CURRENT_VERB_TAG, PREVIOUS_OR_CURRENT_VERB_LEMMA, DYNAMIC_TOTAL }; |
|
10396
|
|
|
|
|
|
|
enum features_map { MAP_NONE = -1, MAP_FORM, MAP_PREFIX1, MAP_PREFIX2, MAP_PREFIX3, MAP_PREFIX4, MAP_PREFIX5, MAP_PREFIX6, MAP_PREFIX7, MAP_PREFIX8, MAP_PREFIX9, MAP_SUFFIX1, MAP_SUFFIX2, MAP_SUFFIX3, MAP_SUFFIX4, MAP_SUFFIX5, MAP_SUFFIX6, MAP_SUFFIX7, MAP_SUFFIX8, MAP_SUFFIX9, MAP_TAG, MAP_TAG1, MAP_TAG2, MAP_TAG3, MAP_TAG4, MAP_TAG5, MAP_LEMMA, MAP_TOTAL } ; |
|
10397
|
|
|
|
|
|
|
|
|
10398
|
|
|
|
|
|
|
struct per_form_features { elementary_feature_value values[PER_FORM_TOTAL]; }; |
|
10399
|
|
|
|
|
|
|
struct per_tag_features { elementary_feature_value values[PER_TAG_TOTAL]; }; |
|
10400
|
|
|
|
|
|
|
struct dynamic_features { elementary_feature_value values[DYNAMIC_TOTAL]; }; |
|
10401
|
|
|
|
|
|
|
|
|
10402
|
|
|
|
|
|
|
static vector descriptions; |
|
10403
|
|
|
|
|
|
|
|
|
10404
|
|
|
|
|
|
|
void compute_features(const vector& forms, const vector>& analyses, vector& per_form, vector>& per_tag) const; |
|
10405
|
|
|
|
|
|
|
inline void compute_dynamic_features(const tagged_lemma& tag, const per_form_features& per_form, const per_tag_features& per_tag, const dynamic_features* prev_dynamic, dynamic_features& dynamic) const; |
|
10406
|
|
|
|
|
|
|
|
|
10407
|
|
|
|
|
|
|
using elementary_features |
|
10408
|
|
|
|
|
|
|
}; |
|
10409
|
|
|
|
|
|
|
|
|
10410
|
|
|
|
|
|
|
typedef generic_elementary_features persistent_generic_elementary_features; |
|
10411
|
|
|
|
|
|
|
|
|
10412
|
|
|
|
|
|
|
// Definitions |
|
10413
|
|
|
|
|
|
|
template |
|
10414
|
0
|
|
|
|
|
|
generic_elementary_features |
|
10415
|
0
|
0
|
|
|
|
|
maps.resize(MAP_TOTAL); |
|
10416
|
0
|
|
|
|
|
|
} |
|
10417
|
|
|
|
|
|
|
|
|
10418
|
|
|
|
|
|
|
template |
|
10419
|
|
|
|
|
|
|
vector generic_elementary_features |
|
10420
|
|
|
|
|
|
|
{"Form", PER_FORM, ANY_OFFSET, FORM, MAP_FORM}, |
|
10421
|
|
|
|
|
|
|
{"FollowingVerbTag", PER_FORM, ANY_OFFSET, FOLLOWING_VERB_TAG, MAP_TAG}, |
|
10422
|
|
|
|
|
|
|
{"FollowingVerbLemma", PER_FORM, ANY_OFFSET, FOLLOWING_VERB_LEMMA, MAP_LEMMA }, |
|
10423
|
|
|
|
|
|
|
{"Num", PER_FORM, ONLY_CURRENT, NUM, MAP_NONE}, |
|
10424
|
|
|
|
|
|
|
{"Cap", PER_FORM, ONLY_CURRENT, CAP, MAP_NONE}, |
|
10425
|
|
|
|
|
|
|
{"Dash", PER_FORM, ONLY_CURRENT, DASH, MAP_NONE}, |
|
10426
|
|
|
|
|
|
|
{"Prefix1", PER_FORM, ONLY_CURRENT, PREFIX1, MAP_PREFIX1}, |
|
10427
|
|
|
|
|
|
|
{"Prefix2", PER_FORM, ONLY_CURRENT, PREFIX2, MAP_PREFIX2}, |
|
10428
|
|
|
|
|
|
|
{"Prefix3", PER_FORM, ONLY_CURRENT, PREFIX3, MAP_PREFIX3}, |
|
10429
|
|
|
|
|
|
|
{"Prefix4", PER_FORM, ONLY_CURRENT, PREFIX4, MAP_PREFIX4}, |
|
10430
|
|
|
|
|
|
|
{"Prefix5", PER_FORM, ONLY_CURRENT, PREFIX5, MAP_PREFIX5}, |
|
10431
|
|
|
|
|
|
|
{"Prefix6", PER_FORM, ONLY_CURRENT, PREFIX6, MAP_PREFIX6}, |
|
10432
|
|
|
|
|
|
|
{"Prefix7", PER_FORM, ONLY_CURRENT, PREFIX7, MAP_PREFIX7}, |
|
10433
|
|
|
|
|
|
|
{"Prefix8", PER_FORM, ONLY_CURRENT, PREFIX8, MAP_PREFIX8}, |
|
10434
|
|
|
|
|
|
|
{"Prefix9", PER_FORM, ONLY_CURRENT, PREFIX9, MAP_PREFIX9}, |
|
10435
|
|
|
|
|
|
|
{"Suffix1", PER_FORM, ONLY_CURRENT, SUFFIX1, MAP_SUFFIX1}, |
|
10436
|
|
|
|
|
|
|
{"Suffix2", PER_FORM, ONLY_CURRENT, SUFFIX2, MAP_SUFFIX2}, |
|
10437
|
|
|
|
|
|
|
{"Suffix3", PER_FORM, ONLY_CURRENT, SUFFIX3, MAP_SUFFIX3}, |
|
10438
|
|
|
|
|
|
|
{"Suffix4", PER_FORM, ONLY_CURRENT, SUFFIX4, MAP_SUFFIX4}, |
|
10439
|
|
|
|
|
|
|
{"Suffix5", PER_FORM, ONLY_CURRENT, SUFFIX5, MAP_SUFFIX5}, |
|
10440
|
|
|
|
|
|
|
{"Suffix6", PER_FORM, ONLY_CURRENT, SUFFIX6, MAP_SUFFIX6}, |
|
10441
|
|
|
|
|
|
|
{"Suffix7", PER_FORM, ONLY_CURRENT, SUFFIX7, MAP_SUFFIX7}, |
|
10442
|
|
|
|
|
|
|
{"Suffix8", PER_FORM, ONLY_CURRENT, SUFFIX8, MAP_SUFFIX8}, |
|
10443
|
|
|
|
|
|
|
{"Suffix9", PER_FORM, ONLY_CURRENT, SUFFIX9, MAP_SUFFIX9}, |
|
10444
|
|
|
|
|
|
|
|
|
10445
|
|
|
|
|
|
|
{"Tag", PER_TAG, ANY_OFFSET, TAG, MAP_TAG}, |
|
10446
|
|
|
|
|
|
|
{"Tag1", PER_TAG, ANY_OFFSET, TAG1, MAP_TAG1}, |
|
10447
|
|
|
|
|
|
|
{"Tag2", PER_TAG, ANY_OFFSET, TAG2, MAP_TAG2}, |
|
10448
|
|
|
|
|
|
|
{"Tag3", PER_TAG, ANY_OFFSET, TAG3, MAP_TAG3}, |
|
10449
|
|
|
|
|
|
|
{"Tag4", PER_TAG, ANY_OFFSET, TAG4, MAP_TAG4}, |
|
10450
|
|
|
|
|
|
|
{"Tag5", PER_TAG, ANY_OFFSET, TAG5, MAP_TAG5}, |
|
10451
|
|
|
|
|
|
|
{"Lemma", PER_TAG, ANY_OFFSET, LEMMA, MAP_LEMMA}, |
|
10452
|
|
|
|
|
|
|
|
|
10453
|
|
|
|
|
|
|
{"PreviousVerbTag", DYNAMIC, ANY_OFFSET, PREVIOUS_VERB_TAG, MAP_TAG}, |
|
10454
|
|
|
|
|
|
|
{"PreviousVerbLemma", DYNAMIC, ANY_OFFSET, PREVIOUS_VERB_LEMMA, MAP_LEMMA} |
|
10455
|
|
|
|
|
|
|
}; |
|
10456
|
|
|
|
|
|
|
|
|
10457
|
|
|
|
|
|
|
template |
|
10458
|
0
|
|
|
|
|
|
void generic_elementary_features |
|
10459
|
|
|
|
|
|
|
using namespace unilib; |
|
10460
|
|
|
|
|
|
|
|
|
10461
|
|
|
|
|
|
|
// We process the sentence in reverse order, so that we can compute FollowingVerbTag and FollowingVerbLemma directly. |
|
10462
|
|
|
|
|
|
|
elementary_feature_value following_verb_tag = elementary_feature_empty, following_verb_lemma = elementary_feature_empty; |
|
10463
|
0
|
0
|
|
|
|
|
for (unsigned i = forms.size(); i--;) { |
|
10464
|
|
|
|
|
|
|
int verb_candidate = -1; |
|
10465
|
|
|
|
|
|
|
|
|
10466
|
|
|
|
|
|
|
// Per_tag features and verb_candidate |
|
10467
|
0
|
0
|
|
|
|
|
for (unsigned j = 0; j < analyses[i].size(); j++) { |
|
10468
|
0
|
|
|
|
|
|
per_tag[i][j].values[TAG] = maps[MAP_TAG].value(analyses[i][j].tag.c_str(), analyses[i][j].tag.size()); |
|
10469
|
0
|
0
|
|
|
|
|
per_tag[i][j].values[TAG1] = analyses[i][j].tag.size() >= 1 ? maps[MAP_TAG1].value(analyses[i][j].tag.c_str() + 0, 1) : elementary_feature_empty; |
|
10470
|
0
|
0
|
|
|
|
|
per_tag[i][j].values[TAG2] = analyses[i][j].tag.size() >= 2 ? maps[MAP_TAG2].value(analyses[i][j].tag.c_str() + 1, 1) : elementary_feature_empty; |
|
10471
|
0
|
0
|
|
|
|
|
per_tag[i][j].values[TAG3] = analyses[i][j].tag.size() >= 3 ? maps[MAP_TAG3].value(analyses[i][j].tag.c_str() + 2, 1) : elementary_feature_empty; |
|
10472
|
0
|
0
|
|
|
|
|
per_tag[i][j].values[TAG4] = analyses[i][j].tag.size() >= 4 ? maps[MAP_TAG4].value(analyses[i][j].tag.c_str() + 3, 1) : elementary_feature_empty; |
|
10473
|
0
|
0
|
|
|
|
|
per_tag[i][j].values[TAG5] = analyses[i][j].tag.size() >= 5 ? maps[MAP_TAG5].value(analyses[i][j].tag.c_str() + 4, 1) : elementary_feature_empty; |
|
10474
|
0
|
0
|
|
|
|
|
per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == analyses[i][j].lemma ? per_tag[i][j-1].values[LEMMA] : |
|
|
|
0
|
|
|
|
|
|
|
10475
|
|
|
|
|
|
|
maps[MAP_LEMMA].value(analyses[i][j].lemma.c_str(), analyses[i][j].lemma.size()); |
|
10476
|
|
|
|
|
|
|
|
|
10477
|
0
|
0
|
|
|
|
|
if (analyses[i][j].tag[0] == 'V') { |
|
10478
|
|
|
|
|
|
|
int tag_compare; |
|
10479
|
0
|
0
|
|
|
|
|
verb_candidate = verb_candidate < 0 || (tag_compare = analyses[i][j].tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && analyses[i][j].lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate; |
|
|
|
0
|
|
|
|
|
|
|
10480
|
|
|
|
|
|
|
} |
|
10481
|
|
|
|
|
|
|
} |
|
10482
|
|
|
|
|
|
|
|
|
10483
|
|
|
|
|
|
|
// Per_form features |
|
10484
|
0
|
|
|
|
|
|
per_form[i].values[FORM] = maps[MAP_FORM].value(forms[i].str, forms[i].len); |
|
10485
|
0
|
|
|
|
|
|
per_form[i].values[FOLLOWING_VERB_TAG] = following_verb_tag; |
|
10486
|
0
|
|
|
|
|
|
per_form[i].values[FOLLOWING_VERB_LEMMA] = following_verb_lemma; |
|
10487
|
|
|
|
|
|
|
|
|
10488
|
|
|
|
|
|
|
// Update following_verb_{tag,lemma} _after_ filling FOLLOWING_VERB_{TAG,LEMMA}. |
|
10489
|
0
|
0
|
|
|
|
|
if (verb_candidate >= 0) { |
|
10490
|
0
|
|
|
|
|
|
following_verb_tag = per_tag[i][verb_candidate].values[TAG]; |
|
10491
|
0
|
|
|
|
|
|
following_verb_lemma = per_tag[i][verb_candidate].values[LEMMA]; |
|
10492
|
|
|
|
|
|
|
} |
|
10493
|
|
|
|
|
|
|
|
|
10494
|
|
|
|
|
|
|
// Ortographic per_form features if needed |
|
10495
|
0
|
0
|
|
|
|
|
if (analyses[i].size() == 1) { |
|
10496
|
0
|
|
|
|
|
|
per_form[i].values[NUM] = per_form[i].values[CAP] = per_form[i].values[DASH] = elementary_feature_unknown; |
|
10497
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX1] = per_form[i].values[PREFIX2] = per_form[i].values[PREFIX3] = elementary_feature_unknown; |
|
10498
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX4] = per_form[i].values[PREFIX5] = per_form[i].values[PREFIX6] = elementary_feature_unknown; |
|
10499
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX7] = per_form[i].values[PREFIX8] = per_form[i].values[PREFIX9] = elementary_feature_unknown; |
|
10500
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX1] = per_form[i].values[SUFFIX2] = per_form[i].values[SUFFIX3] = elementary_feature_unknown; |
|
10501
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX4] = per_form[i].values[SUFFIX5] = per_form[i].values[SUFFIX6] = elementary_feature_unknown; |
|
10502
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX7] = per_form[i].values[SUFFIX8] = per_form[i].values[SUFFIX9] = elementary_feature_unknown; |
|
10503
|
0
|
0
|
|
|
|
|
} else if (forms[i].len <= 0) { |
|
10504
|
0
|
|
|
|
|
|
per_form[i].values[NUM] = per_form[i].values[CAP] = per_form[i].values[DASH] = elementary_feature_empty + 1; |
|
10505
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX1] = per_form[i].values[PREFIX2] = per_form[i].values[PREFIX3] = elementary_feature_empty; |
|
10506
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX4] = per_form[i].values[PREFIX5] = per_form[i].values[PREFIX6] = elementary_feature_empty; |
|
10507
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX7] = per_form[i].values[PREFIX8] = per_form[i].values[PREFIX9] = elementary_feature_empty; |
|
10508
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX1] = per_form[i].values[SUFFIX2] = per_form[i].values[SUFFIX3] = elementary_feature_empty; |
|
10509
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX4] = per_form[i].values[SUFFIX5] = per_form[i].values[SUFFIX6] = elementary_feature_empty; |
|
10510
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX7] = per_form[i].values[SUFFIX8] = per_form[i].values[SUFFIX9] = elementary_feature_empty; |
|
10511
|
|
|
|
|
|
|
} else { |
|
10512
|
0
|
|
|
|
|
|
string_piece form = forms[i]; |
|
10513
|
0
|
|
|
|
|
|
const char* form_start = form.str; |
|
10514
|
|
|
|
|
|
|
|
|
10515
|
|
|
|
|
|
|
bool num = false, cap = false, dash = false; |
|
10516
|
0
|
|
|
|
|
|
size_t indices[18] = {0, form.len, form.len, form.len, form.len, form.len, form.len, form.len, form.len, form.len, 0, 0, 0, 0, 0, 0, 0, 0}; // careful here regarding forms shorter than 9 characters |
|
10517
|
|
|
|
|
|
|
int index = 0; |
|
10518
|
0
|
0
|
|
|
|
|
while (form.len) { |
|
10519
|
0
|
|
|
|
|
|
indices[(index++) % 18] = form.str - form_start; |
|
10520
|
|
|
|
|
|
|
|
|
10521
|
0
|
|
|
|
|
|
unicode::category_t cat = unicode::category(utf8::decode(form.str, form.len)); |
|
10522
|
0
|
0
|
|
|
|
|
num = num || cat & unicode::N; |
|
|
|
0
|
|
|
|
|
|
|
10523
|
0
|
0
|
|
|
|
|
cap = cap || cat & unicode::Lut; |
|
|
|
0
|
|
|
|
|
|
|
10524
|
0
|
0
|
|
|
|
|
dash = dash || cat & unicode::Pd; |
|
|
|
0
|
|
|
|
|
|
|
10525
|
|
|
|
|
|
|
|
|
10526
|
0
|
0
|
|
|
|
|
if (index == 10 || (!form.len && index < 10)) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
10527
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX1] = maps[MAP_PREFIX1].value(form_start, indices[1]); |
|
10528
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX2] = maps[MAP_PREFIX2].value(form_start, indices[2]); |
|
10529
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX3] = maps[MAP_PREFIX3].value(form_start, indices[3]); |
|
10530
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX4] = maps[MAP_PREFIX4].value(form_start, indices[4]); |
|
10531
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX5] = maps[MAP_PREFIX5].value(form_start, indices[5]); |
|
10532
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX6] = maps[MAP_PREFIX6].value(form_start, indices[6]); |
|
10533
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX7] = maps[MAP_PREFIX7].value(form_start, indices[7]); |
|
10534
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX8] = maps[MAP_PREFIX8].value(form_start, indices[8]); |
|
10535
|
0
|
|
|
|
|
|
per_form[i].values[PREFIX9] = maps[MAP_PREFIX9].value(form_start, indices[9]); |
|
10536
|
|
|
|
|
|
|
} |
|
10537
|
|
|
|
|
|
|
} |
|
10538
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX1] = maps[MAP_SUFFIX1].value(form_start + indices[(index+18-1) % 18], form.str - form_start - indices[(index+18-1) % 18]); |
|
10539
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX2] = maps[MAP_SUFFIX2].value(form_start + indices[(index+18-2) % 18], form.str - form_start - indices[(index+18-2) % 18]); |
|
10540
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX3] = maps[MAP_SUFFIX3].value(form_start + indices[(index+18-3) % 18], form.str - form_start - indices[(index+18-3) % 18]); |
|
10541
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX4] = maps[MAP_SUFFIX4].value(form_start + indices[(index+18-4) % 18], form.str - form_start - indices[(index+18-4) % 18]); |
|
10542
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX5] = maps[MAP_SUFFIX5].value(form_start + indices[(index+18-5) % 18], form.str - form_start - indices[(index+18-5) % 18]); |
|
10543
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX6] = maps[MAP_SUFFIX6].value(form_start + indices[(index+18-6) % 18], form.str - form_start - indices[(index+18-6) % 18]); |
|
10544
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX7] = maps[MAP_SUFFIX7].value(form_start + indices[(index+18-7) % 18], form.str - form_start - indices[(index+18-7) % 18]); |
|
10545
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX8] = maps[MAP_SUFFIX8].value(form_start + indices[(index+18-8) % 18], form.str - form_start - indices[(index+18-8) % 18]); |
|
10546
|
0
|
|
|
|
|
|
per_form[i].values[SUFFIX9] = maps[MAP_SUFFIX9].value(form_start + indices[(index+18-9) % 18], form.str - form_start - indices[(index+18-9) % 18]); |
|
10547
|
0
|
|
|
|
|
|
per_form[i].values[NUM] = elementary_feature_empty + 1 + num; |
|
10548
|
0
|
|
|
|
|
|
per_form[i].values[CAP] = elementary_feature_empty + 1 + cap; |
|
10549
|
0
|
|
|
|
|
|
per_form[i].values[DASH] = elementary_feature_empty + 1 + dash; |
|
10550
|
|
|
|
|
|
|
} |
|
10551
|
|
|
|
|
|
|
} |
|
10552
|
0
|
|
|
|
|
|
} |
|
10553
|
|
|
|
|
|
|
|
|
10554
|
|
|
|
|
|
|
template |
|
10555
|
|
|
|
|
|
|
void generic_elementary_features |
|
10556
|
0
|
0
|
|
|
|
|
if (prev_dynamic) { |
|
10557
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_VERB_TAG] = prev_dynamic->values[PREVIOUS_OR_CURRENT_VERB_TAG]; |
|
10558
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_VERB_LEMMA] = prev_dynamic->values[PREVIOUS_OR_CURRENT_VERB_LEMMA]; |
|
10559
|
|
|
|
|
|
|
} else { |
|
10560
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_VERB_TAG] = elementary_feature_empty; |
|
10561
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_VERB_LEMMA] = elementary_feature_empty; |
|
10562
|
|
|
|
|
|
|
} |
|
10563
|
|
|
|
|
|
|
|
|
10564
|
0
|
0
|
|
|
|
|
if (tag.tag[0] == 'V') { |
|
10565
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_OR_CURRENT_VERB_TAG] = per_tag.values[TAG]; |
|
10566
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_OR_CURRENT_VERB_LEMMA] = per_tag.values[LEMMA]; |
|
10567
|
|
|
|
|
|
|
} else { |
|
10568
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_OR_CURRENT_VERB_TAG] = dynamic.values[PREVIOUS_VERB_TAG]; |
|
10569
|
0
|
|
|
|
|
|
dynamic.values[PREVIOUS_OR_CURRENT_VERB_LEMMA] = dynamic.values[PREVIOUS_VERB_LEMMA]; |
|
10570
|
|
|
|
|
|
|
} |
|
10571
|
|
|
|
|
|
|
} |
|
10572
|
|
|
|
|
|
|
|
|
10573
|
|
|
|
|
|
|
} // namespace morphodita |
|
10574
|
|
|
|
|
|
|
|
|
10575
|
|
|
|
|
|
|
///////// |
|
10576
|
|
|
|
|
|
|
// File: morphodita/tagger/perceptron_tagger.h |
|
10577
|
|
|
|
|
|
|
///////// |
|
10578
|
|
|
|
|
|
|
|
|
10579
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
10580
|
|
|
|
|
|
|
// |
|
10581
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
10582
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
10583
|
|
|
|
|
|
|
// |
|
10584
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
10585
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
10586
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
10587
|
|
|
|
|
|
|
|
|
10588
|
|
|
|
|
|
|
namespace morphodita { |
|
10589
|
|
|
|
|
|
|
|
|
10590
|
|
|
|
|
|
|
// Declarations |
|
10591
|
|
|
|
|
|
|
template |
|
10592
|
4
|
|
|
|
|
|
class perceptron_tagger : public tagger { |
|
10593
|
|
|
|
|
|
|
public: |
|
10594
|
|
|
|
|
|
|
perceptron_tagger(int decoding_order, int window_size); |
|
10595
|
|
|
|
|
|
|
|
|
10596
|
|
|
|
|
|
|
bool load(istream& is); |
|
10597
|
|
|
|
|
|
|
virtual const morpho* get_morpho() const override; |
|
10598
|
|
|
|
|
|
|
virtual void tag(const vector& forms, vector& tags, morpho::guesser_mode guesser = morpho::guesser_mode(-1)) const override; |
|
10599
|
|
|
|
|
|
|
virtual void tag_analyzed(const vector& forms, const vector>& analyses, vector& tags) const override; |
|
10600
|
|
|
|
|
|
|
|
|
10601
|
|
|
|
|
|
|
private: |
|
10602
|
|
|
|
|
|
|
int decoding_order, window_size; |
|
10603
|
|
|
|
|
|
|
|
|
10604
|
|
|
|
|
|
|
unique_ptr dict; |
|
10605
|
|
|
|
|
|
|
bool use_guesser; |
|
10606
|
|
|
|
|
|
|
FeatureSequences features; |
|
10607
|
|
|
|
|
|
|
typedef viterbi viterbi_decoder; |
|
10608
|
|
|
|
|
|
|
viterbi_decoder decoder; |
|
10609
|
3
|
|
|
|
|
|
struct cache { |
|
10610
|
|
|
|
|
|
|
vector forms; |
|
10611
|
|
|
|
|
|
|
vector> analyses; |
|
10612
|
|
|
|
|
|
|
vector tags; |
|
10613
|
|
|
|
|
|
|
typename viterbi_decoder::cache decoder_cache; |
|
10614
|
|
|
|
|
|
|
|
|
10615
|
1
|
0
|
|
|
|
|
cache(const perceptron_tagger& self) : decoder_cache(self.decoder) {} |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
10616
|
|
|
|
|
|
|
}; |
|
10617
|
|
|
|
|
|
|
|
|
10618
|
|
|
|
|
|
|
mutable threadsafe_stack caches; |
|
10619
|
|
|
|
|
|
|
}; |
|
10620
|
|
|
|
|
|
|
|
|
10621
|
|
|
|
|
|
|
// Definitions |
|
10622
|
|
|
|
|
|
|
|
|
10623
|
|
|
|
|
|
|
template |
|
10624
|
1
|
|
|
|
|
|
perceptron_tagger::perceptron_tagger(int decoding_order, int window_size) |
|
10625
|
1
|
|
|
|
|
|
: decoding_order(decoding_order), window_size(window_size), decoder(features, decoding_order, window_size) {} |
|
10626
|
|
|
|
|
|
|
|
|
10627
|
|
|
|
|
|
|
template |
|
10628
|
1
|
|
|
|
|
|
bool perceptron_tagger::load(istream& is) { |
|
10629
|
2
|
50
|
|
|
|
|
if (dict.reset(morpho::load(is)), !dict) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
10630
|
1
|
|
|
|
|
|
use_guesser = is.get(); |
|
10631
|
1
|
50
|
|
|
|
|
if (!features.load(is)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
10632
|
1
|
|
|
|
|
|
return true; |
|
10633
|
|
|
|
|
|
|
} |
|
10634
|
|
|
|
|
|
|
|
|
10635
|
|
|
|
|
|
|
template |
|
10636
|
1
|
|
|
|
|
|
const morpho* perceptron_tagger::get_morpho() const { |
|
10637
|
1
|
|
|
|
|
|
return dict.get(); |
|
10638
|
|
|
|
|
|
|
} |
|
10639
|
|
|
|
|
|
|
|
|
10640
|
|
|
|
|
|
|
template |
|
10641
|
1
|
|
|
|
|
|
void perceptron_tagger::tag(const vector& forms, vector& tags, morpho::guesser_mode guesser) const { |
|
10642
|
|
|
|
|
|
|
tags.clear(); |
|
10643
|
1
|
0
|
|
|
|
|
if (!dict) return; |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
10644
|
|
|
|
|
|
|
|
|
10645
|
1
|
|
|
|
|
|
cache* c = caches.pop(); |
|
10646
|
1
|
0
|
|
|
|
|
if (!c) c = new cache(*this); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
10647
|
|
|
|
|
|
|
|
|
10648
|
1
|
|
|
|
|
|
c->forms.resize(forms.size()); |
|
10649
|
1
|
0
|
|
|
|
|
if (c->analyses.size() < forms.size()) c->analyses.resize(forms.size()); |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
10650
|
8
|
0
|
|
|
|
|
for (unsigned i = 0; i < forms.size(); i++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
10651
|
7
|
|
|
|
|
|
c->forms[i] = forms[i]; |
|
10652
|
7
|
|
|
|
|
|
c->forms[i].len = dict->raw_form_len(forms[i]); |
|
10653
|
7
|
0
|
|
|
|
|
dict->analyze(forms[i], guesser >= 0 ? guesser : use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, c->analyses[i]); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
10654
|
|
|
|
|
|
|
} |
|
10655
|
|
|
|
|
|
|
|
|
10656
|
1
|
0
|
|
|
|
|
if (c->tags.size() < forms.size()) c->tags.resize(forms.size() * 2); |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
10657
|
1
|
|
|
|
|
|
decoder.tag(c->forms, c->analyses, c->decoder_cache, c->tags); |
|
10658
|
|
|
|
|
|
|
|
|
10659
|
8
|
0
|
|
|
|
|
for (unsigned i = 0; i < forms.size(); i++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
10660
|
7
|
|
|
|
|
|
tags.emplace_back(c->analyses[i][c->tags[i]]); |
|
10661
|
|
|
|
|
|
|
|
|
10662
|
1
|
|
|
|
|
|
caches.push(c); |
|
10663
|
|
|
|
|
|
|
} |
|
10664
|
|
|
|
|
|
|
|
|
10665
|
|
|
|
|
|
|
template |
|
10666
|
0
|
|
|
|
|
|
void perceptron_tagger::tag_analyzed(const vector& forms, const vector>& analyses, vector& tags) const { |
|
10667
|
|
|
|
|
|
|
tags.clear(); |
|
10668
|
|
|
|
|
|
|
|
|
10669
|
0
|
|
|
|
|
|
cache* c = caches.pop(); |
|
10670
|
0
|
0
|
|
|
|
|
if (!c) c = new cache(*this); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
10671
|
|
|
|
|
|
|
|
|
10672
|
0
|
|
|
|
|
|
tags.resize(forms.size()); |
|
10673
|
0
|
|
|
|
|
|
decoder.tag(forms, analyses, c->decoder_cache, tags); |
|
10674
|
|
|
|
|
|
|
|
|
10675
|
0
|
|
|
|
|
|
caches.push(c); |
|
10676
|
0
|
|
|
|
|
|
} |
|
10677
|
|
|
|
|
|
|
|
|
10678
|
|
|
|
|
|
|
} // namespace morphodita |
|
10679
|
|
|
|
|
|
|
|
|
10680
|
|
|
|
|
|
|
///////// |
|
10681
|
|
|
|
|
|
|
// File: morphodita/tagger/tagger.cpp |
|
10682
|
|
|
|
|
|
|
///////// |
|
10683
|
|
|
|
|
|
|
|
|
10684
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
10685
|
|
|
|
|
|
|
// |
|
10686
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
10687
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
10688
|
|
|
|
|
|
|
// |
|
10689
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
10690
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
10691
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
10692
|
|
|
|
|
|
|
|
|
10693
|
|
|
|
|
|
|
namespace morphodita { |
|
10694
|
|
|
|
|
|
|
|
|
10695
|
1
|
|
|
|
|
|
tagger* tagger::load(istream& is) { |
|
10696
|
1
|
50
|
|
|
|
|
tagger_id id = tagger_id(is.get()); |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
10697
|
|
|
|
|
|
|
switch (id) { |
|
10698
|
|
|
|
|
|
|
case tagger_ids::CZECH2: |
|
10699
|
|
|
|
|
|
|
case tagger_ids::CZECH2_3: |
|
10700
|
|
|
|
|
|
|
case tagger_ids::CZECH3: |
|
10701
|
|
|
|
|
|
|
{ |
|
10702
|
0
|
0
|
|
|
|
|
auto res = new_unique_ptr>>(tagger_ids::decoding_order(id), tagger_ids::window_size(id)); |
|
10703
|
0
|
0
|
|
|
|
|
if (res->load(is)) return res.release(); |
|
|
|
0
|
|
|
|
|
|
|
10704
|
|
|
|
|
|
|
break; |
|
10705
|
|
|
|
|
|
|
} |
|
10706
|
|
|
|
|
|
|
case tagger_ids::GENERIC2: |
|
10707
|
|
|
|
|
|
|
case tagger_ids::GENERIC2_3: |
|
10708
|
|
|
|
|
|
|
case tagger_ids::GENERIC3: |
|
10709
|
|
|
|
|
|
|
case tagger_ids::GENERIC4: |
|
10710
|
|
|
|
|
|
|
{ |
|
10711
|
0
|
0
|
|
|
|
|
auto res = new_unique_ptr>>(tagger_ids::decoding_order(id), tagger_ids::window_size(id)); |
|
10712
|
0
|
0
|
|
|
|
|
if (res->load(is)) return res.release(); |
|
|
|
0
|
|
|
|
|
|
|
10713
|
|
|
|
|
|
|
break; |
|
10714
|
|
|
|
|
|
|
} |
|
10715
|
|
|
|
|
|
|
case tagger_ids::CONLLU2: |
|
10716
|
|
|
|
|
|
|
case tagger_ids::CONLLU2_3: |
|
10717
|
|
|
|
|
|
|
case tagger_ids::CONLLU3: |
|
10718
|
|
|
|
|
|
|
{ |
|
10719
|
1
|
50
|
|
|
|
|
auto res = new_unique_ptr>>(tagger_ids::decoding_order(id), tagger_ids::window_size(id)); |
|
10720
|
1
|
50
|
|
|
|
|
if (res->load(is)) return res.release(); |
|
|
|
50
|
|
|
|
|
|
|
10721
|
|
|
|
|
|
|
break; |
|
10722
|
|
|
|
|
|
|
} |
|
10723
|
|
|
|
|
|
|
} |
|
10724
|
|
|
|
|
|
|
|
|
10725
|
|
|
|
|
|
|
return nullptr; |
|
10726
|
|
|
|
|
|
|
} |
|
10727
|
|
|
|
|
|
|
|
|
10728
|
0
|
|
|
|
|
|
tagger* tagger::load(const char* fname) { |
|
10729
|
0
|
0
|
|
|
|
|
ifstream f(path_from_utf8(fname).c_str(), ifstream::binary); |
|
10730
|
0
|
0
|
|
|
|
|
if (!f) return nullptr; |
|
10731
|
|
|
|
|
|
|
|
|
10732
|
0
|
0
|
|
|
|
|
return load(f); |
|
10733
|
|
|
|
|
|
|
} |
|
10734
|
|
|
|
|
|
|
|
|
10735
|
0
|
|
|
|
|
|
tokenizer* tagger::new_tokenizer() const { |
|
10736
|
0
|
|
|
|
|
|
auto morpho = get_morpho(); |
|
10737
|
0
|
0
|
|
|
|
|
return morpho ? morpho->new_tokenizer() : nullptr; |
|
10738
|
|
|
|
|
|
|
} |
|
10739
|
|
|
|
|
|
|
|
|
10740
|
|
|
|
|
|
|
} // namespace morphodita |
|
10741
|
|
|
|
|
|
|
|
|
10742
|
|
|
|
|
|
|
///////// |
|
10743
|
|
|
|
|
|
|
// File: morphodita/tagset_converter/identity_tagset_converter.h |
|
10744
|
|
|
|
|
|
|
///////// |
|
10745
|
|
|
|
|
|
|
|
|
10746
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
10747
|
|
|
|
|
|
|
// |
|
10748
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
10749
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
10750
|
|
|
|
|
|
|
// |
|
10751
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
10752
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
10753
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
10754
|
|
|
|
|
|
|
|
|
10755
|
|
|
|
|
|
|
namespace morphodita { |
|
10756
|
|
|
|
|
|
|
|
|
10757
|
0
|
|
|
|
|
|
class identity_tagset_converter : public tagset_converter { |
|
10758
|
|
|
|
|
|
|
public: |
|
10759
|
|
|
|
|
|
|
virtual void convert(tagged_lemma& tagged_lemma) const override; |
|
10760
|
|
|
|
|
|
|
virtual void convert_analyzed(vector& tagged_lemmas) const override; |
|
10761
|
|
|
|
|
|
|
virtual void convert_generated(vector& forms) const override; |
|
10762
|
|
|
|
|
|
|
}; |
|
10763
|
|
|
|
|
|
|
|
|
10764
|
|
|
|
|
|
|
} // namespace morphodita |
|
10765
|
|
|
|
|
|
|
|
|
10766
|
|
|
|
|
|
|
///////// |
|
10767
|
|
|
|
|
|
|
// File: morphodita/tagset_converter/identity_tagset_converter.cpp |
|
10768
|
|
|
|
|
|
|
///////// |
|
10769
|
|
|
|
|
|
|
|
|
10770
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
10771
|
|
|
|
|
|
|
// |
|
10772
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
10773
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
10774
|
|
|
|
|
|
|
// |
|
10775
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
10776
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
10777
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
10778
|
|
|
|
|
|
|
|
|
10779
|
|
|
|
|
|
|
namespace morphodita { |
|
10780
|
|
|
|
|
|
|
|
|
10781
|
0
|
|
|
|
|
|
void identity_tagset_converter::convert(tagged_lemma& /*tagged_lemma*/) const {} |
|
10782
|
|
|
|
|
|
|
|
|
10783
|
0
|
|
|
|
|
|
void identity_tagset_converter::convert_analyzed(vector& /*tagged_lemmas*/) const {} |
|
10784
|
|
|
|
|
|
|
|
|
10785
|
0
|
|
|
|
|
|
void identity_tagset_converter::convert_generated(vector& /*forms*/) const {} |
|
10786
|
|
|
|
|
|
|
|
|
10787
|
|
|
|
|
|
|
} // namespace morphodita |
|
10788
|
|
|
|
|
|
|
|
|
10789
|
|
|
|
|
|
|
///////// |
|
10790
|
|
|
|
|
|
|
// File: morphodita/tagset_converter/pdt_to_conll2009_tagset_converter.h |
|
10791
|
|
|
|
|
|
|
///////// |
|
10792
|
|
|
|
|
|
|
|
|
10793
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
10794
|
|
|
|
|
|
|
// |
|
10795
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
10796
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
10797
|
|
|
|
|
|
|
// |
|
10798
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
10799
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
10800
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
10801
|
|
|
|
|
|
|
|
|
10802
|
|
|
|
|
|
|
namespace morphodita { |
|
10803
|
|
|
|
|
|
|
|
|
10804
|
0
|
|
|
|
|
|
class pdt_to_conll2009_tagset_converter : public tagset_converter { |
|
10805
|
|
|
|
|
|
|
public: |
|
10806
|
|
|
|
|
|
|
virtual void convert(tagged_lemma& tagged_lemma) const override; |
|
10807
|
|
|
|
|
|
|
virtual void convert_analyzed(vector& tagged_lemmas) const override; |
|
10808
|
|
|
|
|
|
|
virtual void convert_generated(vector& forms) const override; |
|
10809
|
|
|
|
|
|
|
|
|
10810
|
|
|
|
|
|
|
private: |
|
10811
|
|
|
|
|
|
|
inline void convert_tag(const string& lemma, string& tag) const; |
|
10812
|
|
|
|
|
|
|
inline bool convert_lemma(string& lemma) const; |
|
10813
|
|
|
|
|
|
|
}; |
|
10814
|
|
|
|
|
|
|
|
|
10815
|
|
|
|
|
|
|
} // namespace morphodita |
|
10816
|
|
|
|
|
|
|
|
|
10817
|
|
|
|
|
|
|
///////// |
|
10818
|
|
|
|
|
|
|
// File: morphodita/tagset_converter/pdt_to_conll2009_tagset_converter.cpp |
|
10819
|
|
|
|
|
|
|
///////// |
|
10820
|
|
|
|
|
|
|
|
|
10821
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
10822
|
|
|
|
|
|
|
// |
|
10823
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
10824
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
10825
|
|
|
|
|
|
|
// |
|
10826
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
10827
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
10828
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
10829
|
|
|
|
|
|
|
|
|
10830
|
|
|
|
|
|
|
namespace morphodita { |
|
10831
|
|
|
|
|
|
|
|
|
10832
|
|
|
|
|
|
|
static const char* names[15] = {"POS", "SubPOS", "Gen", "Num", "Cas", "PGe", "PNu", "Per", "Ten", "Gra", "Neg", "Voi", "", "", "Var"}; |
|
10833
|
|
|
|
|
|
|
|
|
10834
|
0
|
|
|
|
|
|
inline void pdt_to_conll2009_tagset_converter::convert_tag(const string& lemma, string& tag) const { |
|
10835
|
|
|
|
|
|
|
char pdt_tag[16]; |
|
10836
|
|
|
|
|
|
|
strncpy(pdt_tag, tag.c_str(), 15); |
|
10837
|
|
|
|
|
|
|
|
|
10838
|
|
|
|
|
|
|
// Clear the tag |
|
10839
|
|
|
|
|
|
|
tag.clear(); |
|
10840
|
|
|
|
|
|
|
|
|
10841
|
|
|
|
|
|
|
// Fill FEAT of filled tag characters |
|
10842
|
0
|
0
|
|
|
|
|
for (int i = 0; i < 15 && pdt_tag[i]; i++) |
|
|
|
0
|
|
|
|
|
|
|
10843
|
0
|
0
|
|
|
|
|
if (pdt_tag[i] != '-') { |
|
10844
|
0
|
0
|
|
|
|
|
if (!tag.empty()) tag.push_back('|'); |
|
10845
|
0
|
|
|
|
|
|
tag.append(names[i]); |
|
10846
|
0
|
|
|
|
|
|
tag.push_back('='); |
|
10847
|
0
|
|
|
|
|
|
tag.push_back(pdt_tag[i]); |
|
10848
|
|
|
|
|
|
|
} |
|
10849
|
|
|
|
|
|
|
|
|
10850
|
|
|
|
|
|
|
// Try adding Sem FEAT |
|
10851
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i + 2 < lemma.size(); i++) |
|
10852
|
0
|
0
|
|
|
|
|
if (lemma[i] == '_' && lemma[i + 1] == ';') { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
10853
|
0
|
0
|
|
|
|
|
if (!tag.empty()) tag.push_back('|'); |
|
10854
|
0
|
|
|
|
|
|
tag.append("Sem="); |
|
10855
|
0
|
|
|
|
|
|
tag.push_back(lemma[i + 2]); |
|
10856
|
|
|
|
|
|
|
break; |
|
10857
|
|
|
|
|
|
|
} |
|
10858
|
0
|
|
|
|
|
|
} |
|
10859
|
|
|
|
|
|
|
|
|
10860
|
0
|
|
|
|
|
|
inline bool pdt_to_conll2009_tagset_converter::convert_lemma(string& lemma) const { |
|
10861
|
0
|
|
|
|
|
|
unsigned raw_lemma = czech_lemma_addinfo::raw_lemma_len(lemma); |
|
10862
|
0
|
0
|
|
|
|
|
return raw_lemma < lemma.size() ? (lemma.resize(raw_lemma), true) : false; |
|
10863
|
|
|
|
|
|
|
} |
|
10864
|
|
|
|
|
|
|
|
|
10865
|
0
|
|
|
|
|
|
void pdt_to_conll2009_tagset_converter::convert(tagged_lemma& tagged_lemma) const { |
|
10866
|
0
|
|
|
|
|
|
convert_tag(tagged_lemma.lemma, tagged_lemma.tag); |
|
10867
|
0
|
|
|
|
|
|
convert_lemma(tagged_lemma.lemma); |
|
10868
|
0
|
|
|
|
|
|
} |
|
10869
|
|
|
|
|
|
|
|
|
10870
|
0
|
|
|
|
|
|
void pdt_to_conll2009_tagset_converter::convert_analyzed(vector& tagged_lemmas) const { |
|
10871
|
|
|
|
|
|
|
bool lemma_changed = false; |
|
10872
|
|
|
|
|
|
|
|
|
10873
|
0
|
0
|
|
|
|
|
for (auto&& tagged_lemma : tagged_lemmas) { |
|
10874
|
0
|
|
|
|
|
|
convert_tag(tagged_lemma.lemma, tagged_lemma.tag); |
|
10875
|
0
|
|
|
|
|
|
lemma_changed |= convert_lemma(tagged_lemma.lemma); |
|
10876
|
|
|
|
|
|
|
} |
|
10877
|
|
|
|
|
|
|
|
|
10878
|
|
|
|
|
|
|
// If no lemma was changed or there is 1 analysis, no duplicates could be created. |
|
10879
|
0
|
0
|
|
|
|
|
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
10880
|
|
|
|
|
|
|
|
|
10881
|
0
|
|
|
|
|
|
tagset_converter_unique_analyzed(tagged_lemmas); |
|
10882
|
|
|
|
|
|
|
} |
|
10883
|
|
|
|
|
|
|
|
|
10884
|
0
|
|
|
|
|
|
void pdt_to_conll2009_tagset_converter::convert_generated(vector& forms) const { |
|
10885
|
|
|
|
|
|
|
bool lemma_changed = false; |
|
10886
|
|
|
|
|
|
|
|
|
10887
|
0
|
0
|
|
|
|
|
for (auto&& tagged_lemma_forms : forms) { |
|
10888
|
0
|
0
|
|
|
|
|
for (auto&& tagged_form : tagged_lemma_forms.forms) |
|
10889
|
0
|
|
|
|
|
|
convert_tag(tagged_lemma_forms.lemma, tagged_form.tag); |
|
10890
|
0
|
|
|
|
|
|
lemma_changed |= convert_lemma(tagged_lemma_forms.lemma); |
|
10891
|
|
|
|
|
|
|
} |
|
10892
|
|
|
|
|
|
|
|
|
10893
|
|
|
|
|
|
|
// If no lemma was changed or there is 1 analysis, no duplicates could be created. |
|
10894
|
0
|
0
|
|
|
|
|
if (!lemma_changed || forms.size() < 2) return; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
10895
|
|
|
|
|
|
|
|
|
10896
|
0
|
|
|
|
|
|
tagset_converter_unique_generated(forms); |
|
10897
|
|
|
|
|
|
|
} |
|
10898
|
|
|
|
|
|
|
|
|
10899
|
|
|
|
|
|
|
} // namespace morphodita |
|
10900
|
|
|
|
|
|
|
|
|
10901
|
|
|
|
|
|
|
///////// |
|
10902
|
|
|
|
|
|
|
// File: morphodita/tagset_converter/strip_lemma_comment_tagset_converter.h |
|
10903
|
|
|
|
|
|
|
///////// |
|
10904
|
|
|
|
|
|
|
|
|
10905
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
10906
|
|
|
|
|
|
|
// |
|
10907
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
10908
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
10909
|
|
|
|
|
|
|
// |
|
10910
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
10911
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
10912
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
10913
|
|
|
|
|
|
|
|
|
10914
|
|
|
|
|
|
|
namespace morphodita { |
|
10915
|
|
|
|
|
|
|
|
|
10916
|
0
|
|
|
|
|
|
class strip_lemma_comment_tagset_converter : public tagset_converter { |
|
10917
|
|
|
|
|
|
|
public: |
|
10918
|
0
|
|
|
|
|
|
strip_lemma_comment_tagset_converter(const morpho& dictionary) : dictionary(dictionary) {} |
|
10919
|
|
|
|
|
|
|
|
|
10920
|
|
|
|
|
|
|
virtual void convert(tagged_lemma& tagged_lemma) const override; |
|
10921
|
|
|
|
|
|
|
virtual void convert_analyzed(vector& tagged_lemmas) const override; |
|
10922
|
|
|
|
|
|
|
virtual void convert_generated(vector& forms) const override; |
|
10923
|
|
|
|
|
|
|
|
|
10924
|
|
|
|
|
|
|
private: |
|
10925
|
|
|
|
|
|
|
inline bool convert_lemma(string& lemma) const; |
|
10926
|
|
|
|
|
|
|
const morpho& dictionary; |
|
10927
|
|
|
|
|
|
|
}; |
|
10928
|
|
|
|
|
|
|
|
|
10929
|
|
|
|
|
|
|
} // namespace morphodita |
|
10930
|
|
|
|
|
|
|
|
|
10931
|
|
|
|
|
|
|
///////// |
|
10932
|
|
|
|
|
|
|
// File: morphodita/tagset_converter/strip_lemma_comment_tagset_converter.cpp |
|
10933
|
|
|
|
|
|
|
///////// |
|
10934
|
|
|
|
|
|
|
|
|
10935
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
10936
|
|
|
|
|
|
|
// |
|
10937
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
10938
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
10939
|
|
|
|
|
|
|
// |
|
10940
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
10941
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
10942
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
10943
|
|
|
|
|
|
|
|
|
10944
|
|
|
|
|
|
|
namespace morphodita { |
|
10945
|
|
|
|
|
|
|
|
|
10946
|
0
|
|
|
|
|
|
inline bool strip_lemma_comment_tagset_converter::convert_lemma(string& lemma) const { |
|
10947
|
0
|
|
|
|
|
|
unsigned lemma_id_len = dictionary.lemma_id_len(lemma); |
|
10948
|
0
|
0
|
|
|
|
|
return lemma_id_len < lemma.size() ? (lemma.resize(lemma_id_len), true) : false; |
|
10949
|
|
|
|
|
|
|
} |
|
10950
|
|
|
|
|
|
|
|
|
10951
|
0
|
|
|
|
|
|
void strip_lemma_comment_tagset_converter::convert(tagged_lemma& tagged_lemma) const { |
|
10952
|
0
|
|
|
|
|
|
convert_lemma(tagged_lemma.lemma); |
|
10953
|
0
|
|
|
|
|
|
} |
|
10954
|
|
|
|
|
|
|
|
|
10955
|
0
|
|
|
|
|
|
void strip_lemma_comment_tagset_converter::convert_analyzed(vector& tagged_lemmas) const { |
|
10956
|
|
|
|
|
|
|
bool lemma_changed = false; |
|
10957
|
|
|
|
|
|
|
|
|
10958
|
0
|
0
|
|
|
|
|
for (auto&& tagged_lemma : tagged_lemmas) |
|
10959
|
0
|
|
|
|
|
|
lemma_changed |= convert_lemma(tagged_lemma.lemma); |
|
10960
|
|
|
|
|
|
|
|
|
10961
|
|
|
|
|
|
|
// If no lemma was changed or there is 1 analysis, no duplicates could be created. |
|
10962
|
0
|
0
|
|
|
|
|
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
10963
|
|
|
|
|
|
|
|
|
10964
|
0
|
|
|
|
|
|
tagset_converter_unique_analyzed(tagged_lemmas); |
|
10965
|
|
|
|
|
|
|
} |
|
10966
|
|
|
|
|
|
|
|
|
10967
|
0
|
|
|
|
|
|
void strip_lemma_comment_tagset_converter::convert_generated(vector& forms) const { |
|
10968
|
|
|
|
|
|
|
bool lemma_changed = false; |
|
10969
|
|
|
|
|
|
|
|
|
10970
|
0
|
0
|
|
|
|
|
for (auto&& tagged_lemma_forms : forms) |
|
10971
|
0
|
|
|
|
|
|
lemma_changed |= convert_lemma(tagged_lemma_forms.lemma); |
|
10972
|
|
|
|
|
|
|
|
|
10973
|
|
|
|
|
|
|
// If no lemma was changed or there is 1 analysis, no duplicates could be created. |
|
10974
|
0
|
0
|
|
|
|
|
if (!lemma_changed || forms.size() < 2) return; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
10975
|
|
|
|
|
|
|
|
|
10976
|
0
|
|
|
|
|
|
tagset_converter_unique_generated(forms); |
|
10977
|
|
|
|
|
|
|
} |
|
10978
|
|
|
|
|
|
|
|
|
10979
|
|
|
|
|
|
|
} // namespace morphodita |
|
10980
|
|
|
|
|
|
|
|
|
10981
|
|
|
|
|
|
|
///////// |
|
10982
|
|
|
|
|
|
|
// File: morphodita/tagset_converter/strip_lemma_id_tagset_converter.h |
|
10983
|
|
|
|
|
|
|
///////// |
|
10984
|
|
|
|
|
|
|
|
|
10985
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
10986
|
|
|
|
|
|
|
// |
|
10987
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
10988
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
10989
|
|
|
|
|
|
|
// |
|
10990
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
10991
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
10992
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
10993
|
|
|
|
|
|
|
|
|
10994
|
|
|
|
|
|
|
namespace morphodita { |
|
10995
|
|
|
|
|
|
|
|
|
10996
|
0
|
|
|
|
|
|
class strip_lemma_id_tagset_converter : public tagset_converter { |
|
10997
|
|
|
|
|
|
|
public: |
|
10998
|
0
|
|
|
|
|
|
strip_lemma_id_tagset_converter(const morpho& dictionary) : dictionary(dictionary) {} |
|
10999
|
|
|
|
|
|
|
|
|
11000
|
|
|
|
|
|
|
virtual void convert(tagged_lemma& tagged_lemma) const override; |
|
11001
|
|
|
|
|
|
|
virtual void convert_analyzed(vector& tagged_lemmas) const override; |
|
11002
|
|
|
|
|
|
|
virtual void convert_generated(vector& forms) const override; |
|
11003
|
|
|
|
|
|
|
|
|
11004
|
|
|
|
|
|
|
private: |
|
11005
|
|
|
|
|
|
|
inline bool convert_lemma(string& lemma) const; |
|
11006
|
|
|
|
|
|
|
const morpho& dictionary; |
|
11007
|
|
|
|
|
|
|
}; |
|
11008
|
|
|
|
|
|
|
|
|
11009
|
|
|
|
|
|
|
} // namespace morphodita |
|
11010
|
|
|
|
|
|
|
|
|
11011
|
|
|
|
|
|
|
///////// |
|
11012
|
|
|
|
|
|
|
// File: morphodita/tagset_converter/strip_lemma_id_tagset_converter.cpp |
|
11013
|
|
|
|
|
|
|
///////// |
|
11014
|
|
|
|
|
|
|
|
|
11015
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
11016
|
|
|
|
|
|
|
// |
|
11017
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
11018
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
11019
|
|
|
|
|
|
|
// |
|
11020
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
11021
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
11022
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
11023
|
|
|
|
|
|
|
|
|
11024
|
|
|
|
|
|
|
namespace morphodita { |
|
11025
|
|
|
|
|
|
|
|
|
11026
|
0
|
|
|
|
|
|
inline bool strip_lemma_id_tagset_converter::convert_lemma(string& lemma) const { |
|
11027
|
0
|
|
|
|
|
|
unsigned raw_lemma_len = dictionary.raw_lemma_len(lemma); |
|
11028
|
0
|
0
|
|
|
|
|
return raw_lemma_len < lemma.size() ? (lemma.resize(raw_lemma_len), true) : false; |
|
11029
|
|
|
|
|
|
|
} |
|
11030
|
|
|
|
|
|
|
|
|
11031
|
0
|
|
|
|
|
|
void strip_lemma_id_tagset_converter::convert(tagged_lemma& tagged_lemma) const { |
|
11032
|
0
|
|
|
|
|
|
convert_lemma(tagged_lemma.lemma); |
|
11033
|
0
|
|
|
|
|
|
} |
|
11034
|
|
|
|
|
|
|
|
|
11035
|
0
|
|
|
|
|
|
void strip_lemma_id_tagset_converter::convert_analyzed(vector& tagged_lemmas) const { |
|
11036
|
|
|
|
|
|
|
bool lemma_changed = false; |
|
11037
|
|
|
|
|
|
|
|
|
11038
|
0
|
0
|
|
|
|
|
for (auto&& tagged_lemma : tagged_lemmas) |
|
11039
|
0
|
|
|
|
|
|
lemma_changed |= convert_lemma(tagged_lemma.lemma); |
|
11040
|
|
|
|
|
|
|
|
|
11041
|
|
|
|
|
|
|
// If no lemma was changed or there is 1 analysis, no duplicates could be created. |
|
11042
|
0
|
0
|
|
|
|
|
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
11043
|
|
|
|
|
|
|
|
|
11044
|
0
|
|
|
|
|
|
tagset_converter_unique_analyzed(tagged_lemmas); |
|
11045
|
|
|
|
|
|
|
} |
|
11046
|
|
|
|
|
|
|
|
|
11047
|
0
|
|
|
|
|
|
void strip_lemma_id_tagset_converter::convert_generated(vector& forms) const { |
|
11048
|
|
|
|
|
|
|
bool lemma_changed = false; |
|
11049
|
|
|
|
|
|
|
|
|
11050
|
0
|
0
|
|
|
|
|
for (auto&& tagged_lemma_forms : forms) |
|
11051
|
0
|
|
|
|
|
|
lemma_changed |= convert_lemma(tagged_lemma_forms.lemma); |
|
11052
|
|
|
|
|
|
|
|
|
11053
|
|
|
|
|
|
|
// If no lemma was changed or there is 1 analysis, no duplicates could be created. |
|
11054
|
0
|
0
|
|
|
|
|
if (!lemma_changed || forms.size() < 2) return; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
11055
|
|
|
|
|
|
|
|
|
11056
|
0
|
|
|
|
|
|
tagset_converter_unique_generated(forms); |
|
11057
|
|
|
|
|
|
|
} |
|
11058
|
|
|
|
|
|
|
|
|
11059
|
|
|
|
|
|
|
} // namespace morphodita |
|
11060
|
|
|
|
|
|
|
|
|
11061
|
|
|
|
|
|
|
///////// |
|
11062
|
|
|
|
|
|
|
// File: morphodita/tagset_converter/tagset_converter.cpp |
|
11063
|
|
|
|
|
|
|
///////// |
|
11064
|
|
|
|
|
|
|
|
|
11065
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
11066
|
|
|
|
|
|
|
// |
|
11067
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
11068
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
11069
|
|
|
|
|
|
|
// |
|
11070
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
11071
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
11072
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
11073
|
|
|
|
|
|
|
|
|
11074
|
|
|
|
|
|
|
namespace morphodita { |
|
11075
|
|
|
|
|
|
|
|
|
11076
|
0
|
|
|
|
|
|
tagset_converter* tagset_converter::new_identity_converter() { |
|
11077
|
0
|
|
|
|
|
|
return new identity_tagset_converter(); |
|
11078
|
|
|
|
|
|
|
} |
|
11079
|
|
|
|
|
|
|
|
|
11080
|
0
|
|
|
|
|
|
tagset_converter* tagset_converter::new_pdt_to_conll2009_converter() { |
|
11081
|
0
|
|
|
|
|
|
return new pdt_to_conll2009_tagset_converter(); |
|
11082
|
|
|
|
|
|
|
} |
|
11083
|
|
|
|
|
|
|
|
|
11084
|
0
|
|
|
|
|
|
tagset_converter* tagset_converter::new_strip_lemma_comment_converter(const morpho& dictionary) { |
|
11085
|
0
|
|
|
|
|
|
return new strip_lemma_comment_tagset_converter(dictionary); |
|
11086
|
|
|
|
|
|
|
} |
|
11087
|
|
|
|
|
|
|
|
|
11088
|
0
|
|
|
|
|
|
tagset_converter* tagset_converter::new_strip_lemma_id_converter(const morpho& dictionary) { |
|
11089
|
0
|
|
|
|
|
|
return new strip_lemma_id_tagset_converter(dictionary); |
|
11090
|
|
|
|
|
|
|
} |
|
11091
|
|
|
|
|
|
|
|
|
11092
|
0
|
|
|
|
|
|
tagset_converter* new_tagset_converter(const string& name, const morpho& dictionary) { |
|
11093
|
0
|
0
|
|
|
|
|
if (name == "pdt_to_conll2009") return tagset_converter::new_pdt_to_conll2009_converter(); |
|
11094
|
0
|
0
|
|
|
|
|
if (name == "strip_lemma_comment") return tagset_converter::new_strip_lemma_comment_converter(dictionary); |
|
11095
|
0
|
0
|
|
|
|
|
if (name == "strip_lemma_id") return tagset_converter::new_strip_lemma_id_converter(dictionary); |
|
11096
|
|
|
|
|
|
|
return nullptr; |
|
11097
|
|
|
|
|
|
|
} |
|
11098
|
|
|
|
|
|
|
|
|
11099
|
0
|
|
|
|
|
|
void tagset_converter_unique_analyzed(vector& tagged_lemmas) { |
|
11100
|
|
|
|
|
|
|
// Remove possible lemma-tag pair duplicates |
|
11101
|
|
|
|
|
|
|
struct tagged_lemma_comparator { |
|
11102
|
0
|
0
|
|
|
|
|
inline static bool eq(const tagged_lemma& a, const tagged_lemma& b) { return a.lemma == b.lemma && a.tag == b.tag; } |
|
|
|
0
|
|
|
|
|
|
|
11103
|
0
|
0
|
|
|
|
|
inline static bool lt(const tagged_lemma& a, const tagged_lemma& b) { int lemma_compare = a.lemma.compare(b.lemma); return lemma_compare < 0 || (lemma_compare == 0 && a.tag < b.tag); } |
|
11104
|
|
|
|
|
|
|
}; |
|
11105
|
|
|
|
|
|
|
|
|
11106
|
|
|
|
|
|
|
sort(tagged_lemmas.begin(), tagged_lemmas.end(), tagged_lemma_comparator::lt); |
|
11107
|
0
|
|
|
|
|
|
tagged_lemmas.resize(unique(tagged_lemmas.begin(), tagged_lemmas.end(), tagged_lemma_comparator::eq) - tagged_lemmas.begin()); |
|
11108
|
0
|
|
|
|
|
|
} |
|
11109
|
|
|
|
|
|
|
|
|
11110
|
0
|
|
|
|
|
|
void tagset_converter_unique_generated(vector& forms) { |
|
11111
|
|
|
|
|
|
|
// Regroup and if needed remove duplicate form-tag pairs for each lemma |
|
11112
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < forms.size(); i++) { |
|
11113
|
|
|
|
|
|
|
bool any_merged = false; |
|
11114
|
0
|
0
|
|
|
|
|
for (unsigned j = forms.size() - 1; j > i; j--) |
|
11115
|
0
|
0
|
|
|
|
|
if (forms[j].lemma == forms[i].lemma) { |
|
11116
|
|
|
|
|
|
|
// Same lemma was found. Merge form-tag pairs |
|
11117
|
0
|
0
|
|
|
|
|
for (auto&& tagged_form : forms[j].forms) |
|
11118
|
0
|
|
|
|
|
|
forms[i].forms.emplace_back(move(tagged_form)); |
|
11119
|
|
|
|
|
|
|
|
|
11120
|
|
|
|
|
|
|
// Remove lemma j by moving it to end and deleting |
|
11121
|
0
|
0
|
|
|
|
|
if (j < forms.size() - 1) { |
|
11122
|
0
|
|
|
|
|
|
forms[j].lemma.swap(forms[forms.size() - 1].lemma); |
|
11123
|
0
|
|
|
|
|
|
forms[j].forms.swap(forms[forms.size() - 1].forms); |
|
11124
|
|
|
|
|
|
|
} |
|
11125
|
|
|
|
|
|
|
forms.pop_back(); |
|
11126
|
|
|
|
|
|
|
any_merged = true; |
|
11127
|
|
|
|
|
|
|
} |
|
11128
|
|
|
|
|
|
|
|
|
11129
|
0
|
0
|
|
|
|
|
if (any_merged && forms[i].forms.size() > 1) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
11130
|
|
|
|
|
|
|
// Remove duplicate form-tag pairs |
|
11131
|
|
|
|
|
|
|
struct tagged_form_comparator { |
|
11132
|
0
|
0
|
|
|
|
|
inline static bool eq(const tagged_form& a, const tagged_form& b) { return a.tag == b.tag && a.form == b.form; } |
|
|
|
0
|
|
|
|
|
|
|
11133
|
0
|
0
|
|
|
|
|
inline static bool lt(const tagged_form& a, const tagged_form& b) { int tag_compare = a.tag.compare(b.tag); return tag_compare < 0 || (tag_compare == 0 && a.form < b.form); } |
|
11134
|
|
|
|
|
|
|
}; |
|
11135
|
|
|
|
|
|
|
|
|
11136
|
|
|
|
|
|
|
sort(forms[i].forms.begin(), forms[i].forms.end(), tagged_form_comparator::lt); |
|
11137
|
0
|
|
|
|
|
|
forms[i].forms.resize(unique(forms[i].forms.begin(), forms[i].forms.end(), tagged_form_comparator::eq) - forms[i].forms.begin()); |
|
11138
|
|
|
|
|
|
|
} |
|
11139
|
|
|
|
|
|
|
} |
|
11140
|
0
|
|
|
|
|
|
} |
|
11141
|
|
|
|
|
|
|
|
|
11142
|
|
|
|
|
|
|
} // namespace morphodita |
|
11143
|
|
|
|
|
|
|
|
|
11144
|
|
|
|
|
|
|
///////// |
|
11145
|
|
|
|
|
|
|
// File: morphodita/tokenizer/czech_tokenizer.cpp |
|
11146
|
|
|
|
|
|
|
///////// |
|
11147
|
|
|
|
|
|
|
|
|
11148
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
11149
|
|
|
|
|
|
|
// |
|
11150
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
11151
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
11152
|
|
|
|
|
|
|
// |
|
11153
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
11154
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
11155
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
11156
|
|
|
|
|
|
|
|
|
11157
|
|
|
|
|
|
|
namespace morphodita { |
|
11158
|
|
|
|
|
|
|
|
|
11159
|
|
|
|
|
|
|
static const char _czech_tokenizer_cond_offsets[] = { |
|
11160
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
11161
|
|
|
|
|
|
|
2, 2, 2, 2, 2, 2, 2, 2, |
|
11162
|
|
|
|
|
|
|
2, 2, 2, 2, 2, 2, 2 |
|
11163
|
|
|
|
|
|
|
}; |
|
11164
|
|
|
|
|
|
|
|
|
11165
|
|
|
|
|
|
|
static const char _czech_tokenizer_cond_lengths[] = { |
|
11166
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 2, |
|
11167
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
11168
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0 |
|
11169
|
|
|
|
|
|
|
}; |
|
11170
|
|
|
|
|
|
|
|
|
11171
|
|
|
|
|
|
|
static const short _czech_tokenizer_cond_keys[] = { |
|
11172
|
|
|
|
|
|
|
43u, 43u, 45u, 45u, 0 |
|
11173
|
|
|
|
|
|
|
}; |
|
11174
|
|
|
|
|
|
|
|
|
11175
|
|
|
|
|
|
|
static const char _czech_tokenizer_cond_spaces[] = { |
|
11176
|
|
|
|
|
|
|
1, 0, 0 |
|
11177
|
|
|
|
|
|
|
}; |
|
11178
|
|
|
|
|
|
|
|
|
11179
|
|
|
|
|
|
|
static const unsigned char _czech_tokenizer_key_offsets[] = { |
|
11180
|
|
|
|
|
|
|
0, 0, 17, 29, 43, 46, 51, 54, |
|
11181
|
|
|
|
|
|
|
89, 94, 98, 101, 105, 110, 111, 116, |
|
11182
|
|
|
|
|
|
|
117, 122, 136, 143, 148, 151, 163 |
|
11183
|
|
|
|
|
|
|
}; |
|
11184
|
|
|
|
|
|
|
|
|
11185
|
|
|
|
|
|
|
static const short _czech_tokenizer_trans_keys[] = { |
|
11186
|
|
|
|
|
|
|
13u, 32u, 34u, 40u, 91u, 96u, 123u, 129u, |
|
11187
|
|
|
|
|
|
|
133u, 135u, 147u, 150u, 162u, 9u, 10u, 65u, |
|
11188
|
|
|
|
|
|
|
90u, 34u, 40u, 91u, 96u, 123u, 129u, 133u, |
|
11189
|
|
|
|
|
|
|
135u, 150u, 162u, 65u, 90u, 13u, 32u, 34u, |
|
11190
|
|
|
|
|
|
|
39u, 41u, 59u, 93u, 125u, 139u, 141u, 147u, |
|
11191
|
|
|
|
|
|
|
161u, 9u, 10u, 159u, 48u, 57u, 43u, 45u, |
|
11192
|
|
|
|
|
|
|
159u, 48u, 57u, 159u, 48u, 57u, 9u, 10u, |
|
11193
|
|
|
|
|
|
|
13u, 32u, 33u, 44u, 46u, 47u, 63u, 129u, |
|
11194
|
|
|
|
|
|
|
131u, 135u, 142u, 147u, 157u, 159u, 160u, 301u, |
|
11195
|
|
|
|
|
|
|
557u, 811u, 1067u, 0u, 42u, 48u, 57u, 58u, |
|
11196
|
|
|
|
|
|
|
64u, 65u, 90u, 91u, 96u, 97u, 122u, 123u, |
|
11197
|
|
|
|
|
|
|
255u, 9u, 10u, 13u, 32u, 147u, 9u, 13u, |
|
11198
|
|
|
|
|
|
|
32u, 147u, 9u, 32u, 147u, 9u, 10u, 32u, |
|
11199
|
|
|
|
|
|
|
147u, 9u, 10u, 13u, 32u, 147u, 13u, 9u, |
|
11200
|
|
|
|
|
|
|
10u, 13u, 32u, 147u, 10u, 9u, 10u, 13u, |
|
11201
|
|
|
|
|
|
|
32u, 147u, 13u, 32u, 34u, 39u, 41u, 59u, |
|
11202
|
|
|
|
|
|
|
93u, 125u, 139u, 141u, 147u, 161u, 9u, 10u, |
|
11203
|
|
|
|
|
|
|
44u, 46u, 69u, 101u, 159u, 48u, 57u, 69u, |
|
11204
|
|
|
|
|
|
|
101u, 159u, 48u, 57u, 159u, 48u, 57u, 129u, |
|
11205
|
|
|
|
|
|
|
131u, 135u, 151u, 155u, 157u, 65u, 90u, 97u, |
|
11206
|
|
|
|
|
|
|
122u, 142u, 143u, 159u, 48u, 57u, 0 |
|
11207
|
|
|
|
|
|
|
}; |
|
11208
|
|
|
|
|
|
|
|
|
11209
|
|
|
|
|
|
|
static const char _czech_tokenizer_single_lengths[] = { |
|
11210
|
|
|
|
|
|
|
0, 13, 10, 12, 1, 3, 1, 21, |
|
11211
|
|
|
|
|
|
|
5, 4, 3, 4, 5, 1, 5, 1, |
|
11212
|
|
|
|
|
|
|
5, 12, 5, 3, 1, 6, 1 |
|
11213
|
|
|
|
|
|
|
}; |
|
11214
|
|
|
|
|
|
|
|
|
11215
|
|
|
|
|
|
|
static const char _czech_tokenizer_range_lengths[] = { |
|
11216
|
|
|
|
|
|
|
0, 2, 1, 1, 1, 1, 1, 7, |
|
11217
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
11218
|
|
|
|
|
|
|
0, 1, 1, 1, 1, 3, 1 |
|
11219
|
|
|
|
|
|
|
}; |
|
11220
|
|
|
|
|
|
|
|
|
11221
|
|
|
|
|
|
|
static const unsigned char _czech_tokenizer_index_offsets[] = { |
|
11222
|
|
|
|
|
|
|
0, 0, 16, 28, 42, 45, 50, 53, |
|
11223
|
|
|
|
|
|
|
82, 88, 93, 97, 102, 108, 110, 116, |
|
11224
|
|
|
|
|
|
|
118, 124, 138, 145, 150, 153, 163 |
|
11225
|
|
|
|
|
|
|
}; |
|
11226
|
|
|
|
|
|
|
|
|
11227
|
|
|
|
|
|
|
static const char _czech_tokenizer_indicies[] = { |
|
11228
|
|
|
|
|
|
|
1, 1, 2, 2, 2, 2, 2, 3, |
|
11229
|
|
|
|
|
|
|
2, 3, 1, 2, 2, 1, 3, 0, |
|
11230
|
|
|
|
|
|
|
2, 2, 2, 2, 2, 3, 2, 3, |
|
11231
|
|
|
|
|
|
|
2, 2, 3, 0, 4, 4, 5, 5, |
|
11232
|
|
|
|
|
|
|
5, 5, 5, 5, 5, 5, 4, 5, |
|
11233
|
|
|
|
|
|
|
4, 0, 6, 6, 0, 7, 7, 8, |
|
11234
|
|
|
|
|
|
|
8, 0, 8, 8, 0, 10, 11, 12, |
|
11235
|
|
|
|
|
|
|
10, 13, 9, 13, 9, 13, 16, 16, |
|
11236
|
|
|
|
|
|
|
16, 16, 10, 16, 15, 13, 9, 17, |
|
11237
|
|
|
|
|
|
|
9, 17, 9, 15, 9, 16, 9, 16, |
|
11238
|
|
|
|
|
|
|
9, 14, 10, 19, 20, 10, 10, 18, |
|
11239
|
|
|
|
|
|
|
10, 21, 10, 10, 18, 10, 10, 10, |
|
11240
|
|
|
|
|
|
|
18, 10, 21, 10, 10, 18, 10, 22, |
|
11241
|
|
|
|
|
|
|
23, 10, 10, 18, 25, 24, 10, 22, |
|
11242
|
|
|
|
|
|
|
26, 10, 10, 18, 25, 24, 10, 23, |
|
11243
|
|
|
|
|
|
|
26, 10, 10, 18, 4, 4, 5, 5, |
|
11244
|
|
|
|
|
|
|
5, 5, 5, 5, 5, 5, 4, 5, |
|
11245
|
|
|
|
|
|
|
4, 27, 28, 28, 29, 29, 15, 15, |
|
11246
|
|
|
|
|
|
|
27, 29, 29, 6, 6, 27, 8, 8, |
|
11247
|
|
|
|
|
|
|
27, 16, 16, 16, 16, 16, 16, 16, |
|
11248
|
|
|
|
|
|
|
16, 16, 27, 15, 15, 27, 0 |
|
11249
|
|
|
|
|
|
|
}; |
|
11250
|
|
|
|
|
|
|
|
|
11251
|
|
|
|
|
|
|
static const char _czech_tokenizer_trans_targs[] = { |
|
11252
|
|
|
|
|
|
|
7, 1, 2, 7, 1, 3, 19, 6, |
|
11253
|
|
|
|
|
|
|
20, 7, 8, 12, 16, 17, 0, 18, |
|
11254
|
|
|
|
|
|
|
21, 22, 7, 9, 11, 10, 13, 14, |
|
11255
|
|
|
|
|
|
|
7, 7, 15, 7, 4, 5 |
|
11256
|
|
|
|
|
|
|
}; |
|
11257
|
|
|
|
|
|
|
|
|
11258
|
|
|
|
|
|
|
static const char _czech_tokenizer_trans_actions[] = { |
|
11259
|
|
|
|
|
|
|
1, 0, 0, 2, 3, 0, 4, 0, |
|
11260
|
|
|
|
|
|
|
0, 7, 0, 0, 0, 4, 0, 4, |
|
11261
|
|
|
|
|
|
|
0, 0, 8, 0, 0, 0, 0, 0, |
|
11262
|
|
|
|
|
|
|
9, 10, 0, 11, 0, 0 |
|
11263
|
|
|
|
|
|
|
}; |
|
11264
|
|
|
|
|
|
|
|
|
11265
|
|
|
|
|
|
|
static const char _czech_tokenizer_to_state_actions[] = { |
|
11266
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 5, |
|
11267
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
11268
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0 |
|
11269
|
|
|
|
|
|
|
}; |
|
11270
|
|
|
|
|
|
|
|
|
11271
|
|
|
|
|
|
|
static const char _czech_tokenizer_from_state_actions[] = { |
|
11272
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 6, |
|
11273
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
11274
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0 |
|
11275
|
|
|
|
|
|
|
}; |
|
11276
|
|
|
|
|
|
|
|
|
11277
|
|
|
|
|
|
|
static const unsigned char _czech_tokenizer_eof_trans[] = { |
|
11278
|
|
|
|
|
|
|
0, 1, 1, 1, 1, 1, 1, 0, |
|
11279
|
|
|
|
|
|
|
19, 19, 19, 19, 19, 25, 19, 25, |
|
11280
|
|
|
|
|
|
|
19, 28, 28, 28, 28, 28, 28 |
|
11281
|
|
|
|
|
|
|
}; |
|
11282
|
|
|
|
|
|
|
|
|
11283
|
|
|
|
|
|
|
static const int czech_tokenizer_start = 7; |
|
11284
|
|
|
|
|
|
|
|
|
11285
|
|
|
|
|
|
|
// The list of lower cased words that when preceding eos do not end sentence. |
|
11286
|
|
|
|
|
|
|
// Note: because of VS, we cannot list the abbreviations directly in UTF-8, |
|
11287
|
|
|
|
|
|
|
// because the compilation of utf-8 encoded sources fail on some locales |
|
11288
|
|
|
|
|
|
|
// (e.g., Japanese). |
|
11289
|
|
|
|
|
|
|
// perl -CS -ple 'use Encode;s/([^[:ascii:]])/join("", map {sprintf "\\%o", ord($_)} split(m@@, encode("utf-8", $1)))/ge' |
|
11290
|
|
|
|
|
|
|
// perl -CS -ple 'use Encode;s/\\([0-7]{3})\\([0-7]{3})/decode("utf-8", chr(oct($1)).chr(oct($2)))/ge' |
|
11291
|
218
|
100
|
|
|
|
|
const unordered_set czech_tokenizer::abbreviations_czech = { |
|
|
|
0
|
|
|
|
|
|
|
11292
|
|
|
|
|
|
|
// Titles |
|
11293
|
|
|
|
|
|
|
"prof", "csc", "drsc", "doc", "phd", "ph", "d", |
|
11294
|
|
|
|
|
|
|
"judr", "mddr", "mudr", "mvdr", "paeddr", "paedr", "phdr", "rndr", "rsdr", "dr", |
|
11295
|
|
|
|
|
|
|
"ing", "arch", "mgr", "bc", "mag", "mba", "bca", "mga", |
|
11296
|
|
|
|
|
|
|
"gen", "plk", "pplk", "npor", "por", "ppor", "kpt", "mjr", "sgt", "pls", "p", "s", |
|
11297
|
|
|
|
|
|
|
"p", "p\303\255", "fa", "fy", "mr", "mrs", "ms", "miss", "tr", "sv", |
|
11298
|
|
|
|
|
|
|
// Geographic names |
|
11299
|
|
|
|
|
|
|
"angl", "fr", "\304\215es", "ces", "\304\215s", "cs", "slov", "n\304\233m", "nem", "it", "pol", "ma\304\217", "mad", "rus", |
|
11300
|
|
|
|
|
|
|
"sev", "v\303\275ch", "vych", "ji\305\276", "jiz", "z\303\241p", "zap", |
|
11301
|
|
|
|
|
|
|
// Common abbrevs |
|
11302
|
|
|
|
|
|
|
"adr", "\304\215", "c", "eg", "ev", "g", "hod", "j", "kr", "m", "max", "min", "mj", "nap\305\231", "napr", |
|
11303
|
|
|
|
|
|
|
"okr", "pop\305\231", "popr", "pozn", "r", "\305\231", "red", "rep", "resp", "srov", "st", "st\305\231", "str", |
|
11304
|
|
|
|
|
|
|
"sv", "tel", "tj", "tzv", "\303\272", "u", "uh", "ul", "um", "zl", "zn", |
|
11305
|
|
|
|
|
|
|
}; |
|
11306
|
|
|
|
|
|
|
|
|
11307
|
210
|
100
|
|
|
|
|
const unordered_set czech_tokenizer::abbreviations_slovak = { |
|
|
|
0
|
|
|
|
|
|
|
11308
|
|
|
|
|
|
|
// Titles |
|
11309
|
|
|
|
|
|
|
"prof", "csc", "drsc", "doc", "phd", "ph", "d", |
|
11310
|
|
|
|
|
|
|
"judr", "mddr", "mudr", "mvdr", "paeddr", "paedr", "phdr", "rndr", "rsdr", "dr", |
|
11311
|
|
|
|
|
|
|
"ing", "arch", "mgr", "bc", "mag", "mba", "bca", "mga", |
|
11312
|
|
|
|
|
|
|
"gen", "plk", "pplk", "npor", "por", "ppor", "kpt", "mjr", "sgt", "pls", "p", "s", |
|
11313
|
|
|
|
|
|
|
"p", "p\303\255", "fa", "fy", "mr", "mrs", "ms", "miss", "tr", "sv", |
|
11314
|
|
|
|
|
|
|
// Geographic names |
|
11315
|
|
|
|
|
|
|
"angl", "fr", "\304\215es", "ces", "\304\215s", "cs", "slov", "nem", "it", "po\304\276", "pol", "ma\304\217", "mad", |
|
11316
|
|
|
|
|
|
|
"rus", "sev", "v\303\275ch", "vych", "ju\305\276", "juz", "z\303\241p", "zap", |
|
11317
|
|
|
|
|
|
|
// Common abbrevs |
|
11318
|
|
|
|
|
|
|
"adr", "\304\215", "c", "eg", "ev", "g", "hod", "j", "kr", "m", "max", "min", "mj", "napr", |
|
11319
|
|
|
|
|
|
|
"okr", "popr", "pozn", "r", "red", "rep", "resp", "srov", "st", "str", |
|
11320
|
|
|
|
|
|
|
"sv", "tel", "tj", "tzv", "\303\272", "u", "uh", "ul", "um", "zl", "zn", |
|
11321
|
|
|
|
|
|
|
}; |
|
11322
|
|
|
|
|
|
|
|
|
11323
|
0
|
|
|
|
|
|
czech_tokenizer::czech_tokenizer(tokenizer_language language, unsigned version, const morpho* m) |
|
11324
|
0
|
0
|
|
|
|
|
: ragel_tokenizer(version <= 1 ? 1 : 2), m(m) { |
|
|
|
0
|
|
|
|
|
|
|
11325
|
0
|
|
|
|
|
|
switch (language) { |
|
11326
|
|
|
|
|
|
|
case CZECH: |
|
11327
|
0
|
|
|
|
|
|
abbreviations = &abbreviations_czech; |
|
11328
|
0
|
|
|
|
|
|
break; |
|
11329
|
|
|
|
|
|
|
case SLOVAK: |
|
11330
|
0
|
|
|
|
|
|
abbreviations = &abbreviations_slovak; |
|
11331
|
0
|
|
|
|
|
|
break; |
|
11332
|
|
|
|
|
|
|
} |
|
11333
|
0
|
|
|
|
|
|
} |
|
11334
|
|
|
|
|
|
|
|
|
11335
|
0
|
|
|
|
|
|
void czech_tokenizer::merge_hyphenated(vector& tokens) { |
|
11336
|
|
|
|
|
|
|
using namespace unilib; |
|
11337
|
|
|
|
|
|
|
|
|
11338
|
0
|
0
|
|
|
|
|
if (!m) return; |
|
11339
|
0
|
0
|
|
|
|
|
if (tokens.empty() || chars[tokens.back().start].cat & ~unicode::L) return; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
11340
|
|
|
|
|
|
|
|
|
11341
|
|
|
|
|
|
|
unsigned matched_hyphens = 0; |
|
11342
|
0
|
0
|
|
|
|
|
for (unsigned hyphens = 1; hyphens <= 2; hyphens++) { |
|
11343
|
|
|
|
|
|
|
// Are the tokens a sequence of 'hyphens' hyphenated tokens? |
|
11344
|
0
|
0
|
|
|
|
|
if (tokens.size() < 2*hyphens + 1) break; |
|
11345
|
0
|
|
|
|
|
|
unsigned first_hyphen = tokens.size() - 2*hyphens; |
|
11346
|
0
|
0
|
|
|
|
|
if (tokens[first_hyphen].length != 1 || chars[tokens[first_hyphen].start].cat & ~unicode::P || |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
11347
|
0
|
0
|
|
|
|
|
tokens[first_hyphen].start + tokens[first_hyphen].length != tokens[first_hyphen + 1].start || |
|
11348
|
0
|
0
|
|
|
|
|
tokens[first_hyphen-1].start + tokens[first_hyphen-1].length != tokens[first_hyphen].start || |
|
|
|
0
|
|
|
|
|
|
|
11349
|
0
|
|
|
|
|
|
chars[tokens[first_hyphen-1].start].cat & ~unicode::L) |
|
11350
|
|
|
|
|
|
|
break; |
|
11351
|
|
|
|
|
|
|
|
|
11352
|
0
|
0
|
|
|
|
|
if (m->analyze(string_piece(chars[tokens[first_hyphen-1].start].str, chars[tokens.back().start + tokens.back().length].str - chars[tokens[first_hyphen-1].start].str), morpho::NO_GUESSER, lemmas) >= 0) |
|
11353
|
|
|
|
|
|
|
matched_hyphens = hyphens; |
|
11354
|
|
|
|
|
|
|
} |
|
11355
|
|
|
|
|
|
|
|
|
11356
|
0
|
0
|
|
|
|
|
if (matched_hyphens) { |
|
11357
|
0
|
|
|
|
|
|
unsigned first = tokens.size() - 2*matched_hyphens - 1; |
|
11358
|
0
|
|
|
|
|
|
tokens[first].length = tokens.back().start + tokens.back().length - tokens[first].start; |
|
11359
|
0
|
|
|
|
|
|
tokens.resize(first + 1); |
|
11360
|
|
|
|
|
|
|
} |
|
11361
|
|
|
|
|
|
|
} |
|
11362
|
|
|
|
|
|
|
|
|
11363
|
0
|
|
|
|
|
|
bool czech_tokenizer::next_sentence(vector& tokens) { |
|
11364
|
|
|
|
|
|
|
using namespace unilib; |
|
11365
|
|
|
|
|
|
|
|
|
11366
|
|
|
|
|
|
|
int cs, act; |
|
11367
|
|
|
|
|
|
|
size_t ts, te; |
|
11368
|
|
|
|
|
|
|
size_t whitespace = 0; // Suppress "may be uninitialized" warning |
|
11369
|
|
|
|
|
|
|
|
|
11370
|
0
|
0
|
|
|
|
|
while (tokenize_url_email(tokens)) |
|
11371
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) |
|
11372
|
|
|
|
|
|
|
return true; |
|
11373
|
|
|
|
|
|
|
|
|
11374
|
|
|
|
|
|
|
{ |
|
11375
|
|
|
|
|
|
|
cs = czech_tokenizer_start; |
|
11376
|
0
|
|
|
|
|
|
ts = 0; |
|
11377
|
|
|
|
|
|
|
te = 0; |
|
11378
|
|
|
|
|
|
|
act = 0; |
|
11379
|
|
|
|
|
|
|
} |
|
11380
|
|
|
|
|
|
|
|
|
11381
|
|
|
|
|
|
|
{ |
|
11382
|
|
|
|
|
|
|
int _klen; |
|
11383
|
|
|
|
|
|
|
const short *_keys; |
|
11384
|
|
|
|
|
|
|
int _trans; |
|
11385
|
|
|
|
|
|
|
short _widec; |
|
11386
|
|
|
|
|
|
|
|
|
11387
|
0
|
0
|
|
|
|
|
if ( ( current) == ( (chars.size() - 1)) ) |
|
11388
|
|
|
|
|
|
|
goto _test_eof; |
|
11389
|
|
|
|
|
|
|
if ( cs == 0 ) |
|
11390
|
|
|
|
|
|
|
goto _out; |
|
11391
|
|
|
|
|
|
|
_resume: |
|
11392
|
0
|
0
|
|
|
|
|
switch ( _czech_tokenizer_from_state_actions[cs] ) { |
|
11393
|
|
|
|
|
|
|
case 6: |
|
11394
|
0
|
|
|
|
|
|
{ts = ( current);} |
|
11395
|
0
|
|
|
|
|
|
break; |
|
11396
|
|
|
|
|
|
|
} |
|
11397
|
|
|
|
|
|
|
|
|
11398
|
0
|
|
|
|
|
|
_widec = ( ragel_char(chars[current])); |
|
11399
|
0
|
|
|
|
|
|
_klen = _czech_tokenizer_cond_lengths[cs]; |
|
11400
|
0
|
|
|
|
|
|
_keys = _czech_tokenizer_cond_keys + (_czech_tokenizer_cond_offsets[cs]*2); |
|
11401
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
|
11402
|
|
|
|
|
|
|
const short *_lower = _keys; |
|
11403
|
|
|
|
|
|
|
const short *_mid; |
|
11404
|
0
|
|
|
|
|
|
const short *_upper = _keys + (_klen<<1) - 2; |
|
11405
|
|
|
|
|
|
|
while (1) { |
|
11406
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
|
11407
|
|
|
|
|
|
|
break; |
|
11408
|
|
|
|
|
|
|
|
|
11409
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
|
11410
|
0
|
0
|
|
|
|
|
if ( _widec < _mid[0] ) |
|
11411
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
|
11412
|
0
|
0
|
|
|
|
|
else if ( _widec > _mid[1] ) |
|
11413
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
|
11414
|
|
|
|
|
|
|
else { |
|
11415
|
0
|
|
|
|
|
|
switch ( _czech_tokenizer_cond_spaces[_czech_tokenizer_cond_offsets[cs] + ((_mid - _keys)>>1)] ) { |
|
11416
|
|
|
|
|
|
|
case 0: { |
|
11417
|
0
|
|
|
|
|
|
_widec = (short)(256u + (( ragel_char(chars[current])) - 0u)); |
|
11418
|
0
|
0
|
|
|
|
|
if ( |
|
11419
|
0
|
0
|
|
|
|
|
!current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256; |
|
|
|
0
|
|
|
|
|
|
|
11420
|
|
|
|
|
|
|
break; |
|
11421
|
|
|
|
|
|
|
} |
|
11422
|
|
|
|
|
|
|
case 1: { |
|
11423
|
0
|
|
|
|
|
|
_widec = (short)(768u + (( ragel_char(chars[current])) - 0u)); |
|
11424
|
0
|
0
|
|
|
|
|
if ( |
|
11425
|
0
|
0
|
|
|
|
|
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
11426
|
|
|
|
|
|
|
break; |
|
11427
|
|
|
|
|
|
|
} |
|
11428
|
|
|
|
|
|
|
} |
|
11429
|
|
|
|
|
|
|
break; |
|
11430
|
|
|
|
|
|
|
} |
|
11431
|
|
|
|
|
|
|
} |
|
11432
|
|
|
|
|
|
|
} |
|
11433
|
|
|
|
|
|
|
|
|
11434
|
0
|
|
|
|
|
|
_keys = _czech_tokenizer_trans_keys + _czech_tokenizer_key_offsets[cs]; |
|
11435
|
0
|
|
|
|
|
|
_trans = _czech_tokenizer_index_offsets[cs]; |
|
11436
|
|
|
|
|
|
|
|
|
11437
|
0
|
|
|
|
|
|
_klen = _czech_tokenizer_single_lengths[cs]; |
|
11438
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
|
11439
|
|
|
|
|
|
|
const short *_lower = _keys; |
|
11440
|
|
|
|
|
|
|
const short *_mid; |
|
11441
|
0
|
|
|
|
|
|
const short *_upper = _keys + _klen - 1; |
|
11442
|
|
|
|
|
|
|
while (1) { |
|
11443
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
|
11444
|
|
|
|
|
|
|
break; |
|
11445
|
|
|
|
|
|
|
|
|
11446
|
0
|
|
|
|
|
|
_mid = _lower + ((_upper-_lower) >> 1); |
|
11447
|
0
|
0
|
|
|
|
|
if ( _widec < *_mid ) |
|
11448
|
0
|
|
|
|
|
|
_upper = _mid - 1; |
|
11449
|
0
|
0
|
|
|
|
|
else if ( _widec > *_mid ) |
|
11450
|
0
|
|
|
|
|
|
_lower = _mid + 1; |
|
11451
|
|
|
|
|
|
|
else { |
|
11452
|
0
|
|
|
|
|
|
_trans += (unsigned int)(_mid - _keys); |
|
11453
|
0
|
|
|
|
|
|
goto _match; |
|
11454
|
|
|
|
|
|
|
} |
|
11455
|
|
|
|
|
|
|
} |
|
11456
|
0
|
|
|
|
|
|
_keys += _klen; |
|
11457
|
0
|
|
|
|
|
|
_trans += _klen; |
|
11458
|
|
|
|
|
|
|
} |
|
11459
|
|
|
|
|
|
|
|
|
11460
|
0
|
|
|
|
|
|
_klen = _czech_tokenizer_range_lengths[cs]; |
|
11461
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
|
11462
|
|
|
|
|
|
|
const short *_lower = _keys; |
|
11463
|
|
|
|
|
|
|
const short *_mid; |
|
11464
|
0
|
|
|
|
|
|
const short *_upper = _keys + (_klen<<1) - 2; |
|
11465
|
|
|
|
|
|
|
while (1) { |
|
11466
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
|
11467
|
|
|
|
|
|
|
break; |
|
11468
|
|
|
|
|
|
|
|
|
11469
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
|
11470
|
0
|
0
|
|
|
|
|
if ( _widec < _mid[0] ) |
|
11471
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
|
11472
|
0
|
0
|
|
|
|
|
else if ( _widec > _mid[1] ) |
|
11473
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
|
11474
|
|
|
|
|
|
|
else { |
|
11475
|
0
|
|
|
|
|
|
_trans += (unsigned int)((_mid - _keys)>>1); |
|
11476
|
0
|
|
|
|
|
|
goto _match; |
|
11477
|
|
|
|
|
|
|
} |
|
11478
|
|
|
|
|
|
|
} |
|
11479
|
0
|
|
|
|
|
|
_trans += _klen; |
|
11480
|
|
|
|
|
|
|
} |
|
11481
|
|
|
|
|
|
|
|
|
11482
|
|
|
|
|
|
|
_match: |
|
11483
|
0
|
|
|
|
|
|
_trans = _czech_tokenizer_indicies[_trans]; |
|
11484
|
|
|
|
|
|
|
_eof_trans: |
|
11485
|
0
|
|
|
|
|
|
cs = _czech_tokenizer_trans_targs[_trans]; |
|
11486
|
|
|
|
|
|
|
|
|
11487
|
0
|
0
|
|
|
|
|
if ( _czech_tokenizer_trans_actions[_trans] == 0 ) |
|
11488
|
|
|
|
|
|
|
goto _again; |
|
11489
|
|
|
|
|
|
|
|
|
11490
|
0
|
|
|
|
|
|
switch ( _czech_tokenizer_trans_actions[_trans] ) { |
|
11491
|
|
|
|
|
|
|
case 3: |
|
11492
|
0
|
|
|
|
|
|
{ whitespace = current; } |
|
11493
|
0
|
|
|
|
|
|
break; |
|
11494
|
|
|
|
|
|
|
case 4: |
|
11495
|
0
|
|
|
|
|
|
{te = ( current)+1;} |
|
11496
|
0
|
|
|
|
|
|
break; |
|
11497
|
|
|
|
|
|
|
case 7: |
|
11498
|
0
|
|
|
|
|
|
{te = ( current)+1;{ tokens.emplace_back(ts, te - ts); |
|
11499
|
0
|
|
|
|
|
|
merge_hyphenated(tokens); |
|
11500
|
0
|
|
|
|
|
|
current = te; |
|
11501
|
0
|
0
|
|
|
|
|
do |
|
11502
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
11503
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
|
11504
|
0
|
|
|
|
|
|
( current)--; |
|
11505
|
|
|
|
|
|
|
}} |
|
11506
|
0
|
|
|
|
|
|
break; |
|
11507
|
|
|
|
|
|
|
case 2: |
|
11508
|
0
|
|
|
|
|
|
{te = ( current)+1;{ |
|
11509
|
0
|
|
|
|
|
|
bool eos = is_eos(tokens, chars[ts].chr, abbreviations); |
|
11510
|
0
|
0
|
|
|
|
|
for (current = ts; current < whitespace; current++) |
|
11511
|
0
|
|
|
|
|
|
tokens.emplace_back(current, 1); |
|
11512
|
0
|
|
|
|
|
|
{( current) = (( whitespace))-1;} |
|
11513
|
0
|
0
|
|
|
|
|
if (eos) {( current)++; goto _out; } |
|
11514
|
|
|
|
|
|
|
}} |
|
11515
|
|
|
|
|
|
|
break; |
|
11516
|
|
|
|
|
|
|
case 10: |
|
11517
|
0
|
|
|
|
|
|
{te = ( current)+1;{ |
|
11518
|
0
|
0
|
|
|
|
|
if (!tokens.empty()) {( current)++; goto _out; } |
|
11519
|
0
|
|
|
|
|
|
current = te; |
|
11520
|
0
|
0
|
|
|
|
|
do |
|
11521
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
11522
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
|
11523
|
0
|
|
|
|
|
|
( current)--; |
|
11524
|
|
|
|
|
|
|
}} |
|
11525
|
0
|
|
|
|
|
|
break; |
|
11526
|
|
|
|
|
|
|
case 11: |
|
11527
|
0
|
|
|
|
|
|
{te = ( current);( current)--;{ tokens.emplace_back(ts, te - ts); |
|
11528
|
0
|
|
|
|
|
|
merge_hyphenated(tokens); |
|
11529
|
0
|
|
|
|
|
|
current = te; |
|
11530
|
0
|
0
|
|
|
|
|
do |
|
11531
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
11532
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
|
11533
|
0
|
|
|
|
|
|
( current)--; |
|
11534
|
|
|
|
|
|
|
}} |
|
11535
|
0
|
|
|
|
|
|
break; |
|
11536
|
|
|
|
|
|
|
case 8: |
|
11537
|
0
|
|
|
|
|
|
{te = ( current);( current)--;{ |
|
11538
|
0
|
|
|
|
|
|
current = te; |
|
11539
|
0
|
0
|
|
|
|
|
do |
|
11540
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
11541
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
|
11542
|
0
|
|
|
|
|
|
( current)--; |
|
11543
|
|
|
|
|
|
|
}} |
|
11544
|
0
|
|
|
|
|
|
break; |
|
11545
|
|
|
|
|
|
|
case 9: |
|
11546
|
0
|
|
|
|
|
|
{te = ( current);( current)--;{ |
|
11547
|
0
|
0
|
|
|
|
|
if (!tokens.empty()) {( current)++; goto _out; } |
|
11548
|
0
|
|
|
|
|
|
current = te; |
|
11549
|
0
|
0
|
|
|
|
|
do |
|
11550
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
11551
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
|
11552
|
0
|
|
|
|
|
|
( current)--; |
|
11553
|
|
|
|
|
|
|
}} |
|
11554
|
0
|
|
|
|
|
|
break; |
|
11555
|
|
|
|
|
|
|
case 1: |
|
11556
|
0
|
|
|
|
|
|
{{( current) = ((te))-1;}{ tokens.emplace_back(ts, te - ts); |
|
11557
|
0
|
|
|
|
|
|
merge_hyphenated(tokens); |
|
11558
|
0
|
|
|
|
|
|
current = te; |
|
11559
|
0
|
0
|
|
|
|
|
do |
|
11560
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
11561
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
|
11562
|
0
|
|
|
|
|
|
( current)--; |
|
11563
|
|
|
|
|
|
|
}} |
|
11564
|
0
|
|
|
|
|
|
break; |
|
11565
|
|
|
|
|
|
|
} |
|
11566
|
|
|
|
|
|
|
|
|
11567
|
|
|
|
|
|
|
_again: |
|
11568
|
0
|
0
|
|
|
|
|
switch ( _czech_tokenizer_to_state_actions[cs] ) { |
|
11569
|
|
|
|
|
|
|
case 5: |
|
11570
|
0
|
|
|
|
|
|
{ts = 0;} |
|
11571
|
0
|
|
|
|
|
|
break; |
|
11572
|
|
|
|
|
|
|
} |
|
11573
|
|
|
|
|
|
|
|
|
11574
|
0
|
0
|
|
|
|
|
if ( cs == 0 ) |
|
11575
|
|
|
|
|
|
|
goto _out; |
|
11576
|
0
|
0
|
|
|
|
|
if ( ++( current) != ( (chars.size() - 1)) ) |
|
11577
|
|
|
|
|
|
|
goto _resume; |
|
11578
|
|
|
|
|
|
|
_test_eof: {} |
|
11579
|
0
|
0
|
|
|
|
|
if ( ( current) == ( (chars.size() - 1)) ) |
|
11580
|
|
|
|
|
|
|
{ |
|
11581
|
0
|
0
|
|
|
|
|
if ( _czech_tokenizer_eof_trans[cs] > 0 ) { |
|
11582
|
0
|
|
|
|
|
|
_trans = _czech_tokenizer_eof_trans[cs] - 1; |
|
11583
|
0
|
|
|
|
|
|
goto _eof_trans; |
|
11584
|
|
|
|
|
|
|
} |
|
11585
|
|
|
|
|
|
|
} |
|
11586
|
|
|
|
|
|
|
|
|
11587
|
|
|
|
|
|
|
_out: {} |
|
11588
|
|
|
|
|
|
|
} |
|
11589
|
|
|
|
|
|
|
|
|
11590
|
|
|
|
|
|
|
(void)act; // Suppress unused variable warning |
|
11591
|
|
|
|
|
|
|
|
|
11592
|
0
|
|
|
|
|
|
return !tokens.empty(); |
|
11593
|
|
|
|
|
|
|
} |
|
11594
|
|
|
|
|
|
|
|
|
11595
|
|
|
|
|
|
|
} // namespace morphodita |
|
11596
|
|
|
|
|
|
|
|
|
11597
|
|
|
|
|
|
|
///////// |
|
11598
|
|
|
|
|
|
|
// File: morphodita/tokenizer/czech_tokenizer_factory.h |
|
11599
|
|
|
|
|
|
|
///////// |
|
11600
|
|
|
|
|
|
|
|
|
11601
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
11602
|
|
|
|
|
|
|
// |
|
11603
|
|
|
|
|
|
|
// Copyright 2019 Institute of Formal and Applied Linguistics, Faculty of |
|
11604
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
11605
|
|
|
|
|
|
|
// |
|
11606
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
11607
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
11608
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
11609
|
|
|
|
|
|
|
|
|
11610
|
|
|
|
|
|
|
namespace morphodita { |
|
11611
|
|
|
|
|
|
|
|
|
11612
|
0
|
|
|
|
|
|
class czech_tokenizer_factory : public tokenizer_factory { |
|
11613
|
|
|
|
|
|
|
public: |
|
11614
|
|
|
|
|
|
|
// Construct a new tokenizer instance. |
|
11615
|
|
|
|
|
|
|
virtual tokenizer* new_tokenizer(const morpho* m) const override; |
|
11616
|
|
|
|
|
|
|
|
|
11617
|
|
|
|
|
|
|
bool load(istream& is); |
|
11618
|
|
|
|
|
|
|
private: |
|
11619
|
|
|
|
|
|
|
czech_tokenizer::tokenizer_language language; |
|
11620
|
|
|
|
|
|
|
unsigned version; |
|
11621
|
|
|
|
|
|
|
}; |
|
11622
|
|
|
|
|
|
|
|
|
11623
|
|
|
|
|
|
|
} // namespace morphodita |
|
11624
|
|
|
|
|
|
|
|
|
11625
|
|
|
|
|
|
|
///////// |
|
11626
|
|
|
|
|
|
|
// File: morphodita/tokenizer/czech_tokenizer_factory.cpp |
|
11627
|
|
|
|
|
|
|
///////// |
|
11628
|
|
|
|
|
|
|
|
|
11629
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
11630
|
|
|
|
|
|
|
// |
|
11631
|
|
|
|
|
|
|
// Copyright 2019 Institute of Formal and Applied Linguistics, Faculty of |
|
11632
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
11633
|
|
|
|
|
|
|
// |
|
11634
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
11635
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
11636
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
11637
|
|
|
|
|
|
|
|
|
11638
|
|
|
|
|
|
|
namespace morphodita { |
|
11639
|
|
|
|
|
|
|
|
|
11640
|
0
|
|
|
|
|
|
tokenizer* czech_tokenizer_factory::new_tokenizer(const morpho* m) const { |
|
11641
|
0
|
0
|
|
|
|
|
return new czech_tokenizer(language, version, m); |
|
11642
|
|
|
|
|
|
|
} |
|
11643
|
|
|
|
|
|
|
|
|
11644
|
0
|
|
|
|
|
|
bool czech_tokenizer_factory::load(istream& is) { |
|
11645
|
0
|
|
|
|
|
|
language = czech_tokenizer::tokenizer_language(is.get()); |
|
11646
|
0
|
|
|
|
|
|
version = is.get(); |
|
11647
|
|
|
|
|
|
|
|
|
11648
|
0
|
0
|
|
|
|
|
return bool(is) && (language == czech_tokenizer::CZECH || language == czech_tokenizer::SLOVAK); |
|
|
|
0
|
|
|
|
|
|
|
11649
|
|
|
|
|
|
|
} |
|
11650
|
|
|
|
|
|
|
|
|
11651
|
|
|
|
|
|
|
} // namespace morphodita |
|
11652
|
|
|
|
|
|
|
|
|
11653
|
|
|
|
|
|
|
///////// |
|
11654
|
|
|
|
|
|
|
// File: morphodita/tokenizer/czech_tokenizer_factory_encoder.h |
|
11655
|
|
|
|
|
|
|
///////// |
|
11656
|
|
|
|
|
|
|
|
|
11657
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
11658
|
|
|
|
|
|
|
// |
|
11659
|
|
|
|
|
|
|
// Copyright 2019 Institute of Formal and Applied Linguistics, Faculty of |
|
11660
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
11661
|
|
|
|
|
|
|
// |
|
11662
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
11663
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
11664
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
11665
|
|
|
|
|
|
|
|
|
11666
|
|
|
|
|
|
|
namespace morphodita { |
|
11667
|
|
|
|
|
|
|
|
|
11668
|
|
|
|
|
|
|
class czech_tokenizer_factory_encoder { |
|
11669
|
|
|
|
|
|
|
public: |
|
11670
|
|
|
|
|
|
|
static void encode(czech_tokenizer::tokenizer_language language, unsigned version, ostream& os); |
|
11671
|
|
|
|
|
|
|
}; |
|
11672
|
|
|
|
|
|
|
|
|
11673
|
|
|
|
|
|
|
} // namespace morphodita |
|
11674
|
|
|
|
|
|
|
|
|
11675
|
|
|
|
|
|
|
///////// |
|
11676
|
|
|
|
|
|
|
// File: morphodita/tokenizer/czech_tokenizer_factory_encoder.cpp |
|
11677
|
|
|
|
|
|
|
///////// |
|
11678
|
|
|
|
|
|
|
|
|
11679
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
11680
|
|
|
|
|
|
|
// |
|
11681
|
|
|
|
|
|
|
// Copyright 2019 Institute of Formal and Applied Linguistics, Faculty of |
|
11682
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
11683
|
|
|
|
|
|
|
// |
|
11684
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
11685
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
11686
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
11687
|
|
|
|
|
|
|
|
|
11688
|
|
|
|
|
|
|
namespace morphodita { |
|
11689
|
|
|
|
|
|
|
|
|
11690
|
0
|
|
|
|
|
|
void czech_tokenizer_factory_encoder::encode(czech_tokenizer::tokenizer_language language, unsigned version, ostream& os) { |
|
11691
|
0
|
|
|
|
|
|
os.put(language); |
|
11692
|
0
|
|
|
|
|
|
os.put(version); |
|
11693
|
0
|
|
|
|
|
|
} |
|
11694
|
|
|
|
|
|
|
|
|
11695
|
|
|
|
|
|
|
} // namespace morphodita |
|
11696
|
|
|
|
|
|
|
|
|
11697
|
|
|
|
|
|
|
///////// |
|
11698
|
|
|
|
|
|
|
// File: morphodita/tokenizer/english_tokenizer.cpp |
|
11699
|
|
|
|
|
|
|
///////// |
|
11700
|
|
|
|
|
|
|
|
|
11701
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
11702
|
|
|
|
|
|
|
// |
|
11703
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
11704
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
11705
|
|
|
|
|
|
|
// |
|
11706
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
11707
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
11708
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
11709
|
|
|
|
|
|
|
|
|
11710
|
|
|
|
|
|
|
namespace morphodita { |
|
11711
|
|
|
|
|
|
|
|
|
11712
|
|
|
|
|
|
|
// The list of lowercased words that when preceding eos do not end sentence. |
|
11713
|
232
|
100
|
|
|
|
|
const unordered_set english_tokenizer::abbreviations = { |
|
|
|
0
|
|
|
|
|
|
|
11714
|
|
|
|
|
|
|
// Titles |
|
11715
|
|
|
|
|
|
|
"adj", "adm", "adv", "assoc", "asst", "bart", "bldg", "brig", "bros", "capt", |
|
11716
|
|
|
|
|
|
|
"cmdr", "col", "comdr", "con", "corp", "cpl", "d", "dr", "dr", "drs", "ens", |
|
11717
|
|
|
|
|
|
|
"gen", "gov", "hon", "hosp", "hr", "insp", "lt", "mm", "mr", "mrs", "ms", |
|
11718
|
|
|
|
|
|
|
"maj", "messrs", "mlle", "mme", "mr", "mrs", "ms", "msgr", "op", "ord", |
|
11719
|
|
|
|
|
|
|
"pfc", "ph", "phd", "prof", "pvt", "rep", "reps", "res", "rev", "rt", "sen", |
|
11720
|
|
|
|
|
|
|
"sens", "sfc", "sgt", "sr", "st", "supt", "surg", "univ", |
|
11721
|
|
|
|
|
|
|
// Common abbrevs |
|
11722
|
|
|
|
|
|
|
"addr", "approx", "apr", "aug", "calif", "co", "corp", "dec", "def", "e", |
|
11723
|
|
|
|
|
|
|
"e.g", "eg", "feb", "fla", "ft", "gen", "gov", "hrs", "i.", "i.e", "ie", |
|
11724
|
|
|
|
|
|
|
"inc", "jan", "jr", "ltd", "mar", "max", "min", "mph", "mt", "n", "nov", |
|
11725
|
|
|
|
|
|
|
"oct", "ont", "pa", "pres", "rep", "rev", "s", "sec", "sen", "sep", "sept", |
|
11726
|
|
|
|
|
|
|
"sgt", "sr", "tel", "un", "univ", "v", "va", "vs", "w", "yrs", |
|
11727
|
|
|
|
|
|
|
}; |
|
11728
|
|
|
|
|
|
|
|
|
11729
|
|
|
|
|
|
|
static const char _english_tokenizer_split_token_key_offsets[] = { |
|
11730
|
|
|
|
|
|
|
0, 0, 16, 20, 22, 26, 28, 30, |
|
11731
|
|
|
|
|
|
|
32, 34, 36, 44, 46, 50, 52, 54, |
|
11732
|
|
|
|
|
|
|
56, 58, 60, 62, 64, 66, 68, 72, |
|
11733
|
|
|
|
|
|
|
74, 76, 78, 80, 82, 82 |
|
11734
|
|
|
|
|
|
|
}; |
|
11735
|
|
|
|
|
|
|
|
|
11736
|
|
|
|
|
|
|
static const unsigned char _english_tokenizer_split_token_trans_keys[] = { |
|
11737
|
|
|
|
|
|
|
65u, 68u, 69u, 76u, 77u, 78u, 83u, 84u, |
|
11738
|
|
|
|
|
|
|
97u, 100u, 101u, 108u, 109u, 110u, 115u, 116u, |
|
11739
|
|
|
|
|
|
|
78u, 84u, 110u, 116u, 78u, 110u, 65u, 79u, |
|
11740
|
|
|
|
|
|
|
97u, 111u, 87u, 119u, 71u, 103u, 84u, 116u, |
|
11741
|
|
|
|
|
|
|
79u, 111u, 39u, 161u, 77u, 82u, 86u, 89u, |
|
11742
|
|
|
|
|
|
|
109u, 114u, 118u, 121u, 77u, 109u, 69u, 73u, |
|
11743
|
|
|
|
|
|
|
101u, 105u, 76u, 108u, 39u, 161u, 68u, 100u, |
|
11744
|
|
|
|
|
|
|
76u, 108u, 39u, 161u, 69u, 101u, 82u, 114u, |
|
11745
|
|
|
|
|
|
|
79u, 111u, 77u, 109u, 39u, 79u, 111u, 161u, |
|
11746
|
|
|
|
|
|
|
78u, 110u, 78u, 110u, 78u, 110u, 65u, 97u, |
|
11747
|
|
|
|
|
|
|
67u, 99u, 0 |
|
11748
|
|
|
|
|
|
|
}; |
|
11749
|
|
|
|
|
|
|
|
|
11750
|
|
|
|
|
|
|
static const char _english_tokenizer_split_token_single_lengths[] = { |
|
11751
|
|
|
|
|
|
|
0, 16, 4, 2, 4, 2, 2, 2, |
|
11752
|
|
|
|
|
|
|
2, 2, 8, 2, 4, 2, 2, 2, |
|
11753
|
|
|
|
|
|
|
2, 2, 2, 2, 2, 2, 4, 2, |
|
11754
|
|
|
|
|
|
|
2, 2, 2, 2, 0, 0 |
|
11755
|
|
|
|
|
|
|
}; |
|
11756
|
|
|
|
|
|
|
|
|
11757
|
|
|
|
|
|
|
static const char _english_tokenizer_split_token_range_lengths[] = { |
|
11758
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
11759
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
11760
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
11761
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0 |
|
11762
|
|
|
|
|
|
|
}; |
|
11763
|
|
|
|
|
|
|
|
|
11764
|
|
|
|
|
|
|
static const unsigned char _english_tokenizer_split_token_index_offsets[] = { |
|
11765
|
|
|
|
|
|
|
0, 0, 17, 22, 25, 30, 33, 36, |
|
11766
|
|
|
|
|
|
|
39, 42, 45, 54, 57, 62, 65, 68, |
|
11767
|
|
|
|
|
|
|
71, 74, 77, 80, 83, 86, 89, 94, |
|
11768
|
|
|
|
|
|
|
97, 100, 103, 106, 109, 110 |
|
11769
|
|
|
|
|
|
|
}; |
|
11770
|
|
|
|
|
|
|
|
|
11771
|
|
|
|
|
|
|
static const char _english_tokenizer_split_token_indicies[] = { |
|
11772
|
|
|
|
|
|
|
0, 2, 3, 4, 2, 5, 2, 6, |
|
11773
|
|
|
|
|
|
|
0, 2, 3, 4, 2, 5, 2, 6, |
|
11774
|
|
|
|
|
|
|
1, 7, 8, 7, 8, 1, 9, 9, |
|
11775
|
|
|
|
|
|
|
1, 10, 11, 10, 11, 1, 12, 12, |
|
11776
|
|
|
|
|
|
|
1, 12, 12, 1, 13, 13, 1, 11, |
|
11777
|
|
|
|
|
|
|
11, 1, 14, 14, 1, 15, 2, 2, |
|
11778
|
|
|
|
|
|
|
16, 15, 2, 2, 16, 1, 17, 17, |
|
11779
|
|
|
|
|
|
|
1, 18, 11, 18, 11, 1, 12, 12, |
|
11780
|
|
|
|
|
|
|
1, 19, 19, 1, 12, 12, 1, 2, |
|
11781
|
|
|
|
|
|
|
2, 1, 20, 20, 1, 21, 21, 1, |
|
11782
|
|
|
|
|
|
|
22, 22, 1, 23, 23, 1, 12, 12, |
|
11783
|
|
|
|
|
|
|
1, 24, 25, 25, 24, 1, 14, 14, |
|
11784
|
|
|
|
|
|
|
1, 26, 26, 1, 27, 27, 1, 28, |
|
11785
|
|
|
|
|
|
|
28, 1, 12, 12, 1, 1, 1, 0 |
|
11786
|
|
|
|
|
|
|
}; |
|
11787
|
|
|
|
|
|
|
|
|
11788
|
|
|
|
|
|
|
static const char _english_tokenizer_split_token_trans_targs[] = { |
|
11789
|
|
|
|
|
|
|
2, 0, 9, 10, 16, 17, 22, 3, |
|
11790
|
|
|
|
|
|
|
7, 4, 5, 6, 28, 8, 29, 11, |
|
11791
|
|
|
|
|
|
|
14, 12, 13, 15, 18, 19, 20, 21, |
|
11792
|
|
|
|
|
|
|
23, 24, 25, 26, 27 |
|
11793
|
|
|
|
|
|
|
}; |
|
11794
|
|
|
|
|
|
|
|
|
11795
|
|
|
|
|
|
|
static const char _english_tokenizer_split_token_trans_actions[] = { |
|
11796
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 1, |
|
11797
|
|
|
|
|
|
|
1, 0, 0, 0, 0, 0, 2, 1, |
|
11798
|
|
|
|
|
|
|
1, 0, 0, 0, 1, 0, 0, 0, |
|
11799
|
|
|
|
|
|
|
0, 0, 1, 0, 0 |
|
11800
|
|
|
|
|
|
|
}; |
|
11801
|
|
|
|
|
|
|
|
|
11802
|
|
|
|
|
|
|
static const char _english_tokenizer_split_token_eof_actions[] = { |
|
11803
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
11804
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
11805
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
11806
|
|
|
|
|
|
|
0, 0, 0, 0, 3, 0 |
|
11807
|
|
|
|
|
|
|
}; |
|
11808
|
|
|
|
|
|
|
|
|
11809
|
|
|
|
|
|
|
static const int english_tokenizer_split_token_start = 1; |
|
11810
|
|
|
|
|
|
|
|
|
11811
|
0
|
|
|
|
|
|
void english_tokenizer::split_token(vector& tokens) { |
|
11812
|
0
|
0
|
|
|
|
|
if (tokens.empty() || chars[tokens.back().start].cat & ~unilib::unicode::L) return; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
11813
|
|
|
|
|
|
|
|
|
11814
|
0
|
|
|
|
|
|
size_t index = tokens.back().start, end = index + tokens.back().length; |
|
11815
|
|
|
|
|
|
|
int cs; |
|
11816
|
0
|
|
|
|
|
|
size_t split_mark = 0, split_len = 0; |
|
11817
|
|
|
|
|
|
|
|
|
11818
|
|
|
|
|
|
|
{ |
|
11819
|
|
|
|
|
|
|
cs = english_tokenizer_split_token_start; |
|
11820
|
|
|
|
|
|
|
} |
|
11821
|
|
|
|
|
|
|
|
|
11822
|
|
|
|
|
|
|
{ |
|
11823
|
|
|
|
|
|
|
int _klen; |
|
11824
|
|
|
|
|
|
|
const unsigned char *_keys; |
|
11825
|
|
|
|
|
|
|
int _trans; |
|
11826
|
|
|
|
|
|
|
|
|
11827
|
0
|
0
|
|
|
|
|
if ( ( index) == ( end) ) |
|
11828
|
|
|
|
|
|
|
goto _test_eof; |
|
11829
|
|
|
|
|
|
|
if ( cs == 0 ) |
|
11830
|
|
|
|
|
|
|
goto _out; |
|
11831
|
|
|
|
|
|
|
_resume: |
|
11832
|
0
|
|
|
|
|
|
_keys = _english_tokenizer_split_token_trans_keys + _english_tokenizer_split_token_key_offsets[cs]; |
|
11833
|
0
|
|
|
|
|
|
_trans = _english_tokenizer_split_token_index_offsets[cs]; |
|
11834
|
|
|
|
|
|
|
|
|
11835
|
0
|
|
|
|
|
|
_klen = _english_tokenizer_split_token_single_lengths[cs]; |
|
11836
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
|
11837
|
|
|
|
|
|
|
const unsigned char *_lower = _keys; |
|
11838
|
|
|
|
|
|
|
const unsigned char *_mid; |
|
11839
|
0
|
|
|
|
|
|
const unsigned char *_upper = _keys + _klen - 1; |
|
11840
|
|
|
|
|
|
|
while (1) { |
|
11841
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
|
11842
|
|
|
|
|
|
|
break; |
|
11843
|
|
|
|
|
|
|
|
|
11844
|
0
|
|
|
|
|
|
_mid = _lower + ((_upper-_lower) >> 1); |
|
11845
|
0
|
0
|
|
|
|
|
if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) < *_mid ) |
|
11846
|
0
|
|
|
|
|
|
_upper = _mid - 1; |
|
11847
|
0
|
0
|
|
|
|
|
else if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) > *_mid ) |
|
11848
|
0
|
|
|
|
|
|
_lower = _mid + 1; |
|
11849
|
|
|
|
|
|
|
else { |
|
11850
|
0
|
|
|
|
|
|
_trans += (unsigned int)(_mid - _keys); |
|
11851
|
0
|
|
|
|
|
|
goto _match; |
|
11852
|
|
|
|
|
|
|
} |
|
11853
|
|
|
|
|
|
|
} |
|
11854
|
0
|
|
|
|
|
|
_keys += _klen; |
|
11855
|
0
|
|
|
|
|
|
_trans += _klen; |
|
11856
|
|
|
|
|
|
|
} |
|
11857
|
|
|
|
|
|
|
|
|
11858
|
0
|
|
|
|
|
|
_klen = _english_tokenizer_split_token_range_lengths[cs]; |
|
11859
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
|
11860
|
|
|
|
|
|
|
const unsigned char *_lower = _keys; |
|
11861
|
|
|
|
|
|
|
const unsigned char *_mid; |
|
11862
|
0
|
|
|
|
|
|
const unsigned char *_upper = _keys + (_klen<<1) - 2; |
|
11863
|
|
|
|
|
|
|
while (1) { |
|
11864
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
|
11865
|
|
|
|
|
|
|
break; |
|
11866
|
|
|
|
|
|
|
|
|
11867
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
|
11868
|
0
|
0
|
|
|
|
|
if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) < _mid[0] ) |
|
11869
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
|
11870
|
0
|
0
|
|
|
|
|
else if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) > _mid[1] ) |
|
11871
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
|
11872
|
|
|
|
|
|
|
else { |
|
11873
|
0
|
|
|
|
|
|
_trans += (unsigned int)((_mid - _keys)>>1); |
|
11874
|
0
|
|
|
|
|
|
goto _match; |
|
11875
|
|
|
|
|
|
|
} |
|
11876
|
|
|
|
|
|
|
} |
|
11877
|
0
|
|
|
|
|
|
_trans += _klen; |
|
11878
|
|
|
|
|
|
|
} |
|
11879
|
|
|
|
|
|
|
|
|
11880
|
|
|
|
|
|
|
_match: |
|
11881
|
0
|
|
|
|
|
|
_trans = _english_tokenizer_split_token_indicies[_trans]; |
|
11882
|
0
|
|
|
|
|
|
cs = _english_tokenizer_split_token_trans_targs[_trans]; |
|
11883
|
|
|
|
|
|
|
|
|
11884
|
0
|
0
|
|
|
|
|
if ( _english_tokenizer_split_token_trans_actions[_trans] == 0 ) |
|
11885
|
|
|
|
|
|
|
goto _again; |
|
11886
|
|
|
|
|
|
|
|
|
11887
|
0
|
|
|
|
|
|
switch ( _english_tokenizer_split_token_trans_actions[_trans] ) { |
|
11888
|
|
|
|
|
|
|
case 1: |
|
11889
|
0
|
|
|
|
|
|
{ split_mark = index - tokens.back().start + 1; } |
|
11890
|
0
|
|
|
|
|
|
break; |
|
11891
|
|
|
|
|
|
|
case 2: |
|
11892
|
0
|
|
|
|
|
|
{ split_mark = index - tokens.back().start + 1; } |
|
11893
|
0
|
|
|
|
|
|
{ split_len = split_mark; {( index)++; goto _out; } } |
|
11894
|
|
|
|
|
|
|
break; |
|
11895
|
|
|
|
|
|
|
} |
|
11896
|
|
|
|
|
|
|
|
|
11897
|
|
|
|
|
|
|
_again: |
|
11898
|
0
|
0
|
|
|
|
|
if ( cs == 0 ) |
|
11899
|
|
|
|
|
|
|
goto _out; |
|
11900
|
0
|
0
|
|
|
|
|
if ( ++( index) != ( end) ) |
|
11901
|
|
|
|
|
|
|
goto _resume; |
|
11902
|
|
|
|
|
|
|
_test_eof: {} |
|
11903
|
0
|
0
|
|
|
|
|
if ( ( index) == ( end) ) |
|
11904
|
|
|
|
|
|
|
{ |
|
11905
|
0
|
0
|
|
|
|
|
switch ( _english_tokenizer_split_token_eof_actions[cs] ) { |
|
11906
|
|
|
|
|
|
|
case 3: |
|
11907
|
0
|
|
|
|
|
|
{ split_len = split_mark; {( index)++; goto _out; } } |
|
11908
|
|
|
|
|
|
|
break; |
|
11909
|
|
|
|
|
|
|
} |
|
11910
|
|
|
|
|
|
|
} |
|
11911
|
|
|
|
|
|
|
|
|
11912
|
|
|
|
|
|
|
_out: {} |
|
11913
|
|
|
|
|
|
|
} |
|
11914
|
|
|
|
|
|
|
|
|
11915
|
0
|
0
|
|
|
|
|
if (split_len && split_len < end) { |
|
11916
|
0
|
|
|
|
|
|
tokens.back().length -= split_len; |
|
11917
|
0
|
|
|
|
|
|
tokens.emplace_back(end - split_len, split_len); |
|
11918
|
|
|
|
|
|
|
} |
|
11919
|
|
|
|
|
|
|
} |
|
11920
|
|
|
|
|
|
|
|
|
11921
|
|
|
|
|
|
|
static const char _english_tokenizer_cond_offsets[] = { |
|
11922
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
11923
|
|
|
|
|
|
|
0, 0, 0, 2, 2, 2, 2, 2, |
|
11924
|
|
|
|
|
|
|
2, 2, 2, 2, 2, 2, 2, 2, |
|
11925
|
|
|
|
|
|
|
2, 2, 2, 2, 2 |
|
11926
|
|
|
|
|
|
|
}; |
|
11927
|
|
|
|
|
|
|
|
|
11928
|
|
|
|
|
|
|
static const char _english_tokenizer_cond_lengths[] = { |
|
11929
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
11930
|
|
|
|
|
|
|
0, 0, 2, 0, 0, 0, 0, 0, |
|
11931
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
11932
|
|
|
|
|
|
|
0, 0, 0, 0, 0 |
|
11933
|
|
|
|
|
|
|
}; |
|
11934
|
|
|
|
|
|
|
|
|
11935
|
|
|
|
|
|
|
static const short _english_tokenizer_cond_keys[] = { |
|
11936
|
|
|
|
|
|
|
43u, 43u, 45u, 45u, 0 |
|
11937
|
|
|
|
|
|
|
}; |
|
11938
|
|
|
|
|
|
|
|
|
11939
|
|
|
|
|
|
|
static const char _english_tokenizer_cond_spaces[] = { |
|
11940
|
|
|
|
|
|
|
1, 0, 0 |
|
11941
|
|
|
|
|
|
|
}; |
|
11942
|
|
|
|
|
|
|
|
|
11943
|
|
|
|
|
|
|
static const unsigned char _english_tokenizer_key_offsets[] = { |
|
11944
|
|
|
|
|
|
|
0, 0, 17, 29, 43, 46, 49, 52, |
|
11945
|
|
|
|
|
|
|
55, 60, 63, 98, 103, 107, 110, 114, |
|
11946
|
|
|
|
|
|
|
119, 120, 125, 126, 131, 145, 152, 156, |
|
11947
|
|
|
|
|
|
|
161, 164, 179, 192, 206 |
|
11948
|
|
|
|
|
|
|
}; |
|
11949
|
|
|
|
|
|
|
|
|
11950
|
|
|
|
|
|
|
static const short _english_tokenizer_trans_keys[] = { |
|
11951
|
|
|
|
|
|
|
13u, 32u, 34u, 40u, 91u, 96u, 123u, 129u, |
|
11952
|
|
|
|
|
|
|
133u, 135u, 147u, 150u, 162u, 9u, 10u, 65u, |
|
11953
|
|
|
|
|
|
|
90u, 34u, 40u, 91u, 96u, 123u, 129u, 133u, |
|
11954
|
|
|
|
|
|
|
135u, 150u, 162u, 65u, 90u, 13u, 32u, 34u, |
|
11955
|
|
|
|
|
|
|
39u, 41u, 59u, 93u, 125u, 139u, 141u, 147u, |
|
11956
|
|
|
|
|
|
|
161u, 9u, 10u, 159u, 48u, 57u, 159u, 48u, |
|
11957
|
|
|
|
|
|
|
57u, 159u, 48u, 57u, 159u, 48u, 57u, 43u, |
|
11958
|
|
|
|
|
|
|
45u, 159u, 48u, 57u, 159u, 48u, 57u, 9u, |
|
11959
|
|
|
|
|
|
|
10u, 13u, 32u, 33u, 44u, 46u, 47u, 63u, |
|
11960
|
|
|
|
|
|
|
129u, 131u, 135u, 142u, 147u, 157u, 159u, 160u, |
|
11961
|
|
|
|
|
|
|
301u, 557u, 811u, 1067u, 0u, 42u, 48u, 57u, |
|
11962
|
|
|
|
|
|
|
58u, 64u, 65u, 90u, 91u, 96u, 97u, 122u, |
|
11963
|
|
|
|
|
|
|
123u, 255u, 9u, 10u, 13u, 32u, 147u, 9u, |
|
11964
|
|
|
|
|
|
|
13u, 32u, 147u, 9u, 32u, 147u, 9u, 10u, |
|
11965
|
|
|
|
|
|
|
32u, 147u, 9u, 10u, 13u, 32u, 147u, 13u, |
|
11966
|
|
|
|
|
|
|
9u, 10u, 13u, 32u, 147u, 10u, 9u, 10u, |
|
11967
|
|
|
|
|
|
|
13u, 32u, 147u, 13u, 32u, 34u, 39u, 41u, |
|
11968
|
|
|
|
|
|
|
59u, 93u, 125u, 139u, 141u, 147u, 161u, 9u, |
|
11969
|
|
|
|
|
|
|
10u, 44u, 46u, 69u, 101u, 159u, 48u, 57u, |
|
11970
|
|
|
|
|
|
|
44u, 46u, 69u, 101u, 69u, 101u, 159u, 48u, |
|
11971
|
|
|
|
|
|
|
57u, 159u, 48u, 57u, 39u, 45u, 129u, 131u, |
|
11972
|
|
|
|
|
|
|
135u, 151u, 155u, 157u, 161u, 65u, 90u, 97u, |
|
11973
|
|
|
|
|
|
|
122u, 142u, 143u, 45u, 129u, 131u, 135u, 151u, |
|
11974
|
|
|
|
|
|
|
155u, 157u, 65u, 90u, 97u, 122u, 142u, 143u, |
|
11975
|
|
|
|
|
|
|
39u, 129u, 131u, 135u, 151u, 155u, 157u, 161u, |
|
11976
|
|
|
|
|
|
|
65u, 90u, 97u, 122u, 142u, 143u, 159u, 48u, |
|
11977
|
|
|
|
|
|
|
57u, 0 |
|
11978
|
|
|
|
|
|
|
}; |
|
11979
|
|
|
|
|
|
|
|
|
11980
|
|
|
|
|
|
|
static const char _english_tokenizer_single_lengths[] = { |
|
11981
|
|
|
|
|
|
|
0, 13, 10, 12, 1, 1, 1, 1, |
|
11982
|
|
|
|
|
|
|
3, 1, 21, 5, 4, 3, 4, 5, |
|
11983
|
|
|
|
|
|
|
1, 5, 1, 5, 12, 5, 4, 3, |
|
11984
|
|
|
|
|
|
|
1, 9, 7, 8, 1 |
|
11985
|
|
|
|
|
|
|
}; |
|
11986
|
|
|
|
|
|
|
|
|
11987
|
|
|
|
|
|
|
static const char _english_tokenizer_range_lengths[] = { |
|
11988
|
|
|
|
|
|
|
0, 2, 1, 1, 1, 1, 1, 1, |
|
11989
|
|
|
|
|
|
|
1, 1, 7, 0, 0, 0, 0, 0, |
|
11990
|
|
|
|
|
|
|
0, 0, 0, 0, 1, 1, 0, 1, |
|
11991
|
|
|
|
|
|
|
1, 3, 3, 3, 1 |
|
11992
|
|
|
|
|
|
|
}; |
|
11993
|
|
|
|
|
|
|
|
|
11994
|
|
|
|
|
|
|
static const unsigned char _english_tokenizer_index_offsets[] = { |
|
11995
|
|
|
|
|
|
|
0, 0, 16, 28, 42, 45, 48, 51, |
|
11996
|
|
|
|
|
|
|
54, 59, 62, 91, 97, 102, 106, 111, |
|
11997
|
|
|
|
|
|
|
117, 119, 125, 127, 133, 147, 154, 159, |
|
11998
|
|
|
|
|
|
|
164, 167, 180, 191, 203 |
|
11999
|
|
|
|
|
|
|
}; |
|
12000
|
|
|
|
|
|
|
|
|
12001
|
|
|
|
|
|
|
static const char _english_tokenizer_indicies[] = { |
|
12002
|
|
|
|
|
|
|
1, 1, 2, 2, 2, 2, 2, 3, |
|
12003
|
|
|
|
|
|
|
2, 3, 1, 2, 2, 1, 3, 0, |
|
12004
|
|
|
|
|
|
|
2, 2, 2, 2, 2, 3, 2, 3, |
|
12005
|
|
|
|
|
|
|
2, 2, 3, 0, 4, 4, 5, 5, |
|
12006
|
|
|
|
|
|
|
5, 5, 5, 5, 5, 5, 4, 5, |
|
12007
|
|
|
|
|
|
|
4, 0, 6, 6, 0, 7, 7, 0, |
|
12008
|
|
|
|
|
|
|
8, 8, 0, 9, 9, 0, 10, 10, |
|
12009
|
|
|
|
|
|
|
11, 11, 0, 11, 11, 0, 13, 14, |
|
12010
|
|
|
|
|
|
|
15, 13, 16, 12, 16, 12, 16, 19, |
|
12011
|
|
|
|
|
|
|
19, 19, 19, 13, 19, 18, 16, 12, |
|
12012
|
|
|
|
|
|
|
20, 12, 20, 12, 18, 12, 19, 12, |
|
12013
|
|
|
|
|
|
|
19, 12, 17, 13, 22, 23, 13, 13, |
|
12014
|
|
|
|
|
|
|
21, 13, 24, 13, 13, 21, 13, 13, |
|
12015
|
|
|
|
|
|
|
13, 21, 13, 24, 13, 13, 21, 13, |
|
12016
|
|
|
|
|
|
|
25, 26, 13, 13, 21, 28, 27, 13, |
|
12017
|
|
|
|
|
|
|
25, 29, 13, 13, 21, 28, 27, 13, |
|
12018
|
|
|
|
|
|
|
26, 29, 13, 13, 21, 4, 4, 5, |
|
12019
|
|
|
|
|
|
|
5, 5, 5, 5, 5, 5, 5, 4, |
|
12020
|
|
|
|
|
|
|
5, 4, 30, 31, 32, 33, 33, 18, |
|
12021
|
|
|
|
|
|
|
18, 30, 31, 32, 33, 33, 30, 33, |
|
12022
|
|
|
|
|
|
|
33, 9, 9, 30, 11, 11, 30, 34, |
|
12023
|
|
|
|
|
|
|
35, 19, 19, 19, 19, 19, 19, 34, |
|
12024
|
|
|
|
|
|
|
19, 19, 19, 30, 35, 19, 19, 19, |
|
12025
|
|
|
|
|
|
|
19, 19, 19, 19, 19, 19, 30, 34, |
|
12026
|
|
|
|
|
|
|
19, 19, 19, 19, 19, 19, 34, 19, |
|
12027
|
|
|
|
|
|
|
19, 19, 30, 18, 18, 30, 0 |
|
12028
|
|
|
|
|
|
|
}; |
|
12029
|
|
|
|
|
|
|
|
|
12030
|
|
|
|
|
|
|
static const char _english_tokenizer_trans_targs[] = { |
|
12031
|
|
|
|
|
|
|
10, 1, 2, 10, 1, 3, 5, 6, |
|
12032
|
|
|
|
|
|
|
22, 23, 9, 24, 10, 11, 15, 19, |
|
12033
|
|
|
|
|
|
|
20, 0, 21, 25, 28, 10, 12, 14, |
|
12034
|
|
|
|
|
|
|
13, 16, 17, 10, 10, 18, 10, 4, |
|
12035
|
|
|
|
|
|
|
7, 8, 26, 27 |
|
12036
|
|
|
|
|
|
|
}; |
|
12037
|
|
|
|
|
|
|
|
|
12038
|
|
|
|
|
|
|
static const char _english_tokenizer_trans_actions[] = { |
|
12039
|
|
|
|
|
|
|
1, 0, 0, 2, 3, 0, 0, 0, |
|
12040
|
|
|
|
|
|
|
4, 4, 0, 0, 7, 0, 0, 0, |
|
12041
|
|
|
|
|
|
|
4, 0, 4, 0, 0, 8, 0, 0, |
|
12042
|
|
|
|
|
|
|
0, 0, 0, 9, 10, 0, 11, 0, |
|
12043
|
|
|
|
|
|
|
0, 0, 0, 0 |
|
12044
|
|
|
|
|
|
|
}; |
|
12045
|
|
|
|
|
|
|
|
|
12046
|
|
|
|
|
|
|
static const char _english_tokenizer_to_state_actions[] = { |
|
12047
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
12048
|
|
|
|
|
|
|
0, 0, 5, 0, 0, 0, 0, 0, |
|
12049
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
12050
|
|
|
|
|
|
|
0, 0, 0, 0, 0 |
|
12051
|
|
|
|
|
|
|
}; |
|
12052
|
|
|
|
|
|
|
|
|
12053
|
|
|
|
|
|
|
static const char _english_tokenizer_from_state_actions[] = { |
|
12054
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
12055
|
|
|
|
|
|
|
0, 0, 6, 0, 0, 0, 0, 0, |
|
12056
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
12057
|
|
|
|
|
|
|
0, 0, 0, 0, 0 |
|
12058
|
|
|
|
|
|
|
}; |
|
12059
|
|
|
|
|
|
|
|
|
12060
|
|
|
|
|
|
|
static const unsigned char _english_tokenizer_eof_trans[] = { |
|
12061
|
|
|
|
|
|
|
0, 1, 1, 1, 1, 1, 1, 1, |
|
12062
|
|
|
|
|
|
|
1, 1, 0, 22, 22, 22, 22, 22, |
|
12063
|
|
|
|
|
|
|
28, 22, 28, 22, 31, 31, 31, 31, |
|
12064
|
|
|
|
|
|
|
31, 31, 31, 31, 31 |
|
12065
|
|
|
|
|
|
|
}; |
|
12066
|
|
|
|
|
|
|
|
|
12067
|
|
|
|
|
|
|
static const int english_tokenizer_start = 10; |
|
12068
|
|
|
|
|
|
|
|
|
12069
|
0
|
0
|
|
|
|
|
english_tokenizer::english_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {} |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12070
|
|
|
|
|
|
|
|
|
12071
|
0
|
|
|
|
|
|
bool english_tokenizer::next_sentence(vector& tokens) { |
|
12072
|
|
|
|
|
|
|
using namespace unilib; |
|
12073
|
|
|
|
|
|
|
|
|
12074
|
|
|
|
|
|
|
int cs, act; |
|
12075
|
|
|
|
|
|
|
size_t ts, te; |
|
12076
|
|
|
|
|
|
|
size_t whitespace = 0; // Suppress "may be uninitialized" warning |
|
12077
|
|
|
|
|
|
|
|
|
12078
|
0
|
0
|
|
|
|
|
while (tokenize_url_email(tokens)) |
|
12079
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) |
|
12080
|
|
|
|
|
|
|
return true; |
|
12081
|
|
|
|
|
|
|
|
|
12082
|
|
|
|
|
|
|
{ |
|
12083
|
|
|
|
|
|
|
cs = english_tokenizer_start; |
|
12084
|
0
|
|
|
|
|
|
ts = 0; |
|
12085
|
|
|
|
|
|
|
te = 0; |
|
12086
|
|
|
|
|
|
|
act = 0; |
|
12087
|
|
|
|
|
|
|
} |
|
12088
|
|
|
|
|
|
|
|
|
12089
|
|
|
|
|
|
|
{ |
|
12090
|
|
|
|
|
|
|
int _klen; |
|
12091
|
|
|
|
|
|
|
const short *_keys; |
|
12092
|
|
|
|
|
|
|
int _trans; |
|
12093
|
|
|
|
|
|
|
short _widec; |
|
12094
|
|
|
|
|
|
|
|
|
12095
|
0
|
0
|
|
|
|
|
if ( ( current) == ( (chars.size() - 1)) ) |
|
12096
|
|
|
|
|
|
|
goto _test_eof; |
|
12097
|
|
|
|
|
|
|
if ( cs == 0 ) |
|
12098
|
|
|
|
|
|
|
goto _out; |
|
12099
|
|
|
|
|
|
|
_resume: |
|
12100
|
0
|
0
|
|
|
|
|
switch ( _english_tokenizer_from_state_actions[cs] ) { |
|
12101
|
|
|
|
|
|
|
case 6: |
|
12102
|
0
|
|
|
|
|
|
{ts = ( current);} |
|
12103
|
0
|
|
|
|
|
|
break; |
|
12104
|
|
|
|
|
|
|
} |
|
12105
|
|
|
|
|
|
|
|
|
12106
|
0
|
|
|
|
|
|
_widec = ( ragel_char(chars[current])); |
|
12107
|
0
|
|
|
|
|
|
_klen = _english_tokenizer_cond_lengths[cs]; |
|
12108
|
0
|
|
|
|
|
|
_keys = _english_tokenizer_cond_keys + (_english_tokenizer_cond_offsets[cs]*2); |
|
12109
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
|
12110
|
|
|
|
|
|
|
const short *_lower = _keys; |
|
12111
|
|
|
|
|
|
|
const short *_mid; |
|
12112
|
0
|
|
|
|
|
|
const short *_upper = _keys + (_klen<<1) - 2; |
|
12113
|
|
|
|
|
|
|
while (1) { |
|
12114
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
|
12115
|
|
|
|
|
|
|
break; |
|
12116
|
|
|
|
|
|
|
|
|
12117
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
|
12118
|
0
|
0
|
|
|
|
|
if ( _widec < _mid[0] ) |
|
12119
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
|
12120
|
0
|
0
|
|
|
|
|
else if ( _widec > _mid[1] ) |
|
12121
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
|
12122
|
|
|
|
|
|
|
else { |
|
12123
|
0
|
|
|
|
|
|
switch ( _english_tokenizer_cond_spaces[_english_tokenizer_cond_offsets[cs] + ((_mid - _keys)>>1)] ) { |
|
12124
|
|
|
|
|
|
|
case 0: { |
|
12125
|
0
|
|
|
|
|
|
_widec = (short)(256u + (( ragel_char(chars[current])) - 0u)); |
|
12126
|
0
|
0
|
|
|
|
|
if ( |
|
12127
|
0
|
0
|
|
|
|
|
!current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256; |
|
|
|
0
|
|
|
|
|
|
|
12128
|
|
|
|
|
|
|
break; |
|
12129
|
|
|
|
|
|
|
} |
|
12130
|
|
|
|
|
|
|
case 1: { |
|
12131
|
0
|
|
|
|
|
|
_widec = (short)(768u + (( ragel_char(chars[current])) - 0u)); |
|
12132
|
0
|
0
|
|
|
|
|
if ( |
|
12133
|
0
|
0
|
|
|
|
|
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12134
|
|
|
|
|
|
|
break; |
|
12135
|
|
|
|
|
|
|
} |
|
12136
|
|
|
|
|
|
|
} |
|
12137
|
|
|
|
|
|
|
break; |
|
12138
|
|
|
|
|
|
|
} |
|
12139
|
|
|
|
|
|
|
} |
|
12140
|
|
|
|
|
|
|
} |
|
12141
|
|
|
|
|
|
|
|
|
12142
|
0
|
|
|
|
|
|
_keys = _english_tokenizer_trans_keys + _english_tokenizer_key_offsets[cs]; |
|
12143
|
0
|
|
|
|
|
|
_trans = _english_tokenizer_index_offsets[cs]; |
|
12144
|
|
|
|
|
|
|
|
|
12145
|
0
|
|
|
|
|
|
_klen = _english_tokenizer_single_lengths[cs]; |
|
12146
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
|
12147
|
|
|
|
|
|
|
const short *_lower = _keys; |
|
12148
|
|
|
|
|
|
|
const short *_mid; |
|
12149
|
0
|
|
|
|
|
|
const short *_upper = _keys + _klen - 1; |
|
12150
|
|
|
|
|
|
|
while (1) { |
|
12151
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
|
12152
|
|
|
|
|
|
|
break; |
|
12153
|
|
|
|
|
|
|
|
|
12154
|
0
|
|
|
|
|
|
_mid = _lower + ((_upper-_lower) >> 1); |
|
12155
|
0
|
0
|
|
|
|
|
if ( _widec < *_mid ) |
|
12156
|
0
|
|
|
|
|
|
_upper = _mid - 1; |
|
12157
|
0
|
0
|
|
|
|
|
else if ( _widec > *_mid ) |
|
12158
|
0
|
|
|
|
|
|
_lower = _mid + 1; |
|
12159
|
|
|
|
|
|
|
else { |
|
12160
|
0
|
|
|
|
|
|
_trans += (unsigned int)(_mid - _keys); |
|
12161
|
0
|
|
|
|
|
|
goto _match; |
|
12162
|
|
|
|
|
|
|
} |
|
12163
|
|
|
|
|
|
|
} |
|
12164
|
0
|
|
|
|
|
|
_keys += _klen; |
|
12165
|
0
|
|
|
|
|
|
_trans += _klen; |
|
12166
|
|
|
|
|
|
|
} |
|
12167
|
|
|
|
|
|
|
|
|
12168
|
0
|
|
|
|
|
|
_klen = _english_tokenizer_range_lengths[cs]; |
|
12169
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
|
12170
|
|
|
|
|
|
|
const short *_lower = _keys; |
|
12171
|
|
|
|
|
|
|
const short *_mid; |
|
12172
|
0
|
|
|
|
|
|
const short *_upper = _keys + (_klen<<1) - 2; |
|
12173
|
|
|
|
|
|
|
while (1) { |
|
12174
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
|
12175
|
|
|
|
|
|
|
break; |
|
12176
|
|
|
|
|
|
|
|
|
12177
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
|
12178
|
0
|
0
|
|
|
|
|
if ( _widec < _mid[0] ) |
|
12179
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
|
12180
|
0
|
0
|
|
|
|
|
else if ( _widec > _mid[1] ) |
|
12181
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
|
12182
|
|
|
|
|
|
|
else { |
|
12183
|
0
|
|
|
|
|
|
_trans += (unsigned int)((_mid - _keys)>>1); |
|
12184
|
0
|
|
|
|
|
|
goto _match; |
|
12185
|
|
|
|
|
|
|
} |
|
12186
|
|
|
|
|
|
|
} |
|
12187
|
0
|
|
|
|
|
|
_trans += _klen; |
|
12188
|
|
|
|
|
|
|
} |
|
12189
|
|
|
|
|
|
|
|
|
12190
|
|
|
|
|
|
|
_match: |
|
12191
|
0
|
|
|
|
|
|
_trans = _english_tokenizer_indicies[_trans]; |
|
12192
|
|
|
|
|
|
|
_eof_trans: |
|
12193
|
0
|
|
|
|
|
|
cs = _english_tokenizer_trans_targs[_trans]; |
|
12194
|
|
|
|
|
|
|
|
|
12195
|
0
|
0
|
|
|
|
|
if ( _english_tokenizer_trans_actions[_trans] == 0 ) |
|
12196
|
|
|
|
|
|
|
goto _again; |
|
12197
|
|
|
|
|
|
|
|
|
12198
|
0
|
|
|
|
|
|
switch ( _english_tokenizer_trans_actions[_trans] ) { |
|
12199
|
|
|
|
|
|
|
case 3: |
|
12200
|
0
|
|
|
|
|
|
{ whitespace = current; } |
|
12201
|
0
|
|
|
|
|
|
break; |
|
12202
|
|
|
|
|
|
|
case 4: |
|
12203
|
0
|
|
|
|
|
|
{te = ( current)+1;} |
|
12204
|
0
|
|
|
|
|
|
break; |
|
12205
|
|
|
|
|
|
|
case 7: |
|
12206
|
0
|
|
|
|
|
|
{te = ( current)+1;{ tokens.emplace_back(ts, te - ts); |
|
12207
|
0
|
|
|
|
|
|
split_token(tokens); |
|
12208
|
0
|
|
|
|
|
|
current = te; |
|
12209
|
0
|
0
|
|
|
|
|
do |
|
12210
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
12211
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
|
12212
|
0
|
|
|
|
|
|
( current)--; |
|
12213
|
|
|
|
|
|
|
}} |
|
12214
|
0
|
|
|
|
|
|
break; |
|
12215
|
|
|
|
|
|
|
case 2: |
|
12216
|
0
|
|
|
|
|
|
{te = ( current)+1;{ |
|
12217
|
0
|
|
|
|
|
|
bool eos = is_eos(tokens, chars[ts].chr, &abbreviations); |
|
12218
|
0
|
0
|
|
|
|
|
for (current = ts; current < whitespace; current++) |
|
12219
|
0
|
|
|
|
|
|
tokens.emplace_back(current, 1); |
|
12220
|
0
|
|
|
|
|
|
{( current) = (( whitespace))-1;} |
|
12221
|
0
|
0
|
|
|
|
|
if (eos) {( current)++; goto _out; } |
|
12222
|
|
|
|
|
|
|
}} |
|
12223
|
|
|
|
|
|
|
break; |
|
12224
|
|
|
|
|
|
|
case 10: |
|
12225
|
0
|
|
|
|
|
|
{te = ( current)+1;{ |
|
12226
|
0
|
0
|
|
|
|
|
if (!tokens.empty()) {( current)++; goto _out; } |
|
12227
|
0
|
|
|
|
|
|
current = te; |
|
12228
|
0
|
0
|
|
|
|
|
do |
|
12229
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
12230
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
|
12231
|
0
|
|
|
|
|
|
( current)--; |
|
12232
|
|
|
|
|
|
|
}} |
|
12233
|
0
|
|
|
|
|
|
break; |
|
12234
|
|
|
|
|
|
|
case 11: |
|
12235
|
0
|
|
|
|
|
|
{te = ( current);( current)--;{ tokens.emplace_back(ts, te - ts); |
|
12236
|
0
|
|
|
|
|
|
split_token(tokens); |
|
12237
|
0
|
|
|
|
|
|
current = te; |
|
12238
|
0
|
0
|
|
|
|
|
do |
|
12239
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
12240
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
|
12241
|
0
|
|
|
|
|
|
( current)--; |
|
12242
|
|
|
|
|
|
|
}} |
|
12243
|
0
|
|
|
|
|
|
break; |
|
12244
|
|
|
|
|
|
|
case 8: |
|
12245
|
0
|
|
|
|
|
|
{te = ( current);( current)--;{ |
|
12246
|
0
|
|
|
|
|
|
current = te; |
|
12247
|
0
|
0
|
|
|
|
|
do |
|
12248
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
12249
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
|
12250
|
0
|
|
|
|
|
|
( current)--; |
|
12251
|
|
|
|
|
|
|
}} |
|
12252
|
0
|
|
|
|
|
|
break; |
|
12253
|
|
|
|
|
|
|
case 9: |
|
12254
|
0
|
|
|
|
|
|
{te = ( current);( current)--;{ |
|
12255
|
0
|
0
|
|
|
|
|
if (!tokens.empty()) {( current)++; goto _out; } |
|
12256
|
0
|
|
|
|
|
|
current = te; |
|
12257
|
0
|
0
|
|
|
|
|
do |
|
12258
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
12259
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
|
12260
|
0
|
|
|
|
|
|
( current)--; |
|
12261
|
|
|
|
|
|
|
}} |
|
12262
|
0
|
|
|
|
|
|
break; |
|
12263
|
|
|
|
|
|
|
case 1: |
|
12264
|
0
|
|
|
|
|
|
{{( current) = ((te))-1;}{ tokens.emplace_back(ts, te - ts); |
|
12265
|
0
|
|
|
|
|
|
split_token(tokens); |
|
12266
|
0
|
|
|
|
|
|
current = te; |
|
12267
|
0
|
0
|
|
|
|
|
do |
|
12268
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
12269
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
|
12270
|
0
|
|
|
|
|
|
( current)--; |
|
12271
|
|
|
|
|
|
|
}} |
|
12272
|
0
|
|
|
|
|
|
break; |
|
12273
|
|
|
|
|
|
|
} |
|
12274
|
|
|
|
|
|
|
|
|
12275
|
|
|
|
|
|
|
_again: |
|
12276
|
0
|
0
|
|
|
|
|
switch ( _english_tokenizer_to_state_actions[cs] ) { |
|
12277
|
|
|
|
|
|
|
case 5: |
|
12278
|
0
|
|
|
|
|
|
{ts = 0;} |
|
12279
|
0
|
|
|
|
|
|
break; |
|
12280
|
|
|
|
|
|
|
} |
|
12281
|
|
|
|
|
|
|
|
|
12282
|
0
|
0
|
|
|
|
|
if ( cs == 0 ) |
|
12283
|
|
|
|
|
|
|
goto _out; |
|
12284
|
0
|
0
|
|
|
|
|
if ( ++( current) != ( (chars.size() - 1)) ) |
|
12285
|
|
|
|
|
|
|
goto _resume; |
|
12286
|
|
|
|
|
|
|
_test_eof: {} |
|
12287
|
0
|
0
|
|
|
|
|
if ( ( current) == ( (chars.size() - 1)) ) |
|
12288
|
|
|
|
|
|
|
{ |
|
12289
|
0
|
0
|
|
|
|
|
if ( _english_tokenizer_eof_trans[cs] > 0 ) { |
|
12290
|
0
|
|
|
|
|
|
_trans = _english_tokenizer_eof_trans[cs] - 1; |
|
12291
|
0
|
|
|
|
|
|
goto _eof_trans; |
|
12292
|
|
|
|
|
|
|
} |
|
12293
|
|
|
|
|
|
|
} |
|
12294
|
|
|
|
|
|
|
|
|
12295
|
|
|
|
|
|
|
_out: {} |
|
12296
|
|
|
|
|
|
|
} |
|
12297
|
|
|
|
|
|
|
|
|
12298
|
|
|
|
|
|
|
(void)act; // Suppress unused variable warning |
|
12299
|
|
|
|
|
|
|
|
|
12300
|
0
|
|
|
|
|
|
return !tokens.empty(); |
|
12301
|
|
|
|
|
|
|
} |
|
12302
|
|
|
|
|
|
|
|
|
12303
|
|
|
|
|
|
|
} // namespace morphodita |
|
12304
|
|
|
|
|
|
|
|
|
12305
|
|
|
|
|
|
|
///////// |
|
12306
|
|
|
|
|
|
|
// File: morphodita/tokenizer/generic_tokenizer.cpp |
|
12307
|
|
|
|
|
|
|
///////// |
|
12308
|
|
|
|
|
|
|
|
|
12309
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
12310
|
|
|
|
|
|
|
// |
|
12311
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
12312
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
12313
|
|
|
|
|
|
|
// |
|
12314
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
12315
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
12316
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
12317
|
|
|
|
|
|
|
|
|
12318
|
|
|
|
|
|
|
namespace morphodita { |
|
12319
|
|
|
|
|
|
|
|
|
12320
|
|
|
|
|
|
|
static const char _generic_tokenizer_cond_offsets[] = { |
|
12321
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
12322
|
|
|
|
|
|
|
2, 2, 2, 2, 2, 2, 2, 2, |
|
12323
|
|
|
|
|
|
|
2, 2, 2, 2, 2, 2, 2 |
|
12324
|
|
|
|
|
|
|
}; |
|
12325
|
|
|
|
|
|
|
|
|
12326
|
|
|
|
|
|
|
static const char _generic_tokenizer_cond_lengths[] = { |
|
12327
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 2, |
|
12328
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
12329
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0 |
|
12330
|
|
|
|
|
|
|
}; |
|
12331
|
|
|
|
|
|
|
|
|
12332
|
|
|
|
|
|
|
static const short _generic_tokenizer_cond_keys[] = { |
|
12333
|
|
|
|
|
|
|
43u, 43u, 45u, 45u, 0 |
|
12334
|
|
|
|
|
|
|
}; |
|
12335
|
|
|
|
|
|
|
|
|
12336
|
|
|
|
|
|
|
static const char _generic_tokenizer_cond_spaces[] = { |
|
12337
|
|
|
|
|
|
|
1, 0, 0 |
|
12338
|
|
|
|
|
|
|
}; |
|
12339
|
|
|
|
|
|
|
|
|
12340
|
|
|
|
|
|
|
static const unsigned char _generic_tokenizer_key_offsets[] = { |
|
12341
|
|
|
|
|
|
|
0, 0, 17, 29, 43, 46, 51, 54, |
|
12342
|
|
|
|
|
|
|
89, 94, 98, 101, 105, 110, 111, 116, |
|
12343
|
|
|
|
|
|
|
117, 122, 136, 142, 147, 150, 162 |
|
12344
|
|
|
|
|
|
|
}; |
|
12345
|
|
|
|
|
|
|
|
|
12346
|
|
|
|
|
|
|
static const short _generic_tokenizer_trans_keys[] = { |
|
12347
|
|
|
|
|
|
|
13u, 32u, 34u, 40u, 91u, 96u, 123u, 129u, |
|
12348
|
|
|
|
|
|
|
133u, 135u, 147u, 150u, 162u, 9u, 10u, 65u, |
|
12349
|
|
|
|
|
|
|
90u, 34u, 40u, 91u, 96u, 123u, 129u, 133u, |
|
12350
|
|
|
|
|
|
|
135u, 150u, 162u, 65u, 90u, 13u, 32u, 34u, |
|
12351
|
|
|
|
|
|
|
39u, 41u, 59u, 93u, 125u, 139u, 141u, 147u, |
|
12352
|
|
|
|
|
|
|
161u, 9u, 10u, 159u, 48u, 57u, 43u, 45u, |
|
12353
|
|
|
|
|
|
|
159u, 48u, 57u, 159u, 48u, 57u, 9u, 10u, |
|
12354
|
|
|
|
|
|
|
13u, 32u, 33u, 44u, 46u, 47u, 63u, 129u, |
|
12355
|
|
|
|
|
|
|
131u, 135u, 142u, 147u, 157u, 159u, 160u, 301u, |
|
12356
|
|
|
|
|
|
|
557u, 811u, 1067u, 0u, 42u, 48u, 57u, 58u, |
|
12357
|
|
|
|
|
|
|
64u, 65u, 90u, 91u, 96u, 97u, 122u, 123u, |
|
12358
|
|
|
|
|
|
|
255u, 9u, 10u, 13u, 32u, 147u, 9u, 13u, |
|
12359
|
|
|
|
|
|
|
32u, 147u, 9u, 32u, 147u, 9u, 10u, 32u, |
|
12360
|
|
|
|
|
|
|
147u, 9u, 10u, 13u, 32u, 147u, 13u, 9u, |
|
12361
|
|
|
|
|
|
|
10u, 13u, 32u, 147u, 10u, 9u, 10u, 13u, |
|
12362
|
|
|
|
|
|
|
32u, 147u, 13u, 32u, 34u, 39u, 41u, 59u, |
|
12363
|
|
|
|
|
|
|
93u, 125u, 139u, 141u, 147u, 161u, 9u, 10u, |
|
12364
|
|
|
|
|
|
|
46u, 69u, 101u, 159u, 48u, 57u, 69u, 101u, |
|
12365
|
|
|
|
|
|
|
159u, 48u, 57u, 159u, 48u, 57u, 129u, 131u, |
|
12366
|
|
|
|
|
|
|
135u, 151u, 155u, 157u, 65u, 90u, 97u, 122u, |
|
12367
|
|
|
|
|
|
|
142u, 143u, 159u, 48u, 57u, 0 |
|
12368
|
|
|
|
|
|
|
}; |
|
12369
|
|
|
|
|
|
|
|
|
12370
|
|
|
|
|
|
|
static const char _generic_tokenizer_single_lengths[] = { |
|
12371
|
|
|
|
|
|
|
0, 13, 10, 12, 1, 3, 1, 21, |
|
12372
|
|
|
|
|
|
|
5, 4, 3, 4, 5, 1, 5, 1, |
|
12373
|
|
|
|
|
|
|
5, 12, 4, 3, 1, 6, 1 |
|
12374
|
|
|
|
|
|
|
}; |
|
12375
|
|
|
|
|
|
|
|
|
12376
|
|
|
|
|
|
|
static const char _generic_tokenizer_range_lengths[] = { |
|
12377
|
|
|
|
|
|
|
0, 2, 1, 1, 1, 1, 1, 7, |
|
12378
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
12379
|
|
|
|
|
|
|
0, 1, 1, 1, 1, 3, 1 |
|
12380
|
|
|
|
|
|
|
}; |
|
12381
|
|
|
|
|
|
|
|
|
12382
|
|
|
|
|
|
|
static const unsigned char _generic_tokenizer_index_offsets[] = { |
|
12383
|
|
|
|
|
|
|
0, 0, 16, 28, 42, 45, 50, 53, |
|
12384
|
|
|
|
|
|
|
82, 88, 93, 97, 102, 108, 110, 116, |
|
12385
|
|
|
|
|
|
|
118, 124, 138, 144, 149, 152, 162 |
|
12386
|
|
|
|
|
|
|
}; |
|
12387
|
|
|
|
|
|
|
|
|
12388
|
|
|
|
|
|
|
static const char _generic_tokenizer_indicies[] = { |
|
12389
|
|
|
|
|
|
|
1, 1, 2, 2, 2, 2, 2, 3, |
|
12390
|
|
|
|
|
|
|
2, 3, 1, 2, 2, 1, 3, 0, |
|
12391
|
|
|
|
|
|
|
2, 2, 2, 2, 2, 3, 2, 3, |
|
12392
|
|
|
|
|
|
|
2, 2, 3, 0, 4, 4, 5, 5, |
|
12393
|
|
|
|
|
|
|
5, 5, 5, 5, 5, 5, 4, 5, |
|
12394
|
|
|
|
|
|
|
4, 0, 6, 6, 0, 7, 7, 8, |
|
12395
|
|
|
|
|
|
|
8, 0, 8, 8, 0, 10, 11, 12, |
|
12396
|
|
|
|
|
|
|
10, 13, 9, 13, 9, 13, 16, 16, |
|
12397
|
|
|
|
|
|
|
16, 16, 10, 16, 15, 13, 9, 17, |
|
12398
|
|
|
|
|
|
|
9, 17, 9, 15, 9, 16, 9, 16, |
|
12399
|
|
|
|
|
|
|
9, 14, 10, 19, 20, 10, 10, 18, |
|
12400
|
|
|
|
|
|
|
10, 21, 10, 10, 18, 10, 10, 10, |
|
12401
|
|
|
|
|
|
|
18, 10, 21, 10, 10, 18, 10, 22, |
|
12402
|
|
|
|
|
|
|
23, 10, 10, 18, 25, 24, 10, 22, |
|
12403
|
|
|
|
|
|
|
26, 10, 10, 18, 25, 24, 10, 23, |
|
12404
|
|
|
|
|
|
|
26, 10, 10, 18, 4, 4, 5, 5, |
|
12405
|
|
|
|
|
|
|
5, 5, 5, 5, 5, 5, 4, 5, |
|
12406
|
|
|
|
|
|
|
4, 27, 28, 29, 29, 15, 15, 27, |
|
12407
|
|
|
|
|
|
|
29, 29, 6, 6, 27, 8, 8, 27, |
|
12408
|
|
|
|
|
|
|
16, 16, 16, 16, 16, 16, 16, 16, |
|
12409
|
|
|
|
|
|
|
16, 27, 15, 15, 27, 0 |
|
12410
|
|
|
|
|
|
|
}; |
|
12411
|
|
|
|
|
|
|
|
|
12412
|
|
|
|
|
|
|
static const char _generic_tokenizer_trans_targs[] = { |
|
12413
|
|
|
|
|
|
|
7, 1, 2, 7, 1, 3, 19, 6, |
|
12414
|
|
|
|
|
|
|
20, 7, 8, 12, 16, 17, 0, 18, |
|
12415
|
|
|
|
|
|
|
21, 22, 7, 9, 11, 10, 13, 14, |
|
12416
|
|
|
|
|
|
|
7, 7, 15, 7, 4, 5 |
|
12417
|
|
|
|
|
|
|
}; |
|
12418
|
|
|
|
|
|
|
|
|
12419
|
|
|
|
|
|
|
static const char _generic_tokenizer_trans_actions[] = { |
|
12420
|
|
|
|
|
|
|
1, 0, 0, 2, 3, 0, 4, 0, |
|
12421
|
|
|
|
|
|
|
0, 7, 0, 0, 0, 4, 0, 4, |
|
12422
|
|
|
|
|
|
|
0, 0, 8, 0, 0, 0, 0, 0, |
|
12423
|
|
|
|
|
|
|
9, 10, 0, 11, 0, 0 |
|
12424
|
|
|
|
|
|
|
}; |
|
12425
|
|
|
|
|
|
|
|
|
12426
|
|
|
|
|
|
|
static const char _generic_tokenizer_to_state_actions[] = { |
|
12427
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 5, |
|
12428
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
12429
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0 |
|
12430
|
|
|
|
|
|
|
}; |
|
12431
|
|
|
|
|
|
|
|
|
12432
|
|
|
|
|
|
|
static const char _generic_tokenizer_from_state_actions[] = { |
|
12433
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 6, |
|
12434
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
12435
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0 |
|
12436
|
|
|
|
|
|
|
}; |
|
12437
|
|
|
|
|
|
|
|
|
12438
|
|
|
|
|
|
|
static const unsigned char _generic_tokenizer_eof_trans[] = { |
|
12439
|
|
|
|
|
|
|
0, 1, 1, 1, 1, 1, 1, 0, |
|
12440
|
|
|
|
|
|
|
19, 19, 19, 19, 19, 25, 19, 25, |
|
12441
|
|
|
|
|
|
|
19, 28, 28, 28, 28, 28, 28 |
|
12442
|
|
|
|
|
|
|
}; |
|
12443
|
|
|
|
|
|
|
|
|
12444
|
|
|
|
|
|
|
static const int generic_tokenizer_start = 7; |
|
12445
|
|
|
|
|
|
|
|
|
12446
|
0
|
0
|
|
|
|
|
generic_tokenizer::generic_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {} |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12447
|
|
|
|
|
|
|
|
|
12448
|
0
|
|
|
|
|
|
bool generic_tokenizer::next_sentence(vector& tokens) { |
|
12449
|
|
|
|
|
|
|
using namespace unilib; |
|
12450
|
|
|
|
|
|
|
|
|
12451
|
|
|
|
|
|
|
int cs, act; |
|
12452
|
|
|
|
|
|
|
size_t ts, te; |
|
12453
|
|
|
|
|
|
|
size_t whitespace = 0; // Suppress "may be uninitialized" warning |
|
12454
|
|
|
|
|
|
|
|
|
12455
|
0
|
0
|
|
|
|
|
while (tokenize_url_email(tokens)) |
|
12456
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) |
|
12457
|
|
|
|
|
|
|
return true; |
|
12458
|
|
|
|
|
|
|
|
|
12459
|
|
|
|
|
|
|
{ |
|
12460
|
|
|
|
|
|
|
cs = generic_tokenizer_start; |
|
12461
|
0
|
|
|
|
|
|
ts = 0; |
|
12462
|
|
|
|
|
|
|
te = 0; |
|
12463
|
|
|
|
|
|
|
act = 0; |
|
12464
|
|
|
|
|
|
|
} |
|
12465
|
|
|
|
|
|
|
|
|
12466
|
|
|
|
|
|
|
{ |
|
12467
|
|
|
|
|
|
|
int _klen; |
|
12468
|
|
|
|
|
|
|
const short *_keys; |
|
12469
|
|
|
|
|
|
|
int _trans; |
|
12470
|
|
|
|
|
|
|
short _widec; |
|
12471
|
|
|
|
|
|
|
|
|
12472
|
0
|
0
|
|
|
|
|
if ( ( current) == ( (chars.size() - 1)) ) |
|
12473
|
|
|
|
|
|
|
goto _test_eof; |
|
12474
|
|
|
|
|
|
|
if ( cs == 0 ) |
|
12475
|
|
|
|
|
|
|
goto _out; |
|
12476
|
|
|
|
|
|
|
_resume: |
|
12477
|
0
|
0
|
|
|
|
|
switch ( _generic_tokenizer_from_state_actions[cs] ) { |
|
12478
|
|
|
|
|
|
|
case 6: |
|
12479
|
0
|
|
|
|
|
|
{ts = ( current);} |
|
12480
|
0
|
|
|
|
|
|
break; |
|
12481
|
|
|
|
|
|
|
} |
|
12482
|
|
|
|
|
|
|
|
|
12483
|
0
|
|
|
|
|
|
_widec = ( ragel_char(chars[current])); |
|
12484
|
0
|
|
|
|
|
|
_klen = _generic_tokenizer_cond_lengths[cs]; |
|
12485
|
0
|
|
|
|
|
|
_keys = _generic_tokenizer_cond_keys + (_generic_tokenizer_cond_offsets[cs]*2); |
|
12486
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
|
12487
|
|
|
|
|
|
|
const short *_lower = _keys; |
|
12488
|
|
|
|
|
|
|
const short *_mid; |
|
12489
|
0
|
|
|
|
|
|
const short *_upper = _keys + (_klen<<1) - 2; |
|
12490
|
|
|
|
|
|
|
while (1) { |
|
12491
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
|
12492
|
|
|
|
|
|
|
break; |
|
12493
|
|
|
|
|
|
|
|
|
12494
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
|
12495
|
0
|
0
|
|
|
|
|
if ( _widec < _mid[0] ) |
|
12496
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
|
12497
|
0
|
0
|
|
|
|
|
else if ( _widec > _mid[1] ) |
|
12498
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
|
12499
|
|
|
|
|
|
|
else { |
|
12500
|
0
|
|
|
|
|
|
switch ( _generic_tokenizer_cond_spaces[_generic_tokenizer_cond_offsets[cs] + ((_mid - _keys)>>1)] ) { |
|
12501
|
|
|
|
|
|
|
case 0: { |
|
12502
|
0
|
|
|
|
|
|
_widec = (short)(256u + (( ragel_char(chars[current])) - 0u)); |
|
12503
|
0
|
0
|
|
|
|
|
if ( |
|
12504
|
0
|
0
|
|
|
|
|
!current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256; |
|
|
|
0
|
|
|
|
|
|
|
12505
|
|
|
|
|
|
|
break; |
|
12506
|
|
|
|
|
|
|
} |
|
12507
|
|
|
|
|
|
|
case 1: { |
|
12508
|
0
|
|
|
|
|
|
_widec = (short)(768u + (( ragel_char(chars[current])) - 0u)); |
|
12509
|
0
|
0
|
|
|
|
|
if ( |
|
12510
|
0
|
0
|
|
|
|
|
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12511
|
|
|
|
|
|
|
break; |
|
12512
|
|
|
|
|
|
|
} |
|
12513
|
|
|
|
|
|
|
} |
|
12514
|
|
|
|
|
|
|
break; |
|
12515
|
|
|
|
|
|
|
} |
|
12516
|
|
|
|
|
|
|
} |
|
12517
|
|
|
|
|
|
|
} |
|
12518
|
|
|
|
|
|
|
|
|
12519
|
0
|
|
|
|
|
|
_keys = _generic_tokenizer_trans_keys + _generic_tokenizer_key_offsets[cs]; |
|
12520
|
0
|
|
|
|
|
|
_trans = _generic_tokenizer_index_offsets[cs]; |
|
12521
|
|
|
|
|
|
|
|
|
12522
|
0
|
|
|
|
|
|
_klen = _generic_tokenizer_single_lengths[cs]; |
|
12523
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
|
12524
|
|
|
|
|
|
|
const short *_lower = _keys; |
|
12525
|
|
|
|
|
|
|
const short *_mid; |
|
12526
|
0
|
|
|
|
|
|
const short *_upper = _keys + _klen - 1; |
|
12527
|
|
|
|
|
|
|
while (1) { |
|
12528
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
|
12529
|
|
|
|
|
|
|
break; |
|
12530
|
|
|
|
|
|
|
|
|
12531
|
0
|
|
|
|
|
|
_mid = _lower + ((_upper-_lower) >> 1); |
|
12532
|
0
|
0
|
|
|
|
|
if ( _widec < *_mid ) |
|
12533
|
0
|
|
|
|
|
|
_upper = _mid - 1; |
|
12534
|
0
|
0
|
|
|
|
|
else if ( _widec > *_mid ) |
|
12535
|
0
|
|
|
|
|
|
_lower = _mid + 1; |
|
12536
|
|
|
|
|
|
|
else { |
|
12537
|
0
|
|
|
|
|
|
_trans += (unsigned int)(_mid - _keys); |
|
12538
|
0
|
|
|
|
|
|
goto _match; |
|
12539
|
|
|
|
|
|
|
} |
|
12540
|
|
|
|
|
|
|
} |
|
12541
|
0
|
|
|
|
|
|
_keys += _klen; |
|
12542
|
0
|
|
|
|
|
|
_trans += _klen; |
|
12543
|
|
|
|
|
|
|
} |
|
12544
|
|
|
|
|
|
|
|
|
12545
|
0
|
|
|
|
|
|
_klen = _generic_tokenizer_range_lengths[cs]; |
|
12546
|
0
|
0
|
|
|
|
|
if ( _klen > 0 ) { |
|
12547
|
|
|
|
|
|
|
const short *_lower = _keys; |
|
12548
|
|
|
|
|
|
|
const short *_mid; |
|
12549
|
0
|
|
|
|
|
|
const short *_upper = _keys + (_klen<<1) - 2; |
|
12550
|
|
|
|
|
|
|
while (1) { |
|
12551
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
|
12552
|
|
|
|
|
|
|
break; |
|
12553
|
|
|
|
|
|
|
|
|
12554
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
|
12555
|
0
|
0
|
|
|
|
|
if ( _widec < _mid[0] ) |
|
12556
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
|
12557
|
0
|
0
|
|
|
|
|
else if ( _widec > _mid[1] ) |
|
12558
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
|
12559
|
|
|
|
|
|
|
else { |
|
12560
|
0
|
|
|
|
|
|
_trans += (unsigned int)((_mid - _keys)>>1); |
|
12561
|
0
|
|
|
|
|
|
goto _match; |
|
12562
|
|
|
|
|
|
|
} |
|
12563
|
|
|
|
|
|
|
} |
|
12564
|
0
|
|
|
|
|
|
_trans += _klen; |
|
12565
|
|
|
|
|
|
|
} |
|
12566
|
|
|
|
|
|
|
|
|
12567
|
|
|
|
|
|
|
_match: |
|
12568
|
0
|
|
|
|
|
|
_trans = _generic_tokenizer_indicies[_trans]; |
|
12569
|
|
|
|
|
|
|
_eof_trans: |
|
12570
|
0
|
|
|
|
|
|
cs = _generic_tokenizer_trans_targs[_trans]; |
|
12571
|
|
|
|
|
|
|
|
|
12572
|
0
|
0
|
|
|
|
|
if ( _generic_tokenizer_trans_actions[_trans] == 0 ) |
|
12573
|
|
|
|
|
|
|
goto _again; |
|
12574
|
|
|
|
|
|
|
|
|
12575
|
0
|
|
|
|
|
|
switch ( _generic_tokenizer_trans_actions[_trans] ) { |
|
12576
|
|
|
|
|
|
|
case 3: |
|
12577
|
0
|
|
|
|
|
|
{ whitespace = current; } |
|
12578
|
0
|
|
|
|
|
|
break; |
|
12579
|
|
|
|
|
|
|
case 4: |
|
12580
|
0
|
|
|
|
|
|
{te = ( current)+1;} |
|
12581
|
0
|
|
|
|
|
|
break; |
|
12582
|
|
|
|
|
|
|
case 7: |
|
12583
|
0
|
|
|
|
|
|
{te = ( current)+1;{ tokens.emplace_back(ts, te - ts); |
|
12584
|
0
|
|
|
|
|
|
current = te; |
|
12585
|
0
|
0
|
|
|
|
|
do |
|
12586
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
12587
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
|
12588
|
0
|
|
|
|
|
|
( current)--; |
|
12589
|
|
|
|
|
|
|
}} |
|
12590
|
0
|
|
|
|
|
|
break; |
|
12591
|
|
|
|
|
|
|
case 2: |
|
12592
|
0
|
|
|
|
|
|
{te = ( current)+1;{ |
|
12593
|
0
|
|
|
|
|
|
bool eos = is_eos(tokens, chars[ts].chr, nullptr); |
|
12594
|
0
|
0
|
|
|
|
|
for (current = ts; current < whitespace; current++) |
|
12595
|
0
|
|
|
|
|
|
tokens.emplace_back(current, 1); |
|
12596
|
0
|
|
|
|
|
|
{( current) = (( whitespace))-1;} |
|
12597
|
0
|
0
|
|
|
|
|
if (eos) {( current)++; goto _out; } |
|
12598
|
|
|
|
|
|
|
}} |
|
12599
|
|
|
|
|
|
|
break; |
|
12600
|
|
|
|
|
|
|
case 10: |
|
12601
|
0
|
|
|
|
|
|
{te = ( current)+1;{ |
|
12602
|
0
|
0
|
|
|
|
|
if (!tokens.empty()) {( current)++; goto _out; } |
|
12603
|
0
|
|
|
|
|
|
current = te; |
|
12604
|
0
|
0
|
|
|
|
|
do |
|
12605
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
12606
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
|
12607
|
0
|
|
|
|
|
|
( current)--; |
|
12608
|
|
|
|
|
|
|
}} |
|
12609
|
0
|
|
|
|
|
|
break; |
|
12610
|
|
|
|
|
|
|
case 11: |
|
12611
|
0
|
|
|
|
|
|
{te = ( current);( current)--;{ tokens.emplace_back(ts, te - ts); |
|
12612
|
0
|
|
|
|
|
|
current = te; |
|
12613
|
0
|
0
|
|
|
|
|
do |
|
12614
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
12615
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
|
12616
|
0
|
|
|
|
|
|
( current)--; |
|
12617
|
|
|
|
|
|
|
}} |
|
12618
|
0
|
|
|
|
|
|
break; |
|
12619
|
|
|
|
|
|
|
case 8: |
|
12620
|
0
|
|
|
|
|
|
{te = ( current);( current)--;{ |
|
12621
|
0
|
|
|
|
|
|
current = te; |
|
12622
|
0
|
0
|
|
|
|
|
do |
|
12623
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
12624
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
|
12625
|
0
|
|
|
|
|
|
( current)--; |
|
12626
|
|
|
|
|
|
|
}} |
|
12627
|
0
|
|
|
|
|
|
break; |
|
12628
|
|
|
|
|
|
|
case 9: |
|
12629
|
0
|
|
|
|
|
|
{te = ( current);( current)--;{ |
|
12630
|
0
|
0
|
|
|
|
|
if (!tokens.empty()) {( current)++; goto _out; } |
|
12631
|
0
|
|
|
|
|
|
current = te; |
|
12632
|
0
|
0
|
|
|
|
|
do |
|
12633
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
12634
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
|
12635
|
0
|
|
|
|
|
|
( current)--; |
|
12636
|
|
|
|
|
|
|
}} |
|
12637
|
0
|
|
|
|
|
|
break; |
|
12638
|
|
|
|
|
|
|
case 1: |
|
12639
|
0
|
|
|
|
|
|
{{( current) = ((te))-1;}{ tokens.emplace_back(ts, te - ts); |
|
12640
|
0
|
|
|
|
|
|
current = te; |
|
12641
|
0
|
0
|
|
|
|
|
do |
|
12642
|
0
|
0
|
|
|
|
|
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
12643
|
|
|
|
|
|
|
while (tokenize_url_email(tokens)); |
|
12644
|
0
|
|
|
|
|
|
( current)--; |
|
12645
|
|
|
|
|
|
|
}} |
|
12646
|
0
|
|
|
|
|
|
break; |
|
12647
|
|
|
|
|
|
|
} |
|
12648
|
|
|
|
|
|
|
|
|
12649
|
|
|
|
|
|
|
_again: |
|
12650
|
0
|
0
|
|
|
|
|
switch ( _generic_tokenizer_to_state_actions[cs] ) { |
|
12651
|
|
|
|
|
|
|
case 5: |
|
12652
|
0
|
|
|
|
|
|
{ts = 0;} |
|
12653
|
0
|
|
|
|
|
|
break; |
|
12654
|
|
|
|
|
|
|
} |
|
12655
|
|
|
|
|
|
|
|
|
12656
|
0
|
0
|
|
|
|
|
if ( cs == 0 ) |
|
12657
|
|
|
|
|
|
|
goto _out; |
|
12658
|
0
|
0
|
|
|
|
|
if ( ++( current) != ( (chars.size() - 1)) ) |
|
12659
|
|
|
|
|
|
|
goto _resume; |
|
12660
|
|
|
|
|
|
|
_test_eof: {} |
|
12661
|
0
|
0
|
|
|
|
|
if ( ( current) == ( (chars.size() - 1)) ) |
|
12662
|
|
|
|
|
|
|
{ |
|
12663
|
0
|
0
|
|
|
|
|
if ( _generic_tokenizer_eof_trans[cs] > 0 ) { |
|
12664
|
0
|
|
|
|
|
|
_trans = _generic_tokenizer_eof_trans[cs] - 1; |
|
12665
|
0
|
|
|
|
|
|
goto _eof_trans; |
|
12666
|
|
|
|
|
|
|
} |
|
12667
|
|
|
|
|
|
|
} |
|
12668
|
|
|
|
|
|
|
|
|
12669
|
|
|
|
|
|
|
_out: {} |
|
12670
|
|
|
|
|
|
|
} |
|
12671
|
|
|
|
|
|
|
|
|
12672
|
|
|
|
|
|
|
(void)act; // Suppress unused variable warning |
|
12673
|
|
|
|
|
|
|
|
|
12674
|
0
|
|
|
|
|
|
return !tokens.empty(); |
|
12675
|
|
|
|
|
|
|
} |
|
12676
|
|
|
|
|
|
|
|
|
12677
|
|
|
|
|
|
|
} // namespace morphodita |
|
12678
|
|
|
|
|
|
|
|
|
12679
|
|
|
|
|
|
|
///////// |
|
12680
|
|
|
|
|
|
|
// File: morphodita/tokenizer/generic_tokenizer_factory.h |
|
12681
|
|
|
|
|
|
|
///////// |
|
12682
|
|
|
|
|
|
|
|
|
12683
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
12684
|
|
|
|
|
|
|
// |
|
12685
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
|
12686
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
12687
|
|
|
|
|
|
|
// |
|
12688
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
12689
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
12690
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
12691
|
|
|
|
|
|
|
|
|
12692
|
|
|
|
|
|
|
namespace morphodita { |
|
12693
|
|
|
|
|
|
|
|
|
12694
|
0
|
|
|
|
|
|
class generic_tokenizer_factory : public tokenizer_factory { |
|
12695
|
|
|
|
|
|
|
public: |
|
12696
|
|
|
|
|
|
|
// Construct a new tokenizer instance. |
|
12697
|
|
|
|
|
|
|
virtual tokenizer* new_tokenizer(const morpho* m) const override; |
|
12698
|
|
|
|
|
|
|
|
|
12699
|
|
|
|
|
|
|
bool load(istream& is); |
|
12700
|
|
|
|
|
|
|
private: |
|
12701
|
|
|
|
|
|
|
unsigned version; |
|
12702
|
|
|
|
|
|
|
}; |
|
12703
|
|
|
|
|
|
|
|
|
12704
|
|
|
|
|
|
|
} // namespace morphodita |
|
12705
|
|
|
|
|
|
|
|
|
12706
|
|
|
|
|
|
|
///////// |
|
12707
|
|
|
|
|
|
|
// File: morphodita/tokenizer/generic_tokenizer_factory.cpp |
|
12708
|
|
|
|
|
|
|
///////// |
|
12709
|
|
|
|
|
|
|
|
|
12710
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
12711
|
|
|
|
|
|
|
// |
|
12712
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
|
12713
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
12714
|
|
|
|
|
|
|
// |
|
12715
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
12716
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
12717
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
12718
|
|
|
|
|
|
|
|
|
12719
|
|
|
|
|
|
|
namespace morphodita { |
|
12720
|
|
|
|
|
|
|
|
|
12721
|
0
|
|
|
|
|
|
tokenizer* generic_tokenizer_factory::new_tokenizer(const morpho* /*m*/) const { |
|
12722
|
0
|
|
|
|
|
|
return new generic_tokenizer(version); |
|
12723
|
|
|
|
|
|
|
} |
|
12724
|
|
|
|
|
|
|
|
|
12725
|
0
|
|
|
|
|
|
bool generic_tokenizer_factory::load(istream& is) { |
|
12726
|
0
|
0
|
|
|
|
|
version = is.get(); |
|
12727
|
|
|
|
|
|
|
|
|
12728
|
0
|
|
|
|
|
|
return bool(is); |
|
12729
|
|
|
|
|
|
|
} |
|
12730
|
|
|
|
|
|
|
|
|
12731
|
|
|
|
|
|
|
} // namespace morphodita |
|
12732
|
|
|
|
|
|
|
|
|
12733
|
|
|
|
|
|
|
///////// |
|
12734
|
|
|
|
|
|
|
// File: morphodita/tokenizer/generic_tokenizer_factory_encoder.h |
|
12735
|
|
|
|
|
|
|
///////// |
|
12736
|
|
|
|
|
|
|
|
|
12737
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
12738
|
|
|
|
|
|
|
// |
|
12739
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
12740
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
12741
|
|
|
|
|
|
|
// |
|
12742
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
12743
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
12744
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
12745
|
|
|
|
|
|
|
|
|
12746
|
|
|
|
|
|
|
namespace morphodita { |
|
12747
|
|
|
|
|
|
|
|
|
12748
|
|
|
|
|
|
|
class generic_tokenizer_factory_encoder { |
|
12749
|
|
|
|
|
|
|
public: |
|
12750
|
|
|
|
|
|
|
static void encode(unsigned version, ostream& os); |
|
12751
|
|
|
|
|
|
|
}; |
|
12752
|
|
|
|
|
|
|
|
|
12753
|
|
|
|
|
|
|
} // namespace morphodita |
|
12754
|
|
|
|
|
|
|
|
|
12755
|
|
|
|
|
|
|
///////// |
|
12756
|
|
|
|
|
|
|
// File: morphodita/tokenizer/generic_tokenizer_factory_encoder.cpp |
|
12757
|
|
|
|
|
|
|
///////// |
|
12758
|
|
|
|
|
|
|
|
|
12759
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
12760
|
|
|
|
|
|
|
// |
|
12761
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
12762
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
12763
|
|
|
|
|
|
|
// |
|
12764
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
12765
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
12766
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
12767
|
|
|
|
|
|
|
|
|
12768
|
|
|
|
|
|
|
namespace morphodita { |
|
12769
|
|
|
|
|
|
|
|
|
12770
|
0
|
|
|
|
|
|
void generic_tokenizer_factory_encoder::encode(unsigned version, ostream& os) { |
|
12771
|
0
|
0
|
|
|
|
|
os.put(version); |
|
12772
|
0
|
|
|
|
|
|
} |
|
12773
|
|
|
|
|
|
|
|
|
12774
|
|
|
|
|
|
|
} // namespace morphodita |
|
12775
|
|
|
|
|
|
|
|
|
12776
|
|
|
|
|
|
|
///////// |
|
12777
|
|
|
|
|
|
|
// File: unilib/uninorms.h |
|
12778
|
|
|
|
|
|
|
///////// |
|
12779
|
|
|
|
|
|
|
|
|
12780
|
|
|
|
|
|
|
// This file is part of UniLib . |
|
12781
|
|
|
|
|
|
|
// |
|
12782
|
|
|
|
|
|
|
// Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of |
|
12783
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
12784
|
|
|
|
|
|
|
// |
|
12785
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
12786
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
12787
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
12788
|
|
|
|
|
|
|
// |
|
12789
|
|
|
|
|
|
|
// UniLib version: 3.3.0 |
|
12790
|
|
|
|
|
|
|
// Unicode version: 15.0.0 |
|
12791
|
|
|
|
|
|
|
|
|
12792
|
|
|
|
|
|
|
namespace unilib { |
|
12793
|
|
|
|
|
|
|
|
|
12794
|
|
|
|
|
|
|
class uninorms { |
|
12795
|
|
|
|
|
|
|
public: |
|
12796
|
|
|
|
|
|
|
static void nfc(std::u32string& str); |
|
12797
|
|
|
|
|
|
|
static void nfd(std::u32string& str); |
|
12798
|
|
|
|
|
|
|
static void nfkc(std::u32string& str); |
|
12799
|
|
|
|
|
|
|
static void nfkd(std::u32string& str); |
|
12800
|
|
|
|
|
|
|
|
|
12801
|
|
|
|
|
|
|
private: |
|
12802
|
|
|
|
|
|
|
static void compose(std::u32string& str); |
|
12803
|
|
|
|
|
|
|
static void decompose(std::u32string& str, bool kanonical); |
|
12804
|
|
|
|
|
|
|
|
|
12805
|
|
|
|
|
|
|
static const char32_t CHARS = 0x110000; |
|
12806
|
|
|
|
|
|
|
|
|
12807
|
|
|
|
|
|
|
struct Hangul { |
|
12808
|
|
|
|
|
|
|
// Hangul decomposition and composition |
|
12809
|
|
|
|
|
|
|
static const char32_t SBase = 0xAC00, LBase = 0x1100, VBase = 0x1161, TBase = 0x11A7; |
|
12810
|
|
|
|
|
|
|
static const char32_t LCount = 19, VCount = 21, TCount = 28, NCount = VCount * TCount, SCount = LCount * NCount; |
|
12811
|
|
|
|
|
|
|
}; |
|
12812
|
|
|
|
|
|
|
|
|
12813
|
|
|
|
|
|
|
static const uint8_t ccc_index[CHARS >> 8]; |
|
12814
|
|
|
|
|
|
|
static const uint8_t ccc_block[][256]; |
|
12815
|
|
|
|
|
|
|
|
|
12816
|
|
|
|
|
|
|
static const uint8_t composition_index[CHARS >> 8]; |
|
12817
|
|
|
|
|
|
|
static const uint16_t composition_block[][257]; |
|
12818
|
|
|
|
|
|
|
static const char32_t composition_data[]; |
|
12819
|
|
|
|
|
|
|
|
|
12820
|
|
|
|
|
|
|
static const uint8_t decomposition_index[CHARS >> 8]; |
|
12821
|
|
|
|
|
|
|
static const uint16_t decomposition_block[][257]; |
|
12822
|
|
|
|
|
|
|
static const char32_t decomposition_data[]; |
|
12823
|
|
|
|
|
|
|
}; |
|
12824
|
|
|
|
|
|
|
|
|
12825
|
|
|
|
|
|
|
} // namespace unilib |
|
12826
|
|
|
|
|
|
|
|
|
12827
|
|
|
|
|
|
|
///////// |
|
12828
|
|
|
|
|
|
|
// File: morphodita/tokenizer/gru_tokenizer_network.h |
|
12829
|
|
|
|
|
|
|
///////// |
|
12830
|
|
|
|
|
|
|
|
|
12831
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
12832
|
|
|
|
|
|
|
// |
|
12833
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
|
12834
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
12835
|
|
|
|
|
|
|
// |
|
12836
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
12837
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
12838
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
12839
|
|
|
|
|
|
|
|
|
12840
|
|
|
|
|
|
|
namespace morphodita { |
|
12841
|
|
|
|
|
|
|
|
|
12842
|
|
|
|
|
|
|
// Declarations |
|
12843
|
|
|
|
|
|
|
|
|
12844
|
1
|
|
|
|
|
|
class gru_tokenizer_network { |
|
12845
|
|
|
|
|
|
|
public: |
|
12846
|
1
|
|
|
|
|
|
virtual ~gru_tokenizer_network() {} |
|
12847
|
|
|
|
|
|
|
|
|
12848
|
|
|
|
|
|
|
template struct matrix { |
|
12849
|
|
|
|
|
|
|
float w[R][C]; |
|
12850
|
|
|
|
|
|
|
float b[R]; |
|
12851
|
|
|
|
|
|
|
|
|
12852
|
|
|
|
|
|
|
void clear(); |
|
12853
|
|
|
|
|
|
|
void load(binary_decoder& data); |
|
12854
|
|
|
|
|
|
|
}; |
|
12855
|
|
|
|
|
|
|
|
|
12856
|
|
|
|
|
|
|
enum { NO_SPLIT, END_OF_TOKEN, END_OF_SENTENCE, OUTCOMES }; |
|
12857
|
|
|
|
|
|
|
struct outcome_t { |
|
12858
|
|
|
|
|
|
|
int outcome; |
|
12859
|
|
|
|
|
|
|
float w[3]; |
|
12860
|
|
|
|
|
|
|
const float* embedding; |
|
12861
|
|
|
|
|
|
|
}; |
|
12862
|
|
|
|
|
|
|
struct char_info { |
|
12863
|
|
|
|
|
|
|
char32_t chr; |
|
12864
|
|
|
|
|
|
|
unilib::unicode::category_t cat; |
|
12865
|
|
|
|
|
|
|
|
|
12866
|
|
|
|
|
|
|
char_info() {} |
|
12867
|
34
|
|
|
|
|
|
char_info(char32_t chr, unilib::unicode::category_t cat) : chr(chr), cat(cat) {} |
|
12868
|
|
|
|
|
|
|
}; |
|
12869
|
|
|
|
|
|
|
|
|
12870
|
|
|
|
|
|
|
virtual void classify(const vector& chars, vector& outcomes) const = 0; |
|
12871
|
|
|
|
|
|
|
|
|
12872
|
|
|
|
|
|
|
static gru_tokenizer_network* load(binary_decoder& data); |
|
12873
|
|
|
|
|
|
|
}; |
|
12874
|
|
|
|
|
|
|
|
|
12875
|
|
|
|
|
|
|
template |
|
12876
|
2
|
|
|
|
|
|
class gru_tokenizer_network_implementation : public gru_tokenizer_network { |
|
12877
|
|
|
|
|
|
|
public: |
|
12878
|
|
|
|
|
|
|
virtual void classify(const vector& chars, vector& outcomes) const override; |
|
12879
|
|
|
|
|
|
|
|
|
12880
|
|
|
|
|
|
|
static gru_tokenizer_network_implementation* load(binary_decoder& data); |
|
12881
|
|
|
|
|
|
|
|
|
12882
|
|
|
|
|
|
|
protected: |
|
12883
|
|
|
|
|
|
|
void cache_embeddings(); |
|
12884
|
|
|
|
|
|
|
|
|
12885
|
|
|
|
|
|
|
struct cached_embedding { |
|
12886
|
|
|
|
|
|
|
matrix<1, D> e; |
|
12887
|
|
|
|
|
|
|
matrix<6, D> cache; |
|
12888
|
|
|
|
|
|
|
}; |
|
12889
|
|
|
|
|
|
|
|
|
12890
|
|
|
|
|
|
|
struct gru { |
|
12891
|
|
|
|
|
|
|
matrix X, X_r, X_z; |
|
12892
|
|
|
|
|
|
|
matrix H, H_r, H_z; |
|
12893
|
|
|
|
|
|
|
|
|
12894
|
|
|
|
|
|
|
void load(binary_decoder& data); |
|
12895
|
|
|
|
|
|
|
}; |
|
12896
|
|
|
|
|
|
|
|
|
12897
|
|
|
|
|
|
|
unordered_map embeddings; |
|
12898
|
|
|
|
|
|
|
cached_embedding empty_embedding; |
|
12899
|
|
|
|
|
|
|
gru gru_fwd, gru_bwd; |
|
12900
|
|
|
|
|
|
|
matrix<3, D> projection_fwd, projection_bwd; |
|
12901
|
|
|
|
|
|
|
unordered_map unknown_chars; |
|
12902
|
|
|
|
|
|
|
}; |
|
12903
|
|
|
|
|
|
|
|
|
12904
|
|
|
|
|
|
|
// Definitions |
|
12905
|
|
|
|
|
|
|
|
|
12906
|
|
|
|
|
|
|
template |
|
12907
|
|
|
|
|
|
|
void gru_tokenizer_network::matrix::clear() { |
|
12908
|
4
|
100
|
|
|
|
|
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12909
|
0
|
|
|
|
|
|
fill_n(b, R, 0.f); |
|
12910
|
|
|
|
|
|
|
} |
|
12911
|
|
|
|
|
|
|
|
|
12912
|
|
|
|
|
|
|
template |
|
12913
|
28
|
|
|
|
|
|
void gru_tokenizer_network::matrix::load(binary_decoder& data) { |
|
12914
|
212
|
0
|
|
|
|
|
for (int i = 0; i < R; i++) memcpy(w[i], data.next(C), sizeof(float) * C); |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
12915
|
14
|
|
|
|
|
|
memcpy(b, data.next(R), sizeof(float) * R); |
|
12916
|
14
|
|
|
|
|
|
} |
|
12917
|
|
|
|
|
|
|
|
|
12918
|
|
|
|
|
|
|
template |
|
12919
|
2
|
|
|
|
|
|
void gru_tokenizer_network_implementation::gru::load(binary_decoder& data) { |
|
12920
|
2
|
|
|
|
|
|
X.load(data); |
|
12921
|
2
|
|
|
|
|
|
X_r.load(data); |
|
12922
|
2
|
|
|
|
|
|
X_z.load(data); |
|
12923
|
2
|
|
|
|
|
|
H.load(data); |
|
12924
|
2
|
|
|
|
|
|
H_r.load(data); |
|
12925
|
2
|
|
|
|
|
|
H_z.load(data); |
|
12926
|
2
|
|
|
|
|
|
} |
|
12927
|
|
|
|
|
|
|
|
|
12928
|
|
|
|
|
|
|
template |
|
12929
|
1
|
|
|
|
|
|
void gru_tokenizer_network_implementation::classify(const vector& chars, vector& outcomes) const { |
|
12930
|
2
|
50
|
|
|
|
|
if (chars.empty()) return; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12931
|
|
|
|
|
|
|
|
|
12932
|
|
|
|
|
|
|
// Resolve embeddings, possibly with unknown_chars or empty_embedding |
|
12933
|
|
|
|
|
|
|
u32string decomposition; |
|
12934
|
35
|
100
|
|
|
|
|
for (size_t i = 0; i < chars.size(); i++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12935
|
34
|
|
|
|
|
|
auto embedding = embeddings.find(chars[i].chr); |
|
12936
|
|
|
|
|
|
|
|
|
12937
|
|
|
|
|
|
|
// Try finding substitute character if not found, by using NFKD |
|
12938
|
|
|
|
|
|
|
// and by replacing IDEOGRAPHIC FULL STOP/COMMA. |
|
12939
|
34
|
|
|
|
|
|
if (embedding == embeddings.end()) { |
|
12940
|
0
|
|
|
|
|
|
decomposition.assign(1, chars[i].chr); |
|
12941
|
|
|
|
|
|
|
unilib::uninorms::nfkd(decomposition); |
|
12942
|
0
|
0
|
|
|
|
|
if (decomposition[0] == 0x3001) decomposition[0] = char32_t(','); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12943
|
0
|
0
|
|
|
|
|
if (decomposition[0] == 0x3002) decomposition[0] = char32_t('.'); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12944
|
0
|
0
|
|
|
|
|
if (decomposition[0] != chars[i].chr) embedding = embeddings.find(decomposition[0]); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12945
|
|
|
|
|
|
|
} |
|
12946
|
|
|
|
|
|
|
|
|
12947
|
34
|
50
|
|
|
|
|
if (embedding != embeddings.end()) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12948
|
34
|
|
|
|
|
|
outcomes[i].embedding = embedding->second.cache.w[0]; |
|
12949
|
|
|
|
|
|
|
} else { |
|
12950
|
0
|
|
|
|
|
|
auto unknown_char = unknown_chars.find(chars[i].cat); |
|
12951
|
0
|
|
|
|
|
|
if (unknown_char != unknown_chars.end()) embedding = embeddings.find(unknown_char->second); |
|
12952
|
0
|
0
|
|
|
|
|
outcomes[i].embedding = embedding != embeddings.end() ? embedding->second.cache.w[0] : empty_embedding.cache.w[0]; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12953
|
|
|
|
|
|
|
} |
|
12954
|
|
|
|
|
|
|
} |
|
12955
|
|
|
|
|
|
|
|
|
12956
|
|
|
|
|
|
|
// Clear outcome probabilities |
|
12957
|
35
|
100
|
|
|
|
|
for (auto&& outcome : outcomes) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12958
|
136
|
100
|
|
|
|
|
for (int i = 0; i < 3; i++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12959
|
102
|
|
|
|
|
|
outcome.w[i] = projection_fwd.b[i]; |
|
12960
|
|
|
|
|
|
|
|
|
12961
|
|
|
|
|
|
|
// Perform forward & backward GRU |
|
12962
|
|
|
|
|
|
|
matrix<1, D> state, update, reset, candidate; |
|
12963
|
3
|
100
|
|
|
|
|
for (int dir = 0; dir < 2; dir++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12964
|
2
|
100
|
|
|
|
|
auto& gru = dir == 0 ? gru_fwd : gru_bwd; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12965
|
2
|
100
|
|
|
|
|
auto& projection = dir == 0 ? projection_fwd : projection_bwd; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12966
|
|
|
|
|
|
|
|
|
12967
|
|
|
|
|
|
|
state.clear(); |
|
12968
|
70
|
100
|
|
|
|
|
for (size_t i = 0; i < outcomes.size(); i++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12969
|
68
|
100
|
|
|
|
|
auto& outcome = outcomes[dir == 0 ? i : outcomes.size() - 1 - i]; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12970
|
68
|
100
|
|
|
|
|
auto* embedding_cache = outcome.embedding + (dir == 1) * 3 * D; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12971
|
|
|
|
|
|
|
|
|
12972
|
1156
|
100
|
|
|
|
|
for (int j = 0; j < D; j++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12973
|
1088
|
|
|
|
|
|
update.w[0][j] = gru.X_z.b[j] + embedding_cache[2*D + j]; |
|
12974
|
1088
|
|
|
|
|
|
reset.w[0][j] = gru.X_r.b[j] + embedding_cache[D + j]; |
|
12975
|
18496
|
100
|
|
|
|
|
for (int k = 0; k < D; k++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12976
|
17408
|
|
|
|
|
|
update.w[0][j] += state.w[0][k] * gru.H_z.w[j][k]; |
|
12977
|
17408
|
|
|
|
|
|
reset.w[0][j] += state.w[0][k] * gru.H_r.w[j][k]; |
|
12978
|
|
|
|
|
|
|
} |
|
12979
|
2176
|
|
|
|
|
|
update.w[0][j] = 1.f / (1.f + exp(-update.w[0][j])); |
|
12980
|
2176
|
|
|
|
|
|
reset.w[0][j] = 1.f / (1.f + exp(-reset.w[0][j])); |
|
12981
|
1088
|
|
|
|
|
|
reset.w[0][j] *= state.w[0][j]; |
|
12982
|
|
|
|
|
|
|
} |
|
12983
|
1156
|
100
|
|
|
|
|
for (int j = 0; j < D; j++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12984
|
1088
|
|
|
|
|
|
candidate.w[0][j] = gru.X.b[j] + embedding_cache[j]; |
|
12985
|
18496
|
100
|
|
|
|
|
for (int k = 0; k < D; k++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12986
|
17408
|
|
|
|
|
|
candidate.w[0][j] += reset.w[0][k] * gru.H.w[j][k]; |
|
12987
|
1088
|
|
|
|
|
|
candidate.w[0][j] = tanh(candidate.w[0][j]); |
|
12988
|
1088
|
|
|
|
|
|
state.w[0][j] = update.w[0][j] * state.w[0][j] + (1.f - update.w[0][j]) * candidate.w[0][j]; |
|
12989
|
|
|
|
|
|
|
} |
|
12990
|
|
|
|
|
|
|
|
|
12991
|
272
|
100
|
|
|
|
|
for (int j = 0; j < 3; j++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12992
|
3468
|
100
|
|
|
|
|
for (int k = 0; k < D; k++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12993
|
3264
|
|
|
|
|
|
outcome.w[j] += projection.w[j][k] * state.w[0][k]; |
|
12994
|
|
|
|
|
|
|
} |
|
12995
|
|
|
|
|
|
|
} |
|
12996
|
|
|
|
|
|
|
|
|
12997
|
|
|
|
|
|
|
// Choose the outcome with the highest weight |
|
12998
|
35
|
100
|
|
|
|
|
for (auto&& outcome : outcomes) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
12999
|
34
|
|
|
|
|
|
outcome.outcome = outcome.w[1] > outcome.w[0]; |
|
13000
|
34
|
100
|
|
|
|
|
if (outcome.w[2] > outcome.w[outcome.outcome]) outcome.outcome = 2; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13001
|
|
|
|
|
|
|
} |
|
13002
|
|
|
|
|
|
|
} |
|
13003
|
|
|
|
|
|
|
|
|
13004
|
|
|
|
|
|
|
template |
|
13005
|
1
|
|
|
|
|
|
gru_tokenizer_network_implementation* gru_tokenizer_network_implementation::load(binary_decoder& data) { |
|
13006
|
1
|
|
|
|
|
|
unique_ptr> network(new gru_tokenizer_network_implementation()); |
|
13007
|
|
|
|
|
|
|
|
|
13008
|
21
|
0
|
|
|
|
|
for (unsigned chars = data.next_4B(); chars; chars--) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
13009
|
20
|
0
|
|
|
|
|
auto& embedding = network->embeddings[data.next_4B()]; |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
13010
|
20
|
0
|
|
|
|
|
copy_n(data.next(D), D, embedding.e.w[0]); |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
13011
|
|
|
|
|
|
|
} |
|
13012
|
1
|
|
|
|
|
|
fill_n(network->empty_embedding.e.w[0], D, 0.f); |
|
13013
|
|
|
|
|
|
|
|
|
13014
|
1
|
0
|
|
|
|
|
network->gru_fwd.load(data); |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
13015
|
1
|
0
|
|
|
|
|
network->gru_bwd.load(data); |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
13016
|
1
|
0
|
|
|
|
|
network->projection_fwd.load(data); |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
13017
|
1
|
0
|
|
|
|
|
network->projection_bwd.load(data); |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
13018
|
|
|
|
|
|
|
|
|
13019
|
|
|
|
|
|
|
network->unknown_chars.clear(); |
|
13020
|
5
|
0
|
|
|
|
|
for (unsigned unknown_chars_len = data.next_1B(); unknown_chars_len; unknown_chars_len--) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
13021
|
4
|
0
|
|
|
|
|
unilib::unicode::category_t cat = data.next_4B(); |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
13022
|
4
|
0
|
|
|
|
|
network->unknown_chars[cat] = data.next_4B(); |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
13023
|
|
|
|
|
|
|
} |
|
13024
|
|
|
|
|
|
|
|
|
13025
|
1
|
|
|
|
|
|
network->cache_embeddings(); |
|
13026
|
|
|
|
|
|
|
|
|
13027
|
1
|
|
|
|
|
|
return network.release(); |
|
13028
|
|
|
|
|
|
|
} |
|
13029
|
|
|
|
|
|
|
|
|
13030
|
|
|
|
|
|
|
template |
|
13031
|
2
|
|
|
|
|
|
void gru_tokenizer_network_implementation::cache_embeddings() { |
|
13032
|
21
|
0
|
|
|
|
|
for (auto&& embedding : embeddings) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
13033
|
|
|
|
|
|
|
auto& e = embedding.second.e; |
|
13034
|
|
|
|
|
|
|
auto& cache = embedding.second.cache; |
|
13035
|
|
|
|
|
|
|
|
|
13036
|
140
|
0
|
|
|
|
|
for (int i = 0; i < 6; i++) fill_n(cache.w[i], D, 0.f); |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
13037
|
5460
|
0
|
|
|
|
|
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[0][i] += e.w[0][j] * gru_fwd.X.w[i][j]; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
13038
|
5460
|
0
|
|
|
|
|
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[1][i] += e.w[0][j] * gru_fwd.X_r.w[i][j]; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
13039
|
5460
|
0
|
|
|
|
|
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[2][i] += e.w[0][j] * gru_fwd.X_z.w[i][j]; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
13040
|
5460
|
0
|
|
|
|
|
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[3][i] += e.w[0][j] * gru_bwd.X.w[i][j]; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
13041
|
5460
|
0
|
|
|
|
|
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[4][i] += e.w[0][j] * gru_bwd.X_r.w[i][j]; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
13042
|
5460
|
0
|
|
|
|
|
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[5][i] += e.w[0][j] * gru_bwd.X_z.w[i][j]; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
13043
|
|
|
|
|
|
|
} |
|
13044
|
7
|
0
|
|
|
|
|
for (int i = 0; i < 6; i++) fill_n(empty_embedding.cache.w[i], D, 0.f); |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
13045
|
1
|
|
|
|
|
|
} |
|
13046
|
|
|
|
|
|
|
|
|
13047
|
|
|
|
|
|
|
} // namespace morphodita |
|
13048
|
|
|
|
|
|
|
|
|
13049
|
|
|
|
|
|
|
///////// |
|
13050
|
|
|
|
|
|
|
// File: morphodita/tokenizer/gru_tokenizer.h |
|
13051
|
|
|
|
|
|
|
///////// |
|
13052
|
|
|
|
|
|
|
|
|
13053
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
13054
|
|
|
|
|
|
|
// |
|
13055
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
|
13056
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
13057
|
|
|
|
|
|
|
// |
|
13058
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
13059
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
13060
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
13061
|
|
|
|
|
|
|
|
|
13062
|
|
|
|
|
|
|
namespace morphodita { |
|
13063
|
|
|
|
|
|
|
|
|
13064
|
4
|
|
|
|
|
|
class gru_tokenizer : public unicode_tokenizer { |
|
13065
|
|
|
|
|
|
|
public: |
|
13066
|
|
|
|
|
|
|
gru_tokenizer(unsigned url_email_tokenizer, unsigned segment, bool allow_spaces, const gru_tokenizer_network& network) |
|
13067
|
1
|
0
|
|
|
|
|
: unicode_tokenizer(url_email_tokenizer), segment(segment), allow_spaces(allow_spaces), network_index(0), network_length(0), network(network) {} |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
13068
|
|
|
|
|
|
|
|
|
13069
|
|
|
|
|
|
|
virtual bool next_sentence(vector& tokens) override; |
|
13070
|
|
|
|
|
|
|
|
|
13071
|
|
|
|
|
|
|
private: |
|
13072
|
|
|
|
|
|
|
inline bool is_space(size_t index); |
|
13073
|
|
|
|
|
|
|
int next_outcome(); |
|
13074
|
|
|
|
|
|
|
|
|
13075
|
|
|
|
|
|
|
unsigned segment; |
|
13076
|
|
|
|
|
|
|
bool allow_spaces; |
|
13077
|
|
|
|
|
|
|
unsigned network_index, network_length; |
|
13078
|
|
|
|
|
|
|
vector network_chars; |
|
13079
|
|
|
|
|
|
|
vector network_outcomes; |
|
13080
|
|
|
|
|
|
|
vector network_offsets; |
|
13081
|
|
|
|
|
|
|
const gru_tokenizer_network& network; |
|
13082
|
|
|
|
|
|
|
}; |
|
13083
|
|
|
|
|
|
|
|
|
13084
|
|
|
|
|
|
|
} // namespace morphodita |
|
13085
|
|
|
|
|
|
|
|
|
13086
|
|
|
|
|
|
|
///////// |
|
13087
|
|
|
|
|
|
|
// File: morphodita/tokenizer/gru_tokenizer.cpp |
|
13088
|
|
|
|
|
|
|
///////// |
|
13089
|
|
|
|
|
|
|
|
|
13090
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
13091
|
|
|
|
|
|
|
// |
|
13092
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
|
13093
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
13094
|
|
|
|
|
|
|
// |
|
13095
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
13096
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
13097
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
13098
|
|
|
|
|
|
|
|
|
13099
|
|
|
|
|
|
|
namespace morphodita { |
|
13100
|
|
|
|
|
|
|
|
|
13101
|
|
|
|
|
|
|
bool gru_tokenizer::is_space(size_t index) { |
|
13102
|
83
|
100
|
|
|
|
|
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
13103
|
|
|
|
|
|
|
} |
|
13104
|
|
|
|
|
|
|
|
|
13105
|
2
|
|
|
|
|
|
bool gru_tokenizer::next_sentence(vector& tokens) { |
|
13106
|
|
|
|
|
|
|
tokens.clear(); |
|
13107
|
|
|
|
|
|
|
|
|
13108
|
|
|
|
|
|
|
// Reset tokenizer on new text |
|
13109
|
9
|
100
|
|
|
|
|
if (current == 0) network_index = network_length = 0; |
|
13110
|
|
|
|
|
|
|
|
|
13111
|
|
|
|
|
|
|
// Tokenize until EOS |
|
13112
|
9
|
100
|
|
|
|
|
for (bool eos = false; !eos && !emergency_sentence_split(tokens); ) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
13113
|
25
|
100
|
|
|
|
|
while (current < chars.size() - 1 && is_space(current)) |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
13114
|
5
|
50
|
|
|
|
|
if (next_outcome() == gru_tokenizer_network::END_OF_SENTENCE && !tokens.empty()) |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
13115
|
|
|
|
|
|
|
break; |
|
13116
|
|
|
|
|
|
|
|
|
13117
|
8
|
100
|
|
|
|
|
if (current >= chars.size() - 1) break; |
|
13118
|
|
|
|
|
|
|
|
|
13119
|
|
|
|
|
|
|
// We have a beginning of a token. Try if it is an URL. |
|
13120
|
7
|
50
|
|
|
|
|
if (tokenize_url_email(tokens)) { |
|
13121
|
0
|
0
|
|
|
|
|
while (network_index < network_length && network_offsets[network_index] < current) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13122
|
0
|
0
|
|
|
|
|
if (network_outcomes[network_index++].outcome == gru_tokenizer_network::END_OF_SENTENCE && !tokens.empty()) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13123
|
|
|
|
|
|
|
eos = true; |
|
13124
|
0
|
|
|
|
|
|
continue; |
|
13125
|
|
|
|
|
|
|
} |
|
13126
|
|
|
|
|
|
|
|
|
13127
|
|
|
|
|
|
|
// Slurp current token |
|
13128
|
7
|
|
|
|
|
|
size_t token_start = current; |
|
13129
|
22
|
50
|
|
|
|
|
do { |
|
13130
|
29
|
|
|
|
|
|
int outcome = next_outcome(); |
|
13131
|
29
|
|
|
|
|
|
eos = outcome == gru_tokenizer_network::END_OF_SENTENCE; |
|
13132
|
29
|
100
|
|
|
|
|
if (outcome != gru_tokenizer_network::NO_SPLIT) break; |
|
13133
|
44
|
|
|
|
|
|
} while (current < chars.size() - 1); |
|
13134
|
8
|
|
|
|
|
|
tokens.emplace_back(token_start, current - token_start); |
|
13135
|
|
|
|
|
|
|
} |
|
13136
|
|
|
|
|
|
|
|
|
13137
|
2
|
|
|
|
|
|
return !tokens.empty(); |
|
13138
|
|
|
|
|
|
|
} |
|
13139
|
|
|
|
|
|
|
|
|
13140
|
34
|
|
|
|
|
|
int gru_tokenizer::next_outcome() { |
|
13141
|
34
|
100
|
|
|
|
|
if (network_index >= network_length) { |
|
13142
|
|
|
|
|
|
|
// Compute required window |
|
13143
|
1
|
|
|
|
|
|
network_index = 0; |
|
13144
|
1
|
|
|
|
|
|
network_length = 0; |
|
13145
|
|
|
|
|
|
|
network_chars.clear(); |
|
13146
|
|
|
|
|
|
|
network_outcomes.clear(); |
|
13147
|
|
|
|
|
|
|
network_offsets.clear(); |
|
13148
|
|
|
|
|
|
|
|
|
13149
|
|
|
|
|
|
|
// Prepare data for the classification |
|
13150
|
70
|
100
|
|
|
|
|
for (size_t offset = current; |
|
13151
|
35
|
100
|
|
|
|
|
network_offsets.push_back(offset), offset < chars.size() - 1 && network_length < segment; |
|
|
|
50
|
|
|
|
|
|
|
13152
|
34
|
|
|
|
|
|
network_length++, offset++) { |
|
13153
|
34
|
100
|
|
|
|
|
if (is_space(offset)) { |
|
13154
|
5
|
|
|
|
|
|
network_chars.emplace_back(' ', unilib::unicode::Zs); |
|
13155
|
9
|
100
|
|
|
|
|
while (offset + 1 < chars.size() - 1 && is_space(offset + 1)) offset++; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
13156
|
|
|
|
|
|
|
} else { |
|
13157
|
29
|
|
|
|
|
|
network_chars.emplace_back(chars[offset].chr, chars[offset].cat); |
|
13158
|
|
|
|
|
|
|
} |
|
13159
|
|
|
|
|
|
|
} |
|
13160
|
|
|
|
|
|
|
// Add a space to the end on the EOD |
|
13161
|
1
|
50
|
|
|
|
|
if (network_length < segment && network_chars.back().chr != ' ') |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
13162
|
0
|
|
|
|
|
|
network_chars.emplace_back(' ', unilib::unicode::Zs); |
|
13163
|
1
|
|
|
|
|
|
network_outcomes.resize(network_chars.size()); |
|
13164
|
|
|
|
|
|
|
|
|
13165
|
|
|
|
|
|
|
// Perform the classification |
|
13166
|
34
|
|
|
|
|
|
network.classify(network_chars, network_outcomes); |
|
13167
|
|
|
|
|
|
|
|
|
13168
|
|
|
|
|
|
|
// Add spacing token/sentence breaks |
|
13169
|
34
|
100
|
|
|
|
|
for (size_t i = 0; i < network_length - 1; i++) |
|
13170
|
33
|
100
|
|
|
|
|
if (is_space(network_offsets[i+1])) { |
|
13171
|
|
|
|
|
|
|
// Detect EOS on the following space or \n\n or \r\n\r\n, or if there is end of text |
|
13172
|
5
|
|
|
|
|
|
bool eos = network_outcomes[i+1].outcome == gru_tokenizer_network::END_OF_SENTENCE; |
|
13173
|
5
|
100
|
|
|
|
|
if (i + 2 == network_length) eos = true; |
|
13174
|
5
|
50
|
|
|
|
|
for (size_t j = network_offsets[i+1]; j + 1 < network_offsets[i+2] && !eos; j++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
13175
|
0
|
0
|
|
|
|
|
eos = (chars[j].chr == '\n' && chars[j+1].chr == '\n') || |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13176
|
0
|
0
|
|
|
|
|
(j + 3 < network_offsets[i+2] && chars[j].chr == '\r' && chars[j+1].chr == '\n' && chars[j+2].chr == '\r' && chars[j+3].chr == '\n'); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13177
|
5
|
100
|
|
|
|
|
if (eos) network_outcomes[i].outcome = gru_tokenizer_network::END_OF_SENTENCE; |
|
13178
|
|
|
|
|
|
|
|
|
13179
|
5
|
100
|
|
|
|
|
if (network_outcomes[i].outcome == gru_tokenizer_network::NO_SPLIT) |
|
13180
|
|
|
|
|
|
|
// Force EOT if not allowing spaces, and also detect EOT on the following space |
|
13181
|
4
|
50
|
|
|
|
|
if (!allow_spaces || network_outcomes[i+1].outcome == gru_tokenizer_network::END_OF_TOKEN) |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
13182
|
4
|
|
|
|
|
|
network_outcomes[i].outcome = gru_tokenizer_network::END_OF_TOKEN; |
|
13183
|
|
|
|
|
|
|
} |
|
13184
|
|
|
|
|
|
|
|
|
13185
|
|
|
|
|
|
|
// Adjust network_length to suitable break |
|
13186
|
1
|
50
|
|
|
|
|
if (network_length == segment && network_length >= 10) { |
|
|
|
0
|
|
|
|
|
|
|
13187
|
0
|
|
|
|
|
|
network_length -= 5; |
|
13188
|
0
|
0
|
|
|
|
|
while (network_length > segment / 2) |
|
13189
|
0
|
0
|
|
|
|
|
if (network_outcomes[--network_length].outcome != gru_tokenizer_network::NO_SPLIT) |
|
13190
|
|
|
|
|
|
|
break; |
|
13191
|
|
|
|
|
|
|
} |
|
13192
|
|
|
|
|
|
|
} |
|
13193
|
102
|
|
|
|
|
|
return current = network_offsets[network_index + 1], network_outcomes[network_index++].outcome; |
|
13194
|
|
|
|
|
|
|
} |
|
13195
|
|
|
|
|
|
|
|
|
13196
|
|
|
|
|
|
|
} // namespace morphodita |
|
13197
|
|
|
|
|
|
|
|
|
13198
|
|
|
|
|
|
|
///////// |
|
13199
|
|
|
|
|
|
|
// File: morphodita/tokenizer/gru_tokenizer_factory.h |
|
13200
|
|
|
|
|
|
|
///////// |
|
13201
|
|
|
|
|
|
|
|
|
13202
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
13203
|
|
|
|
|
|
|
// |
|
13204
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
|
13205
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
13206
|
|
|
|
|
|
|
// |
|
13207
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
13208
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
13209
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
13210
|
|
|
|
|
|
|
|
|
13211
|
|
|
|
|
|
|
namespace morphodita { |
|
13212
|
|
|
|
|
|
|
|
|
13213
|
2
|
|
|
|
|
|
class gru_tokenizer_factory : public tokenizer_factory { |
|
13214
|
|
|
|
|
|
|
public: |
|
13215
|
|
|
|
|
|
|
// Construct a new tokenizer instance. |
|
13216
|
|
|
|
|
|
|
virtual tokenizer* new_tokenizer(const morpho* m) const override; |
|
13217
|
|
|
|
|
|
|
|
|
13218
|
|
|
|
|
|
|
bool load(istream& is); |
|
13219
|
|
|
|
|
|
|
|
|
13220
|
|
|
|
|
|
|
private: |
|
13221
|
|
|
|
|
|
|
unsigned url_email_tokenizer; |
|
13222
|
|
|
|
|
|
|
unsigned segment; |
|
13223
|
|
|
|
|
|
|
bool allow_spaces; |
|
13224
|
|
|
|
|
|
|
|
|
13225
|
|
|
|
|
|
|
unique_ptr network; |
|
13226
|
|
|
|
|
|
|
}; |
|
13227
|
|
|
|
|
|
|
|
|
13228
|
|
|
|
|
|
|
} // namespace morphodita |
|
13229
|
|
|
|
|
|
|
|
|
13230
|
|
|
|
|
|
|
///////// |
|
13231
|
|
|
|
|
|
|
// File: morphodita/tokenizer/gru_tokenizer_factory.cpp |
|
13232
|
|
|
|
|
|
|
///////// |
|
13233
|
|
|
|
|
|
|
|
|
13234
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
13235
|
|
|
|
|
|
|
// |
|
13236
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
|
13237
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
13238
|
|
|
|
|
|
|
// |
|
13239
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
13240
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
13241
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
13242
|
|
|
|
|
|
|
|
|
13243
|
|
|
|
|
|
|
namespace morphodita { |
|
13244
|
|
|
|
|
|
|
|
|
13245
|
1
|
|
|
|
|
|
tokenizer* gru_tokenizer_factory::new_tokenizer(const morpho* /*m*/) const { |
|
13246
|
2
|
|
|
|
|
|
return new gru_tokenizer(url_email_tokenizer, segment, allow_spaces, *network); |
|
13247
|
|
|
|
|
|
|
} |
|
13248
|
|
|
|
|
|
|
|
|
13249
|
1
|
|
|
|
|
|
bool gru_tokenizer_factory::load(istream& is) { |
|
13250
|
|
|
|
|
|
|
char version; |
|
13251
|
1
|
50
|
|
|
|
|
if (!is.get(version)) return false; |
|
13252
|
1
|
50
|
|
|
|
|
if (!(version >= 1 && version <= 2)) return false; |
|
13253
|
|
|
|
|
|
|
|
|
13254
|
|
|
|
|
|
|
binary_decoder data; |
|
13255
|
1
|
50
|
|
|
|
|
if (!compressor::load(is, data)) return false; |
|
|
|
50
|
|
|
|
|
|
|
13256
|
|
|
|
|
|
|
|
|
13257
|
|
|
|
|
|
|
try { |
|
13258
|
1
|
50
|
|
|
|
|
url_email_tokenizer = data.next_1B(); |
|
13259
|
1
|
50
|
|
|
|
|
segment = data.next_2B(); |
|
13260
|
1
|
50
|
|
|
|
|
allow_spaces = version >= 2 ? data.next_1B() : false /*false was default for version 1*/; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13261
|
|
|
|
|
|
|
|
|
13262
|
1
|
50
|
|
|
|
|
network.reset(gru_tokenizer_network::load(data)); |
|
13263
|
1
|
50
|
|
|
|
|
if (!network) return false; |
|
|
|
0
|
|
|
|
|
|
|
13264
|
|
|
|
|
|
|
} catch (binary_decoder_error&) { |
|
13265
|
|
|
|
|
|
|
return false; |
|
13266
|
|
|
|
|
|
|
} |
|
13267
|
|
|
|
|
|
|
|
|
13268
|
1
|
|
|
|
|
|
return data.is_end(); |
|
13269
|
|
|
|
|
|
|
} |
|
13270
|
|
|
|
|
|
|
|
|
13271
|
|
|
|
|
|
|
} // namespace morphodita |
|
13272
|
|
|
|
|
|
|
|
|
13273
|
|
|
|
|
|
|
///////// |
|
13274
|
|
|
|
|
|
|
// File: morphodita/tokenizer/gru_tokenizer_network.cpp |
|
13275
|
|
|
|
|
|
|
///////// |
|
13276
|
|
|
|
|
|
|
|
|
13277
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
13278
|
|
|
|
|
|
|
// |
|
13279
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
|
13280
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
13281
|
|
|
|
|
|
|
// |
|
13282
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
13283
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
13284
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
13285
|
|
|
|
|
|
|
|
|
13286
|
|
|
|
|
|
|
namespace morphodita { |
|
13287
|
|
|
|
|
|
|
|
|
13288
|
1
|
|
|
|
|
|
gru_tokenizer_network* gru_tokenizer_network::load(binary_decoder& data) { |
|
13289
|
1
|
50
|
|
|
|
|
if (data.next_1B() != 1) return nullptr; |
|
13290
|
1
|
|
|
|
|
|
switch (data.next_1B()) { |
|
13291
|
1
|
|
|
|
|
|
case 16: return gru_tokenizer_network_implementation<16>::load(data); |
|
13292
|
0
|
|
|
|
|
|
case 24: return gru_tokenizer_network_implementation<24>::load(data); |
|
13293
|
0
|
|
|
|
|
|
case 64: return gru_tokenizer_network_implementation<64>::load(data); |
|
13294
|
|
|
|
|
|
|
} |
|
13295
|
|
|
|
|
|
|
return nullptr; |
|
13296
|
|
|
|
|
|
|
} |
|
13297
|
|
|
|
|
|
|
|
|
13298
|
|
|
|
|
|
|
} // namespace morphodita |
|
13299
|
|
|
|
|
|
|
|
|
13300
|
|
|
|
|
|
|
///////// |
|
13301
|
|
|
|
|
|
|
// File: morphodita/tokenizer/gru_tokenizer_trainer.h |
|
13302
|
|
|
|
|
|
|
///////// |
|
13303
|
|
|
|
|
|
|
|
|
13304
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
13305
|
|
|
|
|
|
|
// |
|
13306
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
|
13307
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
13308
|
|
|
|
|
|
|
// |
|
13309
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
13310
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
13311
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
13312
|
|
|
|
|
|
|
|
|
13313
|
|
|
|
|
|
|
namespace morphodita { |
|
13314
|
|
|
|
|
|
|
|
|
13315
|
0
|
|
|
|
|
|
struct tokenized_sentence { |
|
13316
|
|
|
|
|
|
|
u32string sentence; |
|
13317
|
|
|
|
|
|
|
vector tokens; |
|
13318
|
|
|
|
|
|
|
}; |
|
13319
|
|
|
|
|
|
|
|
|
13320
|
|
|
|
|
|
|
class gru_tokenizer_trainer { |
|
13321
|
|
|
|
|
|
|
public: |
|
13322
|
|
|
|
|
|
|
enum { URL_EMAIL_LATEST = unicode_tokenizer::URL_EMAIL_LATEST }; |
|
13323
|
|
|
|
|
|
|
|
|
13324
|
|
|
|
|
|
|
static bool train(unsigned url_email_tokenizer, unsigned segment, bool allow_spaces, unsigned dimension, unsigned epochs, |
|
13325
|
|
|
|
|
|
|
unsigned batch_size, float learning_rate, float learning_rate_final, float dropout, |
|
13326
|
|
|
|
|
|
|
float initialization_range, bool early_stopping, const vector& data, |
|
13327
|
|
|
|
|
|
|
const vector& heldout, ostream& os, string& error); |
|
13328
|
|
|
|
|
|
|
}; |
|
13329
|
|
|
|
|
|
|
|
|
13330
|
|
|
|
|
|
|
} // namespace morphodita |
|
13331
|
|
|
|
|
|
|
|
|
13332
|
|
|
|
|
|
|
///////// |
|
13333
|
|
|
|
|
|
|
// File: morphodita/tokenizer/gru_tokenizer_network_trainer.h |
|
13334
|
|
|
|
|
|
|
///////// |
|
13335
|
|
|
|
|
|
|
|
|
13336
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
13337
|
|
|
|
|
|
|
// |
|
13338
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
|
13339
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
13340
|
|
|
|
|
|
|
// |
|
13341
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
13342
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
13343
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
13344
|
|
|
|
|
|
|
|
|
13345
|
|
|
|
|
|
|
namespace morphodita { |
|
13346
|
|
|
|
|
|
|
|
|
13347
|
|
|
|
|
|
|
// |
|
13348
|
|
|
|
|
|
|
// Declarations |
|
13349
|
|
|
|
|
|
|
// |
|
13350
|
|
|
|
|
|
|
|
|
13351
|
|
|
|
|
|
|
template |
|
13352
|
0
|
0
|
|
|
|
|
class gru_tokenizer_network_trainer : public gru_tokenizer_network_implementation { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13353
|
|
|
|
|
|
|
public: |
|
13354
|
|
|
|
|
|
|
bool train(unsigned url_email_tokenizer, unsigned segment, bool allow_spaces, unsigned epochs, unsigned batch_size, |
|
13355
|
|
|
|
|
|
|
float learning_rate, float learning_rate_final, float dropout, float initialization_range, |
|
13356
|
|
|
|
|
|
|
bool early_stopping, const vector& data, const vector& heldout, |
|
13357
|
|
|
|
|
|
|
binary_encoder& enc, string& error); |
|
13358
|
|
|
|
|
|
|
|
|
13359
|
|
|
|
|
|
|
private: |
|
13360
|
|
|
|
|
|
|
template using matrix = typename gru_tokenizer_network_implementation::template matrix; |
|
13361
|
|
|
|
|
|
|
using typename gru_tokenizer_network_implementation::cached_embedding; |
|
13362
|
|
|
|
|
|
|
using typename gru_tokenizer_network_implementation::gru; |
|
13363
|
|
|
|
|
|
|
|
|
13364
|
|
|
|
|
|
|
template struct matrix_trainer { |
|
13365
|
|
|
|
|
|
|
matrix& original; |
|
13366
|
|
|
|
|
|
|
float w_g[R][C], b_g[R]; |
|
13367
|
|
|
|
|
|
|
float w_m[R][C], b_m[R]; |
|
13368
|
|
|
|
|
|
|
float w_v[R][C], b_v[R]; |
|
13369
|
|
|
|
|
|
|
|
|
13370
|
0
|
0
|
|
|
|
|
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13371
|
|
|
|
|
|
|
void update_weights(float learning_rate); |
|
13372
|
|
|
|
|
|
|
}; |
|
13373
|
0
|
|
|
|
|
|
struct gru_trainer { |
|
13374
|
|
|
|
|
|
|
matrix_trainer X, X_r, X_z; |
|
13375
|
|
|
|
|
|
|
matrix_trainer H, H_r, H_z; |
|
13376
|
|
|
|
|
|
|
vector> states, updates, resets, resetstates, candidates, dropouts; |
|
13377
|
|
|
|
|
|
|
|
|
13378
|
0
|
|
|
|
|
|
gru_trainer(gru& g, unsigned segment) |
|
13379
|
|
|
|
|
|
|
: X(g.X), X_r(g.X_r), X_z(g.X_z), H(g.H), H_r(g.H_r), H_z(g.H_z), states(segment + 1), |
|
13380
|
0
|
0
|
|
|
|
|
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13381
|
|
|
|
|
|
|
void update_weights(float learning_rate); |
|
13382
|
|
|
|
|
|
|
}; |
|
13383
|
|
|
|
|
|
|
|
|
13384
|
|
|
|
|
|
|
struct f1_info { double precision, recall, f1; }; |
|
13385
|
|
|
|
|
|
|
void evaluate(unsigned url_email_tokenizer, unsigned segment, bool allow_spaces, const vector& heldout, |
|
13386
|
|
|
|
|
|
|
f1_info& tokens_f1, f1_info& sentences_f1); |
|
13387
|
|
|
|
|
|
|
void evaluate_f1(const vector& system, const vector& gold, f1_info& f1); |
|
13388
|
|
|
|
|
|
|
|
|
13389
|
|
|
|
|
|
|
template void random_matrix(matrix& m, mt19937& generator, float range, float bias); |
|
13390
|
|
|
|
|
|
|
void random_gru(gru& g, mt19937& generator, float range); |
|
13391
|
|
|
|
|
|
|
|
|
13392
|
|
|
|
|
|
|
template void save_matrix(const matrix& m, binary_encoder& enc); |
|
13393
|
|
|
|
|
|
|
void save_gru(const gru& g, binary_encoder& enc); |
|
13394
|
|
|
|
|
|
|
}; |
|
13395
|
|
|
|
|
|
|
|
|
13396
|
|
|
|
|
|
|
// |
|
13397
|
|
|
|
|
|
|
// Definitions |
|
13398
|
|
|
|
|
|
|
// |
|
13399
|
|
|
|
|
|
|
|
|
13400
|
|
|
|
|
|
|
template |
|
13401
|
0
|
|
|
|
|
|
bool gru_tokenizer_network_trainer::train(unsigned url_email_tokenizer, unsigned segment, bool allow_spaces, unsigned epochs, unsigned batch_size, |
|
13402
|
|
|
|
|
|
|
float learning_rate_initial, float learning_rate_final, float dropout, |
|
13403
|
|
|
|
|
|
|
float initialization_range, bool early_stopping, const vector& data, |
|
13404
|
|
|
|
|
|
|
const vector& heldout, binary_encoder& enc, string& error) { |
|
13405
|
0
|
0
|
|
|
|
|
if (segment < 10) return error.assign("Segment size must be at least 10!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13406
|
|
|
|
|
|
|
|
|
13407
|
|
|
|
|
|
|
unsigned characters = 0; |
|
13408
|
0
|
0
|
|
|
|
|
for (auto&& sentence : data) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13409
|
0
|
|
|
|
|
|
characters += sentence.sentence.size(); |
|
13410
|
0
|
0
|
|
|
|
|
if (characters < segment) return error.assign("Not enought training data for the gru_tokenizer!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13411
|
|
|
|
|
|
|
|
|
13412
|
|
|
|
|
|
|
mt19937 generator; |
|
13413
|
|
|
|
|
|
|
|
|
13414
|
0
|
|
|
|
|
|
float dropout_multiplier = 1.f / (1.f - dropout); |
|
13415
|
0
|
|
|
|
|
|
bernoulli_distribution dropout_distribution(dropout); |
|
13416
|
|
|
|
|
|
|
|
|
13417
|
|
|
|
|
|
|
// Generate embeddings |
|
13418
|
0
|
0
|
|
|
|
|
for (auto&& sentence : data) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13419
|
0
|
0
|
|
|
|
|
for (auto&& chr : sentence.sentence) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13420
|
0
|
|
|
|
|
|
if (!this->embeddings.count(chr)) { |
|
13421
|
|
|
|
|
|
|
cached_embedding embedding; |
|
13422
|
0
|
|
|
|
|
|
random_matrix(embedding.e, generator, initialization_range, 0.f); |
|
13423
|
|
|
|
|
|
|
this->embeddings.emplace(chr, embedding); |
|
13424
|
|
|
|
|
|
|
} |
|
13425
|
|
|
|
|
|
|
this->empty_embedding.e.clear(); |
|
13426
|
|
|
|
|
|
|
|
|
13427
|
|
|
|
|
|
|
// Initialize weights |
|
13428
|
0
|
|
|
|
|
|
random_gru(this->gru_fwd, generator, initialization_range); |
|
13429
|
0
|
|
|
|
|
|
random_gru(this->gru_bwd, generator, initialization_range); |
|
13430
|
0
|
|
|
|
|
|
random_matrix(this->projection_fwd, generator, initialization_range, 0.f); this->projection_fwd.b[this->NO_SPLIT] = 1.f; |
|
13431
|
0
|
|
|
|
|
|
random_matrix(this->projection_bwd, generator, initialization_range, 0.f); this->projection_bwd.b[this->NO_SPLIT] = 1.f; |
|
13432
|
|
|
|
|
|
|
|
|
13433
|
|
|
|
|
|
|
// Train the network |
|
13434
|
|
|
|
|
|
|
unordered_map> embeddings; |
|
13435
|
0
|
0
|
|
|
|
|
for (auto&& embedding : this->embeddings) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13436
|
0
|
|
|
|
|
|
embeddings.emplace(embedding.first, embedding.second.e); |
|
13437
|
0
|
0
|
|
|
|
|
vector*> chosen_embeddings(segment); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13438
|
0
|
0
|
|
|
|
|
vector> embedding_dropouts(segment); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13439
|
0
|
0
|
|
|
|
|
gru_trainer gru_fwd(this->gru_fwd, segment), gru_bwd(this->gru_bwd, segment); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13440
|
0
|
|
|
|
|
|
matrix_trainer<3, D> projection_fwd(this->projection_fwd), projection_bwd(this->projection_bwd); |
|
13441
|
|
|
|
|
|
|
float learning_rate = learning_rate_initial, b1t = 1.f, b2t = 1.f; |
|
13442
|
|
|
|
|
|
|
|
|
13443
|
|
|
|
|
|
|
float best_combined_f1 = 0.f; unsigned best_combined_f1_epoch = 0; |
|
13444
|
|
|
|
|
|
|
gru_tokenizer_network_trainer best_combined_f1_network; |
|
13445
|
|
|
|
|
|
|
|
|
13446
|
|
|
|
|
|
|
size_t training_offset = 0, training_shift; |
|
13447
|
0
|
0
|
|
|
|
|
vector training_input, instance_input(segment); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13448
|
0
|
0
|
|
|
|
|
vector training_output, instance_output(segment); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13449
|
0
|
0
|
|
|
|
|
vector permutation; for (size_t i = 0; i < data.size(); i++) permutation.push_back(permutation.size()); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13450
|
0
|
0
|
|
|
|
|
for (unsigned epoch = 0; epoch < epochs; epoch++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13451
|
|
|
|
|
|
|
double logprob = 0; |
|
13452
|
|
|
|
|
|
|
int total = 0, correct = 0; |
|
13453
|
|
|
|
|
|
|
|
|
13454
|
0
|
0
|
|
|
|
|
for (int instance = 0, instances = 10000; instance < instances; instance++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13455
|
|
|
|
|
|
|
// Prepare input instance |
|
13456
|
0
|
0
|
|
|
|
|
if (training_offset + segment >= training_input.size()) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13457
|
0
|
|
|
|
|
|
shuffle(permutation.begin(), permutation.end(), generator); |
|
13458
|
|
|
|
|
|
|
training_input.clear(); training_output.clear(); |
|
13459
|
0
|
0
|
|
|
|
|
for (auto&& index : permutation) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13460
|
0
|
|
|
|
|
|
auto& sentence = data[index]; |
|
13461
|
0
|
0
|
|
|
|
|
if (sentence.tokens.empty()) continue; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13462
|
|
|
|
|
|
|
|
|
13463
|
|
|
|
|
|
|
training_offset = training_input.size(); |
|
13464
|
0
|
0
|
|
|
|
|
training_input.resize(training_offset + sentence.sentence.size()); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13465
|
0
|
0
|
|
|
|
|
training_output.resize(training_offset + sentence.sentence.size()); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13466
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < sentence.sentence.size(); i++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13467
|
0
|
|
|
|
|
|
training_input[training_offset + i].chr = sentence.sentence[i]; |
|
13468
|
0
|
|
|
|
|
|
training_output[training_offset + i].outcome = gru_tokenizer_network::NO_SPLIT; |
|
13469
|
|
|
|
|
|
|
} |
|
13470
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < sentence.tokens.size(); i++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13471
|
0
|
0
|
|
|
|
|
training_output[training_offset + sentence.tokens[i].start + sentence.tokens[i].length - 1].outcome = |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13472
|
|
|
|
|
|
|
i+1 < sentence.tokens.size() ? gru_tokenizer_network::END_OF_TOKEN : gru_tokenizer_network::END_OF_SENTENCE; |
|
13473
|
|
|
|
|
|
|
} |
|
13474
|
|
|
|
|
|
|
training_offset = 0; |
|
13475
|
|
|
|
|
|
|
} |
|
13476
|
|
|
|
|
|
|
copy_n(training_input.begin() + training_offset, segment, instance_input.begin()); |
|
13477
|
|
|
|
|
|
|
copy_n(training_output.begin() + training_offset, segment, instance_output.begin()); |
|
13478
|
|
|
|
|
|
|
|
|
13479
|
|
|
|
|
|
|
// Shift training_offset |
|
13480
|
0
|
0
|
|
|
|
|
for (training_shift = segment - 5; training_shift > segment / 2; training_shift--) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13481
|
0
|
0
|
|
|
|
|
if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ') |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13482
|
|
|
|
|
|
|
break; |
|
13483
|
0
|
|
|
|
|
|
training_offset += training_shift; |
|
13484
|
|
|
|
|
|
|
|
|
13485
|
|
|
|
|
|
|
// Forward pass |
|
13486
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < segment; i++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13487
|
0
|
|
|
|
|
|
chosen_embeddings[i] = &embeddings.at(instance_input[i].chr); |
|
13488
|
0
|
0
|
|
|
|
|
for (unsigned k = 0; k < D; k++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13489
|
0
|
0
|
|
|
|
|
embedding_dropouts[i].w[0][k] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13490
|
0
|
0
|
|
|
|
|
for (int j = 0; j < 3; j++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13491
|
0
|
|
|
|
|
|
instance_output[i].w[j] = projection_fwd.original.b[j]; |
|
13492
|
|
|
|
|
|
|
} |
|
13493
|
|
|
|
|
|
|
|
|
13494
|
0
|
0
|
|
|
|
|
for (int dir = 0; dir < 2; dir++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13495
|
0
|
0
|
|
|
|
|
auto& gru = dir == 0 ? gru_fwd : gru_bwd; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13496
|
0
|
0
|
|
|
|
|
auto& projection = dir == 0 ? projection_fwd : projection_bwd; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13497
|
|
|
|
|
|
|
|
|
13498
|
|
|
|
|
|
|
gru.states[0].clear(); |
|
13499
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < segment; i++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13500
|
0
|
0
|
|
|
|
|
auto& embedding = chosen_embeddings[dir == 0 ? i : segment - 1 - i]; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13501
|
0
|
0
|
|
|
|
|
auto& embedding_dropout = embedding_dropouts[dir == 0 ? i : segment - 1 - i]; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13502
|
0
|
0
|
|
|
|
|
auto& output = instance_output[dir == 0 ? i : segment - 1 - i]; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13503
|
|
|
|
|
|
|
|
|
13504
|
0
|
0
|
|
|
|
|
for (int j = 0; j < D; j++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13505
|
0
|
|
|
|
|
|
gru.updates[i].w[0][j] = gru.X_z.original.b[j]; |
|
13506
|
0
|
|
|
|
|
|
gru.resets[i].w[0][j] = gru.X_r.original.b[j]; |
|
13507
|
0
|
0
|
|
|
|
|
for (int k = 0; k < D; k++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13508
|
0
|
|
|
|
|
|
gru.updates[i].w[0][j] += embedding_dropout.w[0][k] * embedding->original.w[0][k] * gru.X_z.original.w[j][k] + gru.states[i].w[0][k] * gru.H_z.original.w[j][k]; |
|
13509
|
0
|
|
|
|
|
|
gru.resets[i].w[0][j] += embedding_dropout.w[0][k] * embedding->original.w[0][k] * gru.X_r.original.w[j][k] + gru.states[i].w[0][k] * gru.H_r.original.w[j][k]; |
|
13510
|
|
|
|
|
|
|
} |
|
13511
|
0
|
|
|
|
|
|
gru.updates[i].w[0][j] = 1.f / (1.f + exp(-gru.updates[i].w[0][j])); |
|
13512
|
0
|
|
|
|
|
|
gru.resets[i].w[0][j] = 1.f / (1.f + exp(-gru.resets[i].w[0][j])); |
|
13513
|
0
|
|
|
|
|
|
gru.resetstates[i].w[0][j] = gru.resets[i].w[0][j] * gru.states[i].w[0][j]; |
|
13514
|
|
|
|
|
|
|
} |
|
13515
|
0
|
0
|
|
|
|
|
for (int j = 0; j < D; j++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13516
|
0
|
|
|
|
|
|
gru.candidates[i].w[0][j] = gru.X.original.b[j]; |
|
13517
|
0
|
0
|
|
|
|
|
for (int k = 0; k < D; k++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13518
|
0
|
|
|
|
|
|
gru.candidates[i].w[0][j] += embedding_dropout.w[0][k] * embedding->original.w[0][k] * gru.X.original.w[j][k] + gru.resetstates[i].w[0][k] * gru.H.original.w[j][k]; |
|
13519
|
0
|
|
|
|
|
|
gru.candidates[i].w[0][j] = tanh(gru.candidates[i].w[0][j]); |
|
13520
|
0
|
|
|
|
|
|
gru.states[i+1].w[0][j] = gru.updates[i].w[0][j] * gru.states[i].w[0][j] + (1.f - gru.updates[i].w[0][j]) * gru.candidates[i].w[0][j]; |
|
13521
|
|
|
|
|
|
|
} |
|
13522
|
|
|
|
|
|
|
|
|
13523
|
0
|
0
|
|
|
|
|
for (int j = 0; j < D; j++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13524
|
0
|
0
|
|
|
|
|
gru.dropouts[i].w[0][j] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier * gru.states[i+1].w[0][j]; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13525
|
|
|
|
|
|
|
|
|
13526
|
0
|
0
|
|
|
|
|
for (int j = 0; j < 3; j++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13527
|
0
|
0
|
|
|
|
|
for (int k = 0; k < D; k++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13528
|
0
|
|
|
|
|
|
output.w[j] += projection.original.w[j][k] * gru.dropouts[i].w[0][k]; |
|
13529
|
|
|
|
|
|
|
} |
|
13530
|
|
|
|
|
|
|
} |
|
13531
|
|
|
|
|
|
|
|
|
13532
|
0
|
0
|
|
|
|
|
for (auto&& output : instance_output) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13533
|
0
|
|
|
|
|
|
int best = output.w[1] > output.w[0]; |
|
13534
|
0
|
0
|
|
|
|
|
if (output.w[2] > output.w[best]) best = 2; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13535
|
0
|
|
|
|
|
|
float maximum = output.w[best], sum = 0; |
|
13536
|
0
|
0
|
|
|
|
|
for (int j = 0; j < 3; j++) sum += (output.w[j] = exp(output.w[j] - maximum)); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13537
|
0
|
|
|
|
|
|
sum = 1.f / sum; |
|
13538
|
0
|
0
|
|
|
|
|
for (int j = 0; j < 3; j++) output.w[j] *= sum; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13539
|
|
|
|
|
|
|
|
|
13540
|
0
|
|
|
|
|
|
total++; |
|
13541
|
0
|
|
|
|
|
|
correct += best == output.outcome; |
|
13542
|
0
|
|
|
|
|
|
logprob += log(output.w[output.outcome]); |
|
13543
|
|
|
|
|
|
|
} |
|
13544
|
|
|
|
|
|
|
|
|
13545
|
|
|
|
|
|
|
// Backward pass |
|
13546
|
0
|
0
|
|
|
|
|
for (auto&& output : instance_output) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13547
|
0
|
0
|
|
|
|
|
for (int j = 0; j < 3; j++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13548
|
0
|
0
|
|
|
|
|
output.w[j] = (output.outcome == j) - output.w[j]; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13549
|
|
|
|
|
|
|
|
|
13550
|
0
|
0
|
|
|
|
|
for (int dir = 0; dir < 2; dir++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13551
|
0
|
0
|
|
|
|
|
auto& gru = dir == 0 ? gru_fwd : gru_bwd; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13552
|
0
|
0
|
|
|
|
|
auto& projection = dir == 0 ? projection_fwd : projection_bwd; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13553
|
|
|
|
|
|
|
|
|
13554
|
|
|
|
|
|
|
matrix<1, D> state_g, update_g, candidate_g, reset_g, resetstate_g; |
|
13555
|
|
|
|
|
|
|
state_g.clear(); |
|
13556
|
0
|
0
|
|
|
|
|
for (size_t i = segment; i--; ) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13557
|
0
|
0
|
|
|
|
|
auto& embedding = chosen_embeddings[dir == 0 ? i : segment - 1 - i]; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13558
|
0
|
0
|
|
|
|
|
auto& embedding_dropout = embedding_dropouts[dir == 0 ? i : segment - 1 - i]; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13559
|
0
|
0
|
|
|
|
|
auto& output = instance_output[dir == 0 ? i : segment - 1 - i]; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13560
|
|
|
|
|
|
|
|
|
13561
|
0
|
0
|
|
|
|
|
for (int j = 0; j < D; j++) // These for cycles are swapped because |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13562
|
0
|
0
|
|
|
|
|
for (int k = 0; k < 3; k++) // g++-4.8 generates wrong code otherwise. |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13563
|
0
|
|
|
|
|
|
projection.w_g[k][j] += gru.dropouts[i].w[0][j] * output.w[k]; |
|
13564
|
|
|
|
|
|
|
|
|
13565
|
0
|
0
|
|
|
|
|
for (int j = 0; j < D; j++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13566
|
0
|
0
|
|
|
|
|
if (gru.dropouts[i].w[0][j]) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13567
|
0
|
0
|
|
|
|
|
for (int k = 0; k < 3; k++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13568
|
0
|
|
|
|
|
|
state_g.w[0][j] += projection.original.w[k][j] * output.w[k]; |
|
13569
|
|
|
|
|
|
|
|
|
13570
|
|
|
|
|
|
|
resetstate_g.clear(); |
|
13571
|
0
|
0
|
|
|
|
|
for (int j = 0; j < D; j++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13572
|
0
|
|
|
|
|
|
update_g.w[0][j] = state_g.w[0][j] * (gru.states[i].w[0][j] - gru.candidates[i].w[0][j]); |
|
13573
|
0
|
|
|
|
|
|
candidate_g.w[0][j] = state_g.w[0][j] * (1.f - gru.updates[i].w[0][j]); |
|
13574
|
0
|
|
|
|
|
|
state_g.w[0][j] = state_g.w[0][j] * gru.updates[i].w[0][j]; |
|
13575
|
|
|
|
|
|
|
|
|
13576
|
0
|
|
|
|
|
|
candidate_g.w[0][j] *= 1 - gru.candidates[i].w[0][j] * gru.candidates[i].w[0][j]; |
|
13577
|
0
|
|
|
|
|
|
gru.X.b_g[j] += candidate_g.w[0][j]; |
|
13578
|
0
|
0
|
|
|
|
|
for (int k = 0; k < D; k++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13579
|
0
|
|
|
|
|
|
gru.X.w_g[j][k] += candidate_g.w[0][j] * embedding_dropout.w[0][k] * embedding->original.w[0][k]; |
|
13580
|
0
|
|
|
|
|
|
gru.H.w_g[j][k] += candidate_g.w[0][j] * gru.resetstates[i].w[0][k]; |
|
13581
|
0
|
|
|
|
|
|
embedding->w_g[0][k] += embedding_dropout.w[0][k] * candidate_g.w[0][j] * gru.X.original.w[j][k]; |
|
13582
|
0
|
|
|
|
|
|
resetstate_g.w[0][k] += candidate_g.w[0][j] * gru.H.original.w[j][k]; |
|
13583
|
|
|
|
|
|
|
} |
|
13584
|
|
|
|
|
|
|
} |
|
13585
|
0
|
0
|
|
|
|
|
for (int j = 0; j < D; j++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13586
|
0
|
|
|
|
|
|
state_g.w[0][j] += resetstate_g.w[0][j] * gru.resets[i].w[0][j]; |
|
13587
|
0
|
|
|
|
|
|
reset_g.w[0][j] = resetstate_g.w[0][j] * gru.states[i].w[0][j]; |
|
13588
|
|
|
|
|
|
|
|
|
13589
|
0
|
|
|
|
|
|
update_g.w[0][j] *= gru.updates[i].w[0][j] * (1 - gru.updates[i].w[0][j]); |
|
13590
|
0
|
|
|
|
|
|
reset_g.w[0][j] *= gru.resets[i].w[0][j] * (1 - gru.resets[i].w[0][j]); |
|
13591
|
|
|
|
|
|
|
|
|
13592
|
0
|
|
|
|
|
|
gru.X_z.b_g[j] += update_g.w[0][j]; |
|
13593
|
0
|
|
|
|
|
|
gru.X_r.b_g[j] += reset_g.w[0][j]; |
|
13594
|
0
|
0
|
|
|
|
|
for (int k = 0; k < D; k++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13595
|
0
|
|
|
|
|
|
gru.X_z.w_g[j][k] += update_g.w[0][j] * embedding_dropout.w[0][k] * embedding->original.w[0][k]; |
|
13596
|
0
|
|
|
|
|
|
gru.H_z.w_g[j][k] += update_g.w[0][j] * gru.states[i].w[0][k]; |
|
13597
|
0
|
|
|
|
|
|
gru.X_r.w_g[j][k] += reset_g.w[0][j] * embedding_dropout.w[0][k] * embedding->original.w[0][k]; |
|
13598
|
0
|
|
|
|
|
|
gru.H_r.w_g[j][k] += reset_g.w[0][j] * gru.states[i].w[0][k]; |
|
13599
|
0
|
|
|
|
|
|
embedding->w_g[0][k] += embedding_dropout.w[0][k] * (update_g.w[0][j] * gru.X_z.original.w[j][k] + |
|
13600
|
0
|
|
|
|
|
|
reset_g.w[0][j] * gru.X_r.original.w[j][k]); |
|
13601
|
0
|
|
|
|
|
|
state_g.w[0][k] += update_g.w[0][j] * gru.H_z.original.w[j][k] + reset_g.w[0][j] * gru.H_r.original.w[j][k]; |
|
13602
|
|
|
|
|
|
|
} |
|
13603
|
|
|
|
|
|
|
} |
|
13604
|
|
|
|
|
|
|
} |
|
13605
|
|
|
|
|
|
|
} |
|
13606
|
|
|
|
|
|
|
|
|
13607
|
|
|
|
|
|
|
// Update the weights |
|
13608
|
0
|
0
|
|
|
|
|
if (batch_size == 1 || |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13609
|
|
|
|
|
|
|
instance+1 == instances || |
|
13610
|
|
|
|
|
|
|
(instance+1) % batch_size == 0) { |
|
13611
|
0
|
|
|
|
|
|
b1t *= 0.9f; |
|
13612
|
0
|
|
|
|
|
|
b2t *= 0.999f; |
|
13613
|
0
|
|
|
|
|
|
float learning_rate_biased = learning_rate * sqrt(1-b2t) / (1-b1t); |
|
13614
|
|
|
|
|
|
|
|
|
13615
|
0
|
0
|
|
|
|
|
if (batch_size == 1) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13616
|
0
|
0
|
|
|
|
|
for (auto&& chosen_embedding : chosen_embeddings) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13617
|
0
|
|
|
|
|
|
chosen_embedding->update_weights(learning_rate_biased); |
|
13618
|
|
|
|
|
|
|
else |
|
13619
|
0
|
0
|
|
|
|
|
for (auto&& embedding : embeddings) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13620
|
0
|
|
|
|
|
|
embedding.second.update_weights(learning_rate_biased); |
|
13621
|
0
|
|
|
|
|
|
gru_fwd.update_weights(learning_rate_biased); |
|
13622
|
0
|
|
|
|
|
|
gru_bwd.update_weights(learning_rate_biased); |
|
13623
|
0
|
|
|
|
|
|
projection_fwd.update_weights(learning_rate_biased); |
|
13624
|
0
|
|
|
|
|
|
projection_bwd.update_weights(learning_rate_biased); |
|
13625
|
|
|
|
|
|
|
} |
|
13626
|
|
|
|
|
|
|
} |
|
13627
|
0
|
0
|
|
|
|
|
if (learning_rate_final && learning_rate_final != learning_rate_initial) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13628
|
0
|
|
|
|
|
|
learning_rate = exp(((epochs - epoch - 2) * log(learning_rate_initial) + (epoch + 1) * log(learning_rate_final)) / (epochs - 1)); |
|
13629
|
|
|
|
|
|
|
|
|
13630
|
|
|
|
|
|
|
// Evaluate |
|
13631
|
0
|
0
|
|
|
|
|
cerr << "Epoch " << epoch+1 << ", logprob: " << scientific << setprecision(4) << logprob |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13632
|
0
|
|
|
|
|
|
<< ", training acc: " << fixed << setprecision(2) << 100. * correct / double(total) << "%"; |
|
13633
|
0
|
0
|
|
|
|
|
if (!heldout.empty()) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13634
|
|
|
|
|
|
|
f1_info tokens, sentences; |
|
13635
|
0
|
0
|
|
|
|
|
evaluate(url_email_tokenizer, segment, allow_spaces, heldout, tokens, sentences); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13636
|
0
|
0
|
|
|
|
|
cerr << ", heldout tokens: " << 100. * tokens.precision << "%P/" << 100. * tokens.recall << "%R/" |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13637
|
0
|
|
|
|
|
|
<< 100. * tokens.f1 << "%, sentences: " << 100. * sentences.precision << "%P/" |
|
13638
|
0
|
|
|
|
|
|
<< 100. * sentences.recall << "%R/" << 100. * sentences.f1 << "%"; |
|
13639
|
|
|
|
|
|
|
|
|
13640
|
0
|
0
|
|
|
|
|
if (early_stopping && sentences.f1 + tokens.f1 > best_combined_f1) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13641
|
0
|
|
|
|
|
|
best_combined_f1 = sentences.f1 + tokens.f1; |
|
13642
|
|
|
|
|
|
|
best_combined_f1_epoch = epoch; |
|
13643
|
|
|
|
|
|
|
best_combined_f1_network = *this; |
|
13644
|
|
|
|
|
|
|
} |
|
13645
|
0
|
0
|
|
|
|
|
if (early_stopping && best_combined_f1 && epoch - best_combined_f1_epoch > 30) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13646
|
|
|
|
|
|
|
cerr << endl << "Stopping after 30 iterations of not improving sum of sentence and token f1." << endl; |
|
13647
|
0
|
|
|
|
|
|
break; |
|
13648
|
|
|
|
|
|
|
} |
|
13649
|
|
|
|
|
|
|
} |
|
13650
|
|
|
|
|
|
|
cerr << endl; |
|
13651
|
|
|
|
|
|
|
} |
|
13652
|
|
|
|
|
|
|
|
|
13653
|
|
|
|
|
|
|
// Choose best network if desired |
|
13654
|
0
|
0
|
|
|
|
|
if (early_stopping && best_combined_f1) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13655
|
0
|
|
|
|
|
|
cerr << "Choosing parameters from epoch " << best_combined_f1_epoch+1 << "." << endl; |
|
13656
|
|
|
|
|
|
|
this->embeddings = best_combined_f1_network.embeddings; |
|
13657
|
0
|
|
|
|
|
|
this->gru_fwd = best_combined_f1_network.gru_fwd; |
|
13658
|
0
|
|
|
|
|
|
this->gru_bwd = best_combined_f1_network.gru_bwd; |
|
13659
|
0
|
|
|
|
|
|
this->projection_fwd = best_combined_f1_network.projection_fwd; |
|
13660
|
0
|
|
|
|
|
|
this->projection_bwd = best_combined_f1_network.projection_bwd; |
|
13661
|
|
|
|
|
|
|
} |
|
13662
|
|
|
|
|
|
|
|
|
13663
|
|
|
|
|
|
|
// Encode the network |
|
13664
|
0
|
0
|
|
|
|
|
enc.add_1B(1); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13665
|
0
|
0
|
|
|
|
|
enc.add_1B(D); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13666
|
|
|
|
|
|
|
|
|
13667
|
0
|
|
|
|
|
|
enc.add_4B(this->embeddings.size()); |
|
13668
|
0
|
0
|
|
|
|
|
for (auto&& embedding : this->embeddings) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13669
|
0
|
|
|
|
|
|
enc.add_4B(embedding.first); |
|
13670
|
0
|
|
|
|
|
|
enc.add_data(embedding.second.e.w[0], D); |
|
13671
|
|
|
|
|
|
|
} |
|
13672
|
0
|
0
|
|
|
|
|
save_gru(this->gru_fwd, enc); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13673
|
0
|
0
|
|
|
|
|
save_gru(this->gru_bwd, enc); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13674
|
0
|
0
|
|
|
|
|
save_matrix(this->projection_fwd, enc); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13675
|
0
|
0
|
|
|
|
|
save_matrix(this->projection_bwd, enc); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13676
|
|
|
|
|
|
|
|
|
13677
|
|
|
|
|
|
|
return true; |
|
13678
|
|
|
|
|
|
|
} |
|
13679
|
|
|
|
|
|
|
|
|
13680
|
|
|
|
|
|
|
template template |
|
13681
|
0
|
|
|
|
|
|
void gru_tokenizer_network_trainer::matrix_trainer::update_weights(float learning_rate) { |
|
13682
|
0
|
0
|
|
|
|
|
for (int i = 0; i < R; i++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13683
|
0
|
0
|
|
|
|
|
for (int j = 0; j < C; j++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13684
|
0
|
|
|
|
|
|
w_m[i][j] = 0.9 * w_m[i][j] + (1-0.9) * w_g[i][j]; |
|
13685
|
0
|
|
|
|
|
|
w_v[i][j] = 0.999 * w_v[i][j] + (1-0.999) * w_g[i][j] * w_g[i][j]; |
|
13686
|
0
|
|
|
|
|
|
original.w[i][j] += learning_rate * w_m[i][j] / (sqrt(w_v[i][j]) + 1e-8); |
|
13687
|
|
|
|
|
|
|
} |
|
13688
|
0
|
|
|
|
|
|
b_m[i] = 0.9 * b_m[i] + (1-0.9) * b_g[i]; |
|
13689
|
0
|
|
|
|
|
|
b_v[i] = 0.999 * b_v[i] + (1-0.999) * b_g[i] * b_g[i]; |
|
13690
|
0
|
|
|
|
|
|
original.b[i] += learning_rate * b_m[i] / (sqrt(b_v[i]) + 1e-8); |
|
13691
|
|
|
|
|
|
|
} |
|
13692
|
|
|
|
|
|
|
|
|
13693
|
0
|
0
|
|
|
|
|
for (int i = 0; i < R; i++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13694
|
0
|
0
|
|
|
|
|
for (int j = 0; j < C; j++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13695
|
0
|
|
|
|
|
|
w_g[i][j] = 0.f; |
|
13696
|
0
|
|
|
|
|
|
b_g[i] = 0.f; |
|
13697
|
|
|
|
|
|
|
} |
|
13698
|
0
|
|
|
|
|
|
} |
|
13699
|
|
|
|
|
|
|
|
|
13700
|
|
|
|
|
|
|
template |
|
13701
|
0
|
|
|
|
|
|
void gru_tokenizer_network_trainer::gru_trainer::update_weights(float learning_rate) { |
|
13702
|
0
|
|
|
|
|
|
X.update_weights(learning_rate); |
|
13703
|
0
|
|
|
|
|
|
X_r.update_weights(learning_rate); |
|
13704
|
0
|
|
|
|
|
|
X_z.update_weights(learning_rate); |
|
13705
|
0
|
|
|
|
|
|
H.update_weights(learning_rate); |
|
13706
|
0
|
|
|
|
|
|
H_r.update_weights(learning_rate); |
|
13707
|
0
|
|
|
|
|
|
H_z.update_weights(learning_rate); |
|
13708
|
0
|
|
|
|
|
|
} |
|
13709
|
|
|
|
|
|
|
|
|
13710
|
|
|
|
|
|
|
template |
|
13711
|
0
|
|
|
|
|
|
void gru_tokenizer_network_trainer::evaluate(unsigned url_email_tokenizer, unsigned segment, bool allow_spaces, const vector& heldout, |
|
13712
|
|
|
|
|
|
|
f1_info& tokens_f1, f1_info& sentences_f1) { |
|
13713
|
|
|
|
|
|
|
// Generate gold data |
|
13714
|
|
|
|
|
|
|
vector gold_sentences, gold_tokens; |
|
13715
|
|
|
|
|
|
|
u32string text; |
|
13716
|
0
|
0
|
|
|
|
|
for (auto&& sentence : heldout) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13717
|
0
|
0
|
|
|
|
|
if (sentence.tokens.empty()) continue; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13718
|
|
|
|
|
|
|
|
|
13719
|
0
|
0
|
|
|
|
|
gold_sentences.emplace_back(text.size() + sentence.tokens.front().start, sentence.tokens.back().start + sentence.tokens.back().length - sentence.tokens.front().start); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13720
|
0
|
0
|
|
|
|
|
for (auto&& token : sentence.tokens) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13721
|
0
|
0
|
|
|
|
|
gold_tokens.emplace_back(text.size() + token.start, token.length); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13722
|
|
|
|
|
|
|
text.append(sentence.sentence); |
|
13723
|
|
|
|
|
|
|
} |
|
13724
|
|
|
|
|
|
|
|
|
13725
|
|
|
|
|
|
|
// Generate system data |
|
13726
|
|
|
|
|
|
|
vector system_sentences, system_tokens, tokens; |
|
13727
|
|
|
|
|
|
|
string text_utf8; |
|
13728
|
|
|
|
|
|
|
|
|
13729
|
0
|
|
|
|
|
|
this->cache_embeddings(); |
|
13730
|
0
|
|
|
|
|
|
gru_tokenizer tokenizer(url_email_tokenizer, segment, allow_spaces, *this); |
|
13731
|
0
|
0
|
|
|
|
|
unilib::utf8::encode(text, text_utf8); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13732
|
0
|
0
|
|
|
|
|
tokenizer.set_text(text_utf8); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13733
|
|
|
|
|
|
|
|
|
13734
|
0
|
0
|
|
|
|
|
while (tokenizer.next_sentence(tokens)) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13735
|
0
|
0
|
|
|
|
|
if (!tokens.empty()) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13736
|
0
|
0
|
|
|
|
|
system_sentences.emplace_back(tokens.front().start, tokens.back().start + tokens.back().length - tokens.front().start); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13737
|
0
|
0
|
|
|
|
|
system_tokens.insert(system_tokens.end(), tokens.begin(), tokens.end()); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13738
|
|
|
|
|
|
|
} |
|
13739
|
|
|
|
|
|
|
|
|
13740
|
0
|
|
|
|
|
|
evaluate_f1(system_tokens, gold_tokens, tokens_f1); |
|
13741
|
0
|
|
|
|
|
|
evaluate_f1(system_sentences, gold_sentences, sentences_f1); |
|
13742
|
0
|
|
|
|
|
|
} |
|
13743
|
|
|
|
|
|
|
|
|
13744
|
|
|
|
|
|
|
template |
|
13745
|
0
|
|
|
|
|
|
void gru_tokenizer_network_trainer::evaluate_f1(const vector& system, const vector& gold, f1_info& f1) { |
|
13746
|
|
|
|
|
|
|
size_t both = 0; |
|
13747
|
0
|
0
|
|
|
|
|
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13748
|
0
|
0
|
|
|
|
|
if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start)) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13749
|
0
|
|
|
|
|
|
si++; |
|
13750
|
0
|
0
|
|
|
|
|
else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start)) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13751
|
0
|
|
|
|
|
|
gi++; |
|
13752
|
|
|
|
|
|
|
else |
|
13753
|
0
|
|
|
|
|
|
both += system[si++].length == gold[gi++].length; |
|
13754
|
|
|
|
|
|
|
|
|
13755
|
0
|
0
|
|
|
|
|
f1.precision = system.size() ? both / double(system.size()) : 0.; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13756
|
0
|
0
|
|
|
|
|
f1.recall = gold.size() ? both / double(gold.size()) : 0.; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13757
|
0
|
0
|
|
|
|
|
f1.f1 = system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0.; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13758
|
0
|
|
|
|
|
|
} |
|
13759
|
|
|
|
|
|
|
|
|
13760
|
|
|
|
|
|
|
template template |
|
13761
|
0
|
|
|
|
|
|
void gru_tokenizer_network_trainer::random_matrix(matrix& m, mt19937& generator, float range, float bias) { |
|
13762
|
0
|
|
|
|
|
|
uniform_real_distribution uniform(-range, range); |
|
13763
|
0
|
0
|
|
|
|
|
for (int i = 0; i < R; i++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13764
|
0
|
|
|
|
|
|
m.b[i] = bias; |
|
13765
|
0
|
0
|
|
|
|
|
for (int j = 0; j < C; j++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13766
|
0
|
|
|
|
|
|
m.w[i][j] = uniform(generator); |
|
13767
|
|
|
|
|
|
|
} |
|
13768
|
0
|
|
|
|
|
|
} |
|
13769
|
|
|
|
|
|
|
|
|
13770
|
|
|
|
|
|
|
template |
|
13771
|
0
|
|
|
|
|
|
void gru_tokenizer_network_trainer::random_gru(gru& g, mt19937& generator, float range) { |
|
13772
|
0
|
|
|
|
|
|
random_matrix(g.X, generator, range, 0.f); |
|
13773
|
0
|
|
|
|
|
|
random_matrix(g.X_r, generator, range, 1.f); |
|
13774
|
0
|
|
|
|
|
|
random_matrix(g.X_z, generator, range, 1.f); |
|
13775
|
0
|
|
|
|
|
|
random_matrix(g.H, generator, range, 0.f); |
|
13776
|
0
|
|
|
|
|
|
random_matrix(g.H_r, generator, range, 1.f); |
|
13777
|
0
|
|
|
|
|
|
random_matrix(g.H_z, generator, range, 1.f); |
|
13778
|
0
|
|
|
|
|
|
} |
|
13779
|
|
|
|
|
|
|
|
|
13780
|
|
|
|
|
|
|
template template |
|
13781
|
0
|
|
|
|
|
|
void gru_tokenizer_network_trainer::save_matrix(const matrix& m, binary_encoder& enc) { |
|
13782
|
0
|
0
|
|
|
|
|
for (int i = 0; i < R; i++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13783
|
0
|
|
|
|
|
|
enc.add_data(m.w[i], C); |
|
13784
|
0
|
|
|
|
|
|
enc.add_data(m.b, R); |
|
13785
|
0
|
|
|
|
|
|
} |
|
13786
|
|
|
|
|
|
|
|
|
13787
|
|
|
|
|
|
|
template |
|
13788
|
0
|
|
|
|
|
|
void gru_tokenizer_network_trainer::save_gru(const gru& g, binary_encoder& enc) { |
|
13789
|
0
|
|
|
|
|
|
save_matrix(g.X, enc); |
|
13790
|
0
|
|
|
|
|
|
save_matrix(g.X_r, enc); |
|
13791
|
0
|
|
|
|
|
|
save_matrix(g.X_z, enc); |
|
13792
|
0
|
|
|
|
|
|
save_matrix(g.H, enc); |
|
13793
|
0
|
|
|
|
|
|
save_matrix(g.H_r, enc); |
|
13794
|
0
|
|
|
|
|
|
save_matrix(g.H_z, enc); |
|
13795
|
0
|
|
|
|
|
|
} |
|
13796
|
|
|
|
|
|
|
|
|
13797
|
|
|
|
|
|
|
} // namespace morphodita |
|
13798
|
|
|
|
|
|
|
|
|
13799
|
|
|
|
|
|
|
///////// |
|
13800
|
|
|
|
|
|
|
// File: morphodita/tokenizer/gru_tokenizer_trainer.cpp |
|
13801
|
|
|
|
|
|
|
///////// |
|
13802
|
|
|
|
|
|
|
|
|
13803
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
13804
|
|
|
|
|
|
|
// |
|
13805
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
|
13806
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
13807
|
|
|
|
|
|
|
// |
|
13808
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
13809
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
13810
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
13811
|
|
|
|
|
|
|
|
|
13812
|
|
|
|
|
|
|
namespace morphodita { |
|
13813
|
|
|
|
|
|
|
|
|
13814
|
0
|
|
|
|
|
|
bool gru_tokenizer_trainer::train(unsigned url_email_tokenizer, unsigned segment, bool allow_spaces, unsigned dimension, unsigned epochs, |
|
13815
|
|
|
|
|
|
|
unsigned batch_size, float learning_rate, float learning_rate_final, float dropout, |
|
13816
|
|
|
|
|
|
|
float initialization_range, bool early_stopping, const vector& data, |
|
13817
|
|
|
|
|
|
|
const vector& heldout, ostream& os, string& error) { |
|
13818
|
|
|
|
|
|
|
using namespace unilib; |
|
13819
|
|
|
|
|
|
|
|
|
13820
|
|
|
|
|
|
|
error.clear(); |
|
13821
|
|
|
|
|
|
|
|
|
13822
|
|
|
|
|
|
|
// Start encoding the tokenizer |
|
13823
|
0
|
|
|
|
|
|
os.put(2); |
|
13824
|
|
|
|
|
|
|
|
|
13825
|
0
|
|
|
|
|
|
binary_encoder enc; |
|
13826
|
0
|
0
|
|
|
|
|
enc.add_1B(url_email_tokenizer); |
|
13827
|
0
|
0
|
|
|
|
|
enc.add_2B(segment); |
|
13828
|
0
|
0
|
|
|
|
|
enc.add_1B(allow_spaces); |
|
13829
|
|
|
|
|
|
|
|
|
13830
|
|
|
|
|
|
|
// Train the GRU network |
|
13831
|
0
|
0
|
|
|
|
|
if (dimension == 16) { |
|
13832
|
|
|
|
|
|
|
gru_tokenizer_network_trainer<16> network; |
|
13833
|
0
|
0
|
|
|
|
|
if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final, |
|
|
|
0
|
|
|
|
|
|
|
13834
|
|
|
|
|
|
|
dropout, initialization_range, early_stopping, data, heldout, enc, error)) return false; |
|
13835
|
0
|
0
|
|
|
|
|
} else if (dimension == 24) { |
|
13836
|
|
|
|
|
|
|
gru_tokenizer_network_trainer<24> network; |
|
13837
|
0
|
0
|
|
|
|
|
if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final, |
|
|
|
0
|
|
|
|
|
|
|
13838
|
|
|
|
|
|
|
dropout, initialization_range, early_stopping, data, heldout, enc, error)) return false; |
|
13839
|
0
|
0
|
|
|
|
|
} else if (dimension == 64) { |
|
13840
|
|
|
|
|
|
|
gru_tokenizer_network_trainer<64> network; |
|
13841
|
0
|
0
|
|
|
|
|
if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final, |
|
|
|
0
|
|
|
|
|
|
|
13842
|
|
|
|
|
|
|
dropout, initialization_range, early_stopping, data, heldout, enc, error)) return false; |
|
13843
|
|
|
|
|
|
|
} else { |
|
13844
|
0
|
0
|
|
|
|
|
return error.assign("Gru tokenizer dimension '").append(to_string(dimension)).append("' is not supported!"), false; |
|
|
|
0
|
|
|
|
|
|
|
13845
|
|
|
|
|
|
|
} |
|
13846
|
|
|
|
|
|
|
|
|
13847
|
|
|
|
|
|
|
// Compute best substitutions for every category |
|
13848
|
|
|
|
|
|
|
unordered_map> counts; |
|
13849
|
0
|
0
|
|
|
|
|
for (auto&& sentence : data) |
|
13850
|
0
|
0
|
|
|
|
|
for (auto&& chr : sentence.sentence) |
|
13851
|
0
|
|
|
|
|
|
counts[unicode::category(chr)][chr]++; |
|
13852
|
|
|
|
|
|
|
|
|
13853
|
|
|
|
|
|
|
unordered_map unknown_chars; |
|
13854
|
0
|
0
|
|
|
|
|
for (auto&& count : counts) { |
|
13855
|
0
|
|
|
|
|
|
char32_t best_chr = 0; |
|
13856
|
|
|
|
|
|
|
unsigned best = 0; |
|
13857
|
0
|
0
|
|
|
|
|
for (auto&& chr : count.second) |
|
13858
|
0
|
0
|
|
|
|
|
if (chr.second > best) |
|
13859
|
0
|
|
|
|
|
|
best = chr.second, best_chr = chr.first; |
|
13860
|
0
|
0
|
|
|
|
|
if (best_chr) |
|
13861
|
0
|
|
|
|
|
|
unknown_chars.emplace(count.first, best_chr); |
|
13862
|
|
|
|
|
|
|
} |
|
13863
|
0
|
0
|
|
|
|
|
enc.add_1B(unknown_chars.size()); |
|
13864
|
0
|
0
|
|
|
|
|
for (auto&& unknown_char : unknown_chars) { |
|
13865
|
0
|
|
|
|
|
|
enc.add_4B(unknown_char.first); |
|
13866
|
0
|
|
|
|
|
|
enc.add_4B(unknown_char.second); |
|
13867
|
|
|
|
|
|
|
} |
|
13868
|
|
|
|
|
|
|
|
|
13869
|
0
|
0
|
|
|
|
|
if (!compressor::save(os, enc)) return error.assign("Cannot save gru_tokenizer_factory!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
13870
|
|
|
|
|
|
|
return true; |
|
13871
|
|
|
|
|
|
|
} |
|
13872
|
|
|
|
|
|
|
|
|
13873
|
|
|
|
|
|
|
} // namespace morphodita |
|
13874
|
|
|
|
|
|
|
|
|
13875
|
|
|
|
|
|
|
///////// |
|
13876
|
|
|
|
|
|
|
// File: morphodita/tokenizer/ragel_tokenizer.cpp |
|
13877
|
|
|
|
|
|
|
///////// |
|
13878
|
|
|
|
|
|
|
|
|
13879
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
13880
|
|
|
|
|
|
|
// |
|
13881
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
13882
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
13883
|
|
|
|
|
|
|
// |
|
13884
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
13885
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
13886
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
13887
|
|
|
|
|
|
|
|
|
13888
|
|
|
|
|
|
|
namespace morphodita { |
|
13889
|
|
|
|
|
|
|
|
|
13890
|
|
|
|
|
|
|
static const char _ragel_url_email_cond_offsets[] = { |
|
13891
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
13892
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 1, 1, 1, |
|
13893
|
|
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, |
|
13894
|
|
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, |
|
13895
|
|
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, |
|
13896
|
|
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, |
|
13897
|
|
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, |
|
13898
|
|
|
|
|
|
|
1, 1, 1, 2, 3, 3, 4, 5, |
|
13899
|
|
|
|
|
|
|
6, 7, 8, 9, 10, 11, 12, 13, |
|
13900
|
|
|
|
|
|
|
14, 15, 16 |
|
13901
|
|
|
|
|
|
|
}; |
|
13902
|
|
|
|
|
|
|
|
|
13903
|
|
|
|
|
|
|
static const char _ragel_url_email_cond_lengths[] = { |
|
13904
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
13905
|
|
|
|
|
|
|
0, 0, 0, 0, 1, 0, 0, 0, |
|
13906
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
13907
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
13908
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
13909
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
13910
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
13911
|
|
|
|
|
|
|
0, 0, 1, 1, 0, 1, 1, 1, |
|
13912
|
|
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, |
|
13913
|
|
|
|
|
|
|
1, 1, 1 |
|
13914
|
|
|
|
|
|
|
}; |
|
13915
|
|
|
|
|
|
|
|
|
13916
|
|
|
|
|
|
|
static const short _ragel_url_email_cond_keys[] = { |
|
13917
|
|
|
|
|
|
|
41u, 41u, 47u, 47u, 47u, 47u, 41u, 41u, |
|
13918
|
|
|
|
|
|
|
47u, 47u, 47u, 47u, 47u, 47u, 47u, 47u, |
|
13919
|
|
|
|
|
|
|
47u, 47u, 47u, 47u, 47u, 47u, 47u, 47u, |
|
13920
|
|
|
|
|
|
|
47u, 47u, 47u, 47u, 47u, 47u, 47u, 47u, |
|
13921
|
|
|
|
|
|
|
47u, 47u, 0 |
|
13922
|
|
|
|
|
|
|
}; |
|
13923
|
|
|
|
|
|
|
|
|
13924
|
|
|
|
|
|
|
static const char _ragel_url_email_cond_spaces[] = { |
|
13925
|
|
|
|
|
|
|
1, 0, 0, 1, 0, 0, 0, 0, |
|
13926
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
13927
|
|
|
|
|
|
|
0, 0 |
|
13928
|
|
|
|
|
|
|
}; |
|
13929
|
|
|
|
|
|
|
|
|
13930
|
|
|
|
|
|
|
static const short _ragel_url_email_key_offsets[] = { |
|
13931
|
|
|
|
|
|
|
0, 0, 15, 29, 41, 54, 63, 71, |
|
13932
|
|
|
|
|
|
|
78, 86, 92, 100, 117, 145, 154, 162, |
|
13933
|
|
|
|
|
|
|
171, 179, 188, 196, 204, 215, 225, 233, |
|
13934
|
|
|
|
|
|
|
241, 252, 262, 270, 278, 289, 299, 315, |
|
13935
|
|
|
|
|
|
|
330, 346, 360, 376, 393, 409, 426, 442, |
|
13936
|
|
|
|
|
|
|
459, 475, 491, 510, 528, 544, 560, 579, |
|
13937
|
|
|
|
|
|
|
597, 613, 629, 648, 666, 682, 698, 714, |
|
13938
|
|
|
|
|
|
|
725, 726, 741, 752, 756, 773, 801, 812, |
|
13939
|
|
|
|
|
|
|
823, 834, 848, 861, 879, 893, 908, 926, |
|
13940
|
|
|
|
|
|
|
944, 962, 983 |
|
13941
|
|
|
|
|
|
|
}; |
|
13942
|
|
|
|
|
|
|
|
|
13943
|
|
|
|
|
|
|
static const short _ragel_url_email_trans_keys[] = { |
|
13944
|
|
|
|
|
|
|
33u, 48u, 49u, 50u, 95u, 36u, 37u, 39u, |
|
13945
|
|
|
|
|
|
|
46u, 51u, 57u, 65u, 90u, 97u, 122u, 33u, |
|
13946
|
|
|
|
|
|
|
58u, 64u, 95u, 36u, 37u, 39u, 46u, 48u, |
|
13947
|
|
|
|
|
|
|
57u, 65u, 90u, 97u, 122u, 33u, 95u, 36u, |
|
13948
|
|
|
|
|
|
|
37u, 39u, 46u, 48u, 57u, 65u, 90u, 97u, |
|
13949
|
|
|
|
|
|
|
122u, 33u, 64u, 95u, 36u, 37u, 39u, 46u, |
|
13950
|
|
|
|
|
|
|
48u, 57u, 65u, 90u, 97u, 122u, 48u, 49u, |
|
13951
|
|
|
|
|
|
|
50u, 51u, 57u, 65u, 90u, 97u, 122u, 45u, |
|
13952
|
|
|
|
|
|
|
46u, 48u, 57u, 65u, 90u, 97u, 122u, 45u, |
|
13953
|
|
|
|
|
|
|
48u, 57u, 65u, 90u, 97u, 122u, 45u, 46u, |
|
13954
|
|
|
|
|
|
|
48u, 57u, 65u, 90u, 97u, 122u, 48u, 57u, |
|
13955
|
|
|
|
|
|
|
65u, 90u, 97u, 122u, 45u, 46u, 48u, 57u, |
|
13956
|
|
|
|
|
|
|
65u, 90u, 97u, 122u, 33u, 39u, 41u, 61u, |
|
13957
|
|
|
|
|
|
|
95u, 36u, 47u, 48u, 57u, 58u, 59u, 63u, |
|
13958
|
|
|
|
|
|
|
64u, 65u, 90u, 97u, 122u, 33u, 39u, 40u, |
|
13959
|
|
|
|
|
|
|
44u, 46u, 61u, 63u, 95u, 129u, 131u, 135u, |
|
13960
|
|
|
|
|
|
|
151u, 809u, 1065u, 36u, 38u, 42u, 57u, 58u, |
|
13961
|
|
|
|
|
|
|
59u, 64u, 90u, 97u, 122u, 142u, 143u, 155u, |
|
13962
|
|
|
|
|
|
|
159u, 48u, 49u, 50u, 51u, 57u, 65u, 90u, |
|
13963
|
|
|
|
|
|
|
97u, 122u, 45u, 46u, 48u, 57u, 65u, 90u, |
|
13964
|
|
|
|
|
|
|
97u, 122u, 48u, 49u, 50u, 51u, 57u, 65u, |
|
13965
|
|
|
|
|
|
|
90u, 97u, 122u, 45u, 46u, 48u, 57u, 65u, |
|
13966
|
|
|
|
|
|
|
90u, 97u, 122u, 48u, 49u, 50u, 51u, 57u, |
|
13967
|
|
|
|
|
|
|
65u, 90u, 97u, 122u, 45u, 46u, 48u, 57u, |
|
13968
|
|
|
|
|
|
|
65u, 90u, 97u, 122u, 45u, 46u, 48u, 57u, |
|
13969
|
|
|
|
|
|
|
65u, 90u, 97u, 122u, 45u, 46u, 53u, 48u, |
|
13970
|
|
|
|
|
|
|
52u, 54u, 57u, 65u, 90u, 97u, 122u, 45u, |
|
13971
|
|
|
|
|
|
|
46u, 48u, 53u, 54u, 57u, 65u, 90u, 97u, |
|
13972
|
|
|
|
|
|
|
122u, 45u, 46u, 48u, 57u, 65u, 90u, 97u, |
|
13973
|
|
|
|
|
|
|
122u, 45u, 46u, 48u, 57u, 65u, 90u, 97u, |
|
13974
|
|
|
|
|
|
|
122u, 45u, 46u, 53u, 48u, 52u, 54u, 57u, |
|
13975
|
|
|
|
|
|
|
65u, 90u, 97u, 122u, 45u, 46u, 48u, 53u, |
|
13976
|
|
|
|
|
|
|
54u, 57u, 65u, 90u, 97u, 122u, 45u, 46u, |
|
13977
|
|
|
|
|
|
|
48u, 57u, 65u, 90u, 97u, 122u, 45u, 46u, |
|
13978
|
|
|
|
|
|
|
48u, 57u, 65u, 90u, 97u, 122u, 45u, 46u, |
|
13979
|
|
|
|
|
|
|
53u, 48u, 52u, 54u, 57u, 65u, 90u, 97u, |
|
13980
|
|
|
|
|
|
|
122u, 45u, 46u, 48u, 53u, 54u, 57u, 65u, |
|
13981
|
|
|
|
|
|
|
90u, 97u, 122u, 33u, 45u, 46u, 58u, 64u, |
|
13982
|
|
|
|
|
|
|
95u, 36u, 37u, 39u, 44u, 48u, 57u, 65u, |
|
13983
|
|
|
|
|
|
|
90u, 97u, 122u, 33u, 45u, 58u, 64u, 95u, |
|
13984
|
|
|
|
|
|
|
36u, 37u, 39u, 46u, 48u, 57u, 65u, 90u, |
|
13985
|
|
|
|
|
|
|
97u, 122u, 33u, 45u, 46u, 58u, 64u, 95u, |
|
13986
|
|
|
|
|
|
|
36u, 37u, 39u, 44u, 48u, 57u, 65u, 90u, |
|
13987
|
|
|
|
|
|
|
97u, 122u, 33u, 58u, 64u, 95u, 36u, 37u, |
|
13988
|
|
|
|
|
|
|
39u, 46u, 48u, 57u, 65u, 90u, 97u, 122u, |
|
13989
|
|
|
|
|
|
|
33u, 45u, 46u, 58u, 64u, 95u, 36u, 37u, |
|
13990
|
|
|
|
|
|
|
39u, 44u, 48u, 57u, 65u, 90u, 97u, 122u, |
|
13991
|
|
|
|
|
|
|
33u, 48u, 49u, 50u, 58u, 64u, 95u, 36u, |
|
13992
|
|
|
|
|
|
|
37u, 39u, 46u, 51u, 57u, 65u, 90u, 97u, |
|
13993
|
|
|
|
|
|
|
122u, 33u, 45u, 46u, 58u, 64u, 95u, 36u, |
|
13994
|
|
|
|
|
|
|
37u, 39u, 44u, 48u, 57u, 65u, 90u, 97u, |
|
13995
|
|
|
|
|
|
|
122u, 33u, 48u, 49u, 50u, 58u, 64u, 95u, |
|
13996
|
|
|
|
|
|
|
36u, 37u, 39u, 46u, 51u, 57u, 65u, 90u, |
|
13997
|
|
|
|
|
|
|
97u, 122u, 33u, 45u, 46u, 58u, 64u, 95u, |
|
13998
|
|
|
|
|
|
|
36u, 37u, 39u, 44u, 48u, 57u, 65u, 90u, |
|
13999
|
|
|
|
|
|
|
97u, 122u, 33u, 48u, 49u, 50u, 58u, 64u, |
|
14000
|
|
|
|
|
|
|
95u, 36u, 37u, 39u, 46u, 51u, 57u, 65u, |
|
14001
|
|
|
|
|
|
|
90u, 97u, 122u, 33u, 45u, 46u, 58u, 64u, |
|
14002
|
|
|
|
|
|
|
95u, 36u, 37u, 39u, 44u, 48u, 57u, 65u, |
|
14003
|
|
|
|
|
|
|
90u, 97u, 122u, 33u, 45u, 46u, 58u, 64u, |
|
14004
|
|
|
|
|
|
|
95u, 36u, 37u, 39u, 44u, 48u, 57u, 65u, |
|
14005
|
|
|
|
|
|
|
90u, 97u, 122u, 33u, 45u, 46u, 53u, 58u, |
|
14006
|
|
|
|
|
|
|
64u, 95u, 36u, 37u, 39u, 44u, 48u, 52u, |
|
14007
|
|
|
|
|
|
|
54u, 57u, 65u, 90u, 97u, 122u, 33u, 45u, |
|
14008
|
|
|
|
|
|
|
46u, 58u, 64u, 95u, 36u, 37u, 39u, 44u, |
|
14009
|
|
|
|
|
|
|
48u, 53u, 54u, 57u, 65u, 90u, 97u, 122u, |
|
14010
|
|
|
|
|
|
|
33u, 45u, 46u, 58u, 64u, 95u, 36u, 37u, |
|
14011
|
|
|
|
|
|
|
39u, 44u, 48u, 57u, 65u, 90u, 97u, 122u, |
|
14012
|
|
|
|
|
|
|
33u, 45u, 46u, 58u, 64u, 95u, 36u, 37u, |
|
14013
|
|
|
|
|
|
|
39u, 44u, 48u, 57u, 65u, 90u, 97u, 122u, |
|
14014
|
|
|
|
|
|
|
33u, 45u, 46u, 53u, 58u, 64u, 95u, 36u, |
|
14015
|
|
|
|
|
|
|
37u, 39u, 44u, 48u, 52u, 54u, 57u, 65u, |
|
14016
|
|
|
|
|
|
|
90u, 97u, 122u, 33u, 45u, 46u, 58u, 64u, |
|
14017
|
|
|
|
|
|
|
95u, 36u, 37u, 39u, 44u, 48u, 53u, 54u, |
|
14018
|
|
|
|
|
|
|
57u, 65u, 90u, 97u, 122u, 33u, 45u, 46u, |
|
14019
|
|
|
|
|
|
|
58u, 64u, 95u, 36u, 37u, 39u, 44u, 48u, |
|
14020
|
|
|
|
|
|
|
57u, 65u, 90u, 97u, 122u, 33u, 45u, 46u, |
|
14021
|
|
|
|
|
|
|
58u, 64u, 95u, 36u, 37u, 39u, 44u, 48u, |
|
14022
|
|
|
|
|
|
|
57u, 65u, 90u, 97u, 122u, 33u, 45u, 46u, |
|
14023
|
|
|
|
|
|
|
53u, 58u, 64u, 95u, 36u, 37u, 39u, 44u, |
|
14024
|
|
|
|
|
|
|
48u, 52u, 54u, 57u, 65u, 90u, 97u, 122u, |
|
14025
|
|
|
|
|
|
|
33u, 45u, 46u, 58u, 64u, 95u, 36u, 37u, |
|
14026
|
|
|
|
|
|
|
39u, 44u, 48u, 53u, 54u, 57u, 65u, 90u, |
|
14027
|
|
|
|
|
|
|
97u, 122u, 33u, 45u, 46u, 58u, 64u, 95u, |
|
14028
|
|
|
|
|
|
|
36u, 37u, 39u, 44u, 48u, 57u, 65u, 90u, |
|
14029
|
|
|
|
|
|
|
97u, 122u, 33u, 45u, 46u, 58u, 64u, 95u, |
|
14030
|
|
|
|
|
|
|
36u, 37u, 39u, 44u, 48u, 57u, 65u, 90u, |
|
14031
|
|
|
|
|
|
|
97u, 122u, 33u, 45u, 46u, 58u, 64u, 95u, |
|
14032
|
|
|
|
|
|
|
36u, 37u, 39u, 44u, 48u, 57u, 65u, 90u, |
|
14033
|
|
|
|
|
|
|
97u, 122u, 33u, 47u, 95u, 36u, 37u, 39u, |
|
14034
|
|
|
|
|
|
|
57u, 65u, 90u, 97u, 122u, 47u, 33u, 48u, |
|
14035
|
|
|
|
|
|
|
49u, 50u, 95u, 36u, 37u, 39u, 46u, 51u, |
|
14036
|
|
|
|
|
|
|
57u, 65u, 90u, 97u, 122u, 45u, 46u, 58u, |
|
14037
|
|
|
|
|
|
|
303u, 559u, 48u, 57u, 65u, 90u, 97u, 122u, |
|
14038
|
|
|
|
|
|
|
303u, 559u, 48u, 57u, 33u, 39u, 41u, 61u, |
|
14039
|
|
|
|
|
|
|
95u, 36u, 47u, 48u, 57u, 58u, 59u, 63u, |
|
14040
|
|
|
|
|
|
|
64u, 65u, 90u, 97u, 122u, 33u, 39u, 40u, |
|
14041
|
|
|
|
|
|
|
44u, 46u, 61u, 63u, 95u, 129u, 131u, 135u, |
|
14042
|
|
|
|
|
|
|
151u, 809u, 1065u, 36u, 38u, 42u, 57u, 58u, |
|
14043
|
|
|
|
|
|
|
59u, 64u, 90u, 97u, 122u, 142u, 143u, 155u, |
|
14044
|
|
|
|
|
|
|
159u, 45u, 46u, 58u, 303u, 559u, 48u, 57u, |
|
14045
|
|
|
|
|
|
|
65u, 90u, 97u, 122u, 45u, 46u, 58u, 303u, |
|
14046
|
|
|
|
|
|
|
559u, 48u, 57u, 65u, 90u, 97u, 122u, 45u, |
|
14047
|
|
|
|
|
|
|
46u, 58u, 303u, 559u, 48u, 57u, 65u, 90u, |
|
14048
|
|
|
|
|
|
|
97u, 122u, 45u, 46u, 53u, 58u, 303u, 559u, |
|
14049
|
|
|
|
|
|
|
48u, 52u, 54u, 57u, 65u, 90u, 97u, 122u, |
|
14050
|
|
|
|
|
|
|
45u, 46u, 58u, 303u, 559u, 48u, 53u, 54u, |
|
14051
|
|
|
|
|
|
|
57u, 65u, 90u, 97u, 122u, 33u, 45u, 46u, |
|
14052
|
|
|
|
|
|
|
58u, 64u, 95u, 303u, 559u, 36u, 37u, 39u, |
|
14053
|
|
|
|
|
|
|
44u, 48u, 57u, 65u, 90u, 97u, 122u, 33u, |
|
14054
|
|
|
|
|
|
|
95u, 303u, 559u, 36u, 37u, 39u, 46u, 48u, |
|
14055
|
|
|
|
|
|
|
57u, 65u, 90u, 97u, 122u, 33u, 64u, 95u, |
|
14056
|
|
|
|
|
|
|
303u, 559u, 36u, 37u, 39u, 46u, 48u, 57u, |
|
14057
|
|
|
|
|
|
|
65u, 90u, 97u, 122u, 33u, 45u, 46u, 58u, |
|
14058
|
|
|
|
|
|
|
64u, 95u, 303u, 559u, 36u, 37u, 39u, 44u, |
|
14059
|
|
|
|
|
|
|
48u, 57u, 65u, 90u, 97u, 122u, 33u, 45u, |
|
14060
|
|
|
|
|
|
|
46u, 58u, 64u, 95u, 303u, 559u, 36u, 37u, |
|
14061
|
|
|
|
|
|
|
39u, 44u, 48u, 57u, 65u, 90u, 97u, 122u, |
|
14062
|
|
|
|
|
|
|
33u, 45u, 46u, 58u, 64u, 95u, 303u, 559u, |
|
14063
|
|
|
|
|
|
|
36u, 37u, 39u, 44u, 48u, 57u, 65u, 90u, |
|
14064
|
|
|
|
|
|
|
97u, 122u, 33u, 45u, 46u, 53u, 58u, 64u, |
|
14065
|
|
|
|
|
|
|
95u, 303u, 559u, 36u, 37u, 39u, 44u, 48u, |
|
14066
|
|
|
|
|
|
|
52u, 54u, 57u, 65u, 90u, 97u, 122u, 33u, |
|
14067
|
|
|
|
|
|
|
45u, 46u, 58u, 64u, 95u, 303u, 559u, 36u, |
|
14068
|
|
|
|
|
|
|
37u, 39u, 44u, 48u, 53u, 54u, 57u, 65u, |
|
14069
|
|
|
|
|
|
|
90u, 97u, 122u, 0 |
|
14070
|
|
|
|
|
|
|
}; |
|
14071
|
|
|
|
|
|
|
|
|
14072
|
|
|
|
|
|
|
static const char _ragel_url_email_single_lengths[] = { |
|
14073
|
|
|
|
|
|
|
0, 5, 4, 2, 3, 3, 2, 1, |
|
14074
|
|
|
|
|
|
|
2, 0, 2, 5, 14, 3, 2, 3, |
|
14075
|
|
|
|
|
|
|
2, 3, 2, 2, 3, 2, 2, 2, |
|
14076
|
|
|
|
|
|
|
3, 2, 2, 2, 3, 2, 6, 5, |
|
14077
|
|
|
|
|
|
|
6, 4, 6, 7, 6, 7, 6, 7, |
|
14078
|
|
|
|
|
|
|
6, 6, 7, 6, 6, 6, 7, 6, |
|
14079
|
|
|
|
|
|
|
6, 6, 7, 6, 6, 6, 6, 3, |
|
14080
|
|
|
|
|
|
|
1, 5, 5, 2, 5, 14, 5, 5, |
|
14081
|
|
|
|
|
|
|
5, 6, 5, 8, 4, 5, 8, 8, |
|
14082
|
|
|
|
|
|
|
8, 9, 8 |
|
14083
|
|
|
|
|
|
|
}; |
|
14084
|
|
|
|
|
|
|
|
|
14085
|
|
|
|
|
|
|
static const char _ragel_url_email_range_lengths[] = { |
|
14086
|
|
|
|
|
|
|
0, 5, 5, 5, 5, 3, 3, 3, |
|
14087
|
|
|
|
|
|
|
3, 3, 3, 6, 7, 3, 3, 3, |
|
14088
|
|
|
|
|
|
|
3, 3, 3, 3, 4, 4, 3, 3, |
|
14089
|
|
|
|
|
|
|
4, 4, 3, 3, 4, 4, 5, 5, |
|
14090
|
|
|
|
|
|
|
5, 5, 5, 5, 5, 5, 5, 5, |
|
14091
|
|
|
|
|
|
|
5, 5, 6, 6, 5, 5, 6, 6, |
|
14092
|
|
|
|
|
|
|
5, 5, 6, 6, 5, 5, 5, 4, |
|
14093
|
|
|
|
|
|
|
0, 5, 3, 1, 6, 7, 3, 3, |
|
14094
|
|
|
|
|
|
|
3, 4, 4, 5, 5, 5, 5, 5, |
|
14095
|
|
|
|
|
|
|
5, 6, 6 |
|
14096
|
|
|
|
|
|
|
}; |
|
14097
|
|
|
|
|
|
|
|
|
14098
|
|
|
|
|
|
|
static const short _ragel_url_email_index_offsets[] = { |
|
14099
|
|
|
|
|
|
|
0, 0, 11, 21, 29, 38, 45, 51, |
|
14100
|
|
|
|
|
|
|
56, 62, 66, 72, 84, 106, 113, 119, |
|
14101
|
|
|
|
|
|
|
126, 132, 139, 145, 151, 159, 166, 172, |
|
14102
|
|
|
|
|
|
|
178, 186, 193, 199, 205, 213, 220, 232, |
|
14103
|
|
|
|
|
|
|
243, 255, 265, 277, 290, 302, 315, 327, |
|
14104
|
|
|
|
|
|
|
340, 352, 364, 378, 391, 403, 415, 429, |
|
14105
|
|
|
|
|
|
|
442, 454, 466, 480, 493, 505, 517, 529, |
|
14106
|
|
|
|
|
|
|
537, 539, 550, 559, 563, 575, 597, 606, |
|
14107
|
|
|
|
|
|
|
615, 624, 635, 645, 659, 669, 680, 694, |
|
14108
|
|
|
|
|
|
|
708, 722, 738 |
|
14109
|
|
|
|
|
|
|
}; |
|
14110
|
|
|
|
|
|
|
|
|
14111
|
|
|
|
|
|
|
static const char _ragel_url_email_indicies[] = { |
|
14112
|
|
|
|
|
|
|
0, 2, 3, 4, 0, 0, 0, 5, |
|
14113
|
|
|
|
|
|
|
6, 6, 1, 0, 7, 8, 0, 0, |
|
14114
|
|
|
|
|
|
|
0, 0, 0, 0, 1, 9, 9, 9, |
|
14115
|
|
|
|
|
|
|
9, 9, 9, 9, 1, 9, 8, 9, |
|
14116
|
|
|
|
|
|
|
9, 9, 9, 9, 9, 1, 10, 11, |
|
14117
|
|
|
|
|
|
|
12, 13, 14, 14, 1, 15, 16, 14, |
|
14118
|
|
|
|
|
|
|
14, 14, 1, 15, 14, 14, 14, 1, |
|
14119
|
|
|
|
|
|
|
15, 17, 14, 14, 14, 1, 14, 18, |
|
14120
|
|
|
|
|
|
|
18, 1, 15, 17, 14, 19, 19, 1, |
|
14121
|
|
|
|
|
|
|
20, 21, 21, 20, 20, 20, 21, 20, |
|
14122
|
|
|
|
|
|
|
20, 21, 21, 1, 22, 22, 24, 22, |
|
14123
|
|
|
|
|
|
|
22, 23, 22, 23, 23, 23, 23, 23, |
|
14124
|
|
|
|
|
|
|
25, 26, 23, 23, 22, 23, 23, 23, |
|
14125
|
|
|
|
|
|
|
23, 1, 27, 28, 29, 30, 18, 18, |
|
14126
|
|
|
|
|
|
|
1, 15, 31, 14, 14, 14, 1, 32, |
|
14127
|
|
|
|
|
|
|
33, 34, 35, 18, 18, 1, 15, 36, |
|
14128
|
|
|
|
|
|
|
14, 14, 14, 1, 37, 38, 39, 40, |
|
14129
|
|
|
|
|
|
|
18, 18, 1, 15, 36, 35, 14, 14, |
|
14130
|
|
|
|
|
|
|
1, 15, 36, 32, 14, 14, 1, 15, |
|
14131
|
|
|
|
|
|
|
36, 41, 35, 32, 14, 14, 1, 15, |
|
14132
|
|
|
|
|
|
|
36, 32, 14, 14, 14, 1, 15, 31, |
|
14133
|
|
|
|
|
|
|
30, 14, 14, 1, 15, 31, 27, 14, |
|
14134
|
|
|
|
|
|
|
14, 1, 15, 31, 42, 30, 27, 14, |
|
14135
|
|
|
|
|
|
|
14, 1, 15, 31, 27, 14, 14, 14, |
|
14136
|
|
|
|
|
|
|
1, 15, 16, 13, 14, 14, 1, 15, |
|
14137
|
|
|
|
|
|
|
16, 10, 14, 14, 1, 15, 16, 43, |
|
14138
|
|
|
|
|
|
|
13, 10, 14, 14, 1, 15, 16, 10, |
|
14139
|
|
|
|
|
|
|
14, 14, 14, 1, 0, 44, 45, 7, |
|
14140
|
|
|
|
|
|
|
8, 0, 0, 0, 46, 46, 46, 1, |
|
14141
|
|
|
|
|
|
|
0, 44, 7, 8, 0, 0, 0, 46, |
|
14142
|
|
|
|
|
|
|
46, 46, 1, 0, 44, 47, 7, 8, |
|
14143
|
|
|
|
|
|
|
0, 0, 0, 46, 46, 46, 1, 0, |
|
14144
|
|
|
|
|
|
|
7, 8, 0, 0, 0, 46, 48, 48, |
|
14145
|
|
|
|
|
|
|
1, 0, 44, 47, 7, 8, 0, 0, |
|
14146
|
|
|
|
|
|
|
0, 46, 49, 49, 1, 0, 50, 51, |
|
14147
|
|
|
|
|
|
|
52, 7, 8, 0, 0, 0, 53, 48, |
|
14148
|
|
|
|
|
|
|
48, 1, 0, 44, 54, 7, 8, 0, |
|
14149
|
|
|
|
|
|
|
0, 0, 46, 46, 46, 1, 0, 55, |
|
14150
|
|
|
|
|
|
|
56, 57, 7, 8, 0, 0, 0, 58, |
|
14151
|
|
|
|
|
|
|
48, 48, 1, 0, 44, 59, 7, 8, |
|
14152
|
|
|
|
|
|
|
0, 0, 0, 46, 46, 46, 1, 0, |
|
14153
|
|
|
|
|
|
|
60, 61, 62, 7, 8, 0, 0, 0, |
|
14154
|
|
|
|
|
|
|
63, 48, 48, 1, 0, 44, 59, 7, |
|
14155
|
|
|
|
|
|
|
8, 0, 0, 0, 58, 46, 46, 1, |
|
14156
|
|
|
|
|
|
|
0, 44, 59, 7, 8, 0, 0, 0, |
|
14157
|
|
|
|
|
|
|
55, 46, 46, 1, 0, 44, 59, 64, |
|
14158
|
|
|
|
|
|
|
7, 8, 0, 0, 0, 58, 55, 46, |
|
14159
|
|
|
|
|
|
|
46, 1, 0, 44, 59, 7, 8, 0, |
|
14160
|
|
|
|
|
|
|
0, 0, 55, 46, 46, 46, 1, 0, |
|
14161
|
|
|
|
|
|
|
44, 54, 7, 8, 0, 0, 0, 53, |
|
14162
|
|
|
|
|
|
|
46, 46, 1, 0, 44, 54, 7, 8, |
|
14163
|
|
|
|
|
|
|
0, 0, 0, 50, 46, 46, 1, 0, |
|
14164
|
|
|
|
|
|
|
44, 54, 65, 7, 8, 0, 0, 0, |
|
14165
|
|
|
|
|
|
|
53, 50, 46, 46, 1, 0, 44, 54, |
|
14166
|
|
|
|
|
|
|
7, 8, 0, 0, 0, 50, 46, 46, |
|
14167
|
|
|
|
|
|
|
46, 1, 0, 44, 45, 7, 8, 0, |
|
14168
|
|
|
|
|
|
|
0, 0, 5, 46, 46, 1, 0, 44, |
|
14169
|
|
|
|
|
|
|
45, 7, 8, 0, 0, 0, 2, 46, |
|
14170
|
|
|
|
|
|
|
46, 1, 0, 44, 45, 66, 7, 8, |
|
14171
|
|
|
|
|
|
|
0, 0, 0, 5, 2, 46, 46, 1, |
|
14172
|
|
|
|
|
|
|
0, 44, 45, 7, 8, 0, 0, 0, |
|
14173
|
|
|
|
|
|
|
2, 46, 46, 46, 1, 0, 44, 47, |
|
14174
|
|
|
|
|
|
|
7, 8, 0, 0, 0, 46, 67, 67, |
|
14175
|
|
|
|
|
|
|
1, 0, 44, 47, 7, 8, 0, 0, |
|
14176
|
|
|
|
|
|
|
0, 46, 68, 68, 1, 0, 44, 47, |
|
14177
|
|
|
|
|
|
|
69, 8, 0, 0, 0, 46, 68, 68, |
|
14178
|
|
|
|
|
|
|
1, 9, 70, 9, 9, 9, 9, 9, |
|
14179
|
|
|
|
|
|
|
1, 71, 1, 0, 2, 3, 4, 0, |
|
14180
|
|
|
|
|
|
|
0, 0, 5, 46, 46, 1, 15, 17, |
|
14181
|
|
|
|
|
|
|
72, 21, 23, 14, 19, 19, 1, 21, |
|
14182
|
|
|
|
|
|
|
23, 72, 1, 20, 21, 21, 20, 20, |
|
14183
|
|
|
|
|
|
|
20, 21, 20, 20, 21, 21, 1, 22, |
|
14184
|
|
|
|
|
|
|
22, 24, 22, 22, 23, 22, 23, 23, |
|
14185
|
|
|
|
|
|
|
23, 23, 23, 25, 26, 23, 23, 22, |
|
14186
|
|
|
|
|
|
|
23, 23, 23, 23, 1, 15, 17, 72, |
|
14187
|
|
|
|
|
|
|
21, 23, 14, 14, 14, 1, 15, 17, |
|
14188
|
|
|
|
|
|
|
72, 21, 23, 40, 14, 14, 1, 15, |
|
14189
|
|
|
|
|
|
|
17, 72, 21, 23, 37, 14, 14, 1, |
|
14190
|
|
|
|
|
|
|
15, 17, 73, 72, 21, 23, 40, 37, |
|
14191
|
|
|
|
|
|
|
14, 14, 1, 15, 17, 72, 21, 23, |
|
14192
|
|
|
|
|
|
|
37, 14, 14, 14, 1, 0, 44, 47, |
|
14193
|
|
|
|
|
|
|
74, 8, 0, 21, 23, 0, 0, 46, |
|
14194
|
|
|
|
|
|
|
49, 49, 1, 9, 9, 21, 23, 9, |
|
14195
|
|
|
|
|
|
|
9, 75, 9, 9, 1, 9, 8, 9, |
|
14196
|
|
|
|
|
|
|
21, 23, 9, 9, 75, 9, 9, 1, |
|
14197
|
|
|
|
|
|
|
0, 44, 47, 74, 8, 0, 21, 23, |
|
14198
|
|
|
|
|
|
|
0, 0, 46, 46, 46, 1, 0, 44, |
|
14199
|
|
|
|
|
|
|
47, 74, 8, 0, 21, 23, 0, 0, |
|
14200
|
|
|
|
|
|
|
63, 46, 46, 1, 0, 44, 47, 74, |
|
14201
|
|
|
|
|
|
|
8, 0, 21, 23, 0, 0, 60, 46, |
|
14202
|
|
|
|
|
|
|
46, 1, 0, 44, 47, 76, 74, 8, |
|
14203
|
|
|
|
|
|
|
0, 21, 23, 0, 0, 63, 60, 46, |
|
14204
|
|
|
|
|
|
|
46, 1, 0, 44, 47, 74, 8, 0, |
|
14205
|
|
|
|
|
|
|
21, 23, 0, 0, 60, 46, 46, 46, |
|
14206
|
|
|
|
|
|
|
1, 0 |
|
14207
|
|
|
|
|
|
|
}; |
|
14208
|
|
|
|
|
|
|
|
|
14209
|
|
|
|
|
|
|
static const char _ragel_url_email_trans_targs[] = { |
|
14210
|
|
|
|
|
|
|
2, 0, 30, 48, 50, 49, 52, 3, |
|
14211
|
|
|
|
|
|
|
5, 4, 6, 26, 28, 27, 8, 7, |
|
14212
|
|
|
|
|
|
|
13, 9, 10, 58, 11, 60, 12, 61, |
|
14213
|
|
|
|
|
|
|
61, 12, 61, 14, 22, 24, 23, 15, |
|
14214
|
|
|
|
|
|
|
16, 18, 20, 19, 17, 62, 63, 65, |
|
14215
|
|
|
|
|
|
|
64, 21, 25, 29, 31, 35, 32, 33, |
|
14216
|
|
|
|
|
|
|
34, 67, 36, 44, 46, 45, 37, 38, |
|
14217
|
|
|
|
|
|
|
40, 42, 41, 39, 70, 71, 73, 72, |
|
14218
|
|
|
|
|
|
|
43, 47, 51, 53, 54, 55, 56, 57, |
|
14219
|
|
|
|
|
|
|
59, 66, 68, 69, 74 |
|
14220
|
|
|
|
|
|
|
}; |
|
14221
|
|
|
|
|
|
|
|
|
14222
|
|
|
|
|
|
|
static const char _ragel_url_email_trans_actions[] = { |
|
14223
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
14224
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
14225
|
|
|
|
|
|
|
0, 0, 0, 1, 0, 1, 0, 1, |
|
14226
|
|
|
|
|
|
|
2, 3, 4, 0, 0, 0, 0, 0, |
|
14227
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 1, 1, 1, |
|
14228
|
|
|
|
|
|
|
1, 0, 0, 0, 0, 0, 0, 0, |
|
14229
|
|
|
|
|
|
|
0, 1, 0, 0, 0, 0, 0, 0, |
|
14230
|
|
|
|
|
|
|
0, 0, 0, 0, 1, 1, 1, 1, |
|
14231
|
|
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, |
|
14232
|
|
|
|
|
|
|
1, 1, 1, 1, 1 |
|
14233
|
|
|
|
|
|
|
}; |
|
14234
|
|
|
|
|
|
|
|
|
14235
|
|
|
|
|
|
|
static const int ragel_url_email_start = 1; |
|
14236
|
|
|
|
|
|
|
|
|
14237
|
2
|
|
|
|
|
|
vector ragel_tokenizer::ragel_map; |
|
14238
|
|
|
|
|
|
|
atomic_flag ragel_tokenizer::ragel_map_flag = ATOMIC_FLAG_INIT; |
|
14239
|
|
|
|
|
|
|
|
|
14240
|
0
|
|
|
|
|
|
ragel_tokenizer::ragel_tokenizer(unsigned url_email_tokenizer) : unicode_tokenizer(url_email_tokenizer) { |
|
14241
|
0
|
0
|
|
|
|
|
initialize_ragel_map(); |
|
14242
|
0
|
|
|
|
|
|
} |
|
14243
|
|
|
|
|
|
|
|
|
14244
|
2
|
|
|
|
|
|
void ragel_tokenizer::initialize_ragel_map() { |
|
14245
|
1
|
50
|
|
|
|
|
while (ragel_map_flag.test_and_set()) {} |
|
14246
|
1
|
50
|
|
|
|
|
if (ragel_map.empty()) { |
|
14247
|
129
|
100
|
|
|
|
|
for (uint8_t ascii = 0; ascii < 128; ascii++) |
|
14248
|
128
|
|
|
|
|
|
ragel_map.push_back(ascii); |
|
14249
|
|
|
|
|
|
|
|
|
14250
|
1
|
|
|
|
|
|
ragel_map_add(U'\u2026', 160); // horizontal ellipsis (TRIPLE DOT) |
|
14251
|
1
|
|
|
|
|
|
ragel_map_add(U'\u2019', 161); // right single quotation mark |
|
14252
|
1
|
|
|
|
|
|
ragel_map_add(U'\u2018', 162); // left single quotation mark |
|
14253
|
1
|
|
|
|
|
|
ragel_map_add(U'\u2010', 163); // hyphen |
|
14254
|
|
|
|
|
|
|
} |
|
14255
|
|
|
|
|
|
|
ragel_map_flag.clear(); |
|
14256
|
1
|
|
|
|
|
|
} |
|
14257
|
|
|
|
|
|
|
|
|
14258
|
4
|
|
|
|
|
|
void ragel_tokenizer::ragel_map_add(char32_t chr, uint8_t mapping) { |
|
14259
|
4
|
100
|
|
|
|
|
if (chr >= ragel_map.size()) |
|
14260
|
1
|
|
|
|
|
|
ragel_map.resize(chr + 1, 128); |
|
14261
|
4
|
|
|
|
|
|
ragel_map[chr] = mapping; |
|
14262
|
4
|
|
|
|
|
|
} |
|
14263
|
|
|
|
|
|
|
|
|
14264
|
7
|
|
|
|
|
|
bool ragel_tokenizer::ragel_url_email(unsigned version, const vector& chars, size_t& current, vector& tokens) { |
|
14265
|
|
|
|
|
|
|
int cs; |
|
14266
|
|
|
|
|
|
|
|
|
14267
|
7
|
|
|
|
|
|
size_t start = current, end = current, parens = 0; |
|
14268
|
|
|
|
|
|
|
|
|
14269
|
|
|
|
|
|
|
{ |
|
14270
|
|
|
|
|
|
|
cs = ragel_url_email_start; |
|
14271
|
|
|
|
|
|
|
} |
|
14272
|
|
|
|
|
|
|
|
|
14273
|
|
|
|
|
|
|
{ |
|
14274
|
|
|
|
|
|
|
int _klen; |
|
14275
|
|
|
|
|
|
|
const short *_keys; |
|
14276
|
|
|
|
|
|
|
int _trans; |
|
14277
|
|
|
|
|
|
|
short _widec; |
|
14278
|
|
|
|
|
|
|
|
|
14279
|
7
|
50
|
|
|
|
|
if ( ( current) == ( (chars.size() - 1)) ) |
|
14280
|
|
|
|
|
|
|
goto _test_eof; |
|
14281
|
|
|
|
|
|
|
if ( cs == 0 ) |
|
14282
|
|
|
|
|
|
|
goto _out; |
|
14283
|
|
|
|
|
|
|
_resume: |
|
14284
|
60
|
|
|
|
|
|
_widec = ( ragel_char(chars[current])); |
|
14285
|
30
|
|
|
|
|
|
_klen = _ragel_url_email_cond_lengths[cs]; |
|
14286
|
30
|
|
|
|
|
|
_keys = _ragel_url_email_cond_keys + (_ragel_url_email_cond_offsets[cs]*2); |
|
14287
|
30
|
50
|
|
|
|
|
if ( _klen > 0 ) { |
|
14288
|
|
|
|
|
|
|
const short *_lower = _keys; |
|
14289
|
|
|
|
|
|
|
const short *_mid; |
|
14290
|
0
|
|
|
|
|
|
const short *_upper = _keys + (_klen<<1) - 2; |
|
14291
|
|
|
|
|
|
|
while (1) { |
|
14292
|
0
|
0
|
|
|
|
|
if ( _upper < _lower ) |
|
14293
|
|
|
|
|
|
|
break; |
|
14294
|
|
|
|
|
|
|
|
|
14295
|
0
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
|
14296
|
0
|
0
|
|
|
|
|
if ( _widec < _mid[0] ) |
|
14297
|
0
|
|
|
|
|
|
_upper = _mid - 2; |
|
14298
|
0
|
0
|
|
|
|
|
else if ( _widec > _mid[1] ) |
|
14299
|
0
|
|
|
|
|
|
_lower = _mid + 2; |
|
14300
|
|
|
|
|
|
|
else { |
|
14301
|
0
|
|
|
|
|
|
switch ( _ragel_url_email_cond_spaces[_ragel_url_email_cond_offsets[cs] + ((_mid - _keys)>>1)] ) { |
|
14302
|
|
|
|
|
|
|
case 0: { |
|
14303
|
0
|
|
|
|
|
|
_widec = (short)(256u + (( ragel_char(chars[current])) - 0u)); |
|
14304
|
0
|
0
|
|
|
|
|
if ( |
|
14305
|
0
|
|
|
|
|
|
version >= 2 ) _widec += 256; |
|
14306
|
|
|
|
|
|
|
break; |
|
14307
|
|
|
|
|
|
|
} |
|
14308
|
|
|
|
|
|
|
case 1: { |
|
14309
|
0
|
|
|
|
|
|
_widec = (short)(768u + (( ragel_char(chars[current])) - 0u)); |
|
14310
|
0
|
0
|
|
|
|
|
if ( |
|
14311
|
0
|
|
|
|
|
|
parens ) _widec += 256; |
|
14312
|
|
|
|
|
|
|
break; |
|
14313
|
|
|
|
|
|
|
} |
|
14314
|
|
|
|
|
|
|
} |
|
14315
|
|
|
|
|
|
|
break; |
|
14316
|
|
|
|
|
|
|
} |
|
14317
|
|
|
|
|
|
|
} |
|
14318
|
|
|
|
|
|
|
} |
|
14319
|
|
|
|
|
|
|
|
|
14320
|
30
|
|
|
|
|
|
_keys = _ragel_url_email_trans_keys + _ragel_url_email_key_offsets[cs]; |
|
14321
|
30
|
|
|
|
|
|
_trans = _ragel_url_email_index_offsets[cs]; |
|
14322
|
|
|
|
|
|
|
|
|
14323
|
30
|
|
|
|
|
|
_klen = _ragel_url_email_single_lengths[cs]; |
|
14324
|
30
|
50
|
|
|
|
|
if ( _klen > 0 ) { |
|
14325
|
|
|
|
|
|
|
const short *_lower = _keys; |
|
14326
|
|
|
|
|
|
|
const short *_mid; |
|
14327
|
117
|
|
|
|
|
|
const short *_upper = _keys + _klen - 1; |
|
14328
|
|
|
|
|
|
|
while (1) { |
|
14329
|
117
|
100
|
|
|
|
|
if ( _upper < _lower ) |
|
14330
|
|
|
|
|
|
|
break; |
|
14331
|
|
|
|
|
|
|
|
|
14332
|
87
|
|
|
|
|
|
_mid = _lower + ((_upper-_lower) >> 1); |
|
14333
|
87
|
100
|
|
|
|
|
if ( _widec < *_mid ) |
|
14334
|
13
|
|
|
|
|
|
_upper = _mid - 1; |
|
14335
|
74
|
50
|
|
|
|
|
else if ( _widec > *_mid ) |
|
14336
|
74
|
|
|
|
|
|
_lower = _mid + 1; |
|
14337
|
|
|
|
|
|
|
else { |
|
14338
|
0
|
|
|
|
|
|
_trans += (unsigned int)(_mid - _keys); |
|
14339
|
0
|
|
|
|
|
|
goto _match; |
|
14340
|
|
|
|
|
|
|
} |
|
14341
|
|
|
|
|
|
|
} |
|
14342
|
30
|
|
|
|
|
|
_keys += _klen; |
|
14343
|
30
|
|
|
|
|
|
_trans += _klen; |
|
14344
|
|
|
|
|
|
|
} |
|
14345
|
|
|
|
|
|
|
|
|
14346
|
30
|
|
|
|
|
|
_klen = _ragel_url_email_range_lengths[cs]; |
|
14347
|
30
|
50
|
|
|
|
|
if ( _klen > 0 ) { |
|
14348
|
|
|
|
|
|
|
const short *_lower = _keys; |
|
14349
|
|
|
|
|
|
|
const short *_mid; |
|
14350
|
93
|
|
|
|
|
|
const short *_upper = _keys + (_klen<<1) - 2; |
|
14351
|
|
|
|
|
|
|
while (1) { |
|
14352
|
93
|
100
|
|
|
|
|
if ( _upper < _lower ) |
|
14353
|
|
|
|
|
|
|
break; |
|
14354
|
|
|
|
|
|
|
|
|
14355
|
86
|
|
|
|
|
|
_mid = _lower + (((_upper-_lower) >> 1) & ~1); |
|
14356
|
86
|
100
|
|
|
|
|
if ( _widec < _mid[0] ) |
|
14357
|
9
|
|
|
|
|
|
_upper = _mid - 2; |
|
14358
|
77
|
100
|
|
|
|
|
else if ( _widec > _mid[1] ) |
|
14359
|
54
|
|
|
|
|
|
_lower = _mid + 2; |
|
14360
|
|
|
|
|
|
|
else { |
|
14361
|
23
|
|
|
|
|
|
_trans += (unsigned int)((_mid - _keys)>>1); |
|
14362
|
23
|
|
|
|
|
|
goto _match; |
|
14363
|
|
|
|
|
|
|
} |
|
14364
|
|
|
|
|
|
|
} |
|
14365
|
7
|
|
|
|
|
|
_trans += _klen; |
|
14366
|
|
|
|
|
|
|
} |
|
14367
|
|
|
|
|
|
|
|
|
14368
|
|
|
|
|
|
|
_match: |
|
14369
|
30
|
|
|
|
|
|
_trans = _ragel_url_email_indicies[_trans]; |
|
14370
|
30
|
|
|
|
|
|
cs = _ragel_url_email_trans_targs[_trans]; |
|
14371
|
|
|
|
|
|
|
|
|
14372
|
30
|
50
|
|
|
|
|
if ( _ragel_url_email_trans_actions[_trans] == 0 ) |
|
14373
|
|
|
|
|
|
|
goto _again; |
|
14374
|
|
|
|
|
|
|
|
|
14375
|
0
|
|
|
|
|
|
switch ( _ragel_url_email_trans_actions[_trans] ) { |
|
14376
|
|
|
|
|
|
|
case 3: |
|
14377
|
0
|
|
|
|
|
|
{parens-=!!parens;} |
|
14378
|
0
|
|
|
|
|
|
break; |
|
14379
|
|
|
|
|
|
|
case 1: |
|
14380
|
0
|
|
|
|
|
|
{ end = current + 1; } |
|
14381
|
0
|
|
|
|
|
|
break; |
|
14382
|
|
|
|
|
|
|
case 2: |
|
14383
|
0
|
|
|
|
|
|
{parens++;} |
|
14384
|
0
|
|
|
|
|
|
{ end = current + 1; } |
|
14385
|
0
|
|
|
|
|
|
break; |
|
14386
|
|
|
|
|
|
|
case 4: |
|
14387
|
0
|
|
|
|
|
|
{parens-=!!parens;} |
|
14388
|
0
|
|
|
|
|
|
{ end = current + 1; } |
|
14389
|
0
|
|
|
|
|
|
break; |
|
14390
|
|
|
|
|
|
|
} |
|
14391
|
|
|
|
|
|
|
|
|
14392
|
|
|
|
|
|
|
_again: |
|
14393
|
30
|
100
|
|
|
|
|
if ( cs == 0 ) |
|
14394
|
|
|
|
|
|
|
goto _out; |
|
14395
|
23
|
50
|
|
|
|
|
if ( ++( current) != ( (chars.size() - 1)) ) |
|
14396
|
|
|
|
|
|
|
goto _resume; |
|
14397
|
|
|
|
|
|
|
_test_eof: {} |
|
14398
|
|
|
|
|
|
|
_out: {} |
|
14399
|
|
|
|
|
|
|
} |
|
14400
|
|
|
|
|
|
|
|
|
14401
|
7
|
50
|
|
|
|
|
if (end > start) { |
|
14402
|
0
|
|
|
|
|
|
tokens.emplace_back(start, end - start); |
|
14403
|
0
|
|
|
|
|
|
current = end; |
|
14404
|
0
|
|
|
|
|
|
return true; |
|
14405
|
|
|
|
|
|
|
} else { |
|
14406
|
7
|
|
|
|
|
|
current = start; |
|
14407
|
7
|
|
|
|
|
|
return false; |
|
14408
|
|
|
|
|
|
|
} |
|
14409
|
|
|
|
|
|
|
} |
|
14410
|
|
|
|
|
|
|
|
|
14411
|
|
|
|
|
|
|
} // namespace morphodita |
|
14412
|
|
|
|
|
|
|
|
|
14413
|
|
|
|
|
|
|
///////// |
|
14414
|
|
|
|
|
|
|
// File: morphodita/tokenizer/vertical_tokenizer.h |
|
14415
|
|
|
|
|
|
|
///////// |
|
14416
|
|
|
|
|
|
|
|
|
14417
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
14418
|
|
|
|
|
|
|
// |
|
14419
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
14420
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
14421
|
|
|
|
|
|
|
// |
|
14422
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
14423
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
14424
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
14425
|
|
|
|
|
|
|
|
|
14426
|
|
|
|
|
|
|
namespace morphodita { |
|
14427
|
|
|
|
|
|
|
|
|
14428
|
0
|
|
|
|
|
|
class vertical_tokenizer : public unicode_tokenizer { |
|
14429
|
|
|
|
|
|
|
public: |
|
14430
|
0
|
0
|
|
|
|
|
vertical_tokenizer() : unicode_tokenizer(0) {} |
|
14431
|
|
|
|
|
|
|
|
|
14432
|
|
|
|
|
|
|
virtual bool next_sentence(vector& tokens) override; |
|
14433
|
|
|
|
|
|
|
}; |
|
14434
|
|
|
|
|
|
|
|
|
14435
|
|
|
|
|
|
|
} // namespace morphodita |
|
14436
|
|
|
|
|
|
|
|
|
14437
|
|
|
|
|
|
|
///////// |
|
14438
|
|
|
|
|
|
|
// File: morphodita/tokenizer/tokenizer.cpp |
|
14439
|
|
|
|
|
|
|
///////// |
|
14440
|
|
|
|
|
|
|
|
|
14441
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
14442
|
|
|
|
|
|
|
// |
|
14443
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
14444
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
14445
|
|
|
|
|
|
|
// |
|
14446
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
14447
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
14448
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
14449
|
|
|
|
|
|
|
|
|
14450
|
|
|
|
|
|
|
namespace morphodita { |
|
14451
|
|
|
|
|
|
|
|
|
14452
|
0
|
|
|
|
|
|
tokenizer* tokenizer::new_vertical_tokenizer() { |
|
14453
|
0
|
|
|
|
|
|
return new vertical_tokenizer(); |
|
14454
|
|
|
|
|
|
|
} |
|
14455
|
|
|
|
|
|
|
|
|
14456
|
0
|
|
|
|
|
|
tokenizer* tokenizer::new_czech_tokenizer() { |
|
14457
|
0
|
|
|
|
|
|
return new czech_tokenizer(czech_tokenizer::CZECH, czech_tokenizer::LATEST); |
|
14458
|
|
|
|
|
|
|
} |
|
14459
|
|
|
|
|
|
|
|
|
14460
|
0
|
|
|
|
|
|
tokenizer* tokenizer::new_english_tokenizer() { |
|
14461
|
0
|
|
|
|
|
|
return new english_tokenizer(english_tokenizer::LATEST); |
|
14462
|
|
|
|
|
|
|
} |
|
14463
|
|
|
|
|
|
|
|
|
14464
|
0
|
|
|
|
|
|
tokenizer* tokenizer::new_generic_tokenizer() { |
|
14465
|
0
|
|
|
|
|
|
return new generic_tokenizer(generic_tokenizer::LATEST); |
|
14466
|
|
|
|
|
|
|
} |
|
14467
|
|
|
|
|
|
|
|
|
14468
|
|
|
|
|
|
|
} // namespace morphodita |
|
14469
|
|
|
|
|
|
|
|
|
14470
|
|
|
|
|
|
|
///////// |
|
14471
|
|
|
|
|
|
|
// File: morphodita/tokenizer/tokenizer_ids.h |
|
14472
|
|
|
|
|
|
|
///////// |
|
14473
|
|
|
|
|
|
|
|
|
14474
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
14475
|
|
|
|
|
|
|
// |
|
14476
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
14477
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
14478
|
|
|
|
|
|
|
// |
|
14479
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
14480
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
14481
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
14482
|
|
|
|
|
|
|
|
|
14483
|
|
|
|
|
|
|
namespace morphodita { |
|
14484
|
|
|
|
|
|
|
|
|
14485
|
|
|
|
|
|
|
class tokenizer_ids { |
|
14486
|
|
|
|
|
|
|
public: |
|
14487
|
|
|
|
|
|
|
enum tokenizer_id { |
|
14488
|
|
|
|
|
|
|
CZECH = 0, |
|
14489
|
|
|
|
|
|
|
ENGLISH = 1, |
|
14490
|
|
|
|
|
|
|
GENERIC = 2, |
|
14491
|
|
|
|
|
|
|
GRU = 3, |
|
14492
|
|
|
|
|
|
|
}; |
|
14493
|
|
|
|
|
|
|
|
|
14494
|
|
|
|
|
|
|
static bool parse(const string& str, tokenizer_id& id) { |
|
14495
|
|
|
|
|
|
|
if (str == "czech") return id = CZECH, true; |
|
14496
|
|
|
|
|
|
|
if (str == "english") return id = ENGLISH, true; |
|
14497
|
|
|
|
|
|
|
if (str == "generic") return id = GENERIC, true; |
|
14498
|
|
|
|
|
|
|
if (str == "gru") return id = GRU, true; |
|
14499
|
|
|
|
|
|
|
return false; |
|
14500
|
|
|
|
|
|
|
} |
|
14501
|
|
|
|
|
|
|
}; |
|
14502
|
|
|
|
|
|
|
|
|
14503
|
|
|
|
|
|
|
typedef tokenizer_ids::tokenizer_id tokenizer_id; |
|
14504
|
|
|
|
|
|
|
|
|
14505
|
|
|
|
|
|
|
} // namespace morphodita |
|
14506
|
|
|
|
|
|
|
|
|
14507
|
|
|
|
|
|
|
///////// |
|
14508
|
|
|
|
|
|
|
// File: morphodita/tokenizer/tokenizer_factory.cpp |
|
14509
|
|
|
|
|
|
|
///////// |
|
14510
|
|
|
|
|
|
|
|
|
14511
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
14512
|
|
|
|
|
|
|
// |
|
14513
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
14514
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
14515
|
|
|
|
|
|
|
// |
|
14516
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
14517
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
14518
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
14519
|
|
|
|
|
|
|
|
|
14520
|
|
|
|
|
|
|
namespace morphodita { |
|
14521
|
|
|
|
|
|
|
|
|
14522
|
1
|
|
|
|
|
|
tokenizer_factory* tokenizer_factory::load(istream& is) { |
|
14523
|
1
|
|
|
|
|
|
tokenizer_id id = tokenizer_id(is.get()); |
|
14524
|
1
|
|
|
|
|
|
switch (id) { |
|
14525
|
|
|
|
|
|
|
case tokenizer_ids::GENERIC: |
|
14526
|
|
|
|
|
|
|
{ |
|
14527
|
|
|
|
|
|
|
auto res = new_unique_ptr(); |
|
14528
|
0
|
0
|
|
|
|
|
if (res->load(is)) return res.release(); |
|
14529
|
|
|
|
|
|
|
break; |
|
14530
|
|
|
|
|
|
|
} |
|
14531
|
|
|
|
|
|
|
case tokenizer_ids::GRU: |
|
14532
|
|
|
|
|
|
|
{ |
|
14533
|
1
|
|
|
|
|
|
auto res = new_unique_ptr(); |
|
14534
|
1
|
50
|
|
|
|
|
if (res->load(is)) return res.release(); |
|
|
|
50
|
|
|
|
|
|
|
14535
|
|
|
|
|
|
|
break; |
|
14536
|
|
|
|
|
|
|
} |
|
14537
|
|
|
|
|
|
|
case tokenizer_ids::CZECH: |
|
14538
|
|
|
|
|
|
|
{ |
|
14539
|
|
|
|
|
|
|
auto res = new_unique_ptr(); |
|
14540
|
0
|
0
|
|
|
|
|
if (res->load(is)) return res.release(); |
|
|
|
0
|
|
|
|
|
|
|
14541
|
|
|
|
|
|
|
break; |
|
14542
|
|
|
|
|
|
|
} |
|
14543
|
|
|
|
|
|
|
case tokenizer_ids::ENGLISH: |
|
14544
|
|
|
|
|
|
|
break; |
|
14545
|
|
|
|
|
|
|
} |
|
14546
|
|
|
|
|
|
|
|
|
14547
|
|
|
|
|
|
|
return nullptr; |
|
14548
|
|
|
|
|
|
|
} |
|
14549
|
|
|
|
|
|
|
|
|
14550
|
0
|
|
|
|
|
|
tokenizer_factory* tokenizer_factory::load(const char* fname) { |
|
14551
|
0
|
0
|
|
|
|
|
ifstream f(path_from_utf8(fname).c_str(), ifstream::binary); |
|
14552
|
0
|
0
|
|
|
|
|
if (!f) return nullptr; |
|
14553
|
|
|
|
|
|
|
|
|
14554
|
0
|
0
|
|
|
|
|
return load(f); |
|
14555
|
|
|
|
|
|
|
} |
|
14556
|
|
|
|
|
|
|
|
|
14557
|
|
|
|
|
|
|
} // namespace morphodita |
|
14558
|
|
|
|
|
|
|
|
|
14559
|
|
|
|
|
|
|
///////// |
|
14560
|
|
|
|
|
|
|
// File: morphodita/tokenizer/unicode_tokenizer.cpp |
|
14561
|
|
|
|
|
|
|
///////// |
|
14562
|
|
|
|
|
|
|
|
|
14563
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
14564
|
|
|
|
|
|
|
// |
|
14565
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
14566
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
14567
|
|
|
|
|
|
|
// |
|
14568
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
14569
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
14570
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
14571
|
|
|
|
|
|
|
|
|
14572
|
|
|
|
|
|
|
namespace morphodita { |
|
14573
|
|
|
|
|
|
|
|
|
14574
|
1
|
|
|
|
|
|
unicode_tokenizer::unicode_tokenizer(unsigned url_email_tokenizer) : url_email_tokenizer(url_email_tokenizer) { |
|
14575
|
1
|
50
|
|
|
|
|
ragel_tokenizer::initialize_ragel_map(); |
|
14576
|
|
|
|
|
|
|
|
|
14577
|
1
|
50
|
|
|
|
|
set_text(string_piece(nullptr, 0)); |
|
14578
|
1
|
|
|
|
|
|
} |
|
14579
|
|
|
|
|
|
|
|
|
14580
|
2
|
|
|
|
|
|
void unicode_tokenizer::set_text(string_piece text, bool make_copy /*= false*/) { |
|
14581
|
|
|
|
|
|
|
using namespace unilib; |
|
14582
|
|
|
|
|
|
|
|
|
14583
|
2
|
50
|
|
|
|
|
if (make_copy && text.str) { |
|
|
|
0
|
|
|
|
|
|
|
14584
|
0
|
|
|
|
|
|
text_buffer.assign(text.str, text.len); |
|
14585
|
0
|
|
|
|
|
|
text.str = text_buffer.c_str(); |
|
14586
|
|
|
|
|
|
|
} |
|
14587
|
2
|
|
|
|
|
|
current = 0; |
|
14588
|
|
|
|
|
|
|
|
|
14589
|
|
|
|
|
|
|
chars.clear(); |
|
14590
|
36
|
100
|
|
|
|
|
for (const char* curr_str = text.str; text.len; curr_str = text.str) |
|
14591
|
34
|
|
|
|
|
|
chars.emplace_back(utf8::decode(text.str, text.len), curr_str); |
|
14592
|
2
|
|
|
|
|
|
chars.emplace_back(0, text.str); |
|
14593
|
2
|
|
|
|
|
|
} |
|
14594
|
|
|
|
|
|
|
|
|
14595
|
2
|
|
|
|
|
|
bool unicode_tokenizer::next_sentence(vector* forms, vector* tokens_ptr) { |
|
14596
|
2
|
50
|
|
|
|
|
vector& tokens = tokens_ptr ? *tokens_ptr : tokens_buffer; |
|
14597
|
|
|
|
|
|
|
tokens.clear(); |
|
14598
|
2
|
50
|
|
|
|
|
if (forms) forms->clear(); |
|
14599
|
2
|
50
|
|
|
|
|
if (current >= chars.size() - 1) return false; |
|
14600
|
|
|
|
|
|
|
|
|
14601
|
2
|
|
|
|
|
|
bool result = next_sentence(tokens); |
|
14602
|
2
|
50
|
|
|
|
|
if (forms) |
|
14603
|
9
|
100
|
|
|
|
|
for (auto&& token : tokens) |
|
14604
|
7
|
|
|
|
|
|
forms->emplace_back(chars[token.start].str, chars[token.start + token.length].str - chars[token.start].str); |
|
14605
|
|
|
|
|
|
|
|
|
14606
|
|
|
|
|
|
|
return result; |
|
14607
|
|
|
|
|
|
|
} |
|
14608
|
|
|
|
|
|
|
|
|
14609
|
7
|
|
|
|
|
|
bool unicode_tokenizer::tokenize_url_email(vector& tokens) { |
|
14610
|
7
|
50
|
|
|
|
|
if (current >= chars.size() - 1) return false; |
|
14611
|
|
|
|
|
|
|
|
|
14612
|
7
|
50
|
|
|
|
|
return url_email_tokenizer ? ragel_tokenizer::ragel_url_email(url_email_tokenizer, chars, current, tokens) : false; |
|
14613
|
|
|
|
|
|
|
} |
|
14614
|
|
|
|
|
|
|
|
|
14615
|
8
|
|
|
|
|
|
bool unicode_tokenizer::emergency_sentence_split(const vector& tokens) { |
|
14616
|
|
|
|
|
|
|
using namespace unilib; |
|
14617
|
|
|
|
|
|
|
|
|
14618
|
|
|
|
|
|
|
// Implement emergency splitting for large sentences |
|
14619
|
8
|
50
|
|
|
|
|
return tokens.size() >= 500 || |
|
14620
|
16
|
50
|
|
|
|
|
(tokens.size() >= 450 && chars[tokens.back().start].cat & unicode::P) || |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
14621
|
0
|
0
|
|
|
|
|
(tokens.size() >= 400 && chars[tokens.back().start].cat & unicode::Po); |
|
14622
|
|
|
|
|
|
|
} |
|
14623
|
|
|
|
|
|
|
|
|
14624
|
0
|
|
|
|
|
|
bool unicode_tokenizer::is_eos(const vector& tokens, char32_t eos_chr, const unordered_set* abbreviations) { |
|
14625
|
|
|
|
|
|
|
using namespace unilib; |
|
14626
|
|
|
|
|
|
|
|
|
14627
|
0
|
0
|
|
|
|
|
if (eos_chr == '.' && !tokens.empty()) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
14628
|
|
|
|
|
|
|
// Ignore one-letter capitals before dot |
|
14629
|
0
|
0
|
|
|
|
|
if (tokens.back().length == 1 && chars[tokens.back().start].cat & unicode::Lut) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
14630
|
|
|
|
|
|
|
return false; |
|
14631
|
|
|
|
|
|
|
|
|
14632
|
|
|
|
|
|
|
// Ignore specified abbreviations |
|
14633
|
0
|
0
|
|
|
|
|
if (abbreviations) { |
|
14634
|
|
|
|
|
|
|
eos_buffer.clear(); |
|
14635
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < tokens.back().length; i++) |
|
14636
|
0
|
|
|
|
|
|
utf8::append(eos_buffer, unicode::lowercase(chars[tokens.back().start + i].chr)); |
|
14637
|
0
|
0
|
|
|
|
|
if (abbreviations->count(eos_buffer)) |
|
14638
|
|
|
|
|
|
|
return false; |
|
14639
|
|
|
|
|
|
|
} |
|
14640
|
|
|
|
|
|
|
} |
|
14641
|
|
|
|
|
|
|
return true; |
|
14642
|
|
|
|
|
|
|
} |
|
14643
|
|
|
|
|
|
|
|
|
14644
|
|
|
|
|
|
|
} // namespace morphodita |
|
14645
|
|
|
|
|
|
|
|
|
14646
|
|
|
|
|
|
|
///////// |
|
14647
|
|
|
|
|
|
|
// File: morphodita/tokenizer/vertical_tokenizer.cpp |
|
14648
|
|
|
|
|
|
|
///////// |
|
14649
|
|
|
|
|
|
|
|
|
14650
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
14651
|
|
|
|
|
|
|
// |
|
14652
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
14653
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
14654
|
|
|
|
|
|
|
// |
|
14655
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
14656
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
14657
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
14658
|
|
|
|
|
|
|
|
|
14659
|
|
|
|
|
|
|
namespace morphodita { |
|
14660
|
|
|
|
|
|
|
|
|
14661
|
0
|
|
|
|
|
|
bool vertical_tokenizer::next_sentence(vector& tokens) { |
|
14662
|
0
|
0
|
|
|
|
|
if (current >= chars.size() - 1) return false; |
|
14663
|
|
|
|
|
|
|
|
|
14664
|
0
|
|
|
|
|
|
while (true) { |
|
14665
|
0
|
|
|
|
|
|
size_t line_start = current; |
|
14666
|
0
|
0
|
|
|
|
|
while (current < chars.size() - 1 && chars[current].chr != '\r' && chars[current].chr != '\n') current++; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
14667
|
|
|
|
|
|
|
|
|
14668
|
|
|
|
|
|
|
size_t line_end = current; |
|
14669
|
0
|
0
|
|
|
|
|
if (current < chars.size() - 1) { |
|
14670
|
0
|
|
|
|
|
|
current++; |
|
14671
|
0
|
0
|
|
|
|
|
if (current < chars.size() - 1 && |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
14672
|
0
|
0
|
|
|
|
|
((chars[current-1].chr == '\r' && chars[current].chr == '\n') || |
|
|
|
0
|
|
|
|
|
|
|
14673
|
0
|
0
|
|
|
|
|
(chars[current-1].chr == '\n' && chars[current].chr == '\r'))) |
|
14674
|
0
|
|
|
|
|
|
current++; |
|
14675
|
|
|
|
|
|
|
} |
|
14676
|
|
|
|
|
|
|
|
|
14677
|
0
|
0
|
|
|
|
|
if (line_start < line_end) |
|
14678
|
0
|
|
|
|
|
|
tokens.emplace_back(line_start, line_end - line_start); |
|
14679
|
|
|
|
|
|
|
else |
|
14680
|
|
|
|
|
|
|
break; |
|
14681
|
|
|
|
|
|
|
} |
|
14682
|
|
|
|
|
|
|
|
|
14683
|
0
|
|
|
|
|
|
return true; |
|
14684
|
|
|
|
|
|
|
} |
|
14685
|
|
|
|
|
|
|
|
|
14686
|
|
|
|
|
|
|
} // namespace morphodita |
|
14687
|
|
|
|
|
|
|
|
|
14688
|
|
|
|
|
|
|
///////// |
|
14689
|
|
|
|
|
|
|
// File: unilib/version.h |
|
14690
|
|
|
|
|
|
|
///////// |
|
14691
|
|
|
|
|
|
|
|
|
14692
|
|
|
|
|
|
|
// This file is part of UniLib . |
|
14693
|
|
|
|
|
|
|
// |
|
14694
|
|
|
|
|
|
|
// Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of |
|
14695
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
14696
|
|
|
|
|
|
|
// |
|
14697
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
14698
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
14699
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
14700
|
|
|
|
|
|
|
// |
|
14701
|
|
|
|
|
|
|
// UniLib version: 3.3.0 |
|
14702
|
|
|
|
|
|
|
// Unicode version: 15.0.0 |
|
14703
|
|
|
|
|
|
|
|
|
14704
|
|
|
|
|
|
|
namespace unilib { |
|
14705
|
|
|
|
|
|
|
|
|
14706
|
0
|
|
|
|
|
|
struct version { |
|
14707
|
|
|
|
|
|
|
unsigned major; |
|
14708
|
|
|
|
|
|
|
unsigned minor; |
|
14709
|
|
|
|
|
|
|
unsigned patch; |
|
14710
|
|
|
|
|
|
|
std::string prerelease; |
|
14711
|
|
|
|
|
|
|
|
|
14712
|
|
|
|
|
|
|
// Returns current version. |
|
14713
|
|
|
|
|
|
|
static version current(); |
|
14714
|
|
|
|
|
|
|
}; |
|
14715
|
|
|
|
|
|
|
|
|
14716
|
|
|
|
|
|
|
} // namespace unilib |
|
14717
|
|
|
|
|
|
|
|
|
14718
|
|
|
|
|
|
|
///////// |
|
14719
|
|
|
|
|
|
|
// File: morphodita/version/version.h |
|
14720
|
|
|
|
|
|
|
///////// |
|
14721
|
|
|
|
|
|
|
|
|
14722
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
14723
|
|
|
|
|
|
|
// |
|
14724
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
14725
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
14726
|
|
|
|
|
|
|
// |
|
14727
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
14728
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
14729
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
14730
|
|
|
|
|
|
|
|
|
14731
|
|
|
|
|
|
|
namespace morphodita { |
|
14732
|
|
|
|
|
|
|
|
|
14733
|
0
|
|
|
|
|
|
class version { |
|
14734
|
|
|
|
|
|
|
public: |
|
14735
|
|
|
|
|
|
|
unsigned major; |
|
14736
|
|
|
|
|
|
|
unsigned minor; |
|
14737
|
|
|
|
|
|
|
unsigned patch; |
|
14738
|
|
|
|
|
|
|
string prerelease; |
|
14739
|
|
|
|
|
|
|
|
|
14740
|
|
|
|
|
|
|
// Returns current MorphoDiTa version. |
|
14741
|
|
|
|
|
|
|
static version current(); |
|
14742
|
|
|
|
|
|
|
|
|
14743
|
|
|
|
|
|
|
// Returns multi-line formated version and copyright string. |
|
14744
|
|
|
|
|
|
|
static string version_and_copyright(const string& other_libraries = string()); |
|
14745
|
|
|
|
|
|
|
}; |
|
14746
|
|
|
|
|
|
|
|
|
14747
|
|
|
|
|
|
|
} // namespace morphodita |
|
14748
|
|
|
|
|
|
|
|
|
14749
|
|
|
|
|
|
|
///////// |
|
14750
|
|
|
|
|
|
|
// File: morphodita/version/version.cpp |
|
14751
|
|
|
|
|
|
|
///////// |
|
14752
|
|
|
|
|
|
|
|
|
14753
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
14754
|
|
|
|
|
|
|
// |
|
14755
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
14756
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
14757
|
|
|
|
|
|
|
// |
|
14758
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
14759
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
14760
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
14761
|
|
|
|
|
|
|
|
|
14762
|
|
|
|
|
|
|
namespace morphodita { |
|
14763
|
|
|
|
|
|
|
|
|
14764
|
0
|
|
|
|
|
|
version version::current() { |
|
14765
|
0
|
0
|
|
|
|
|
return {1, 11, 1, "dev"}; |
|
|
|
0
|
|
|
|
|
|
|
14766
|
|
|
|
|
|
|
} |
|
14767
|
|
|
|
|
|
|
|
|
14768
|
|
|
|
|
|
|
// Returns multi-line formated version and copyright string. |
|
14769
|
0
|
|
|
|
|
|
string version::version_and_copyright(const string& other_libraries) { |
|
14770
|
0
|
|
|
|
|
|
ostringstream info; |
|
14771
|
|
|
|
|
|
|
|
|
14772
|
|
|
|
|
|
|
auto morphodita = version::current(); |
|
14773
|
|
|
|
|
|
|
auto unilib = unilib::version::current(); |
|
14774
|
|
|
|
|
|
|
|
|
14775
|
0
|
|
|
|
|
|
info << "MorphoDiTa version " << morphodita.major << '.' << morphodita.minor << '.' << morphodita.patch |
|
14776
|
0
|
0
|
|
|
|
|
<< (morphodita.prerelease.empty() ? "" : "-") << morphodita.prerelease |
|
|
|
0
|
|
|
|
|
|
|
14777
|
0
|
|
|
|
|
|
<< " (using UniLib " << unilib.major << '.' << unilib.minor << '.' << unilib.patch |
|
14778
|
0
|
0
|
|
|
|
|
<< (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n" |
|
|
|
0
|
|
|
|
|
|
|
14779
|
|
|
|
|
|
|
"Copyright 2015 by Institute of Formal and Applied Linguistics, Faculty of\n" |
|
14780
|
0
|
0
|
|
|
|
|
"Mathematics and Physics, Charles University in Prague, Czech Republic."; |
|
14781
|
|
|
|
|
|
|
|
|
14782
|
0
|
|
|
|
|
|
return info.str(); |
|
14783
|
|
|
|
|
|
|
} |
|
14784
|
|
|
|
|
|
|
|
|
14785
|
|
|
|
|
|
|
} // namespace morphodita |
|
14786
|
|
|
|
|
|
|
|
|
14787
|
|
|
|
|
|
|
///////// |
|
14788
|
|
|
|
|
|
|
// File: parsito/configuration/configuration.cpp |
|
14789
|
|
|
|
|
|
|
///////// |
|
14790
|
|
|
|
|
|
|
|
|
14791
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
14792
|
|
|
|
|
|
|
// |
|
14793
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
14794
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
14795
|
|
|
|
|
|
|
// |
|
14796
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
14797
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
14798
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
14799
|
|
|
|
|
|
|
|
|
14800
|
|
|
|
|
|
|
namespace parsito { |
|
14801
|
|
|
|
|
|
|
|
|
14802
|
1
|
|
|
|
|
|
void configuration::init(tree* t) { |
|
14803
|
1
|
50
|
|
|
|
|
assert(t); |
|
14804
|
|
|
|
|
|
|
|
|
14805
|
|
|
|
|
|
|
t->unlink_all_nodes(); |
|
14806
|
1
|
|
|
|
|
|
this->t = t; |
|
14807
|
|
|
|
|
|
|
|
|
14808
|
|
|
|
|
|
|
stack.clear(); |
|
14809
|
2
|
50
|
|
|
|
|
if (!t->nodes.empty()) stack.push_back(0); |
|
14810
|
|
|
|
|
|
|
|
|
14811
|
|
|
|
|
|
|
buffer.clear(); |
|
14812
|
1
|
|
|
|
|
|
buffer.reserve(t->nodes.size()); |
|
14813
|
8
|
100
|
|
|
|
|
for (size_t i = t->nodes.size(); i > 1; i--) |
|
14814
|
14
|
|
|
|
|
|
buffer.push_back(i - 1); |
|
14815
|
1
|
|
|
|
|
|
} |
|
14816
|
|
|
|
|
|
|
|
|
14817
|
0
|
|
|
|
|
|
bool configuration::final() { |
|
14818
|
67
|
0
|
|
|
|
|
return buffer.empty() && stack.size() <= 1; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
14819
|
|
|
|
|
|
|
} |
|
14820
|
|
|
|
|
|
|
|
|
14821
|
|
|
|
|
|
|
} // namespace parsito |
|
14822
|
|
|
|
|
|
|
|
|
14823
|
|
|
|
|
|
|
///////// |
|
14824
|
|
|
|
|
|
|
// File: parsito/configuration/node_extractor.h |
|
14825
|
|
|
|
|
|
|
///////// |
|
14826
|
|
|
|
|
|
|
|
|
14827
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
14828
|
|
|
|
|
|
|
// |
|
14829
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
14830
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
14831
|
|
|
|
|
|
|
// |
|
14832
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
14833
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
14834
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
14835
|
|
|
|
|
|
|
|
|
14836
|
|
|
|
|
|
|
namespace parsito { |
|
14837
|
|
|
|
|
|
|
|
|
14838
|
1
|
|
|
|
|
|
class node_extractor { |
|
14839
|
|
|
|
|
|
|
public: |
|
14840
|
|
|
|
|
|
|
unsigned node_count() const; |
|
14841
|
|
|
|
|
|
|
void extract(const configuration& conf, vector& nodes) const; |
|
14842
|
|
|
|
|
|
|
|
|
14843
|
|
|
|
|
|
|
bool create(string_piece description, string& error); |
|
14844
|
|
|
|
|
|
|
|
|
14845
|
|
|
|
|
|
|
private: |
|
14846
|
|
|
|
|
|
|
enum start_t { STACK = 0, BUFFER = 1 }; |
|
14847
|
|
|
|
|
|
|
enum direction_t { PARENT = 0, CHILD = 1 }; |
|
14848
|
80
|
|
|
|
|
|
struct node_selector { |
|
14849
|
|
|
|
|
|
|
pair start; |
|
14850
|
|
|
|
|
|
|
vector> directions; |
|
14851
|
|
|
|
|
|
|
|
|
14852
|
|
|
|
|
|
|
node_selector(start_t start, int start_index) : start(start, start_index) {} |
|
14853
|
|
|
|
|
|
|
}; |
|
14854
|
|
|
|
|
|
|
|
|
14855
|
|
|
|
|
|
|
vector selectors; |
|
14856
|
|
|
|
|
|
|
}; |
|
14857
|
|
|
|
|
|
|
|
|
14858
|
|
|
|
|
|
|
} // namespace parsito |
|
14859
|
|
|
|
|
|
|
|
|
14860
|
|
|
|
|
|
|
///////// |
|
14861
|
|
|
|
|
|
|
// File: parsito/configuration/node_extractor.cpp |
|
14862
|
|
|
|
|
|
|
///////// |
|
14863
|
|
|
|
|
|
|
|
|
14864
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
14865
|
|
|
|
|
|
|
// |
|
14866
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
14867
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
14868
|
|
|
|
|
|
|
// |
|
14869
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
14870
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
14871
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
14872
|
|
|
|
|
|
|
|
|
14873
|
|
|
|
|
|
|
namespace parsito { |
|
14874
|
|
|
|
|
|
|
|
|
14875
|
0
|
|
|
|
|
|
unsigned node_extractor::node_count() const { |
|
14876
|
0
|
|
|
|
|
|
return selectors.size(); |
|
14877
|
|
|
|
|
|
|
} |
|
14878
|
|
|
|
|
|
|
|
|
14879
|
62
|
|
|
|
|
|
void node_extractor::extract(const configuration& conf, vector& nodes) const { |
|
14880
|
|
|
|
|
|
|
nodes.clear(); |
|
14881
|
1178
|
100
|
|
|
|
|
for (auto&& selector : selectors) { |
|
14882
|
|
|
|
|
|
|
// Start by locating starting node |
|
14883
|
1116
|
|
|
|
|
|
int current = -1; |
|
14884
|
1116
|
|
|
|
|
|
switch (selector.start.first) { |
|
14885
|
|
|
|
|
|
|
case STACK: |
|
14886
|
930
|
100
|
|
|
|
|
if (selector.start.second < int(conf.stack.size())) |
|
14887
|
867
|
|
|
|
|
|
current = conf.stack[conf.stack.size() - 1 - selector.start.second]; |
|
14888
|
|
|
|
|
|
|
break; |
|
14889
|
|
|
|
|
|
|
case BUFFER: |
|
14890
|
186
|
100
|
|
|
|
|
if (selector.start.second < int(conf.buffer.size())) |
|
14891
|
98
|
|
|
|
|
|
current = conf.buffer[conf.buffer.size() - 1 - selector.start.second]; |
|
14892
|
|
|
|
|
|
|
break; |
|
14893
|
|
|
|
|
|
|
} |
|
14894
|
|
|
|
|
|
|
|
|
14895
|
|
|
|
|
|
|
// Follow directions to the final node |
|
14896
|
1116
|
100
|
|
|
|
|
if (current >= 0) |
|
14897
|
1212
|
100
|
|
|
|
|
for (auto&& direction : selector.directions) { |
|
14898
|
802
|
|
|
|
|
|
const node& node = conf.t->nodes[current]; |
|
14899
|
802
|
|
|
|
|
|
switch (direction.first) { |
|
14900
|
|
|
|
|
|
|
case PARENT: |
|
14901
|
0
|
0
|
|
|
|
|
current = node.head ? node.head : -1; |
|
14902
|
0
|
|
|
|
|
|
break; |
|
14903
|
|
|
|
|
|
|
case CHILD: |
|
14904
|
401
|
100
|
|
|
|
|
current = direction.second >= 0 && direction.second < int(node.children.size()) ? |
|
14905
|
120
|
|
|
|
|
|
node.children[direction.second] : |
|
14906
|
401
|
100
|
|
|
|
|
direction.second < 0 && -direction.second <= int(node.children.size()) ? |
|
14907
|
127
|
|
|
|
|
|
node.children[node.children.size() + direction.second] : |
|
14908
|
1330
|
100
|
|
|
|
|
-1; |
|
|
|
100
|
|
|
|
|
|
|
14909
|
802
|
|
|
|
|
|
break; |
|
14910
|
|
|
|
|
|
|
} |
|
14911
|
802
|
100
|
|
|
|
|
if (current <= 0) break; |
|
14912
|
|
|
|
|
|
|
} |
|
14913
|
|
|
|
|
|
|
|
|
14914
|
|
|
|
|
|
|
// Add the selected node |
|
14915
|
1116
|
|
|
|
|
|
nodes.push_back(current); |
|
14916
|
|
|
|
|
|
|
} |
|
14917
|
62
|
|
|
|
|
|
} |
|
14918
|
|
|
|
|
|
|
|
|
14919
|
1
|
|
|
|
|
|
bool node_extractor::create(string_piece description, string& error) { |
|
14920
|
1
|
|
|
|
|
|
selectors.clear(); |
|
14921
|
|
|
|
|
|
|
error.clear(); |
|
14922
|
|
|
|
|
|
|
|
|
14923
|
|
|
|
|
|
|
vector lines, parts, words; |
|
14924
|
1
|
50
|
|
|
|
|
split(description, '\n', lines); |
|
14925
|
20
|
100
|
|
|
|
|
for (auto&& line : lines) { |
|
14926
|
19
|
100
|
|
|
|
|
if (!line.len || line.str[0] == '#') continue; |
|
|
|
50
|
|
|
|
|
|
|
14927
|
|
|
|
|
|
|
|
|
14928
|
|
|
|
|
|
|
// Separate start and directions |
|
14929
|
18
|
50
|
|
|
|
|
split(line, ',', parts); |
|
14930
|
|
|
|
|
|
|
|
|
14931
|
|
|
|
|
|
|
// Parse start |
|
14932
|
18
|
50
|
|
|
|
|
split(parts[0], ' ', words); |
|
14933
|
18
|
50
|
|
|
|
|
if (words.size() != 2) |
|
14934
|
0
|
0
|
|
|
|
|
return error.assign("The node selector '").append(parts[0].str, parts[0].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
14935
|
|
|
|
|
|
|
|
|
14936
|
|
|
|
|
|
|
start_t start; |
|
14937
|
18
|
100
|
|
|
|
|
if (words[0] == "stack") |
|
14938
|
15
|
|
|
|
|
|
start = STACK; |
|
14939
|
3
|
50
|
|
|
|
|
else if (words[0] == "buffer") |
|
14940
|
3
|
|
|
|
|
|
start = BUFFER; |
|
14941
|
|
|
|
|
|
|
else |
|
14942
|
0
|
0
|
|
|
|
|
return error.assign("Cannot parse starting location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
14943
|
|
|
|
|
|
|
|
|
14944
|
|
|
|
|
|
|
int start_index; |
|
14945
|
18
|
50
|
|
|
|
|
if (!parse_int(words[1], "starting index", start_index, error)) return false; |
|
|
|
50
|
|
|
|
|
|
|
14946
|
|
|
|
|
|
|
|
|
14947
|
18
|
50
|
|
|
|
|
selectors.emplace_back(start, start_index); |
|
14948
|
|
|
|
|
|
|
|
|
14949
|
|
|
|
|
|
|
// Parse directions |
|
14950
|
34
|
100
|
|
|
|
|
for (size_t i = 1; i < parts.size(); i++) { |
|
14951
|
16
|
50
|
|
|
|
|
split(parts[i], ' ', words); |
|
14952
|
16
|
50
|
|
|
|
|
if (words.empty()) |
|
14953
|
0
|
0
|
|
|
|
|
return error.assign("Empty node selector on line '").append(line.str, line.len).append(".!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
14954
|
|
|
|
|
|
|
|
|
14955
|
16
|
50
|
|
|
|
|
if (words[0] == "parent") { |
|
14956
|
0
|
0
|
|
|
|
|
if (words.size() != 1) |
|
14957
|
0
|
0
|
|
|
|
|
return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain one space separated value!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
14958
|
0
|
0
|
|
|
|
|
selectors.back().directions.emplace_back(PARENT, 0); |
|
14959
|
16
|
50
|
|
|
|
|
} else if (words[0] == "child") { |
|
14960
|
16
|
50
|
|
|
|
|
if (words.size() != 2) |
|
14961
|
0
|
0
|
|
|
|
|
return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
14962
|
|
|
|
|
|
|
int child_index; |
|
14963
|
16
|
50
|
|
|
|
|
if (!parse_int(words[1], "child index", child_index, error)) return false; |
|
|
|
50
|
|
|
|
|
|
|
14964
|
16
|
50
|
|
|
|
|
selectors.back().directions.emplace_back(CHILD, child_index); |
|
14965
|
|
|
|
|
|
|
} else { |
|
14966
|
0
|
0
|
|
|
|
|
return error.assign("Cannot parse direction location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
14967
|
|
|
|
|
|
|
} |
|
14968
|
|
|
|
|
|
|
} |
|
14969
|
|
|
|
|
|
|
} |
|
14970
|
|
|
|
|
|
|
|
|
14971
|
|
|
|
|
|
|
return true; |
|
14972
|
|
|
|
|
|
|
} |
|
14973
|
|
|
|
|
|
|
|
|
14974
|
|
|
|
|
|
|
} // namespace parsito |
|
14975
|
|
|
|
|
|
|
|
|
14976
|
|
|
|
|
|
|
///////// |
|
14977
|
|
|
|
|
|
|
// File: parsito/configuration/value_extractor.h |
|
14978
|
|
|
|
|
|
|
///////// |
|
14979
|
|
|
|
|
|
|
|
|
14980
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
14981
|
|
|
|
|
|
|
// |
|
14982
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
14983
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
14984
|
|
|
|
|
|
|
// |
|
14985
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
14986
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
14987
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
14988
|
|
|
|
|
|
|
|
|
14989
|
|
|
|
|
|
|
namespace parsito { |
|
14990
|
|
|
|
|
|
|
|
|
14991
|
|
|
|
|
|
|
class value_extractor { |
|
14992
|
|
|
|
|
|
|
public: |
|
14993
|
|
|
|
|
|
|
void extract(const node& n, string& value) const; |
|
14994
|
|
|
|
|
|
|
|
|
14995
|
|
|
|
|
|
|
bool create(string_piece description, string& error); |
|
14996
|
|
|
|
|
|
|
|
|
14997
|
|
|
|
|
|
|
private: |
|
14998
|
|
|
|
|
|
|
enum value_t { FORM = 0, LEMMA = 1, LEMMA_ID = 2, TAG = 3, UNIVERSAL_TAG = 4, |
|
14999
|
|
|
|
|
|
|
FEATS = 5, UNIVERSAL_TAG_FEATS = 6, DEPREL = 7 }; |
|
15000
|
|
|
|
|
|
|
value_t selector; |
|
15001
|
|
|
|
|
|
|
}; |
|
15002
|
|
|
|
|
|
|
|
|
15003
|
|
|
|
|
|
|
} // namespace parsito |
|
15004
|
|
|
|
|
|
|
|
|
15005
|
|
|
|
|
|
|
///////// |
|
15006
|
|
|
|
|
|
|
// File: parsito/configuration/value_extractor.cpp |
|
15007
|
|
|
|
|
|
|
///////// |
|
15008
|
|
|
|
|
|
|
|
|
15009
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
15010
|
|
|
|
|
|
|
// |
|
15011
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
15012
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
15013
|
|
|
|
|
|
|
// |
|
15014
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
15015
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
15016
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
15017
|
|
|
|
|
|
|
|
|
15018
|
|
|
|
|
|
|
namespace parsito { |
|
15019
|
|
|
|
|
|
|
|
|
15020
|
2016
|
|
|
|
|
|
void value_extractor::extract(const node& n, string& value) const { |
|
15021
|
2016
|
|
|
|
|
|
switch (selector) { |
|
15022
|
|
|
|
|
|
|
case FORM: |
|
15023
|
504
|
|
|
|
|
|
value.assign(n.form); |
|
15024
|
|
|
|
|
|
|
break; |
|
15025
|
|
|
|
|
|
|
case LEMMA: |
|
15026
|
0
|
|
|
|
|
|
value.assign(n.lemma); |
|
15027
|
|
|
|
|
|
|
break; |
|
15028
|
|
|
|
|
|
|
case LEMMA_ID: |
|
15029
|
0
|
0
|
|
|
|
|
if (!n.misc.empty()) { |
|
15030
|
|
|
|
|
|
|
// Try finding LId= in misc column |
|
15031
|
0
|
|
|
|
|
|
auto lid = n.misc.find("LId="); |
|
15032
|
0
|
0
|
|
|
|
|
if (lid != string::npos) { |
|
15033
|
0
|
|
|
|
|
|
lid += 4; |
|
15034
|
|
|
|
|
|
|
|
|
15035
|
|
|
|
|
|
|
// Find optional | ending the lemma_id |
|
15036
|
0
|
|
|
|
|
|
auto lid_end = n.misc.find('|', lid); |
|
15037
|
0
|
0
|
|
|
|
|
if (lid_end == string::npos) lid_end = n.misc.size(); |
|
15038
|
|
|
|
|
|
|
|
|
15039
|
|
|
|
|
|
|
// Store the lemma_id |
|
15040
|
0
|
|
|
|
|
|
value.assign(n.misc, lid, lid_end - lid); |
|
15041
|
0
|
|
|
|
|
|
break; |
|
15042
|
|
|
|
|
|
|
} |
|
15043
|
|
|
|
|
|
|
} |
|
15044
|
0
|
|
|
|
|
|
value.assign(n.lemma); |
|
15045
|
|
|
|
|
|
|
break; |
|
15046
|
|
|
|
|
|
|
case TAG: |
|
15047
|
0
|
|
|
|
|
|
value.assign(n.xpostag); |
|
15048
|
|
|
|
|
|
|
break; |
|
15049
|
|
|
|
|
|
|
case UNIVERSAL_TAG: |
|
15050
|
504
|
|
|
|
|
|
value.assign(n.upostag); |
|
15051
|
|
|
|
|
|
|
break; |
|
15052
|
|
|
|
|
|
|
case FEATS: |
|
15053
|
504
|
|
|
|
|
|
value.assign(n.feats); |
|
15054
|
|
|
|
|
|
|
break; |
|
15055
|
|
|
|
|
|
|
case UNIVERSAL_TAG_FEATS: |
|
15056
|
0
|
|
|
|
|
|
value.assign(n.upostag).append(n.feats); |
|
15057
|
|
|
|
|
|
|
break; |
|
15058
|
|
|
|
|
|
|
case DEPREL: |
|
15059
|
504
|
|
|
|
|
|
value.assign(n.deprel); |
|
15060
|
|
|
|
|
|
|
break; |
|
15061
|
|
|
|
|
|
|
} |
|
15062
|
2016
|
|
|
|
|
|
} |
|
15063
|
|
|
|
|
|
|
|
|
15064
|
4
|
|
|
|
|
|
bool value_extractor::create(string_piece description, string& error) { |
|
15065
|
|
|
|
|
|
|
error.clear(); |
|
15066
|
|
|
|
|
|
|
|
|
15067
|
4
|
100
|
|
|
|
|
if (description == "form") |
|
15068
|
1
|
|
|
|
|
|
selector = FORM; |
|
15069
|
3
|
50
|
|
|
|
|
else if (description == "lemma") |
|
15070
|
0
|
|
|
|
|
|
selector = LEMMA; |
|
15071
|
3
|
50
|
|
|
|
|
else if (description == "lemma_id") |
|
15072
|
0
|
|
|
|
|
|
selector = LEMMA_ID; |
|
15073
|
3
|
50
|
|
|
|
|
else if (description == "tag") |
|
15074
|
0
|
|
|
|
|
|
selector = TAG; |
|
15075
|
3
|
100
|
|
|
|
|
else if (description == "universal_tag") |
|
15076
|
1
|
|
|
|
|
|
selector = UNIVERSAL_TAG; |
|
15077
|
2
|
100
|
|
|
|
|
else if (description == "feats") |
|
15078
|
1
|
|
|
|
|
|
selector = FEATS; |
|
15079
|
1
|
50
|
|
|
|
|
else if (description == "universal_tag_feats") |
|
15080
|
0
|
|
|
|
|
|
selector = UNIVERSAL_TAG_FEATS; |
|
15081
|
1
|
50
|
|
|
|
|
else if (description == "deprel") |
|
15082
|
1
|
|
|
|
|
|
selector = DEPREL; |
|
15083
|
|
|
|
|
|
|
else |
|
15084
|
0
|
|
|
|
|
|
return error.assign("Cannot parse value selector '").append(description.str, description.len).append("'!"), false; |
|
15085
|
|
|
|
|
|
|
|
|
15086
|
|
|
|
|
|
|
return true; |
|
15087
|
|
|
|
|
|
|
} |
|
15088
|
|
|
|
|
|
|
|
|
15089
|
|
|
|
|
|
|
} // namespace parsito |
|
15090
|
|
|
|
|
|
|
|
|
15091
|
|
|
|
|
|
|
///////// |
|
15092
|
|
|
|
|
|
|
// File: parsito/embedding/embedding.h |
|
15093
|
|
|
|
|
|
|
///////// |
|
15094
|
|
|
|
|
|
|
|
|
15095
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
15096
|
|
|
|
|
|
|
// |
|
15097
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
15098
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
15099
|
|
|
|
|
|
|
// |
|
15100
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
15101
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
15102
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
15103
|
|
|
|
|
|
|
|
|
15104
|
|
|
|
|
|
|
namespace parsito { |
|
15105
|
|
|
|
|
|
|
|
|
15106
|
4
|
|
|
|
|
|
class embedding { |
|
15107
|
|
|
|
|
|
|
public: |
|
15108
|
|
|
|
|
|
|
unsigned dimension; |
|
15109
|
|
|
|
|
|
|
|
|
15110
|
|
|
|
|
|
|
int lookup_word(const string& word, string& buffer) const; |
|
15111
|
|
|
|
|
|
|
int unknown_word() const; |
|
15112
|
|
|
|
|
|
|
float* weight(int id); // nullptr for wrong id |
|
15113
|
|
|
|
|
|
|
const float* weight(int id) const; // nullpt for wrong id |
|
15114
|
|
|
|
|
|
|
|
|
15115
|
|
|
|
|
|
|
bool can_update_weights(int id) const; |
|
15116
|
|
|
|
|
|
|
|
|
15117
|
|
|
|
|
|
|
void load(binary_decoder& data); |
|
15118
|
|
|
|
|
|
|
void save(binary_encoder& enc) const; |
|
15119
|
|
|
|
|
|
|
|
|
15120
|
|
|
|
|
|
|
void create(unsigned dimension, int updatable_index, const vector>>& words, const vector& unknown_weights); |
|
15121
|
|
|
|
|
|
|
void export_embeddings(vector>>& words, vector& unknown_weights) const; |
|
15122
|
|
|
|
|
|
|
private: |
|
15123
|
|
|
|
|
|
|
int updatable_index, unknown_index; |
|
15124
|
|
|
|
|
|
|
|
|
15125
|
|
|
|
|
|
|
unordered_map dictionary; |
|
15126
|
|
|
|
|
|
|
vector weights; |
|
15127
|
|
|
|
|
|
|
}; |
|
15128
|
|
|
|
|
|
|
|
|
15129
|
|
|
|
|
|
|
} // namespace parsito |
|
15130
|
|
|
|
|
|
|
|
|
15131
|
|
|
|
|
|
|
///////// |
|
15132
|
|
|
|
|
|
|
// File: parsito/embedding/embedding.cpp |
|
15133
|
|
|
|
|
|
|
///////// |
|
15134
|
|
|
|
|
|
|
|
|
15135
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
15136
|
|
|
|
|
|
|
// |
|
15137
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
15138
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
15139
|
|
|
|
|
|
|
// |
|
15140
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
15141
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
15142
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
15143
|
|
|
|
|
|
|
|
|
15144
|
|
|
|
|
|
|
namespace parsito { |
|
15145
|
|
|
|
|
|
|
|
|
15146
|
128
|
|
|
|
|
|
int embedding::lookup_word(const string& word, string& buffer) const { |
|
15147
|
|
|
|
|
|
|
using namespace unilib; |
|
15148
|
|
|
|
|
|
|
|
|
15149
|
|
|
|
|
|
|
auto it = dictionary.find(word); |
|
15150
|
128
|
100
|
|
|
|
|
if (it != dictionary.end()) return it->second; |
|
15151
|
|
|
|
|
|
|
|
|
15152
|
|
|
|
|
|
|
// We now apply several heuristics to find a match |
|
15153
|
|
|
|
|
|
|
|
|
15154
|
|
|
|
|
|
|
// Try locating uppercase/titlecase characters which we could lowercase |
|
15155
|
|
|
|
|
|
|
bool first = true; |
|
15156
|
|
|
|
|
|
|
unicode::category_t first_category = 0, other_categories = 0; |
|
15157
|
54
|
100
|
|
|
|
|
for (auto&& chr : utf8::decoder(word)) { |
|
15158
|
18
|
100
|
|
|
|
|
(first ? first_category : other_categories) |= unicode::category(chr); |
|
15159
|
|
|
|
|
|
|
first = false; |
|
15160
|
|
|
|
|
|
|
} |
|
15161
|
|
|
|
|
|
|
|
|
15162
|
36
|
50
|
|
|
|
|
if ((first_category & unicode::Lut) && (other_categories & unicode::Lut)) { |
|
|
|
0
|
|
|
|
|
|
|
15163
|
|
|
|
|
|
|
// Lowercase all characters but the first |
|
15164
|
|
|
|
|
|
|
buffer.clear(); |
|
15165
|
|
|
|
|
|
|
first = true; |
|
15166
|
0
|
0
|
|
|
|
|
for (auto&& chr : utf8::decoder(word)) { |
|
15167
|
0
|
0
|
|
|
|
|
utf8::append(buffer, first ? chr : unicode::lowercase(chr)); |
|
15168
|
|
|
|
|
|
|
first = false; |
|
15169
|
|
|
|
|
|
|
} |
|
15170
|
|
|
|
|
|
|
|
|
15171
|
|
|
|
|
|
|
it = dictionary.find(buffer); |
|
15172
|
0
|
0
|
|
|
|
|
if (it != dictionary.end()) return it->second; |
|
15173
|
|
|
|
|
|
|
} |
|
15174
|
|
|
|
|
|
|
|
|
15175
|
36
|
50
|
|
|
|
|
if ((first_category & unicode::Lut) || (other_categories & unicode::Lut)) { |
|
|
|
50
|
|
|
|
|
|
|
15176
|
|
|
|
|
|
|
utf8::map(unicode::lowercase, word, buffer); |
|
15177
|
|
|
|
|
|
|
|
|
15178
|
|
|
|
|
|
|
it = dictionary.find(buffer); |
|
15179
|
0
|
0
|
|
|
|
|
if (it != dictionary.end()) return it->second; |
|
15180
|
|
|
|
|
|
|
} |
|
15181
|
|
|
|
|
|
|
|
|
15182
|
|
|
|
|
|
|
// If the word starts with digit and contain only digits and non-letter characters |
|
15183
|
|
|
|
|
|
|
// i.e. large number, date, time, try replacing it with first digit only. |
|
15184
|
36
|
50
|
|
|
|
|
if ((first_category & unicode::N) && !(other_categories & unicode::L)) { |
|
|
|
0
|
|
|
|
|
|
|
15185
|
|
|
|
|
|
|
buffer.clear(); |
|
15186
|
0
|
|
|
|
|
|
utf8::append(buffer, utf8::first(word)); |
|
15187
|
|
|
|
|
|
|
|
|
15188
|
|
|
|
|
|
|
it = dictionary.find(buffer); |
|
15189
|
0
|
0
|
|
|
|
|
if (it != dictionary.end()) return it->second; |
|
15190
|
|
|
|
|
|
|
} |
|
15191
|
|
|
|
|
|
|
|
|
15192
|
36
|
|
|
|
|
|
return unknown_index; |
|
15193
|
|
|
|
|
|
|
} |
|
15194
|
|
|
|
|
|
|
|
|
15195
|
0
|
|
|
|
|
|
int embedding::unknown_word() const { |
|
15196
|
0
|
|
|
|
|
|
return unknown_index; |
|
15197
|
|
|
|
|
|
|
} |
|
15198
|
|
|
|
|
|
|
|
|
15199
|
0
|
|
|
|
|
|
float* embedding::weight(int id) { |
|
15200
|
0
|
0
|
|
|
|
|
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15201
|
0
|
|
|
|
|
|
return weights.data() + id * dimension; |
|
15202
|
|
|
|
|
|
|
} |
|
15203
|
|
|
|
|
|
|
|
|
15204
|
0
|
|
|
|
|
|
const float* embedding::weight(int id) const { |
|
15205
|
58
|
0
|
|
|
|
|
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15206
|
54
|
|
|
|
|
|
return weights.data() + id * dimension; |
|
15207
|
|
|
|
|
|
|
} |
|
15208
|
|
|
|
|
|
|
|
|
15209
|
4
|
|
|
|
|
|
void embedding::load(binary_decoder& data) { |
|
15210
|
|
|
|
|
|
|
// Load dimemsion |
|
15211
|
4
|
|
|
|
|
|
dimension = data.next_4B(); |
|
15212
|
|
|
|
|
|
|
|
|
15213
|
4
|
|
|
|
|
|
updatable_index = numeric_limits::max(); |
|
15214
|
|
|
|
|
|
|
|
|
15215
|
|
|
|
|
|
|
// Load dictionary |
|
15216
|
|
|
|
|
|
|
dictionary.clear(); |
|
15217
|
|
|
|
|
|
|
string word; |
|
15218
|
27
|
50
|
|
|
|
|
for (unsigned size = data.next_4B(); size; size--) { |
|
|
|
100
|
|
|
|
|
|
|
15219
|
23
|
50
|
|
|
|
|
data.next_str(word); |
|
15220
|
46
|
|
|
|
|
|
dictionary.emplace(word, (int)dictionary.size()); |
|
15221
|
|
|
|
|
|
|
} |
|
15222
|
|
|
|
|
|
|
|
|
15223
|
4
|
50
|
|
|
|
|
unknown_index = data.next_1B() ? dictionary.size() : -1; |
|
|
|
50
|
|
|
|
|
|
|
15224
|
|
|
|
|
|
|
|
|
15225
|
|
|
|
|
|
|
// Load weights |
|
15226
|
4
|
50
|
|
|
|
|
weights.resize(dimension * (dictionary.size() + (unknown_index >= 0))); |
|
15227
|
4
|
50
|
|
|
|
|
memcpy(weights.data(), data.next(weights.size()), sizeof(float) * weights.size()); |
|
15228
|
4
|
|
|
|
|
|
} |
|
15229
|
|
|
|
|
|
|
|
|
15230
|
|
|
|
|
|
|
} // namespace parsito |
|
15231
|
|
|
|
|
|
|
|
|
15232
|
|
|
|
|
|
|
///////// |
|
15233
|
|
|
|
|
|
|
// File: parsito/embedding/embedding_encode.cpp |
|
15234
|
|
|
|
|
|
|
///////// |
|
15235
|
|
|
|
|
|
|
|
|
15236
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
15237
|
|
|
|
|
|
|
// |
|
15238
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
15239
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
15240
|
|
|
|
|
|
|
// |
|
15241
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
15242
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
15243
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
15244
|
|
|
|
|
|
|
|
|
15245
|
|
|
|
|
|
|
namespace parsito { |
|
15246
|
|
|
|
|
|
|
|
|
15247
|
0
|
|
|
|
|
|
void embedding::save(binary_encoder& enc) const { |
|
15248
|
|
|
|
|
|
|
// Save dimension and update_weight |
|
15249
|
0
|
|
|
|
|
|
enc.add_4B(dimension); |
|
15250
|
|
|
|
|
|
|
|
|
15251
|
|
|
|
|
|
|
// Save the dictionary |
|
15252
|
0
|
|
|
|
|
|
vector words(dictionary.size()); |
|
15253
|
0
|
0
|
|
|
|
|
for (auto&& entry : dictionary) { |
|
15254
|
0
|
0
|
|
|
|
|
assert(entry.second >= 0 && entry.second < int(dictionary.size())); |
|
|
|
0
|
|
|
|
|
|
|
15255
|
0
|
|
|
|
|
|
words[entry.second] = entry.first; |
|
15256
|
|
|
|
|
|
|
} |
|
15257
|
0
|
|
|
|
|
|
enc.add_4B(dictionary.size()); |
|
15258
|
0
|
0
|
|
|
|
|
for (auto&& word : words) |
|
15259
|
0
|
0
|
|
|
|
|
enc.add_str(word); |
|
15260
|
|
|
|
|
|
|
|
|
15261
|
0
|
0
|
|
|
|
|
enc.add_1B(unknown_index >= 0); |
|
15262
|
|
|
|
|
|
|
|
|
15263
|
|
|
|
|
|
|
// Save the weights |
|
15264
|
|
|
|
|
|
|
enc.add_data(weights); |
|
15265
|
0
|
|
|
|
|
|
} |
|
15266
|
|
|
|
|
|
|
|
|
15267
|
0
|
|
|
|
|
|
bool embedding::can_update_weights(int id) const { |
|
15268
|
0
|
|
|
|
|
|
return id >= int(updatable_index); |
|
15269
|
|
|
|
|
|
|
} |
|
15270
|
|
|
|
|
|
|
|
|
15271
|
0
|
|
|
|
|
|
void embedding::create(unsigned dimension, int updatable_index, const vector>>& words, const vector& unknown_weights) { |
|
15272
|
0
|
|
|
|
|
|
this->dimension = dimension; |
|
15273
|
0
|
|
|
|
|
|
this->updatable_index = updatable_index; |
|
15274
|
|
|
|
|
|
|
|
|
15275
|
|
|
|
|
|
|
dictionary.clear(); |
|
15276
|
|
|
|
|
|
|
weights.clear(); |
|
15277
|
0
|
0
|
|
|
|
|
for (auto&& word : words) { |
|
15278
|
0
|
0
|
|
|
|
|
assert(word.second.size() == dimension); |
|
15279
|
0
|
|
|
|
|
|
dictionary.emplace(word.first, (int)dictionary.size()); |
|
15280
|
0
|
|
|
|
|
|
weights.insert(weights.end(), word.second.begin(), word.second.end()); |
|
15281
|
|
|
|
|
|
|
} |
|
15282
|
|
|
|
|
|
|
|
|
15283
|
0
|
0
|
|
|
|
|
if (unknown_weights.empty()) { |
|
15284
|
0
|
|
|
|
|
|
this->unknown_index = -1; |
|
15285
|
|
|
|
|
|
|
} else { |
|
15286
|
0
|
|
|
|
|
|
this->unknown_index = dictionary.size(); |
|
15287
|
0
|
|
|
|
|
|
weights.insert(weights.end(), unknown_weights.begin(), unknown_weights.end()); |
|
15288
|
|
|
|
|
|
|
} |
|
15289
|
0
|
|
|
|
|
|
} |
|
15290
|
|
|
|
|
|
|
|
|
15291
|
0
|
|
|
|
|
|
void embedding::export_embeddings(vector>>& words, vector& unknown_weights) const { |
|
15292
|
|
|
|
|
|
|
words.clear(); |
|
15293
|
|
|
|
|
|
|
unknown_weights.clear(); |
|
15294
|
|
|
|
|
|
|
|
|
15295
|
0
|
0
|
|
|
|
|
if (dictionary.empty()) return; |
|
15296
|
|
|
|
|
|
|
|
|
15297
|
0
|
0
|
|
|
|
|
assert(unknown_index < 0 || unknown_index == int(dictionary.size())); |
|
|
|
0
|
|
|
|
|
|
|
15298
|
|
|
|
|
|
|
|
|
15299
|
0
|
|
|
|
|
|
words.resize(dictionary.size()); |
|
15300
|
0
|
0
|
|
|
|
|
for (auto&& entry : dictionary) { |
|
15301
|
0
|
|
|
|
|
|
words[entry.second].first = entry.first; |
|
15302
|
0
|
|
|
|
|
|
words[entry.second].second.assign(weights.data() + entry.second * dimension, weights.data() + entry.second * dimension + dimension); |
|
15303
|
|
|
|
|
|
|
} |
|
15304
|
0
|
0
|
|
|
|
|
if (unknown_index >= 0) |
|
15305
|
0
|
|
|
|
|
|
unknown_weights.assign(weights.data() + unknown_index * dimension, weights.data() + unknown_index * dimension + dimension); |
|
15306
|
|
|
|
|
|
|
} |
|
15307
|
|
|
|
|
|
|
|
|
15308
|
|
|
|
|
|
|
} // namespace parsito |
|
15309
|
|
|
|
|
|
|
|
|
15310
|
|
|
|
|
|
|
///////// |
|
15311
|
|
|
|
|
|
|
// File: parsito/network/activation_function.h |
|
15312
|
|
|
|
|
|
|
///////// |
|
15313
|
|
|
|
|
|
|
|
|
15314
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
15315
|
|
|
|
|
|
|
// |
|
15316
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
15317
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
15318
|
|
|
|
|
|
|
// |
|
15319
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
15320
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
15321
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
15322
|
|
|
|
|
|
|
|
|
15323
|
|
|
|
|
|
|
namespace parsito { |
|
15324
|
|
|
|
|
|
|
|
|
15325
|
|
|
|
|
|
|
struct activation_function { |
|
15326
|
|
|
|
|
|
|
enum type { TANH = 0, CUBIC = 1, RELU = 2 }; |
|
15327
|
|
|
|
|
|
|
|
|
15328
|
|
|
|
|
|
|
static bool create(string_piece name, type& activation) { |
|
15329
|
|
|
|
|
|
|
if (name == "tanh") return activation = TANH, true; |
|
15330
|
|
|
|
|
|
|
if (name == "cubic") return activation = CUBIC, true; |
|
15331
|
|
|
|
|
|
|
if (name == "relu") return activation = RELU, true; |
|
15332
|
|
|
|
|
|
|
return false; |
|
15333
|
|
|
|
|
|
|
} |
|
15334
|
|
|
|
|
|
|
}; |
|
15335
|
|
|
|
|
|
|
|
|
15336
|
|
|
|
|
|
|
} // namespace parsito |
|
15337
|
|
|
|
|
|
|
|
|
15338
|
|
|
|
|
|
|
///////// |
|
15339
|
|
|
|
|
|
|
// File: parsito/network/neural_network.h |
|
15340
|
|
|
|
|
|
|
///////// |
|
15341
|
|
|
|
|
|
|
|
|
15342
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
15343
|
|
|
|
|
|
|
// |
|
15344
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
15345
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
15346
|
|
|
|
|
|
|
// |
|
15347
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
15348
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
15349
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
15350
|
|
|
|
|
|
|
|
|
15351
|
|
|
|
|
|
|
namespace parsito { |
|
15352
|
|
|
|
|
|
|
|
|
15353
|
7
|
0
|
|
|
|
|
class neural_network { |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
15354
|
|
|
|
|
|
|
public: |
|
15355
|
|
|
|
|
|
|
typedef vector>> embeddings_cache; |
|
15356
|
|
|
|
|
|
|
|
|
15357
|
|
|
|
|
|
|
void propagate(const vector& embeddings, const vector*>& embedding_ids_sequences, |
|
15358
|
|
|
|
|
|
|
vector& hidden_layer, vector& outcomes, const embeddings_cache* cache = nullptr, bool softmax = true) const; |
|
15359
|
|
|
|
|
|
|
|
|
15360
|
|
|
|
|
|
|
void load(binary_decoder& data); |
|
15361
|
|
|
|
|
|
|
void generate_tanh_cache(); |
|
15362
|
|
|
|
|
|
|
void generate_embeddings_cache(const vector& embeddings, embeddings_cache& cache, unsigned max_words) const; |
|
15363
|
|
|
|
|
|
|
|
|
15364
|
|
|
|
|
|
|
private: |
|
15365
|
|
|
|
|
|
|
friend class neural_network_trainer; |
|
15366
|
|
|
|
|
|
|
|
|
15367
|
|
|
|
|
|
|
void load_matrix(binary_decoder& data, vector>& m); |
|
15368
|
|
|
|
|
|
|
|
|
15369
|
|
|
|
|
|
|
activation_function::type hidden_layer_activation; |
|
15370
|
|
|
|
|
|
|
vector> weights[2]; |
|
15371
|
|
|
|
|
|
|
|
|
15372
|
|
|
|
|
|
|
vector tanh_cache; |
|
15373
|
|
|
|
|
|
|
}; |
|
15374
|
|
|
|
|
|
|
|
|
15375
|
|
|
|
|
|
|
} // namespace parsito |
|
15376
|
|
|
|
|
|
|
|
|
15377
|
|
|
|
|
|
|
///////// |
|
15378
|
|
|
|
|
|
|
// File: parsito/network/neural_network.cpp |
|
15379
|
|
|
|
|
|
|
///////// |
|
15380
|
|
|
|
|
|
|
|
|
15381
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
15382
|
|
|
|
|
|
|
// |
|
15383
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
15384
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
15385
|
|
|
|
|
|
|
// |
|
15386
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
15387
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
15388
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
15389
|
|
|
|
|
|
|
|
|
15390
|
|
|
|
|
|
|
namespace parsito { |
|
15391
|
|
|
|
|
|
|
|
|
15392
|
2
|
|
|
|
|
|
void neural_network::load_matrix(binary_decoder& data, vector>& m) { |
|
15393
|
2
|
|
|
|
|
|
unsigned rows = data.next_4B(); |
|
15394
|
2
|
|
|
|
|
|
unsigned columns = data.next_4B(); |
|
15395
|
|
|
|
|
|
|
|
|
15396
|
2
|
|
|
|
|
|
m.resize(rows); |
|
15397
|
369
|
100
|
|
|
|
|
for (auto&& row : m) { |
|
15398
|
367
|
|
|
|
|
|
row.resize(columns); |
|
15399
|
367
|
|
|
|
|
|
memcpy(row.data(), data.next(columns), sizeof(float) * columns); |
|
15400
|
|
|
|
|
|
|
} |
|
15401
|
2
|
|
|
|
|
|
} |
|
15402
|
|
|
|
|
|
|
|
|
15403
|
1
|
|
|
|
|
|
void neural_network::load(binary_decoder& data) { |
|
15404
|
1
|
|
|
|
|
|
hidden_layer_activation = activation_function::type(data.next_1B()); |
|
15405
|
1
|
|
|
|
|
|
load_matrix(data, weights[0]); |
|
15406
|
1
|
|
|
|
|
|
load_matrix(data, weights[1]); |
|
15407
|
1
|
|
|
|
|
|
} |
|
15408
|
|
|
|
|
|
|
|
|
15409
|
62
|
|
|
|
|
|
void neural_network::propagate(const vector& embeddings, const vector*>& embedding_ids_sequences, |
|
15410
|
|
|
|
|
|
|
vector& hidden_layer, vector& outcomes, const embeddings_cache* cache, bool softmax) const { |
|
15411
|
62
|
50
|
|
|
|
|
assert(!weights[0].empty()); |
|
15412
|
62
|
50
|
|
|
|
|
assert(!weights[1].empty()); |
|
15413
|
1178
|
100
|
|
|
|
|
for (auto&& embedding_ids : embedding_ids_sequences) if (embedding_ids) assert(embeddings.size() == embedding_ids->size()); |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
15414
|
|
|
|
|
|
|
|
|
15415
|
62
|
|
|
|
|
|
unsigned hidden_layer_size = weights[0].front().size(); |
|
15416
|
62
|
|
|
|
|
|
unsigned outcomes_size = weights[1].front().size(); |
|
15417
|
|
|
|
|
|
|
|
|
15418
|
124
|
|
|
|
|
|
outcomes.assign(outcomes_size, 0); |
|
15419
|
|
|
|
|
|
|
|
|
15420
|
|
|
|
|
|
|
// Hidden layer |
|
15421
|
62
|
|
|
|
|
|
hidden_layer.assign(hidden_layer_size, 0); |
|
15422
|
|
|
|
|
|
|
|
|
15423
|
|
|
|
|
|
|
unsigned index = 0; |
|
15424
|
1178
|
100
|
|
|
|
|
for (unsigned sequence = 0; sequence < embedding_ids_sequences.size(); sequence++) |
|
15425
|
5580
|
100
|
|
|
|
|
for (unsigned i = 0; i < embeddings.size(); index += embeddings[i].dimension, i++) |
|
15426
|
6104
|
100
|
|
|
|
|
if (embedding_ids_sequences[sequence] && embedding_ids_sequences[sequence]->at(i) >= 0) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
15427
|
1640
|
|
|
|
|
|
unsigned word = embedding_ids_sequences[sequence]->at(i); |
|
15428
|
3280
|
50
|
|
|
|
|
if (cache && i < cache->size() && word < cache->at(i).size()) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
15429
|
|
|
|
|
|
|
// Use cache |
|
15430
|
1640
|
|
|
|
|
|
const float* precomputed = cache->at(i)[word].data() + sequence * hidden_layer_size; |
|
15431
|
9840
|
100
|
|
|
|
|
for (unsigned j = 0; j < hidden_layer_size; j++) |
|
15432
|
16400
|
|
|
|
|
|
hidden_layer[j] += precomputed[j]; |
|
15433
|
|
|
|
|
|
|
} else { |
|
15434
|
|
|
|
|
|
|
// Compute directly |
|
15435
|
|
|
|
|
|
|
const float* embedding = embeddings[i].weight(word); |
|
15436
|
0
|
0
|
|
|
|
|
for (unsigned j = 0; j < embeddings[i].dimension; j++) |
|
15437
|
0
|
0
|
|
|
|
|
for (unsigned k = 0; k < hidden_layer_size; k++) |
|
15438
|
0
|
|
|
|
|
|
hidden_layer[k] += embedding[j] * weights[0][index + j][k]; |
|
15439
|
|
|
|
|
|
|
} |
|
15440
|
|
|
|
|
|
|
} |
|
15441
|
372
|
100
|
|
|
|
|
for (unsigned i = 0; i < hidden_layer_size; i++) // Bias |
|
15442
|
930
|
|
|
|
|
|
hidden_layer[i] += weights[0][index][i]; |
|
15443
|
|
|
|
|
|
|
|
|
15444
|
|
|
|
|
|
|
// Activation function |
|
15445
|
62
|
|
|
|
|
|
switch (hidden_layer_activation) { |
|
15446
|
|
|
|
|
|
|
case activation_function::TANH: |
|
15447
|
62
|
50
|
|
|
|
|
if (!tanh_cache.empty()) |
|
15448
|
372
|
100
|
|
|
|
|
for (auto&& weight : hidden_layer) |
|
15449
|
310
|
50
|
|
|
|
|
weight = weight <= -10 ? -1 : weight >= 10 ? 1 : tanh_cache[int(weight * 32768 + 10 * 32768)]; |
|
|
|
50
|
|
|
|
|
|
|
15450
|
|
|
|
|
|
|
else |
|
15451
|
62
|
0
|
|
|
|
|
for (auto&& weight : hidden_layer) |
|
15452
|
0
|
|
|
|
|
|
weight = tanh(weight); |
|
15453
|
|
|
|
|
|
|
break; |
|
15454
|
|
|
|
|
|
|
case activation_function::CUBIC: |
|
15455
|
0
|
0
|
|
|
|
|
for (auto&& weight : hidden_layer) |
|
15456
|
0
|
|
|
|
|
|
weight = weight * weight * weight; |
|
15457
|
|
|
|
|
|
|
break; |
|
15458
|
|
|
|
|
|
|
case activation_function::RELU: |
|
15459
|
0
|
0
|
|
|
|
|
for (auto&& weight : hidden_layer) |
|
15460
|
0
|
0
|
|
|
|
|
if (weight < 0) weight = 0; |
|
15461
|
|
|
|
|
|
|
break; |
|
15462
|
|
|
|
|
|
|
} |
|
15463
|
|
|
|
|
|
|
|
|
15464
|
372
|
100
|
|
|
|
|
for (unsigned i = 0; i < hidden_layer_size; i++) |
|
15465
|
4340
|
100
|
|
|
|
|
for (unsigned j = 0; j < outcomes_size; j++) |
|
15466
|
16120
|
|
|
|
|
|
outcomes[j] += hidden_layer[i] * weights[1][i][j]; |
|
15467
|
868
|
100
|
|
|
|
|
for (unsigned i = 0; i < outcomes_size; i++) // Bias |
|
15468
|
2418
|
|
|
|
|
|
outcomes[i] += weights[1][hidden_layer_size][i]; |
|
15469
|
|
|
|
|
|
|
|
|
15470
|
|
|
|
|
|
|
// Softmax if requested |
|
15471
|
62
|
50
|
|
|
|
|
if (softmax) { |
|
15472
|
62
|
|
|
|
|
|
float max = outcomes[0]; |
|
15473
|
806
|
100
|
|
|
|
|
for (unsigned i = 1; i < outcomes_size; i++) if (outcomes[i] > max) max = outcomes[i]; |
|
|
|
100
|
|
|
|
|
|
|
15474
|
|
|
|
|
|
|
|
|
15475
|
|
|
|
|
|
|
float sum = 0; |
|
15476
|
868
|
100
|
|
|
|
|
for (unsigned i = 0; i < outcomes_size; i++) sum += (outcomes[i] = exp(outcomes[i] - max)); |
|
15477
|
62
|
|
|
|
|
|
sum = 1 / sum; |
|
15478
|
|
|
|
|
|
|
|
|
15479
|
868
|
100
|
|
|
|
|
for (unsigned i = 0; i < outcomes_size; i++) outcomes[i] *= sum; |
|
15480
|
|
|
|
|
|
|
} |
|
15481
|
62
|
|
|
|
|
|
} |
|
15482
|
|
|
|
|
|
|
|
|
15483
|
1
|
|
|
|
|
|
void neural_network::generate_tanh_cache() { |
|
15484
|
1
|
|
|
|
|
|
tanh_cache.resize(2 * 10 * 32768); |
|
15485
|
655361
|
100
|
|
|
|
|
for (unsigned i = 0; i < tanh_cache.size(); i++) |
|
15486
|
655360
|
|
|
|
|
|
tanh_cache[i] = tanh(i / 32768.0 - 10); |
|
15487
|
1
|
|
|
|
|
|
} |
|
15488
|
|
|
|
|
|
|
|
|
15489
|
2
|
|
|
|
|
|
void neural_network::generate_embeddings_cache(const vector& embeddings, embeddings_cache& cache, unsigned max_words) const { |
|
15490
|
|
|
|
|
|
|
unsigned embeddings_dim = 0; |
|
15491
|
5
|
100
|
|
|
|
|
for (auto&& embedding : embeddings) embeddings_dim += embedding.dimension; |
|
15492
|
|
|
|
|
|
|
|
|
15493
|
1
|
|
|
|
|
|
unsigned sequences = weights[0].size() / embeddings_dim; |
|
15494
|
1
|
50
|
|
|
|
|
assert(sequences * embeddings_dim + 1 == weights[0].size()); |
|
15495
|
|
|
|
|
|
|
|
|
15496
|
1
|
|
|
|
|
|
unsigned hidden_layer_size = weights[0].front().size(); |
|
15497
|
|
|
|
|
|
|
|
|
15498
|
1
|
|
|
|
|
|
cache.resize(embeddings.size()); |
|
15499
|
5
|
100
|
|
|
|
|
for (unsigned i = 0, weight_index = 0; i < embeddings.size(); weight_index += embeddings[i].dimension, i++) { |
|
15500
|
|
|
|
|
|
|
unsigned words = 0; |
|
15501
|
35
|
50
|
|
|
|
|
while (words < max_words && embeddings[i].weight(words)) words++; |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
15502
|
|
|
|
|
|
|
|
|
15503
|
4
|
|
|
|
|
|
cache[i].resize(words); |
|
15504
|
31
|
100
|
|
|
|
|
for (unsigned word = 0; word < words; word++) { |
|
15505
|
27
|
|
|
|
|
|
const float* embedding = embeddings[i].weight(word); |
|
15506
|
|
|
|
|
|
|
|
|
15507
|
27
|
|
|
|
|
|
cache[i][word].assign(sequences * hidden_layer_size, 0); |
|
15508
|
513
|
100
|
|
|
|
|
for (unsigned sequence = 0, index = weight_index; sequence < sequences; index += embeddings_dim, sequence++) |
|
15509
|
2916
|
100
|
|
|
|
|
for (unsigned j = 0; j < embeddings[i].dimension; j++) |
|
15510
|
14580
|
100
|
|
|
|
|
for (unsigned k = 0; k < hidden_layer_size; k++) |
|
15511
|
36450
|
|
|
|
|
|
cache[i][word][sequence * hidden_layer_size + k] += embedding[j] * weights[0][index + j][k]; |
|
15512
|
|
|
|
|
|
|
} |
|
15513
|
|
|
|
|
|
|
} |
|
15514
|
1
|
|
|
|
|
|
} |
|
15515
|
|
|
|
|
|
|
|
|
15516
|
|
|
|
|
|
|
} // namespace parsito |
|
15517
|
|
|
|
|
|
|
|
|
15518
|
|
|
|
|
|
|
///////// |
|
15519
|
|
|
|
|
|
|
// File: parsito/network/network_parameters.h |
|
15520
|
|
|
|
|
|
|
///////// |
|
15521
|
|
|
|
|
|
|
|
|
15522
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
15523
|
|
|
|
|
|
|
// |
|
15524
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
15525
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
15526
|
|
|
|
|
|
|
// |
|
15527
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
15528
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
15529
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
15530
|
|
|
|
|
|
|
|
|
15531
|
|
|
|
|
|
|
namespace parsito { |
|
15532
|
|
|
|
|
|
|
|
|
15533
|
|
|
|
|
|
|
struct network_trainer { |
|
15534
|
|
|
|
|
|
|
enum network_trainer_algorithm { |
|
15535
|
|
|
|
|
|
|
SGD, |
|
15536
|
|
|
|
|
|
|
SGD_MOMENTUM, |
|
15537
|
|
|
|
|
|
|
ADAGRAD, |
|
15538
|
|
|
|
|
|
|
ADADELTA, |
|
15539
|
|
|
|
|
|
|
ADAM, |
|
15540
|
|
|
|
|
|
|
}; |
|
15541
|
|
|
|
|
|
|
|
|
15542
|
|
|
|
|
|
|
network_trainer_algorithm algorithm; |
|
15543
|
|
|
|
|
|
|
float learning_rate, learning_rate_final; |
|
15544
|
|
|
|
|
|
|
float momentum, momentum2; |
|
15545
|
|
|
|
|
|
|
float epsilon; |
|
15546
|
|
|
|
|
|
|
}; |
|
15547
|
|
|
|
|
|
|
|
|
15548
|
|
|
|
|
|
|
struct network_parameters { |
|
15549
|
|
|
|
|
|
|
unsigned iterations; |
|
15550
|
|
|
|
|
|
|
int structured_interval; |
|
15551
|
|
|
|
|
|
|
unsigned hidden_layer; |
|
15552
|
|
|
|
|
|
|
activation_function::type hidden_layer_type; |
|
15553
|
|
|
|
|
|
|
network_trainer trainer; |
|
15554
|
|
|
|
|
|
|
unsigned batch_size; |
|
15555
|
|
|
|
|
|
|
float initialization_range; |
|
15556
|
|
|
|
|
|
|
float l1_regularization; |
|
15557
|
|
|
|
|
|
|
float l2_regularization; |
|
15558
|
|
|
|
|
|
|
float maxnorm_regularization; |
|
15559
|
|
|
|
|
|
|
float dropout_hidden, dropout_input; |
|
15560
|
|
|
|
|
|
|
bool early_stopping; |
|
15561
|
|
|
|
|
|
|
}; |
|
15562
|
|
|
|
|
|
|
|
|
15563
|
|
|
|
|
|
|
} // namespace parsito |
|
15564
|
|
|
|
|
|
|
|
|
15565
|
|
|
|
|
|
|
///////// |
|
15566
|
|
|
|
|
|
|
// File: parsito/network/neural_network_trainer.h |
|
15567
|
|
|
|
|
|
|
///////// |
|
15568
|
|
|
|
|
|
|
|
|
15569
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
15570
|
|
|
|
|
|
|
// |
|
15571
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
15572
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
15573
|
|
|
|
|
|
|
// |
|
15574
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
15575
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
15576
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
15577
|
|
|
|
|
|
|
|
|
15578
|
|
|
|
|
|
|
namespace parsito { |
|
15579
|
|
|
|
|
|
|
|
|
15580
|
|
|
|
|
|
|
class neural_network_trainer { |
|
15581
|
|
|
|
|
|
|
public: |
|
15582
|
|
|
|
|
|
|
neural_network_trainer(neural_network& network, unsigned input_size, unsigned output_size, |
|
15583
|
|
|
|
|
|
|
const network_parameters& parameters, mt19937& generator); |
|
15584
|
|
|
|
|
|
|
|
|
15585
|
|
|
|
|
|
|
bool next_iteration(); |
|
15586
|
|
|
|
|
|
|
|
|
15587
|
0
|
0
|
|
|
|
|
struct workspace { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15588
|
|
|
|
|
|
|
unsigned batch = 0; |
|
15589
|
|
|
|
|
|
|
vector outcomes; |
|
15590
|
|
|
|
|
|
|
vector hidden_layer; |
|
15591
|
|
|
|
|
|
|
vector error_outcomes; |
|
15592
|
|
|
|
|
|
|
vector error_hidden; |
|
15593
|
|
|
|
|
|
|
|
|
15594
|
|
|
|
|
|
|
// Delta accumulators |
|
15595
|
|
|
|
|
|
|
vector> weights_batch[2]; |
|
15596
|
|
|
|
|
|
|
vector>> error_embedding; |
|
15597
|
|
|
|
|
|
|
vector> error_embedding_nonempty; |
|
15598
|
|
|
|
|
|
|
|
|
15599
|
|
|
|
|
|
|
// Trainer data |
|
15600
|
|
|
|
|
|
|
struct trainer_data { |
|
15601
|
|
|
|
|
|
|
float delta = 0; |
|
15602
|
|
|
|
|
|
|
float gradient = 0; |
|
15603
|
|
|
|
|
|
|
}; |
|
15604
|
|
|
|
|
|
|
vector> weights_trainer[2]; |
|
15605
|
|
|
|
|
|
|
vector>> embedding_trainer; |
|
15606
|
|
|
|
|
|
|
|
|
15607
|
|
|
|
|
|
|
// Dropout vectors |
|
15608
|
|
|
|
|
|
|
vector input_dropout; |
|
15609
|
|
|
|
|
|
|
vector hidden_dropout; |
|
15610
|
|
|
|
|
|
|
vector hidden_kept; |
|
15611
|
|
|
|
|
|
|
}; |
|
15612
|
|
|
|
|
|
|
void propagate(const vector& embeddings, const vector*>& embedding_ids_sequences, workspace& w) const; |
|
15613
|
|
|
|
|
|
|
void backpropagate(vector& embeddings, const vector*>& embedding_ids_sequences, unsigned required_outcome, workspace& w); |
|
15614
|
|
|
|
|
|
|
|
|
15615
|
|
|
|
|
|
|
void finalize_sentence(); |
|
15616
|
|
|
|
|
|
|
|
|
15617
|
|
|
|
|
|
|
void save_network(binary_encoder& enc) const; |
|
15618
|
|
|
|
|
|
|
|
|
15619
|
|
|
|
|
|
|
private: |
|
15620
|
|
|
|
|
|
|
struct trainer_sgd { |
|
15621
|
|
|
|
|
|
|
static bool need_trainer_data; |
|
15622
|
|
|
|
|
|
|
static inline float delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data); |
|
15623
|
|
|
|
|
|
|
}; |
|
15624
|
|
|
|
|
|
|
struct trainer_sgd_momentum { |
|
15625
|
|
|
|
|
|
|
static bool need_trainer_data; |
|
15626
|
|
|
|
|
|
|
static inline float delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data); |
|
15627
|
|
|
|
|
|
|
}; |
|
15628
|
|
|
|
|
|
|
struct trainer_adagrad { |
|
15629
|
|
|
|
|
|
|
static bool need_trainer_data; |
|
15630
|
|
|
|
|
|
|
static inline float delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data); |
|
15631
|
|
|
|
|
|
|
}; |
|
15632
|
|
|
|
|
|
|
struct trainer_adadelta { |
|
15633
|
|
|
|
|
|
|
static bool need_trainer_data; |
|
15634
|
|
|
|
|
|
|
static inline float delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data); |
|
15635
|
|
|
|
|
|
|
}; |
|
15636
|
|
|
|
|
|
|
struct trainer_adam { |
|
15637
|
|
|
|
|
|
|
static bool need_trainer_data; |
|
15638
|
|
|
|
|
|
|
static inline float delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data); |
|
15639
|
|
|
|
|
|
|
}; |
|
15640
|
|
|
|
|
|
|
template void backpropagate_template(vector& embeddings, const vector*>& embedding_ids_sequences, unsigned required_outcome, workspace& w); |
|
15641
|
|
|
|
|
|
|
|
|
15642
|
|
|
|
|
|
|
void l1_regularize(); |
|
15643
|
|
|
|
|
|
|
void maxnorm_regularize(); |
|
15644
|
|
|
|
|
|
|
|
|
15645
|
|
|
|
|
|
|
void save_matrix(const vector>& m, binary_encoder& enc) const; |
|
15646
|
|
|
|
|
|
|
|
|
15647
|
|
|
|
|
|
|
neural_network& network; |
|
15648
|
|
|
|
|
|
|
mt19937& generator; |
|
15649
|
|
|
|
|
|
|
unsigned iteration, iterations, steps; |
|
15650
|
|
|
|
|
|
|
network_trainer trainer; |
|
15651
|
|
|
|
|
|
|
unsigned batch_size; |
|
15652
|
|
|
|
|
|
|
float l1_regularization, l2_regularization, maxnorm_regularization; |
|
15653
|
|
|
|
|
|
|
float dropout_hidden, dropout_input; |
|
15654
|
|
|
|
|
|
|
}; |
|
15655
|
|
|
|
|
|
|
|
|
15656
|
|
|
|
|
|
|
} // namespace parsito |
|
15657
|
|
|
|
|
|
|
|
|
15658
|
|
|
|
|
|
|
///////// |
|
15659
|
|
|
|
|
|
|
// File: parsito/network/neural_network_trainer.cpp |
|
15660
|
|
|
|
|
|
|
///////// |
|
15661
|
|
|
|
|
|
|
|
|
15662
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
15663
|
|
|
|
|
|
|
// |
|
15664
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
15665
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
15666
|
|
|
|
|
|
|
// |
|
15667
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
15668
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
15669
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
15670
|
|
|
|
|
|
|
|
|
15671
|
|
|
|
|
|
|
namespace parsito { |
|
15672
|
|
|
|
|
|
|
|
|
15673
|
0
|
|
|
|
|
|
neural_network_trainer::neural_network_trainer(neural_network& network, unsigned input_size, unsigned output_size, |
|
15674
|
0
|
|
|
|
|
|
const network_parameters& parameters, mt19937& generator) : network(network), generator(generator) { |
|
15675
|
|
|
|
|
|
|
// Initialize hidden layer |
|
15676
|
0
|
|
|
|
|
|
network.hidden_layer_activation = parameters.hidden_layer_type; |
|
15677
|
0
|
0
|
|
|
|
|
if (parameters.hidden_layer) { |
|
15678
|
0
|
|
|
|
|
|
float uniform_pre_hidden_range = parameters.initialization_range > 0 ? parameters.initialization_range : |
|
15679
|
0
|
0
|
|
|
|
|
-parameters.initialization_range * sqrt(6.0 / float(input_size + parameters.hidden_layer)); |
|
15680
|
0
|
|
|
|
|
|
uniform_real_distribution uniform_pre_hidden(-uniform_pre_hidden_range, uniform_pre_hidden_range); |
|
15681
|
|
|
|
|
|
|
|
|
15682
|
0
|
|
|
|
|
|
network.weights[0].resize(input_size + 1/*bias*/); |
|
15683
|
0
|
0
|
|
|
|
|
for (auto&& row : network.weights[0]) { |
|
15684
|
0
|
|
|
|
|
|
row.resize(parameters.hidden_layer); |
|
15685
|
0
|
0
|
|
|
|
|
for (auto&& weight : row) |
|
15686
|
0
|
|
|
|
|
|
weight = uniform_pre_hidden(generator); |
|
15687
|
|
|
|
|
|
|
} |
|
15688
|
|
|
|
|
|
|
|
|
15689
|
0
|
|
|
|
|
|
float uniform_post_hidden_range = parameters.initialization_range > 0 ? parameters.initialization_range : |
|
15690
|
0
|
0
|
|
|
|
|
-parameters.initialization_range * sqrt(6.0 / float(output_size + parameters.hidden_layer)); |
|
15691
|
0
|
|
|
|
|
|
uniform_real_distribution uniform_post_hidden(-uniform_post_hidden_range, uniform_post_hidden_range); |
|
15692
|
|
|
|
|
|
|
|
|
15693
|
0
|
|
|
|
|
|
network.weights[1].resize(parameters.hidden_layer + 1/*bias*/); |
|
15694
|
0
|
0
|
|
|
|
|
for (auto&& row : network.weights[1]) { |
|
15695
|
0
|
|
|
|
|
|
row.resize(output_size); |
|
15696
|
0
|
0
|
|
|
|
|
for (auto&& weight : row) |
|
15697
|
0
|
|
|
|
|
|
weight = uniform_post_hidden(generator); |
|
15698
|
|
|
|
|
|
|
} |
|
15699
|
|
|
|
|
|
|
} |
|
15700
|
|
|
|
|
|
|
|
|
15701
|
|
|
|
|
|
|
// Store the network_parameters |
|
15702
|
0
|
|
|
|
|
|
iteration = steps = 0; |
|
15703
|
0
|
|
|
|
|
|
iterations = parameters.iterations; |
|
15704
|
0
|
|
|
|
|
|
trainer = parameters.trainer; |
|
15705
|
0
|
|
|
|
|
|
batch_size = parameters.batch_size; |
|
15706
|
0
|
|
|
|
|
|
l1_regularization = parameters.l1_regularization; |
|
15707
|
0
|
|
|
|
|
|
l2_regularization = parameters.l2_regularization; |
|
15708
|
0
|
|
|
|
|
|
maxnorm_regularization = parameters.maxnorm_regularization; |
|
15709
|
0
|
|
|
|
|
|
dropout_hidden = parameters.dropout_hidden; |
|
15710
|
0
|
|
|
|
|
|
dropout_input = parameters.dropout_input; |
|
15711
|
|
|
|
|
|
|
|
|
15712
|
|
|
|
|
|
|
// Maxnorm regularize the created weights |
|
15713
|
0
|
0
|
|
|
|
|
if (maxnorm_regularization) maxnorm_regularize(); |
|
15714
|
0
|
|
|
|
|
|
} |
|
15715
|
|
|
|
|
|
|
|
|
15716
|
0
|
|
|
|
|
|
bool neural_network_trainer::next_iteration() { |
|
15717
|
0
|
0
|
|
|
|
|
if (iteration++ >= iterations) return false; |
|
15718
|
|
|
|
|
|
|
|
|
15719
|
0
|
0
|
|
|
|
|
if (trainer.algorithm != network_trainer::ADADELTA) |
|
15720
|
0
|
0
|
|
|
|
|
if (trainer.learning_rate != trainer.learning_rate_final && iteration > 1) |
|
|
|
0
|
|
|
|
|
|
|
15721
|
|
|
|
|
|
|
trainer.learning_rate = |
|
15722
|
0
|
|
|
|
|
|
exp(((iterations - iteration) * log(trainer.learning_rate) + log(trainer.learning_rate_final)) / (iterations - iteration + 1)); |
|
15723
|
|
|
|
|
|
|
|
|
15724
|
|
|
|
|
|
|
return true; |
|
15725
|
|
|
|
|
|
|
} |
|
15726
|
|
|
|
|
|
|
|
|
15727
|
0
|
|
|
|
|
|
void neural_network_trainer::propagate(const vector& embeddings, const vector*>& embedding_ids_sequences, workspace& w) const { |
|
15728
|
|
|
|
|
|
|
// Initialize dropout if requested |
|
15729
|
0
|
0
|
|
|
|
|
if (dropout_input) { |
|
15730
|
0
|
|
|
|
|
|
w.input_dropout.resize(network.weights[0].size()); |
|
15731
|
0
|
|
|
|
|
|
bernoulli_distribution dropout(dropout_input); |
|
15732
|
0
|
0
|
|
|
|
|
for (auto&& flag : w.input_dropout) |
|
15733
|
0
|
|
|
|
|
|
flag = dropout(generator); |
|
15734
|
|
|
|
|
|
|
} |
|
15735
|
|
|
|
|
|
|
|
|
15736
|
0
|
0
|
|
|
|
|
if (dropout_hidden) { |
|
15737
|
0
|
|
|
|
|
|
w.hidden_dropout.resize(network.weights[1].size()); |
|
15738
|
0
|
|
|
|
|
|
bernoulli_distribution dropout(dropout_hidden); |
|
15739
|
0
|
0
|
|
|
|
|
for (auto&& flag : w.hidden_dropout) |
|
15740
|
0
|
|
|
|
|
|
flag = dropout(generator); |
|
15741
|
|
|
|
|
|
|
} |
|
15742
|
|
|
|
|
|
|
w.hidden_kept.clear(); |
|
15743
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < network.weights[0].front().size(); i++) |
|
15744
|
0
|
0
|
|
|
|
|
if (w.hidden_dropout.empty() || !w.hidden_dropout[i]) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15745
|
0
|
|
|
|
|
|
w.hidden_kept.push_back(i); |
|
15746
|
|
|
|
|
|
|
|
|
15747
|
|
|
|
|
|
|
// Propagate |
|
15748
|
|
|
|
|
|
|
unsigned hidden_layer_size = network.weights[0].front().size(); |
|
15749
|
0
|
|
|
|
|
|
unsigned outcomes_size = network.weights[1].front().size(); |
|
15750
|
|
|
|
|
|
|
|
|
15751
|
0
|
|
|
|
|
|
w.outcomes.assign(outcomes_size, 0); |
|
15752
|
|
|
|
|
|
|
|
|
15753
|
|
|
|
|
|
|
// Hidden layer |
|
15754
|
0
|
|
|
|
|
|
w.hidden_layer.assign(hidden_layer_size, 0); |
|
15755
|
|
|
|
|
|
|
|
|
15756
|
|
|
|
|
|
|
unsigned index = 0; |
|
15757
|
0
|
0
|
|
|
|
|
for (auto&& embedding_ids : embedding_ids_sequences) |
|
15758
|
|
|
|
|
|
|
// Note: The unnecessary brackets on the following for cycle are needed |
|
15759
|
|
|
|
|
|
|
// to compile on VS 2015 Update 3, which otherwise fail to compile it. |
|
15760
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < embeddings.size(); i++) { |
|
15761
|
0
|
0
|
|
|
|
|
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15762
|
0
|
|
|
|
|
|
const float* embedding = embeddings[i].weight((*embedding_ids)[i]); |
|
15763
|
0
|
0
|
|
|
|
|
for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, embedding++, index++) |
|
15764
|
0
|
0
|
|
|
|
|
if (w.input_dropout.empty() || !w.input_dropout[index]) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15765
|
0
|
0
|
|
|
|
|
for (auto&& j : w.hidden_kept) |
|
15766
|
0
|
|
|
|
|
|
w.hidden_layer[j] += *embedding * network.weights[0][index][j]; |
|
15767
|
|
|
|
|
|
|
} else { |
|
15768
|
0
|
|
|
|
|
|
index += embeddings[i].dimension; |
|
15769
|
|
|
|
|
|
|
} |
|
15770
|
|
|
|
|
|
|
} |
|
15771
|
0
|
0
|
|
|
|
|
if (dropout_input) { // Dropout normalization |
|
15772
|
0
|
|
|
|
|
|
float dropout_factor = 1. / (1. - dropout_input); |
|
15773
|
0
|
0
|
|
|
|
|
for (auto&& i : w.hidden_kept) |
|
15774
|
0
|
|
|
|
|
|
w.hidden_layer[i] *= dropout_factor; |
|
15775
|
|
|
|
|
|
|
} |
|
15776
|
0
|
0
|
|
|
|
|
for (auto&& i : w.hidden_kept) // Bias |
|
15777
|
0
|
|
|
|
|
|
w.hidden_layer[i] += network.weights[0][index][i]; |
|
15778
|
|
|
|
|
|
|
|
|
15779
|
|
|
|
|
|
|
// Activation function |
|
15780
|
0
|
|
|
|
|
|
switch (network.hidden_layer_activation) { |
|
15781
|
|
|
|
|
|
|
case activation_function::TANH: |
|
15782
|
0
|
0
|
|
|
|
|
for (auto&& weight : w.hidden_layer) |
|
15783
|
0
|
|
|
|
|
|
weight = tanh(weight); |
|
15784
|
|
|
|
|
|
|
break; |
|
15785
|
|
|
|
|
|
|
case activation_function::CUBIC: |
|
15786
|
0
|
0
|
|
|
|
|
for (auto&& weight : w.hidden_layer) |
|
15787
|
0
|
|
|
|
|
|
weight = weight * weight * weight; |
|
15788
|
|
|
|
|
|
|
break; |
|
15789
|
|
|
|
|
|
|
case activation_function::RELU: |
|
15790
|
0
|
0
|
|
|
|
|
for (auto&& weight : w.hidden_layer) |
|
15791
|
0
|
0
|
|
|
|
|
if (weight < 0) weight = 0; |
|
15792
|
|
|
|
|
|
|
break; |
|
15793
|
|
|
|
|
|
|
} |
|
15794
|
0
|
0
|
|
|
|
|
if (dropout_hidden) { // Dropout normalization |
|
15795
|
0
|
|
|
|
|
|
float dropout_factor = 1. / (1. - dropout_hidden); |
|
15796
|
0
|
0
|
|
|
|
|
for (auto&& i : w.hidden_kept) |
|
15797
|
0
|
|
|
|
|
|
w.hidden_layer[i] *= dropout_factor; |
|
15798
|
|
|
|
|
|
|
} |
|
15799
|
|
|
|
|
|
|
|
|
15800
|
0
|
0
|
|
|
|
|
for (auto&& i : w.hidden_kept) |
|
15801
|
0
|
0
|
|
|
|
|
for (unsigned j = 0; j < outcomes_size; j++) |
|
15802
|
0
|
|
|
|
|
|
w.outcomes[j] += w.hidden_layer[i] * network.weights[1][i][j]; |
|
15803
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < outcomes_size; i++) // Bias |
|
15804
|
0
|
|
|
|
|
|
w.outcomes[i] += network.weights[1][hidden_layer_size][i]; |
|
15805
|
|
|
|
|
|
|
|
|
15806
|
|
|
|
|
|
|
// Softmax |
|
15807
|
0
|
|
|
|
|
|
float max = w.outcomes[0]; |
|
15808
|
0
|
0
|
|
|
|
|
for (unsigned i = 1; i < outcomes_size; i++) if (w.outcomes[i] > max) max = w.outcomes[i]; |
|
|
|
0
|
|
|
|
|
|
|
15809
|
|
|
|
|
|
|
|
|
15810
|
|
|
|
|
|
|
float sum = 0; |
|
15811
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < outcomes_size; i++) sum += (w.outcomes[i] = exp(w.outcomes[i] - max)); |
|
15812
|
0
|
|
|
|
|
|
sum = 1 / sum; |
|
15813
|
|
|
|
|
|
|
|
|
15814
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < outcomes_size; i++) w.outcomes[i] *= sum; |
|
15815
|
0
|
|
|
|
|
|
} |
|
15816
|
|
|
|
|
|
|
|
|
15817
|
|
|
|
|
|
|
// SGD |
|
15818
|
|
|
|
|
|
|
bool neural_network_trainer::trainer_sgd::need_trainer_data = false; |
|
15819
|
|
|
|
|
|
|
float neural_network_trainer::trainer_sgd::delta(float gradient, const network_trainer& trainer, workspace::trainer_data& /*data*/) { |
|
15820
|
0
|
|
|
|
|
|
return trainer.learning_rate * gradient; |
|
15821
|
|
|
|
|
|
|
} |
|
15822
|
|
|
|
|
|
|
|
|
15823
|
|
|
|
|
|
|
// SGD with momentum |
|
15824
|
|
|
|
|
|
|
bool neural_network_trainer::trainer_sgd_momentum::need_trainer_data = true; |
|
15825
|
|
|
|
|
|
|
float neural_network_trainer::trainer_sgd_momentum::delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data) { |
|
15826
|
0
|
|
|
|
|
|
data.delta = trainer.momentum * data.delta + trainer.learning_rate * gradient; |
|
15827
|
|
|
|
|
|
|
return data.delta; |
|
15828
|
|
|
|
|
|
|
} |
|
15829
|
|
|
|
|
|
|
|
|
15830
|
|
|
|
|
|
|
// AdaGrad |
|
15831
|
|
|
|
|
|
|
bool neural_network_trainer::trainer_adagrad::need_trainer_data = true; |
|
15832
|
|
|
|
|
|
|
float neural_network_trainer::trainer_adagrad::delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data) { |
|
15833
|
0
|
|
|
|
|
|
data.gradient += gradient * gradient; |
|
15834
|
0
|
|
|
|
|
|
return trainer.learning_rate / sqrt(data.gradient + trainer.epsilon) * gradient; |
|
15835
|
|
|
|
|
|
|
} |
|
15836
|
|
|
|
|
|
|
|
|
15837
|
|
|
|
|
|
|
// AdaDelta |
|
15838
|
|
|
|
|
|
|
bool neural_network_trainer::trainer_adadelta::need_trainer_data = true; |
|
15839
|
0
|
|
|
|
|
|
float neural_network_trainer::trainer_adadelta::delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data) { |
|
15840
|
0
|
|
|
|
|
|
data.gradient = trainer.momentum * data.gradient + (1 - trainer.momentum) * gradient * gradient; |
|
15841
|
0
|
|
|
|
|
|
float delta = sqrt(data.delta + trainer.epsilon) / sqrt(data.gradient + trainer.epsilon) * gradient; |
|
15842
|
0
|
|
|
|
|
|
data.delta = trainer.momentum * data.delta + (1 - trainer.momentum) * delta * delta; |
|
15843
|
0
|
|
|
|
|
|
return delta; |
|
15844
|
|
|
|
|
|
|
} |
|
15845
|
|
|
|
|
|
|
|
|
15846
|
|
|
|
|
|
|
// Adam |
|
15847
|
|
|
|
|
|
|
bool neural_network_trainer::trainer_adam::need_trainer_data = true; |
|
15848
|
0
|
|
|
|
|
|
float neural_network_trainer::trainer_adam::delta(float gradient, const network_trainer& trainer, workspace::trainer_data& data) { |
|
15849
|
0
|
|
|
|
|
|
data.gradient = trainer.momentum * data.gradient + (1 - trainer.momentum) * gradient; |
|
15850
|
0
|
|
|
|
|
|
data.delta = trainer.momentum2 * data.delta + (1 - trainer.momentum2) * gradient * gradient; |
|
15851
|
0
|
|
|
|
|
|
return trainer.learning_rate * data.gradient / sqrt(data.delta + trainer.epsilon); |
|
15852
|
|
|
|
|
|
|
} |
|
15853
|
|
|
|
|
|
|
|
|
15854
|
|
|
|
|
|
|
// Backpropagation |
|
15855
|
|
|
|
|
|
|
template |
|
15856
|
0
|
|
|
|
|
|
void neural_network_trainer::backpropagate_template(vector& embeddings, const vector*>& embedding_ids_sequences, unsigned required_outcome, workspace& w) { |
|
15857
|
0
|
|
|
|
|
|
size_t hidden_layer_size = network.weights[0].front().size(); |
|
15858
|
0
|
|
|
|
|
|
size_t outcomes_size = network.weights[1].front().size(); |
|
15859
|
|
|
|
|
|
|
|
|
15860
|
|
|
|
|
|
|
// Allocate space for delta accumulators |
|
15861
|
0
|
0
|
|
|
|
|
if (network.weights[0].size() > w.weights_batch[0].size()) w.weights_batch[0].resize(network.weights[0].size()); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15862
|
0
|
0
|
|
|
|
|
if (network.weights[1].size() > w.weights_batch[1].size()) w.weights_batch[1].resize(network.weights[1].size()); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15863
|
0
|
0
|
|
|
|
|
if (embeddings.size() > w.error_embedding.size()) w.error_embedding.resize(embeddings.size()); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15864
|
0
|
0
|
|
|
|
|
if (embeddings.size() > w.error_embedding_nonempty.size()) w.error_embedding_nonempty.resize(embeddings.size()); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15865
|
|
|
|
|
|
|
|
|
15866
|
|
|
|
|
|
|
// Allocate space for trainer_data if required) |
|
15867
|
0
|
|
|
|
|
|
workspace::trainer_data none_trainer_data; |
|
15868
|
0
|
0
|
|
|
|
|
if (TRAINER::need_trainer_data) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15869
|
0
|
0
|
|
|
|
|
while (network.weights[0].size() > w.weights_trainer[0].size()) w.weights_trainer[0].emplace_back(network.weights[0].front().size()); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15870
|
0
|
0
|
|
|
|
|
while (network.weights[1].size() > w.weights_trainer[1].size()) w.weights_trainer[1].emplace_back(outcomes_size); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15871
|
|
|
|
|
|
|
} |
|
15872
|
|
|
|
|
|
|
|
|
15873
|
|
|
|
|
|
|
// Compute error vector |
|
15874
|
0
|
|
|
|
|
|
w.error_outcomes.resize(outcomes_size); |
|
15875
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < outcomes_size; i++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15876
|
0
|
0
|
|
|
|
|
w.error_outcomes[i] = (i == required_outcome) - w.outcomes[i]; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15877
|
|
|
|
|
|
|
|
|
15878
|
|
|
|
|
|
|
// Backpropagate error_outcomes to error_hidden |
|
15879
|
0
|
|
|
|
|
|
w.error_hidden.assign(hidden_layer_size, 0); |
|
15880
|
0
|
0
|
|
|
|
|
for (auto&& i : w.hidden_kept) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15881
|
0
|
0
|
|
|
|
|
for (unsigned j = 0; j < outcomes_size; j++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15882
|
0
|
|
|
|
|
|
w.error_hidden[i] += network.weights[1][i][j] * w.error_outcomes[j]; |
|
15883
|
|
|
|
|
|
|
// Dropout normalization |
|
15884
|
0
|
0
|
|
|
|
|
if (dropout_hidden) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15885
|
0
|
|
|
|
|
|
float dropout_factor = 1. / (1. - dropout_hidden); |
|
15886
|
0
|
0
|
|
|
|
|
for (auto&& i : w.hidden_kept) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15887
|
0
|
|
|
|
|
|
w.error_hidden[i] *= dropout_factor; |
|
15888
|
|
|
|
|
|
|
} |
|
15889
|
|
|
|
|
|
|
|
|
15890
|
|
|
|
|
|
|
// Perform activation function derivation |
|
15891
|
0
|
|
|
|
|
|
switch (network.hidden_layer_activation) { |
|
15892
|
|
|
|
|
|
|
case activation_function::TANH: |
|
15893
|
0
|
0
|
|
|
|
|
for (auto&& i : w.hidden_kept) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15894
|
0
|
|
|
|
|
|
w.error_hidden[i] *= 1 - w.hidden_layer[i] * w.hidden_layer[i]; |
|
15895
|
|
|
|
|
|
|
break; |
|
15896
|
|
|
|
|
|
|
case activation_function::CUBIC: |
|
15897
|
0
|
0
|
|
|
|
|
for (auto&& i : w.hidden_kept) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15898
|
0
|
|
|
|
|
|
float hidden_layer = cbrt(w.hidden_layer[i]); |
|
15899
|
0
|
|
|
|
|
|
w.error_hidden[i] *= 3 * hidden_layer * hidden_layer; |
|
15900
|
|
|
|
|
|
|
} |
|
15901
|
|
|
|
|
|
|
break; |
|
15902
|
|
|
|
|
|
|
case activation_function::RELU: |
|
15903
|
0
|
0
|
|
|
|
|
for (auto&& i : w.hidden_kept) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15904
|
0
|
0
|
|
|
|
|
if (w.hidden_layer[i] <= 0) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15905
|
0
|
|
|
|
|
|
w.error_hidden[i] = 0; |
|
15906
|
|
|
|
|
|
|
break; |
|
15907
|
|
|
|
|
|
|
} |
|
15908
|
|
|
|
|
|
|
|
|
15909
|
|
|
|
|
|
|
// Update weights[1] |
|
15910
|
0
|
0
|
|
|
|
|
for (auto&& i : w.hidden_kept) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15911
|
0
|
0
|
|
|
|
|
if (w.weights_batch[1][i].empty()) w.weights_batch[1][i].resize(outcomes_size); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15912
|
0
|
0
|
|
|
|
|
for (unsigned j = 0; j < outcomes_size; j++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15913
|
0
|
|
|
|
|
|
w.weights_batch[1][i][j] += w.hidden_layer[i] * w.error_outcomes[j]; |
|
15914
|
|
|
|
|
|
|
} |
|
15915
|
|
|
|
|
|
|
// Bias |
|
15916
|
0
|
0
|
|
|
|
|
if (w.weights_batch[1][hidden_layer_size].empty()) w.weights_batch[1][hidden_layer_size].resize(outcomes_size); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15917
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < outcomes_size; i++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15918
|
0
|
|
|
|
|
|
w.weights_batch[1][hidden_layer_size][i] += w.error_outcomes[i]; |
|
15919
|
|
|
|
|
|
|
|
|
15920
|
|
|
|
|
|
|
// Dropout normalization |
|
15921
|
0
|
0
|
|
|
|
|
if (dropout_input) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15922
|
0
|
|
|
|
|
|
float dropout_factor = 1. / (1. - dropout_input); |
|
15923
|
0
|
0
|
|
|
|
|
for (auto&& i : w.hidden_kept) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15924
|
0
|
|
|
|
|
|
w.error_hidden[i] *= dropout_factor; |
|
15925
|
|
|
|
|
|
|
} |
|
15926
|
|
|
|
|
|
|
// Update weights[0] and backpropagate to error_embedding |
|
15927
|
|
|
|
|
|
|
unsigned index = 0; |
|
15928
|
0
|
0
|
|
|
|
|
for (auto&& embedding_ids : embedding_ids_sequences) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15929
|
|
|
|
|
|
|
// Note: The unnecessary brackets on the following for cycle are needed |
|
15930
|
|
|
|
|
|
|
// to compile on VS 2015 Update 3, which otherwise fail to compile it. |
|
15931
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < embeddings.size(); i++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15932
|
0
|
0
|
|
|
|
|
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15933
|
0
|
|
|
|
|
|
int embedding_id = (*embedding_ids)[i]; |
|
15934
|
|
|
|
|
|
|
|
|
15935
|
|
|
|
|
|
|
float* error_embedding = nullptr; // Accumulate embedding error if required |
|
15936
|
0
|
0
|
|
|
|
|
if (embeddings[i].can_update_weights(embedding_id)) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15937
|
0
|
0
|
|
|
|
|
if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15938
|
0
|
0
|
|
|
|
|
if (w.error_embedding[i][embedding_id].empty()) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15939
|
0
|
|
|
|
|
|
w.error_embedding[i][embedding_id].assign(embeddings[i].dimension, 0); |
|
15940
|
0
|
0
|
|
|
|
|
w.error_embedding_nonempty[i].emplace_back(embedding_id); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15941
|
|
|
|
|
|
|
} |
|
15942
|
0
|
|
|
|
|
|
error_embedding = w.error_embedding[i][embedding_id].data(); |
|
15943
|
|
|
|
|
|
|
} |
|
15944
|
|
|
|
|
|
|
|
|
15945
|
0
|
|
|
|
|
|
const float* embedding = embeddings[i].weight(embedding_id); |
|
15946
|
0
|
0
|
|
|
|
|
for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15947
|
0
|
0
|
|
|
|
|
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15948
|
0
|
0
|
|
|
|
|
if (error_embedding) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15949
|
0
|
0
|
|
|
|
|
for (auto&& j : w.hidden_kept) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15950
|
0
|
|
|
|
|
|
*error_embedding += network.weights[0][index][j] * w.error_hidden[j]; |
|
15951
|
0
|
0
|
|
|
|
|
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15952
|
0
|
0
|
|
|
|
|
for (auto&& j : w.hidden_kept) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15953
|
0
|
|
|
|
|
|
w.weights_batch[0][index][j] += *embedding * w.error_hidden[j]; |
|
15954
|
|
|
|
|
|
|
} |
|
15955
|
|
|
|
|
|
|
} else { |
|
15956
|
0
|
|
|
|
|
|
index += embeddings[i].dimension; |
|
15957
|
|
|
|
|
|
|
} |
|
15958
|
|
|
|
|
|
|
} |
|
15959
|
|
|
|
|
|
|
// Bias |
|
15960
|
|
|
|
|
|
|
{ |
|
15961
|
0
|
|
|
|
|
|
float negate_input_dropout = 1. - dropout_hidden; |
|
15962
|
0
|
0
|
|
|
|
|
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15963
|
0
|
0
|
|
|
|
|
for (auto&& i : w.hidden_kept) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15964
|
0
|
|
|
|
|
|
w.weights_batch[0][index][i] += w.error_hidden[i] * negate_input_dropout; |
|
15965
|
|
|
|
|
|
|
} |
|
15966
|
|
|
|
|
|
|
|
|
15967
|
|
|
|
|
|
|
// End if not at the end of the batch |
|
15968
|
0
|
0
|
|
|
|
|
if (++w.batch < batch_size) return; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15969
|
0
|
|
|
|
|
|
w.batch = 0; |
|
15970
|
|
|
|
|
|
|
|
|
15971
|
|
|
|
|
|
|
// Update hidden weights |
|
15972
|
0
|
0
|
|
|
|
|
if (!network.weights[0].empty()) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15973
|
0
|
0
|
|
|
|
|
for (int i = 0; i < 2; i++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15974
|
0
|
0
|
|
|
|
|
for (unsigned j = 0; j < w.weights_batch[i].size(); j++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15975
|
0
|
0
|
|
|
|
|
if (!w.weights_batch[i][j].empty()) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15976
|
0
|
0
|
|
|
|
|
for (unsigned k = 0; k < w.weights_batch[i][j].size(); k++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15977
|
0
|
0
|
|
|
|
|
network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k]; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15978
|
|
|
|
|
|
|
w.weights_batch[i][j].clear(); |
|
15979
|
|
|
|
|
|
|
} |
|
15980
|
|
|
|
|
|
|
} |
|
15981
|
|
|
|
|
|
|
|
|
15982
|
|
|
|
|
|
|
// Update embedding weights using error_embedding |
|
15983
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < embeddings.size(); i++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15984
|
0
|
0
|
|
|
|
|
for (auto&& id : w.error_embedding_nonempty[i]) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15985
|
0
|
0
|
|
|
|
|
if (TRAINER::need_trainer_data) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15986
|
0
|
0
|
|
|
|
|
if (w.embedding_trainer.size() <= i) w.embedding_trainer.resize(i + 1); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15987
|
0
|
0
|
|
|
|
|
if (w.embedding_trainer[i].size() <= id) w.embedding_trainer[i].resize(id + 1); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15988
|
0
|
0
|
|
|
|
|
if (w.embedding_trainer[i][id].size() < embeddings[i].dimension) w.embedding_trainer[i][id].resize(embeddings[i].dimension); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15989
|
|
|
|
|
|
|
} |
|
15990
|
0
|
|
|
|
|
|
float* embedding = embeddings[i].weight(id); |
|
15991
|
0
|
0
|
|
|
|
|
for (unsigned j = 0; j < embeddings[i].dimension; j++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15992
|
0
|
0
|
|
|
|
|
embedding[j] += TRAINER::delta(w.error_embedding[i][id][j], trainer, TRAINER::need_trainer_data ? w.embedding_trainer[i][id][j] : none_trainer_data) - l2_regularization * embedding[j]; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
15993
|
0
|
|
|
|
|
|
w.error_embedding[i][id].clear(); |
|
15994
|
|
|
|
|
|
|
} |
|
15995
|
|
|
|
|
|
|
w.error_embedding_nonempty[i].clear(); |
|
15996
|
|
|
|
|
|
|
} |
|
15997
|
|
|
|
|
|
|
|
|
15998
|
|
|
|
|
|
|
// Maxnorm regularize the updated weights |
|
15999
|
0
|
0
|
|
|
|
|
if (maxnorm_regularization) maxnorm_regularize(); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
16000
|
|
|
|
|
|
|
} |
|
16001
|
|
|
|
|
|
|
|
|
16002
|
0
|
|
|
|
|
|
void neural_network_trainer::backpropagate(vector& embeddings, const vector*>& embedding_ids_sequences, unsigned required_outcome, workspace& w) { |
|
16003
|
0
|
|
|
|
|
|
steps++; |
|
16004
|
|
|
|
|
|
|
|
|
16005
|
0
|
|
|
|
|
|
switch (trainer.algorithm) { |
|
16006
|
|
|
|
|
|
|
case network_trainer::SGD: |
|
16007
|
0
|
|
|
|
|
|
backpropagate_template(embeddings, embedding_ids_sequences, required_outcome, w); |
|
16008
|
0
|
|
|
|
|
|
return; |
|
16009
|
|
|
|
|
|
|
case network_trainer::SGD_MOMENTUM: |
|
16010
|
0
|
|
|
|
|
|
backpropagate_template(embeddings, embedding_ids_sequences, required_outcome, w); |
|
16011
|
0
|
|
|
|
|
|
return; |
|
16012
|
|
|
|
|
|
|
case network_trainer::ADAGRAD: |
|
16013
|
0
|
|
|
|
|
|
backpropagate_template(embeddings, embedding_ids_sequences, required_outcome, w); |
|
16014
|
0
|
|
|
|
|
|
return; |
|
16015
|
|
|
|
|
|
|
case network_trainer::ADADELTA: |
|
16016
|
0
|
|
|
|
|
|
backpropagate_template(embeddings, embedding_ids_sequences, required_outcome, w); |
|
16017
|
0
|
|
|
|
|
|
return; |
|
16018
|
|
|
|
|
|
|
case network_trainer::ADAM: |
|
16019
|
0
|
|
|
|
|
|
float original_learning_rate = trainer.learning_rate; |
|
16020
|
0
|
|
|
|
|
|
trainer.learning_rate *= sqrt(1-pow(trainer.momentum2, steps)) / (1-pow(trainer.momentum, steps)); |
|
16021
|
0
|
|
|
|
|
|
backpropagate_template(embeddings, embedding_ids_sequences, required_outcome, w); |
|
16022
|
0
|
|
|
|
|
|
trainer.learning_rate = original_learning_rate; |
|
16023
|
0
|
|
|
|
|
|
return; |
|
16024
|
|
|
|
|
|
|
} |
|
16025
|
|
|
|
|
|
|
|
|
16026
|
0
|
0
|
|
|
|
|
training_failure("Internal error, unsupported trainer!"); |
|
|
|
0
|
|
|
|
|
|
|
16027
|
|
|
|
|
|
|
} |
|
16028
|
|
|
|
|
|
|
|
|
16029
|
0
|
|
|
|
|
|
void neural_network_trainer::l1_regularize() { |
|
16030
|
0
|
0
|
|
|
|
|
if (!l1_regularization) return; |
|
16031
|
|
|
|
|
|
|
|
|
16032
|
0
|
0
|
|
|
|
|
for (auto&& weights : network.weights) |
|
16033
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i + 1 /*ignore biases*/ < weights.size(); i++) { |
|
16034
|
0
|
|
|
|
|
|
auto& row = weights[i]; |
|
16035
|
0
|
0
|
|
|
|
|
for (auto&& weight : row) |
|
16036
|
0
|
0
|
|
|
|
|
if (weight < l1_regularization) weight += l1_regularization; |
|
16037
|
0
|
0
|
|
|
|
|
else if (weight > l1_regularization) weight -= l1_regularization; |
|
16038
|
0
|
|
|
|
|
|
else weight = 0; |
|
16039
|
|
|
|
|
|
|
} |
|
16040
|
|
|
|
|
|
|
} |
|
16041
|
|
|
|
|
|
|
|
|
16042
|
0
|
|
|
|
|
|
void neural_network_trainer::maxnorm_regularize() { |
|
16043
|
0
|
0
|
|
|
|
|
if (!maxnorm_regularization) return; |
|
16044
|
|
|
|
|
|
|
|
|
16045
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) |
|
16046
|
0
|
0
|
|
|
|
|
for (unsigned j = 0; j < network.weights[i].front().size(); j++) { |
|
16047
|
|
|
|
|
|
|
float length = 0; |
|
16048
|
0
|
0
|
|
|
|
|
for (auto&& row : network.weights[i]) |
|
16049
|
0
|
|
|
|
|
|
length += row[j] * row[j]; |
|
16050
|
|
|
|
|
|
|
|
|
16051
|
0
|
0
|
|
|
|
|
if (length > 0 && length > maxnorm_regularization * maxnorm_regularization) { |
|
|
|
0
|
|
|
|
|
|
|
16052
|
0
|
|
|
|
|
|
float factor = 1 / sqrt(length / (maxnorm_regularization * maxnorm_regularization)); |
|
16053
|
0
|
0
|
|
|
|
|
for (auto&& row : network.weights[i]) |
|
16054
|
0
|
|
|
|
|
|
row[j] *= factor; |
|
16055
|
|
|
|
|
|
|
} |
|
16056
|
|
|
|
|
|
|
} |
|
16057
|
|
|
|
|
|
|
} |
|
16058
|
|
|
|
|
|
|
|
|
16059
|
0
|
|
|
|
|
|
void neural_network_trainer::finalize_sentence() { |
|
16060
|
0
|
0
|
|
|
|
|
if (l1_regularization) l1_regularize(); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
16061
|
0
|
|
|
|
|
|
} |
|
16062
|
|
|
|
|
|
|
|
|
16063
|
0
|
|
|
|
|
|
void neural_network_trainer::save_matrix(const vector>& m, binary_encoder& enc) const { |
|
16064
|
0
|
|
|
|
|
|
enc.add_4B(m.size()); |
|
16065
|
0
|
0
|
|
|
|
|
enc.add_4B(m.empty() ? 0 : m.front().size()); |
|
16066
|
|
|
|
|
|
|
|
|
16067
|
0
|
0
|
|
|
|
|
for (auto&& row : m) { |
|
16068
|
0
|
0
|
|
|
|
|
assert(row.size() == m.front().size()); |
|
16069
|
|
|
|
|
|
|
enc.add_data(row); |
|
16070
|
|
|
|
|
|
|
} |
|
16071
|
0
|
|
|
|
|
|
} |
|
16072
|
|
|
|
|
|
|
|
|
16073
|
0
|
|
|
|
|
|
void neural_network_trainer::save_network(binary_encoder& enc) const { |
|
16074
|
0
|
|
|
|
|
|
enc.add_1B(network.hidden_layer_activation); |
|
16075
|
0
|
|
|
|
|
|
save_matrix(network.weights[0], enc); |
|
16076
|
0
|
|
|
|
|
|
save_matrix(network.weights[1], enc); |
|
16077
|
0
|
|
|
|
|
|
} |
|
16078
|
|
|
|
|
|
|
|
|
16079
|
|
|
|
|
|
|
} // namespace parsito |
|
16080
|
|
|
|
|
|
|
|
|
16081
|
|
|
|
|
|
|
///////// |
|
16082
|
|
|
|
|
|
|
// File: parsito/transition/transition.h |
|
16083
|
|
|
|
|
|
|
///////// |
|
16084
|
|
|
|
|
|
|
|
|
16085
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
16086
|
|
|
|
|
|
|
// |
|
16087
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
16088
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
16089
|
|
|
|
|
|
|
// |
|
16090
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
16091
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
16092
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
16093
|
|
|
|
|
|
|
|
|
16094
|
|
|
|
|
|
|
namespace parsito { |
|
16095
|
|
|
|
|
|
|
|
|
16096
|
|
|
|
|
|
|
// Abstract transition class |
|
16097
|
13
|
|
|
|
|
|
class transition { |
|
16098
|
|
|
|
|
|
|
public: |
|
16099
|
13
|
|
|
|
|
|
virtual ~transition() {} |
|
16100
|
|
|
|
|
|
|
|
|
16101
|
|
|
|
|
|
|
virtual bool applicable(const configuration& conf) const = 0; |
|
16102
|
|
|
|
|
|
|
virtual int perform(configuration& conf) const = 0; |
|
16103
|
|
|
|
|
|
|
}; |
|
16104
|
|
|
|
|
|
|
|
|
16105
|
|
|
|
|
|
|
// Specific transition classes |
|
16106
|
12
|
|
|
|
|
|
class transition_left_arc : public transition { |
|
16107
|
|
|
|
|
|
|
public: |
|
16108
|
6
|
|
|
|
|
|
transition_left_arc(const string& label) : label(label), label_is_root(label == "root") {} |
|
16109
|
|
|
|
|
|
|
|
|
16110
|
|
|
|
|
|
|
virtual bool applicable(const configuration& conf) const override; |
|
16111
|
|
|
|
|
|
|
virtual int perform(configuration& conf) const override; |
|
16112
|
|
|
|
|
|
|
private: |
|
16113
|
|
|
|
|
|
|
string label; |
|
16114
|
|
|
|
|
|
|
bool label_is_root; |
|
16115
|
|
|
|
|
|
|
}; |
|
16116
|
|
|
|
|
|
|
|
|
16117
|
12
|
|
|
|
|
|
class transition_right_arc : public transition { |
|
16118
|
|
|
|
|
|
|
public: |
|
16119
|
6
|
|
|
|
|
|
transition_right_arc(const string& label) : label(label), label_is_root(label == "root") {} |
|
16120
|
|
|
|
|
|
|
|
|
16121
|
|
|
|
|
|
|
virtual bool applicable(const configuration& conf) const override; |
|
16122
|
|
|
|
|
|
|
virtual int perform(configuration& conf) const override; |
|
16123
|
|
|
|
|
|
|
private: |
|
16124
|
|
|
|
|
|
|
string label; |
|
16125
|
|
|
|
|
|
|
bool label_is_root; |
|
16126
|
|
|
|
|
|
|
}; |
|
16127
|
|
|
|
|
|
|
|
|
16128
|
2
|
|
|
|
|
|
class transition_shift : public transition { |
|
16129
|
|
|
|
|
|
|
public: |
|
16130
|
|
|
|
|
|
|
virtual bool applicable(const configuration& conf) const override; |
|
16131
|
|
|
|
|
|
|
virtual int perform(configuration& conf) const override; |
|
16132
|
|
|
|
|
|
|
}; |
|
16133
|
|
|
|
|
|
|
|
|
16134
|
0
|
|
|
|
|
|
class transition_swap : public transition { |
|
16135
|
|
|
|
|
|
|
public: |
|
16136
|
|
|
|
|
|
|
virtual bool applicable(const configuration& conf) const override; |
|
16137
|
|
|
|
|
|
|
virtual int perform(configuration& conf) const override; |
|
16138
|
|
|
|
|
|
|
}; |
|
16139
|
|
|
|
|
|
|
|
|
16140
|
0
|
|
|
|
|
|
class transition_left_arc_2 : public transition { |
|
16141
|
|
|
|
|
|
|
public: |
|
16142
|
0
|
|
|
|
|
|
transition_left_arc_2(const string& label) : label(label), label_is_root(label == "root") {} |
|
16143
|
|
|
|
|
|
|
|
|
16144
|
|
|
|
|
|
|
virtual bool applicable(const configuration& conf) const override; |
|
16145
|
|
|
|
|
|
|
virtual int perform(configuration& conf) const override; |
|
16146
|
|
|
|
|
|
|
private: |
|
16147
|
|
|
|
|
|
|
string label; |
|
16148
|
|
|
|
|
|
|
bool label_is_root; |
|
16149
|
|
|
|
|
|
|
}; |
|
16150
|
|
|
|
|
|
|
|
|
16151
|
0
|
|
|
|
|
|
class transition_right_arc_2 : public transition { |
|
16152
|
|
|
|
|
|
|
public: |
|
16153
|
0
|
|
|
|
|
|
transition_right_arc_2(const string& label) : label(label), label_is_root(label == "root") {} |
|
16154
|
|
|
|
|
|
|
|
|
16155
|
|
|
|
|
|
|
virtual bool applicable(const configuration& conf) const override; |
|
16156
|
|
|
|
|
|
|
virtual int perform(configuration& conf) const override; |
|
16157
|
|
|
|
|
|
|
private: |
|
16158
|
|
|
|
|
|
|
string label; |
|
16159
|
|
|
|
|
|
|
bool label_is_root; |
|
16160
|
|
|
|
|
|
|
}; |
|
16161
|
|
|
|
|
|
|
|
|
16162
|
|
|
|
|
|
|
} // namespace parsito |
|
16163
|
|
|
|
|
|
|
|
|
16164
|
|
|
|
|
|
|
///////// |
|
16165
|
|
|
|
|
|
|
// File: parsito/transition/transition_oracle.h |
|
16166
|
|
|
|
|
|
|
///////// |
|
16167
|
|
|
|
|
|
|
|
|
16168
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
16169
|
|
|
|
|
|
|
// |
|
16170
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
16171
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
16172
|
|
|
|
|
|
|
// |
|
16173
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
16174
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
16175
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
16176
|
|
|
|
|
|
|
|
|
16177
|
|
|
|
|
|
|
namespace parsito { |
|
16178
|
|
|
|
|
|
|
|
|
16179
|
0
|
|
|
|
|
|
class transition_oracle { |
|
16180
|
|
|
|
|
|
|
public: |
|
16181
|
0
|
|
|
|
|
|
virtual ~transition_oracle() {} |
|
16182
|
|
|
|
|
|
|
|
|
16183
|
|
|
|
|
|
|
struct predicted_transition { |
|
16184
|
|
|
|
|
|
|
unsigned best; |
|
16185
|
|
|
|
|
|
|
unsigned to_follow; |
|
16186
|
|
|
|
|
|
|
|
|
16187
|
|
|
|
|
|
|
predicted_transition(unsigned best, unsigned to_follow) : best(best), to_follow(to_follow) {} |
|
16188
|
|
|
|
|
|
|
}; |
|
16189
|
|
|
|
|
|
|
|
|
16190
|
0
|
|
|
|
|
|
class tree_oracle { |
|
16191
|
|
|
|
|
|
|
public: |
|
16192
|
0
|
|
|
|
|
|
virtual ~tree_oracle() {} |
|
16193
|
|
|
|
|
|
|
|
|
16194
|
|
|
|
|
|
|
virtual predicted_transition predict(const configuration& conf, unsigned network_outcome, unsigned iteration) const = 0; |
|
16195
|
|
|
|
|
|
|
virtual void interesting_transitions(const configuration& conf, vector& transitions) const = 0; |
|
16196
|
|
|
|
|
|
|
}; |
|
16197
|
|
|
|
|
|
|
|
|
16198
|
|
|
|
|
|
|
virtual unique_ptr create_tree_oracle(const tree& gold) const = 0; |
|
16199
|
|
|
|
|
|
|
}; |
|
16200
|
|
|
|
|
|
|
|
|
16201
|
|
|
|
|
|
|
} // namespace parsito |
|
16202
|
|
|
|
|
|
|
|
|
16203
|
|
|
|
|
|
|
///////// |
|
16204
|
|
|
|
|
|
|
// File: parsito/transition/transition_system.h |
|
16205
|
|
|
|
|
|
|
///////// |
|
16206
|
|
|
|
|
|
|
|
|
16207
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
16208
|
|
|
|
|
|
|
// |
|
16209
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
16210
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
16211
|
|
|
|
|
|
|
// |
|
16212
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
16213
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
16214
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
16215
|
|
|
|
|
|
|
|
|
16216
|
|
|
|
|
|
|
namespace parsito { |
|
16217
|
|
|
|
|
|
|
|
|
16218
|
|
|
|
|
|
|
class transition_system { |
|
16219
|
|
|
|
|
|
|
public: |
|
16220
|
1
|
|
|
|
|
|
virtual ~transition_system() {} |
|
16221
|
|
|
|
|
|
|
|
|
16222
|
|
|
|
|
|
|
virtual unsigned transition_count() const; |
|
16223
|
|
|
|
|
|
|
virtual bool applicable(const configuration& conf, unsigned transition) const; |
|
16224
|
|
|
|
|
|
|
virtual int perform(configuration& conf, unsigned transition) const; |
|
16225
|
|
|
|
|
|
|
virtual transition_oracle* oracle(const string& name) const = 0; |
|
16226
|
|
|
|
|
|
|
|
|
16227
|
|
|
|
|
|
|
static transition_system* create(const string& name, const vector& labels); |
|
16228
|
|
|
|
|
|
|
|
|
16229
|
|
|
|
|
|
|
protected: |
|
16230
|
1
|
|
|
|
|
|
transition_system(const vector& labels) : labels(labels) {} |
|
16231
|
|
|
|
|
|
|
|
|
16232
|
|
|
|
|
|
|
const vector& labels; |
|
16233
|
|
|
|
|
|
|
vector> transitions; |
|
16234
|
|
|
|
|
|
|
}; |
|
16235
|
|
|
|
|
|
|
|
|
16236
|
|
|
|
|
|
|
} // namespace parsito |
|
16237
|
|
|
|
|
|
|
|
|
16238
|
|
|
|
|
|
|
///////// |
|
16239
|
|
|
|
|
|
|
// File: parsito/parser/parser_nn.h |
|
16240
|
|
|
|
|
|
|
///////// |
|
16241
|
|
|
|
|
|
|
|
|
16242
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
16243
|
|
|
|
|
|
|
// |
|
16244
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
16245
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
16246
|
|
|
|
|
|
|
// |
|
16247
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
16248
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
16249
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
16250
|
|
|
|
|
|
|
|
|
16251
|
|
|
|
|
|
|
namespace parsito { |
|
16252
|
|
|
|
|
|
|
|
|
16253
|
5
|
|
|
|
|
|
class parser_nn : public parser { |
|
16254
|
|
|
|
|
|
|
public: |
|
16255
|
|
|
|
|
|
|
parser_nn(bool versioned); |
|
16256
|
|
|
|
|
|
|
|
|
16257
|
|
|
|
|
|
|
virtual void parse(tree& t, unsigned beam_size = 0, double* cost = nullptr) const override; |
|
16258
|
|
|
|
|
|
|
|
|
16259
|
|
|
|
|
|
|
protected: |
|
16260
|
|
|
|
|
|
|
virtual void load(binary_decoder& data, unsigned cache) override; |
|
16261
|
|
|
|
|
|
|
|
|
16262
|
|
|
|
|
|
|
private: |
|
16263
|
|
|
|
|
|
|
friend class parser_nn_trainer; |
|
16264
|
|
|
|
|
|
|
void parse_greedy(tree& t, double* cost) const; |
|
16265
|
|
|
|
|
|
|
void parse_beam_search(tree& t, unsigned beam_size, double* cost) const; |
|
16266
|
|
|
|
|
|
|
|
|
16267
|
|
|
|
|
|
|
bool versioned; |
|
16268
|
|
|
|
|
|
|
unsigned version; |
|
16269
|
|
|
|
|
|
|
bool single_root; |
|
16270
|
|
|
|
|
|
|
enum { VERSION_LATEST = 2 }; |
|
16271
|
|
|
|
|
|
|
|
|
16272
|
|
|
|
|
|
|
vector labels; |
|
16273
|
|
|
|
|
|
|
unique_ptr system; |
|
16274
|
|
|
|
|
|
|
|
|
16275
|
|
|
|
|
|
|
node_extractor nodes; |
|
16276
|
|
|
|
|
|
|
|
|
16277
|
|
|
|
|
|
|
vector values; |
|
16278
|
|
|
|
|
|
|
vector embeddings; |
|
16279
|
|
|
|
|
|
|
|
|
16280
|
|
|
|
|
|
|
neural_network network; |
|
16281
|
|
|
|
|
|
|
neural_network::embeddings_cache embeddings_cache; |
|
16282
|
|
|
|
|
|
|
|
|
16283
|
6
|
50
|
|
|
|
|
struct workspace { |
|
|
|
100
|
|
|
|
|
|
|
16284
|
4
|
100
|
|
|
|
|
workspace(bool single_root) : conf(single_root) {} |
|
16285
|
|
|
|
|
|
|
|
|
16286
|
|
|
|
|
|
|
configuration conf; |
|
16287
|
|
|
|
|
|
|
|
|
16288
|
|
|
|
|
|
|
string word, word_buffer; |
|
16289
|
|
|
|
|
|
|
vector> embeddings; |
|
16290
|
|
|
|
|
|
|
vector> embeddings_values; |
|
16291
|
|
|
|
|
|
|
|
|
16292
|
|
|
|
|
|
|
vector extracted_nodes; |
|
16293
|
|
|
|
|
|
|
vector*> extracted_embeddings; |
|
16294
|
|
|
|
|
|
|
|
|
16295
|
|
|
|
|
|
|
vector outcomes, network_buffer; |
|
16296
|
|
|
|
|
|
|
|
|
16297
|
|
|
|
|
|
|
// Beam-size structures |
|
16298
|
228
|
|
|
|
|
|
struct beam_size_configuration { |
|
16299
|
|
|
|
|
|
|
beam_size_configuration(bool single_root) : conf(single_root) {} |
|
16300
|
|
|
|
|
|
|
|
|
16301
|
|
|
|
|
|
|
configuration conf; |
|
16302
|
|
|
|
|
|
|
vector heads; |
|
16303
|
|
|
|
|
|
|
vector deprels; |
|
16304
|
|
|
|
|
|
|
double cost; |
|
16305
|
|
|
|
|
|
|
|
|
16306
|
|
|
|
|
|
|
void refresh_tree(); |
|
16307
|
|
|
|
|
|
|
void save_tree(); |
|
16308
|
|
|
|
|
|
|
}; |
|
16309
|
|
|
|
|
|
|
struct beam_size_alternative { |
|
16310
|
|
|
|
|
|
|
const beam_size_configuration* bs_conf; |
|
16311
|
|
|
|
|
|
|
int transition; |
|
16312
|
|
|
|
|
|
|
double cost; |
|
16313
|
|
|
|
|
|
|
bool operator<(const beam_size_alternative& other) const { return cost > other.cost; } |
|
16314
|
|
|
|
|
|
|
|
|
16315
|
|
|
|
|
|
|
beam_size_alternative(const beam_size_configuration* bs_conf, int transition, double cost) |
|
16316
|
241
|
|
|
|
|
|
: bs_conf(bs_conf), transition(transition), cost(cost) {} |
|
16317
|
|
|
|
|
|
|
}; |
|
16318
|
|
|
|
|
|
|
vector bs_confs[2]; size_t bs_confs_size[2]; |
|
16319
|
|
|
|
|
|
|
vector bs_alternatives; |
|
16320
|
|
|
|
|
|
|
}; |
|
16321
|
|
|
|
|
|
|
mutable threadsafe_stack workspaces; |
|
16322
|
|
|
|
|
|
|
}; |
|
16323
|
|
|
|
|
|
|
|
|
16324
|
|
|
|
|
|
|
} // namespace parsito |
|
16325
|
|
|
|
|
|
|
|
|
16326
|
|
|
|
|
|
|
///////// |
|
16327
|
|
|
|
|
|
|
// File: parsito/parser/parser.cpp |
|
16328
|
|
|
|
|
|
|
///////// |
|
16329
|
|
|
|
|
|
|
|
|
16330
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
16331
|
|
|
|
|
|
|
// |
|
16332
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
16333
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
16334
|
|
|
|
|
|
|
// |
|
16335
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
16336
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
16337
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
16338
|
|
|
|
|
|
|
|
|
16339
|
|
|
|
|
|
|
namespace parsito { |
|
16340
|
|
|
|
|
|
|
|
|
16341
|
0
|
|
|
|
|
|
parser* parser::load(const char* file, unsigned cache) { |
|
16342
|
0
|
0
|
|
|
|
|
ifstream in(path_from_utf8(file).c_str(), ifstream::in | ifstream::binary); |
|
16343
|
0
|
0
|
|
|
|
|
if (!in.is_open()) return nullptr; |
|
16344
|
0
|
0
|
|
|
|
|
return load(in, cache); |
|
16345
|
|
|
|
|
|
|
} |
|
16346
|
|
|
|
|
|
|
|
|
16347
|
1
|
|
|
|
|
|
parser* parser::load(istream& in, unsigned cache) { |
|
16348
|
|
|
|
|
|
|
unique_ptr result; |
|
16349
|
|
|
|
|
|
|
|
|
16350
|
|
|
|
|
|
|
binary_decoder data; |
|
16351
|
1
|
50
|
|
|
|
|
if (!compressor::load(in, data)) return nullptr; |
|
|
|
50
|
|
|
|
|
|
|
16352
|
|
|
|
|
|
|
|
|
16353
|
|
|
|
|
|
|
try { |
|
16354
|
|
|
|
|
|
|
string name; |
|
16355
|
1
|
50
|
|
|
|
|
data.next_str(name); |
|
16356
|
|
|
|
|
|
|
|
|
16357
|
1
|
50
|
|
|
|
|
result.reset(create(name)); |
|
16358
|
1
|
50
|
|
|
|
|
if (!result) return nullptr; |
|
16359
|
|
|
|
|
|
|
|
|
16360
|
1
|
50
|
|
|
|
|
result->load(data, cache); |
|
|
|
0
|
|
|
|
|
|
|
16361
|
|
|
|
|
|
|
} catch (binary_decoder_error&) { |
|
16362
|
|
|
|
|
|
|
return nullptr; |
|
16363
|
|
|
|
|
|
|
} |
|
16364
|
|
|
|
|
|
|
|
|
16365
|
1
|
50
|
|
|
|
|
return result && data.is_end() ? result.release() : nullptr; |
|
|
|
50
|
|
|
|
|
|
|
16366
|
|
|
|
|
|
|
} |
|
16367
|
|
|
|
|
|
|
|
|
16368
|
1
|
|
|
|
|
|
parser* parser::create(const string& name) { |
|
16369
|
1
|
50
|
|
|
|
|
if (name == "nn") return new parser_nn(false); |
|
16370
|
0
|
0
|
|
|
|
|
if (name == "nn_versioned") return new parser_nn(true); |
|
16371
|
|
|
|
|
|
|
return nullptr; |
|
16372
|
|
|
|
|
|
|
} |
|
16373
|
|
|
|
|
|
|
|
|
16374
|
|
|
|
|
|
|
} // namespace parsito |
|
16375
|
|
|
|
|
|
|
|
|
16376
|
|
|
|
|
|
|
///////// |
|
16377
|
|
|
|
|
|
|
// File: parsito/parser/parser_nn.cpp |
|
16378
|
|
|
|
|
|
|
///////// |
|
16379
|
|
|
|
|
|
|
|
|
16380
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
16381
|
|
|
|
|
|
|
// |
|
16382
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
16383
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
16384
|
|
|
|
|
|
|
// |
|
16385
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
16386
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
16387
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
16388
|
|
|
|
|
|
|
|
|
16389
|
|
|
|
|
|
|
namespace parsito { |
|
16390
|
|
|
|
|
|
|
|
|
16391
|
|
|
|
|
|
|
// Versions: |
|
16392
|
|
|
|
|
|
|
// 1: initial version |
|
16393
|
|
|
|
|
|
|
// 2: add ReLU activation function |
|
16394
|
|
|
|
|
|
|
|
|
16395
|
1
|
|
|
|
|
|
parser_nn::parser_nn(bool versioned) : versioned(versioned) {} |
|
16396
|
|
|
|
|
|
|
|
|
16397
|
1
|
|
|
|
|
|
void parser_nn::parse(tree& t, unsigned beam_size, double* cost) const { |
|
16398
|
1
|
50
|
|
|
|
|
if (beam_size > 1) |
|
16399
|
1
|
|
|
|
|
|
parse_beam_search(t, beam_size, cost); |
|
16400
|
|
|
|
|
|
|
else |
|
16401
|
0
|
0
|
|
|
|
|
parse_greedy(t, cost); |
|
16402
|
1
|
|
|
|
|
|
} |
|
16403
|
|
|
|
|
|
|
|
|
16404
|
0
|
|
|
|
|
|
void parser_nn::parse_greedy(tree& t, double* cost) const { |
|
16405
|
0
|
0
|
|
|
|
|
assert(system); |
|
16406
|
0
|
0
|
|
|
|
|
if (cost) *cost = 0.; |
|
16407
|
|
|
|
|
|
|
|
|
16408
|
|
|
|
|
|
|
// Retrieve or create workspace |
|
16409
|
0
|
|
|
|
|
|
workspace* w = workspaces.pop(); |
|
16410
|
0
|
0
|
|
|
|
|
if (!w) w = new workspace(single_root); |
|
16411
|
|
|
|
|
|
|
|
|
16412
|
|
|
|
|
|
|
// Create configuration |
|
16413
|
0
|
|
|
|
|
|
w->conf.init(&t); |
|
16414
|
|
|
|
|
|
|
|
|
16415
|
|
|
|
|
|
|
// Compute embeddings of all nodes |
|
16416
|
0
|
0
|
|
|
|
|
if (w->embeddings.size() < t.nodes.size()) w->embeddings.resize(t.nodes.size()); |
|
16417
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < t.nodes.size(); i++) { |
|
16418
|
0
|
0
|
|
|
|
|
if (w->embeddings[i].size() < embeddings.size()) w->embeddings[i].resize(embeddings.size()); |
|
16419
|
0
|
0
|
|
|
|
|
for (size_t j = 0; j < embeddings.size(); j++) { |
|
16420
|
0
|
|
|
|
|
|
values[j].extract(t.nodes[i], w->word); |
|
16421
|
0
|
|
|
|
|
|
w->embeddings[i][j] = embeddings[j].lookup_word(w->word, w->word_buffer); |
|
16422
|
|
|
|
|
|
|
} |
|
16423
|
|
|
|
|
|
|
} |
|
16424
|
|
|
|
|
|
|
|
|
16425
|
|
|
|
|
|
|
// Compute which transitions to perform and perform them |
|
16426
|
|
|
|
|
|
|
int transitions = 0; |
|
16427
|
0
|
0
|
|
|
|
|
for (; !w->conf.final(); transitions++) { |
|
16428
|
|
|
|
|
|
|
// Extract nodes from the configuration |
|
16429
|
0
|
|
|
|
|
|
nodes.extract(w->conf, w->extracted_nodes); |
|
16430
|
0
|
|
|
|
|
|
w->extracted_embeddings.resize(w->extracted_nodes.size()); |
|
16431
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < w->extracted_nodes.size(); i++) |
|
16432
|
0
|
0
|
|
|
|
|
w->extracted_embeddings[i] = w->extracted_nodes[i] >= 0 ? &w->embeddings[w->extracted_nodes[i]] : nullptr; |
|
16433
|
|
|
|
|
|
|
|
|
16434
|
|
|
|
|
|
|
// Classify using neural network |
|
16435
|
0
|
|
|
|
|
|
network.propagate(embeddings, w->extracted_embeddings, w->network_buffer, w->outcomes, &embeddings_cache, cost ? true : false); |
|
16436
|
|
|
|
|
|
|
|
|
16437
|
|
|
|
|
|
|
// Find most probable applicable transition |
|
16438
|
|
|
|
|
|
|
int best = -1; |
|
16439
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < w->outcomes.size(); i++) |
|
16440
|
0
|
0
|
|
|
|
|
if (system->applicable(w->conf, i) && (best < 0 || w->outcomes[i] > w->outcomes[best])) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
16441
|
0
|
|
|
|
|
|
best = i; |
|
16442
|
|
|
|
|
|
|
|
|
16443
|
|
|
|
|
|
|
// Perform the best transition |
|
16444
|
0
|
|
|
|
|
|
int child = system->perform(w->conf, best); |
|
16445
|
0
|
0
|
|
|
|
|
if (cost) *cost += log(w->outcomes[best]); |
|
16446
|
|
|
|
|
|
|
|
|
16447
|
|
|
|
|
|
|
// If a node was linked, recompute its embeddings as deprel has changed |
|
16448
|
0
|
0
|
|
|
|
|
if (child >= 0) |
|
16449
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < embeddings.size(); i++) { |
|
16450
|
0
|
|
|
|
|
|
values[i].extract(t.nodes[child], w->word); |
|
16451
|
0
|
|
|
|
|
|
w->embeddings[child][i] = embeddings[i].lookup_word(w->word, w->word_buffer); |
|
16452
|
|
|
|
|
|
|
} |
|
16453
|
|
|
|
|
|
|
} |
|
16454
|
|
|
|
|
|
|
|
|
16455
|
0
|
0
|
|
|
|
|
if (cost && transitions) |
|
16456
|
0
|
|
|
|
|
|
*cost = *cost / transitions * (t.nodes.size() - 1); |
|
16457
|
|
|
|
|
|
|
|
|
16458
|
|
|
|
|
|
|
// Store workspace |
|
16459
|
0
|
|
|
|
|
|
workspaces.push(w); |
|
16460
|
0
|
|
|
|
|
|
} |
|
16461
|
|
|
|
|
|
|
|
|
16462
|
1
|
|
|
|
|
|
void parser_nn::parse_beam_search(tree& t, unsigned beam_size, double* cost) const { |
|
16463
|
1
|
50
|
|
|
|
|
assert(system); |
|
16464
|
|
|
|
|
|
|
|
|
16465
|
|
|
|
|
|
|
// Retrieve or create workspace |
|
16466
|
1
|
|
|
|
|
|
workspace* w = workspaces.pop(); |
|
16467
|
1
|
50
|
|
|
|
|
if (!w) w = new workspace(single_root); |
|
16468
|
|
|
|
|
|
|
|
|
16469
|
|
|
|
|
|
|
// Allocate and initialize configuration |
|
16470
|
3
|
100
|
|
|
|
|
for (int i = 0; i < 2; i++) { |
|
16471
|
12
|
100
|
|
|
|
|
while (w->bs_confs[i].size() < beam_size) w->bs_confs[i].emplace_back(single_root); |
|
16472
|
2
|
50
|
|
|
|
|
while (w->bs_confs[i].size() > beam_size) w->bs_confs[i].pop_back(); |
|
16473
|
2
|
|
|
|
|
|
w->bs_confs_size[i] = 0; |
|
16474
|
|
|
|
|
|
|
} |
|
16475
|
1
|
|
|
|
|
|
w->bs_confs[0][0].cost = 0; |
|
16476
|
1
|
|
|
|
|
|
w->bs_confs[0][0].conf.init(&t); |
|
16477
|
1
|
|
|
|
|
|
w->bs_confs[0][0].save_tree(); |
|
16478
|
1
|
|
|
|
|
|
w->bs_confs_size[0] = 1; |
|
16479
|
|
|
|
|
|
|
|
|
16480
|
|
|
|
|
|
|
// Compute embeddings of all nodes |
|
16481
|
1
|
50
|
|
|
|
|
if (w->embeddings.size() < t.nodes.size()) w->embeddings.resize(t.nodes.size()); |
|
16482
|
1
|
50
|
|
|
|
|
if (w->embeddings_values.size() < t.nodes.size()) w->embeddings_values.resize(t.nodes.size()); |
|
16483
|
9
|
100
|
|
|
|
|
for (size_t i = 0; i < t.nodes.size(); i++) { |
|
16484
|
8
|
50
|
|
|
|
|
if (w->embeddings[i].size() < embeddings.size()) w->embeddings[i].resize(embeddings.size()); |
|
16485
|
8
|
50
|
|
|
|
|
if (w->embeddings_values[i].size() < embeddings.size()) w->embeddings_values[i].resize(embeddings.size()); |
|
16486
|
40
|
100
|
|
|
|
|
for (size_t j = 0; j < embeddings.size(); j++) { |
|
16487
|
32
|
|
|
|
|
|
values[j].extract(t.nodes[i], w->embeddings_values[i][j]); |
|
16488
|
32
|
|
|
|
|
|
w->embeddings[i][j] = embeddings[j].lookup_word(w->embeddings_values[i][j], w->word_buffer); |
|
16489
|
|
|
|
|
|
|
} |
|
16490
|
|
|
|
|
|
|
} |
|
16491
|
|
|
|
|
|
|
|
|
16492
|
|
|
|
|
|
|
// Compute which transitions to perform and perform them |
|
16493
|
|
|
|
|
|
|
size_t iteration = 0; |
|
16494
|
16
|
100
|
|
|
|
|
for (bool all_final = false; !all_final; iteration++) { |
|
16495
|
|
|
|
|
|
|
all_final = true; |
|
16496
|
|
|
|
|
|
|
w->bs_alternatives.clear(); |
|
16497
|
|
|
|
|
|
|
|
|
16498
|
82
|
100
|
|
|
|
|
for (size_t c = 0; c < w->bs_confs_size[iteration & 1]; c++) { |
|
16499
|
67
|
|
|
|
|
|
auto& bs_conf = w->bs_confs[iteration & 1][c]; |
|
16500
|
|
|
|
|
|
|
|
|
16501
|
67
|
100
|
|
|
|
|
if (bs_conf.conf.final()) { |
|
16502
|
5
|
50
|
|
|
|
|
if (w->bs_alternatives.size() == beam_size) { |
|
16503
|
0
|
0
|
|
|
|
|
if (bs_conf.cost <= w->bs_alternatives[0].cost) continue; |
|
16504
|
|
|
|
|
|
|
pop_heap(w->bs_alternatives.begin(), w->bs_alternatives.end()); |
|
16505
|
|
|
|
|
|
|
w->bs_alternatives.pop_back(); |
|
16506
|
|
|
|
|
|
|
} |
|
16507
|
5
|
|
|
|
|
|
w->bs_alternatives.emplace_back(&bs_conf, -1, bs_conf.cost); |
|
16508
|
5
|
|
|
|
|
|
push_heap(w->bs_alternatives.begin(), w->bs_alternatives.end()); |
|
16509
|
5
|
|
|
|
|
|
continue; |
|
16510
|
|
|
|
|
|
|
} |
|
16511
|
|
|
|
|
|
|
all_final = false; |
|
16512
|
|
|
|
|
|
|
|
|
16513
|
62
|
|
|
|
|
|
bs_conf.refresh_tree(); |
|
16514
|
|
|
|
|
|
|
// Update embeddings for all nodes |
|
16515
|
558
|
100
|
|
|
|
|
for (size_t i = 0; i < t.nodes.size(); i++) |
|
16516
|
2480
|
100
|
|
|
|
|
for (size_t j = 0; j < embeddings.size(); j++) { |
|
16517
|
1984
|
|
|
|
|
|
values[j].extract(t.nodes[i], w->word); |
|
16518
|
1984
|
100
|
|
|
|
|
if (w->word != w->embeddings_values[i][j]) { |
|
16519
|
96
|
|
|
|
|
|
w->embeddings[i][j] = embeddings[j].lookup_word(w->word, w->word_buffer); |
|
16520
|
|
|
|
|
|
|
w->embeddings_values[i][j].assign(w->word); |
|
16521
|
|
|
|
|
|
|
} |
|
16522
|
|
|
|
|
|
|
} |
|
16523
|
|
|
|
|
|
|
|
|
16524
|
|
|
|
|
|
|
// Extract nodes from the configuration |
|
16525
|
62
|
|
|
|
|
|
nodes.extract(bs_conf.conf, w->extracted_nodes); |
|
16526
|
62
|
|
|
|
|
|
w->extracted_embeddings.resize(w->extracted_nodes.size()); |
|
16527
|
1178
|
100
|
|
|
|
|
for (size_t i = 0; i < w->extracted_nodes.size(); i++) |
|
16528
|
1116
|
100
|
|
|
|
|
w->extracted_embeddings[i] = w->extracted_nodes[i] >= 0 ? &w->embeddings[w->extracted_nodes[i]] : nullptr; |
|
16529
|
|
|
|
|
|
|
|
|
16530
|
|
|
|
|
|
|
// Classify using neural network |
|
16531
|
62
|
|
|
|
|
|
network.propagate(embeddings, w->extracted_embeddings, w->network_buffer, w->outcomes, &embeddings_cache); |
|
16532
|
|
|
|
|
|
|
|
|
16533
|
|
|
|
|
|
|
// Store all alternatives |
|
16534
|
868
|
100
|
|
|
|
|
for (unsigned i = 0; i < w->outcomes.size(); i++) |
|
16535
|
806
|
100
|
|
|
|
|
if (system->applicable(bs_conf.conf, i)) { |
|
16536
|
1899
|
|
|
|
|
|
double cost = (bs_conf.cost * iteration + log(w->outcomes[i])) / (iteration + 1); |
|
16537
|
633
|
100
|
|
|
|
|
if (w->bs_alternatives.size() == beam_size) { |
|
16538
|
567
|
100
|
|
|
|
|
if (cost <= w->bs_alternatives[0].cost) continue; |
|
16539
|
|
|
|
|
|
|
pop_heap(w->bs_alternatives.begin(), w->bs_alternatives.end()); |
|
16540
|
|
|
|
|
|
|
w->bs_alternatives.pop_back(); |
|
16541
|
|
|
|
|
|
|
} |
|
16542
|
236
|
|
|
|
|
|
w->bs_alternatives.emplace_back(&bs_conf, i, cost); |
|
16543
|
236
|
|
|
|
|
|
push_heap(w->bs_alternatives.begin(), w->bs_alternatives.end()); |
|
16544
|
|
|
|
|
|
|
} |
|
16545
|
|
|
|
|
|
|
} |
|
16546
|
|
|
|
|
|
|
|
|
16547
|
15
|
|
|
|
|
|
w->bs_confs_size[(iteration + 1) & 1] = 0; |
|
16548
|
86
|
100
|
|
|
|
|
for (auto&& alternative : w->bs_alternatives) { |
|
16549
|
71
|
|
|
|
|
|
auto& bs_conf_new = w->bs_confs[(iteration + 1) & 1][w->bs_confs_size[(iteration + 1) & 1]++]; |
|
16550
|
71
|
|
|
|
|
|
bs_conf_new = *alternative.bs_conf; |
|
16551
|
71
|
|
|
|
|
|
bs_conf_new.cost = alternative.cost; |
|
16552
|
71
|
100
|
|
|
|
|
if (alternative.transition >= 0) { |
|
16553
|
66
|
|
|
|
|
|
bs_conf_new.refresh_tree(); |
|
16554
|
66
|
|
|
|
|
|
system->perform(bs_conf_new.conf, alternative.transition); |
|
16555
|
66
|
|
|
|
|
|
bs_conf_new.save_tree(); |
|
16556
|
|
|
|
|
|
|
} |
|
16557
|
|
|
|
|
|
|
} |
|
16558
|
|
|
|
|
|
|
} |
|
16559
|
|
|
|
|
|
|
|
|
16560
|
|
|
|
|
|
|
// Return the best tree |
|
16561
|
|
|
|
|
|
|
size_t best = 0; |
|
16562
|
5
|
100
|
|
|
|
|
for (size_t i = 1; i < w->bs_confs_size[iteration & 1]; i++) |
|
16563
|
4
|
100
|
|
|
|
|
if (w->bs_confs[iteration & 1][i].cost > w->bs_confs[iteration & 1][best].cost) |
|
16564
|
|
|
|
|
|
|
best = i; |
|
16565
|
1
|
|
|
|
|
|
w->bs_confs[iteration & 1][best].refresh_tree(); |
|
16566
|
|
|
|
|
|
|
|
|
16567
|
1
|
50
|
|
|
|
|
if (cost) *cost = w->bs_confs[iteration & 1][best].cost * (t.nodes.size() - 1); |
|
16568
|
|
|
|
|
|
|
|
|
16569
|
|
|
|
|
|
|
// Store workspace |
|
16570
|
1
|
|
|
|
|
|
workspaces.push(w); |
|
16571
|
1
|
|
|
|
|
|
} |
|
16572
|
|
|
|
|
|
|
|
|
16573
|
129
|
|
|
|
|
|
void parser_nn::workspace::beam_size_configuration::refresh_tree() { |
|
16574
|
1161
|
100
|
|
|
|
|
for (auto&& node : conf.t->nodes) node.children.clear(); |
|
16575
|
1161
|
100
|
|
|
|
|
for (size_t i = 0; i < conf.t->nodes.size(); i++) { |
|
16576
|
1032
|
|
|
|
|
|
conf.t->nodes[i].head = heads[i]; |
|
16577
|
2064
|
|
|
|
|
|
conf.t->nodes[i].deprel = deprels[i]; |
|
16578
|
1334
|
100
|
|
|
|
|
if (heads[i] >= 0) conf.t->nodes[heads[i]].children.push_back(i); |
|
16579
|
|
|
|
|
|
|
} |
|
16580
|
129
|
|
|
|
|
|
} |
|
16581
|
|
|
|
|
|
|
|
|
16582
|
67
|
|
|
|
|
|
void parser_nn::workspace::beam_size_configuration::save_tree() { |
|
16583
|
67
|
100
|
|
|
|
|
if (conf.t->nodes.size() > heads.size()) heads.resize(conf.t->nodes.size()); |
|
16584
|
67
|
100
|
|
|
|
|
if (conf.t->nodes.size() > deprels.size()) deprels.resize(conf.t->nodes.size()); |
|
16585
|
603
|
100
|
|
|
|
|
for (size_t i = 0; i < conf.t->nodes.size(); i++) { |
|
16586
|
536
|
|
|
|
|
|
heads[i] = conf.t->nodes[i].head; |
|
16587
|
1072
|
|
|
|
|
|
deprels[i] = conf.t->nodes[i].deprel; |
|
16588
|
|
|
|
|
|
|
} |
|
16589
|
67
|
|
|
|
|
|
} |
|
16590
|
|
|
|
|
|
|
|
|
16591
|
1
|
|
|
|
|
|
void parser_nn::load(binary_decoder& data, unsigned cache) { |
|
16592
|
|
|
|
|
|
|
string description, error; |
|
16593
|
|
|
|
|
|
|
|
|
16594
|
1
|
50
|
|
|
|
|
version = versioned ? data.next_1B() : 1; |
|
|
|
0
|
|
|
|
|
|
|
16595
|
1
|
50
|
|
|
|
|
if (!(version >= 1 && version <= VERSION_LATEST)) |
|
16596
|
0
|
|
|
|
|
|
throw binary_decoder_error("Unrecognized version of the parser_nn model"); |
|
16597
|
|
|
|
|
|
|
|
|
16598
|
1
|
50
|
|
|
|
|
single_root = version >= 2 ? data.next_1B() : false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
16599
|
|
|
|
|
|
|
|
|
16600
|
|
|
|
|
|
|
// Load labels |
|
16601
|
1
|
50
|
|
|
|
|
labels.resize(data.next_2B()); |
|
|
|
50
|
|
|
|
|
|
|
16602
|
7
|
100
|
|
|
|
|
for (auto&& label : labels) |
|
16603
|
6
|
50
|
|
|
|
|
data.next_str(label); |
|
16604
|
|
|
|
|
|
|
|
|
16605
|
|
|
|
|
|
|
// Load transition system |
|
16606
|
|
|
|
|
|
|
string system_name; |
|
16607
|
1
|
50
|
|
|
|
|
data.next_str(system_name); |
|
16608
|
1
|
50
|
|
|
|
|
system.reset(transition_system::create(system_name, labels)); |
|
16609
|
1
|
50
|
|
|
|
|
if (!system) throw binary_decoder_error("Cannot load transition system"); |
|
16610
|
|
|
|
|
|
|
|
|
16611
|
|
|
|
|
|
|
// Load node extractor |
|
16612
|
1
|
50
|
|
|
|
|
data.next_str(description); |
|
16613
|
1
|
50
|
|
|
|
|
if (!nodes.create(description, error)) |
|
|
|
50
|
|
|
|
|
|
|
16614
|
0
|
|
|
|
|
|
throw binary_decoder_error(error.c_str()); |
|
16615
|
|
|
|
|
|
|
|
|
16616
|
|
|
|
|
|
|
// Load value extractors and embeddings |
|
16617
|
1
|
50
|
|
|
|
|
values.resize(data.next_2B()); |
|
|
|
50
|
|
|
|
|
|
|
16618
|
5
|
100
|
|
|
|
|
for (auto&& value : values) { |
|
16619
|
4
|
50
|
|
|
|
|
data.next_str(description); |
|
16620
|
4
|
50
|
|
|
|
|
if (!value.create(description, error)) |
|
|
|
50
|
|
|
|
|
|
|
16621
|
0
|
|
|
|
|
|
throw binary_decoder_error(error.c_str()); |
|
16622
|
|
|
|
|
|
|
} |
|
16623
|
|
|
|
|
|
|
|
|
16624
|
1
|
50
|
|
|
|
|
embeddings.resize(values.size()); |
|
16625
|
5
|
100
|
|
|
|
|
for (auto&& embedding : embeddings) |
|
16626
|
4
|
50
|
|
|
|
|
embedding.load(data); |
|
16627
|
|
|
|
|
|
|
|
|
16628
|
|
|
|
|
|
|
// Load the network |
|
16629
|
1
|
50
|
|
|
|
|
network.load(data); |
|
16630
|
1
|
50
|
|
|
|
|
network.generate_tanh_cache(); |
|
16631
|
1
|
50
|
|
|
|
|
network.generate_embeddings_cache(embeddings, embeddings_cache, cache); |
|
16632
|
1
|
|
|
|
|
|
} |
|
16633
|
|
|
|
|
|
|
|
|
16634
|
|
|
|
|
|
|
} // namespace parsito |
|
16635
|
|
|
|
|
|
|
|
|
16636
|
|
|
|
|
|
|
///////// |
|
16637
|
|
|
|
|
|
|
// File: parsito/parser/parser_nn_trainer.h |
|
16638
|
|
|
|
|
|
|
///////// |
|
16639
|
|
|
|
|
|
|
|
|
16640
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
16641
|
|
|
|
|
|
|
// |
|
16642
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
16643
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
16644
|
|
|
|
|
|
|
// |
|
16645
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
16646
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
16647
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
16648
|
|
|
|
|
|
|
|
|
16649
|
|
|
|
|
|
|
namespace parsito { |
|
16650
|
|
|
|
|
|
|
|
|
16651
|
|
|
|
|
|
|
class parser_nn_trainer { |
|
16652
|
|
|
|
|
|
|
public: |
|
16653
|
|
|
|
|
|
|
static void train(const string& transition_system_name, const string& transition_oracle_name, bool single_root, |
|
16654
|
|
|
|
|
|
|
const string& embeddings_description, const string& nodes_description, const network_parameters& parameters, |
|
16655
|
|
|
|
|
|
|
unsigned number_of_threads, const vector& train, const vector& heldout, binary_encoder& enc); |
|
16656
|
|
|
|
|
|
|
}; |
|
16657
|
|
|
|
|
|
|
|
|
16658
|
|
|
|
|
|
|
} // namespace parsito |
|
16659
|
|
|
|
|
|
|
|
|
16660
|
|
|
|
|
|
|
///////// |
|
16661
|
|
|
|
|
|
|
// File: parsito/parser/parser_nn_trainer.cpp |
|
16662
|
|
|
|
|
|
|
///////// |
|
16663
|
|
|
|
|
|
|
|
|
16664
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
16665
|
|
|
|
|
|
|
// |
|
16666
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
16667
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
16668
|
|
|
|
|
|
|
// |
|
16669
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
16670
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
16671
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
16672
|
|
|
|
|
|
|
|
|
16673
|
|
|
|
|
|
|
namespace parsito { |
|
16674
|
|
|
|
|
|
|
|
|
16675
|
0
|
|
|
|
|
|
void parser_nn_trainer::train(const string& transition_system_name, const string& transition_oracle_name, bool single_root, |
|
16676
|
|
|
|
|
|
|
const string& embeddings_description, const string& nodes_description, const network_parameters& parameters, |
|
16677
|
|
|
|
|
|
|
unsigned /*number_of_threads*/, const vector& train, const vector& heldout, binary_encoder& enc) { |
|
16678
|
0
|
0
|
|
|
|
|
if (train.empty()) training_failure("No training data was given!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
16679
|
|
|
|
|
|
|
|
|
16680
|
|
|
|
|
|
|
// Random generator with fixed seed for reproducibility |
|
16681
|
|
|
|
|
|
|
mt19937 generator(42); |
|
16682
|
|
|
|
|
|
|
|
|
16683
|
|
|
|
|
|
|
// Check that all non-root nodes have heads and nonempty deprel |
|
16684
|
0
|
0
|
|
|
|
|
for (auto&& tree : train) |
|
16685
|
0
|
0
|
|
|
|
|
for (auto&& node : tree.nodes) |
|
16686
|
0
|
0
|
|
|
|
|
if (node.id) { |
|
16687
|
0
|
0
|
|
|
|
|
if (node.head < 0) training_failure("The node '" << node.form << "' with id " << node.id << " has no head set!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
16688
|
0
|
0
|
|
|
|
|
if (node.deprel.empty()) training_failure("The node '" << node.form << "' with id " << node.id << " has no deprel set!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
16689
|
|
|
|
|
|
|
} |
|
16690
|
|
|
|
|
|
|
|
|
16691
|
|
|
|
|
|
|
// Create parser instance to be trained |
|
16692
|
0
|
|
|
|
|
|
parser_nn parser(true); parser.version = parser_nn::VERSION_LATEST; |
|
16693
|
|
|
|
|
|
|
|
|
16694
|
|
|
|
|
|
|
// Generate labels for transition system |
|
16695
|
|
|
|
|
|
|
unordered_set labels_set; |
|
16696
|
0
|
0
|
|
|
|
|
for (auto&& tree : train) |
|
16697
|
0
|
0
|
|
|
|
|
for (auto&& node : tree.nodes) |
|
16698
|
0
|
0
|
|
|
|
|
if (node.id && !labels_set.count(node.deprel)) { |
|
16699
|
0
|
|
|
|
|
|
labels_set.insert(node.deprel); |
|
16700
|
0
|
0
|
|
|
|
|
parser.labels.push_back(node.deprel); |
|
16701
|
|
|
|
|
|
|
} |
|
16702
|
|
|
|
|
|
|
|
|
16703
|
|
|
|
|
|
|
// If single_root, check that exactly root nodes have "root" deprel |
|
16704
|
0
|
0
|
|
|
|
|
if (single_root) { |
|
16705
|
0
|
0
|
|
|
|
|
for (auto&& tree : train) { |
|
16706
|
|
|
|
|
|
|
unsigned roots = 0; |
|
16707
|
0
|
0
|
|
|
|
|
for (auto&& node : tree.nodes) |
|
16708
|
0
|
0
|
|
|
|
|
if (node.id) { |
|
16709
|
0
|
0
|
|
|
|
|
if (node.head == 0 && node.deprel != "root") |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
16710
|
0
|
0
|
|
|
|
|
training_failure("When single root is required, every root node must have 'root' deprel!"); |
|
|
|
0
|
|
|
|
|
|
|
16711
|
0
|
0
|
|
|
|
|
if (node.head != 0 && node.deprel == "root") |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
16712
|
0
|
0
|
|
|
|
|
training_failure("When single root is required, any non-root cannot have 'root' deprel!"); |
|
|
|
0
|
|
|
|
|
|
|
16713
|
0
|
|
|
|
|
|
roots += node.head == 0; |
|
16714
|
|
|
|
|
|
|
} |
|
16715
|
0
|
0
|
|
|
|
|
if (roots != 1) |
|
16716
|
0
|
0
|
|
|
|
|
training_failure("When single root is required, every training tree must have single root!"); |
|
|
|
0
|
|
|
|
|
|
|
16717
|
|
|
|
|
|
|
} |
|
16718
|
|
|
|
|
|
|
|
|
16719
|
|
|
|
|
|
|
// Make sure (in case input is really small) there is "root" deprel plus another one |
|
16720
|
0
|
0
|
|
|
|
|
if (!labels_set.count("root")) |
|
|
|
0
|
|
|
|
|
|
|
16721
|
0
|
0
|
|
|
|
|
training_failure("When single root is required, the deprel 'root' must be present!"); |
|
|
|
0
|
|
|
|
|
|
|
16722
|
0
|
0
|
|
|
|
|
if (labels_set.size() <= 1) |
|
16723
|
0
|
0
|
|
|
|
|
training_failure("When single root is required, deprel different from 'root' must exist!"); |
|
|
|
0
|
|
|
|
|
|
|
16724
|
|
|
|
|
|
|
} |
|
16725
|
|
|
|
|
|
|
|
|
16726
|
|
|
|
|
|
|
// Create transition system and transition oracle |
|
16727
|
0
|
0
|
|
|
|
|
parser.system.reset(transition_system::create(transition_system_name, parser.labels)); |
|
16728
|
0
|
0
|
|
|
|
|
if (!parser.system) training_failure("Cannot create transition system '" << transition_system_name << "'!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
16729
|
|
|
|
|
|
|
|
|
16730
|
0
|
0
|
|
|
|
|
unique_ptr oracle(parser.system->oracle(transition_oracle_name)); |
|
16731
|
0
|
0
|
|
|
|
|
if (!oracle) training_failure("Cannot create transition oracle '" << transition_oracle_name << "' for transition system '" << transition_system_name << "'!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
16732
|
|
|
|
|
|
|
|
|
16733
|
|
|
|
|
|
|
// Create node_extractor |
|
16734
|
|
|
|
|
|
|
string error; |
|
16735
|
0
|
0
|
|
|
|
|
if (!parser.nodes.create(nodes_description, error)) training_failure(error); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
16736
|
|
|
|
|
|
|
|
|
16737
|
|
|
|
|
|
|
// Load value_extractors and embeddings |
|
16738
|
0
|
|
|
|
|
|
vector value_names; |
|
16739
|
|
|
|
|
|
|
vector lines, tokens; |
|
16740
|
0
|
0
|
|
|
|
|
split(embeddings_description, '\n', lines); |
|
16741
|
0
|
0
|
|
|
|
|
for (auto&& line : lines) { |
|
16742
|
|
|
|
|
|
|
// Ignore empty lines and comments |
|
16743
|
0
|
0
|
|
|
|
|
if (!line.len || line.str[0] == '#') continue; |
|
|
|
0
|
|
|
|
|
|
|
16744
|
|
|
|
|
|
|
|
|
16745
|
0
|
0
|
|
|
|
|
split(line, ' ', tokens); |
|
16746
|
0
|
0
|
|
|
|
|
if (!(tokens.size() >= 3 && tokens.size() <= 6)) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
16747
|
0
|
0
|
|
|
|
|
training_failure("Expected 3 to 6 columns on embedding description line '" << line << "'!"); |
|
|
|
0
|
|
|
|
|
|
|
16748
|
|
|
|
|
|
|
|
|
16749
|
0
|
0
|
|
|
|
|
value_names.emplace_back(string(tokens[0].str, tokens[0].len)); |
|
16750
|
0
|
0
|
|
|
|
|
parser.values.emplace_back(); |
|
16751
|
0
|
0
|
|
|
|
|
if (!parser.values.back().create(tokens[0], error)) training_failure(error); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
16752
|
|
|
|
|
|
|
|
|
16753
|
0
|
0
|
|
|
|
|
int dimension = parse_int(tokens[1], "embedding dimension"); |
|
16754
|
0
|
0
|
|
|
|
|
int min_count = parse_int(tokens[2], "minimum frequency count"); |
|
16755
|
|
|
|
|
|
|
unsigned updatable_index = 0; |
|
16756
|
|
|
|
|
|
|
unsigned embeddings_from_file = 0; |
|
16757
|
|
|
|
|
|
|
string embeddings_from_file_comment; |
|
16758
|
0
|
|
|
|
|
|
vector>> weights; |
|
16759
|
|
|
|
|
|
|
unordered_set weights_set; |
|
16760
|
|
|
|
|
|
|
|
|
16761
|
|
|
|
|
|
|
// Compute words and counts present in the training data |
|
16762
|
|
|
|
|
|
|
string word; |
|
16763
|
|
|
|
|
|
|
unordered_map word_counts; |
|
16764
|
0
|
0
|
|
|
|
|
for (auto&& tree : train) |
|
16765
|
0
|
0
|
|
|
|
|
for (auto&& node : tree.nodes) |
|
16766
|
0
|
0
|
|
|
|
|
if (node.id) { |
|
16767
|
0
|
0
|
|
|
|
|
parser.values.back().extract(node, word); |
|
16768
|
0
|
|
|
|
|
|
word_counts[word]++; |
|
16769
|
|
|
|
|
|
|
} |
|
16770
|
|
|
|
|
|
|
|
|
16771
|
|
|
|
|
|
|
// Load embedding if it was given |
|
16772
|
0
|
0
|
|
|
|
|
if (tokens.size() >= 4) { |
|
16773
|
0
|
0
|
|
|
|
|
int update_weights = tokens.size() >= 5 ? parse_int(tokens[4], "update weights") : 1; |
|
|
|
0
|
|
|
|
|
|
|
16774
|
0
|
0
|
|
|
|
|
int max_embeddings = tokens.size() >= 6 ? parse_int(tokens[5], "maximum embeddings count") : numeric_limits::max(); |
|
|
|
0
|
|
|
|
|
|
|
16775
|
0
|
0
|
|
|
|
|
ifstream in(path_from_utf8(string(tokens[3].str, tokens[3].len)).c_str()); |
|
16776
|
0
|
0
|
|
|
|
|
if (!in.is_open()) training_failure("Cannot load '" << tokens[0] << "' embedding from file '" << tokens[3] << "'!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
16777
|
|
|
|
|
|
|
|
|
16778
|
|
|
|
|
|
|
// Load first line containing dictionary size and dimensions |
|
16779
|
|
|
|
|
|
|
string line; |
|
16780
|
|
|
|
|
|
|
vector parts; |
|
16781
|
0
|
0
|
|
|
|
|
if (!getline(in, line)) training_failure("Cannot read first line from embedding file '" << tokens[3] << "'!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
16782
|
0
|
0
|
|
|
|
|
split(line, ' ', parts); |
|
16783
|
0
|
0
|
|
|
|
|
if (parts.size() != 2) training_failure("Expected two numbers on the first line of embedding file '" << tokens[3] << "'!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
16784
|
0
|
0
|
|
|
|
|
int file_dimension = parse_int(parts[1], "embedding file dimension"); |
|
16785
|
|
|
|
|
|
|
|
|
16786
|
0
|
0
|
|
|
|
|
if (file_dimension < dimension) training_failure("The embedding file '" << tokens[3] << "' has lower dimension than required!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
16787
|
|
|
|
|
|
|
|
|
16788
|
|
|
|
|
|
|
// Generate random projection when smaller dimension is required |
|
16789
|
0
|
|
|
|
|
|
vector> projection; |
|
16790
|
0
|
0
|
|
|
|
|
if (file_dimension > dimension) { |
|
16791
|
0
|
0
|
|
|
|
|
embeddings_from_file_comment = "[dim" + to_string(file_dimension) + "->" + to_string(dimension) + "]"; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
16792
|
|
|
|
|
|
|
|
|
16793
|
|
|
|
|
|
|
uniform_real_distribution uniform(0, 1); |
|
16794
|
0
|
0
|
|
|
|
|
projection.resize(dimension); |
|
16795
|
0
|
0
|
|
|
|
|
for (auto&& row : projection) { |
|
16796
|
0
|
0
|
|
|
|
|
row.resize(file_dimension); |
|
16797
|
0
|
0
|
|
|
|
|
for (auto&& weight : row) weight = uniform(generator); |
|
16798
|
|
|
|
|
|
|
|
|
16799
|
|
|
|
|
|
|
double sum = 0; |
|
16800
|
0
|
0
|
|
|
|
|
for (auto&& weight : row) sum += weight; |
|
16801
|
0
|
0
|
|
|
|
|
for (auto&& weight : row) weight /= sum; |
|
16802
|
|
|
|
|
|
|
} |
|
16803
|
|
|
|
|
|
|
} |
|
16804
|
|
|
|
|
|
|
|
|
16805
|
|
|
|
|
|
|
// Load input embedding |
|
16806
|
0
|
0
|
|
|
|
|
vector input_weights(file_dimension); |
|
16807
|
0
|
0
|
|
|
|
|
vector projected_weights(dimension); |
|
16808
|
0
|
0
|
|
|
|
|
while (getline(in, line) && int(weights.size()) < max_embeddings) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
16809
|
0
|
0
|
|
|
|
|
split(line, ' ', parts); |
|
16810
|
0
|
0
|
|
|
|
|
if (!parts.empty() && !parts.back().len) parts.pop_back(); // Ignore space at the end of line |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
16811
|
0
|
0
|
|
|
|
|
if (int(parts.size()) != file_dimension + 1) training_failure("Wrong number of values on line '" << line << "' of embedding file '" << tokens[3]); |
|
|
|
0
|
|
|
|
|
|
|
16812
|
0
|
0
|
|
|
|
|
for (int i = 0; i < file_dimension; i++) |
|
16813
|
0
|
0
|
|
|
|
|
input_weights[i] = parse_double(parts[1 + i], "embedding weight"); |
|
16814
|
|
|
|
|
|
|
|
|
16815
|
0
|
|
|
|
|
|
string word(parts[0].str, parts[0].len); |
|
16816
|
|
|
|
|
|
|
|
|
16817
|
|
|
|
|
|
|
// For update_weights == 2, ignore embeddings for unknown words |
|
16818
|
0
|
0
|
|
|
|
|
if (update_weights == 2 && !word_counts.count(word)) |
|
16819
|
|
|
|
|
|
|
continue; |
|
16820
|
|
|
|
|
|
|
|
|
16821
|
0
|
0
|
|
|
|
|
for (int i = 0; i < dimension; i++) |
|
16822
|
0
|
0
|
|
|
|
|
if (file_dimension == dimension) { |
|
16823
|
0
|
|
|
|
|
|
projected_weights[i] = input_weights[i]; |
|
16824
|
|
|
|
|
|
|
} else { |
|
16825
|
0
|
|
|
|
|
|
projected_weights[i] = 0; |
|
16826
|
0
|
0
|
|
|
|
|
for (int j = 0; j < file_dimension; j++) |
|
16827
|
0
|
|
|
|
|
|
projected_weights[i] += projection[i][j] * input_weights[j]; |
|
16828
|
|
|
|
|
|
|
} |
|
16829
|
|
|
|
|
|
|
|
|
16830
|
0
|
0
|
|
|
|
|
if (!weights_set.count(word)) { |
|
16831
|
0
|
0
|
|
|
|
|
weights.emplace_back(word, projected_weights); |
|
16832
|
|
|
|
|
|
|
weights_set.insert(word); |
|
16833
|
|
|
|
|
|
|
} |
|
16834
|
|
|
|
|
|
|
} |
|
16835
|
0
|
|
|
|
|
|
embeddings_from_file = weights.size(); |
|
16836
|
0
|
0
|
|
|
|
|
updatable_index = update_weights ? 0 : embeddings_from_file; |
|
16837
|
|
|
|
|
|
|
} |
|
16838
|
|
|
|
|
|
|
|
|
16839
|
|
|
|
|
|
|
// Add embedding for non-present word with min_count, sorted by count |
|
16840
|
|
|
|
|
|
|
{ |
|
16841
|
0
|
|
|
|
|
|
vector> count_words; |
|
16842
|
0
|
0
|
|
|
|
|
for (auto&& word_count : word_counts) |
|
16843
|
0
|
0
|
|
|
|
|
if (word_count.second >= min_count && !weights_set.count(word_count.first)) |
|
16844
|
0
|
0
|
|
|
|
|
count_words.emplace_back(word_count.second, word_count.first); |
|
16845
|
|
|
|
|
|
|
|
|
16846
|
|
|
|
|
|
|
sort(count_words.rbegin(), count_words.rend()); |
|
16847
|
|
|
|
|
|
|
|
|
16848
|
0
|
0
|
|
|
|
|
vector word_weights(dimension); |
|
16849
|
|
|
|
|
|
|
uniform_real_distribution uniform(-1, 1); |
|
16850
|
0
|
0
|
|
|
|
|
for (auto&& count_word : count_words) { |
|
16851
|
0
|
0
|
|
|
|
|
for (auto&& word_weight : word_weights) |
|
16852
|
0
|
|
|
|
|
|
word_weight = uniform(generator); |
|
16853
|
|
|
|
|
|
|
|
|
16854
|
0
|
0
|
|
|
|
|
weights.emplace_back(count_word.second, word_weights); |
|
16855
|
|
|
|
|
|
|
} |
|
16856
|
|
|
|
|
|
|
} |
|
16857
|
|
|
|
|
|
|
|
|
16858
|
|
|
|
|
|
|
// If there are unknown words in the training data, create initial embedding |
|
16859
|
0
|
0
|
|
|
|
|
vector unknown_weights(dimension); |
|
16860
|
0
|
0
|
|
|
|
|
if (min_count > 1) { |
|
16861
|
|
|
|
|
|
|
uniform_real_distribution uniform(-1, 1); |
|
16862
|
|
|
|
|
|
|
|
|
16863
|
0
|
0
|
|
|
|
|
for (auto&& weight : unknown_weights) |
|
16864
|
0
|
|
|
|
|
|
weight = uniform(generator); |
|
16865
|
|
|
|
|
|
|
} |
|
16866
|
|
|
|
|
|
|
|
|
16867
|
|
|
|
|
|
|
// Add the embedding |
|
16868
|
0
|
0
|
|
|
|
|
parser.embeddings.emplace_back(); |
|
16869
|
0
|
0
|
|
|
|
|
parser.embeddings.back().create(dimension, updatable_index, weights, unknown_weights); |
|
16870
|
|
|
|
|
|
|
|
|
16871
|
|
|
|
|
|
|
// Count the cover of this embedding |
|
16872
|
|
|
|
|
|
|
string buffer; |
|
16873
|
|
|
|
|
|
|
unsigned words_total = 0, words_covered = 0, words_covered_from_file = 0; |
|
16874
|
0
|
0
|
|
|
|
|
for (auto&& tree : train) |
|
16875
|
0
|
0
|
|
|
|
|
for (auto&& node : tree.nodes) |
|
16876
|
0
|
0
|
|
|
|
|
if (node.id) { |
|
16877
|
0
|
0
|
|
|
|
|
parser.values.back().extract(node, word); |
|
16878
|
0
|
|
|
|
|
|
words_total++; |
|
16879
|
0
|
0
|
|
|
|
|
int word_id = parser.embeddings.back().lookup_word(word, buffer); |
|
16880
|
0
|
|
|
|
|
|
words_covered += word_id != parser.embeddings.back().unknown_word(); |
|
16881
|
0
|
0
|
|
|
|
|
words_covered_from_file += word_id != parser.embeddings.back().unknown_word() && unsigned(word_id) < embeddings_from_file; |
|
|
|
0
|
|
|
|
|
|
|
16882
|
|
|
|
|
|
|
} |
|
16883
|
|
|
|
|
|
|
|
|
16884
|
|
|
|
|
|
|
cerr << "Initialized '" << tokens[0] << "' embedding with " << embeddings_from_file << embeddings_from_file_comment |
|
16885
|
0
|
|
|
|
|
|
<< "," << weights.size() << " words and " << fixed << setprecision(1) << 100. * words_covered_from_file / words_total |
|
16886
|
0
|
|
|
|
|
|
<< "%," << 100. * words_covered / words_total << "% coverage." << endl; |
|
16887
|
|
|
|
|
|
|
} |
|
16888
|
|
|
|
|
|
|
|
|
16889
|
|
|
|
|
|
|
// Train the network |
|
16890
|
|
|
|
|
|
|
unsigned total_dimension = 0, total_nodes = 0; |
|
16891
|
0
|
0
|
|
|
|
|
for (auto&& embedding : parser.embeddings) total_dimension += embedding.dimension; |
|
16892
|
0
|
0
|
|
|
|
|
for (auto&& tree : train) total_nodes += tree.nodes.size() - 1; |
|
16893
|
0
|
|
|
|
|
|
auto scaled_parameters = parameters; |
|
16894
|
0
|
|
|
|
|
|
scaled_parameters.l1_regularization /= train.size(); |
|
16895
|
0
|
|
|
|
|
|
scaled_parameters.l2_regularization /= total_nodes; |
|
16896
|
0
|
0
|
|
|
|
|
neural_network_trainer network_trainer(parser.network, total_dimension * parser.nodes.node_count(), parser.system->transition_count(), scaled_parameters, generator); |
|
|
|
0
|
|
|
|
|
|
|
16897
|
|
|
|
|
|
|
|
|
16898
|
0
|
|
|
|
|
|
neural_network heldout_best_network; |
|
16899
|
|
|
|
|
|
|
unsigned heldout_best_correct_labelled = 0, heldout_best_iteration = 0; |
|
16900
|
|
|
|
|
|
|
|
|
16901
|
|
|
|
|
|
|
vector permutation; |
|
16902
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < train.size(); i++) |
|
16903
|
0
|
|
|
|
|
|
permutation.push_back(permutation.size()); |
|
16904
|
|
|
|
|
|
|
|
|
16905
|
0
|
0
|
|
|
|
|
for (int iteration = 1; network_trainer.next_iteration(); iteration++) { |
|
16906
|
|
|
|
|
|
|
// Train on training data |
|
16907
|
0
|
|
|
|
|
|
shuffle(permutation.begin(), permutation.end(), generator); |
|
16908
|
|
|
|
|
|
|
|
|
16909
|
0
|
|
|
|
|
|
atomic atomic_index(0); |
|
16910
|
0
|
|
|
|
|
|
atomic atomic_logprob(0); |
|
16911
|
0
|
|
|
|
|
|
auto training = [&]() { |
|
16912
|
0
|
|
|
|
|
|
tree t; |
|
16913
|
0
|
|
|
|
|
|
configuration conf(single_root); |
|
16914
|
|
|
|
|
|
|
string word, word_buffer; |
|
16915
|
0
|
|
|
|
|
|
vector> nodes_embeddings; |
|
16916
|
|
|
|
|
|
|
vector extracted_nodes; |
|
16917
|
|
|
|
|
|
|
vector*> extracted_embeddings; |
|
16918
|
0
|
|
|
|
|
|
neural_network_trainer::workspace workspace; |
|
16919
|
|
|
|
|
|
|
double logprob = 0; |
|
16920
|
|
|
|
|
|
|
|
|
16921
|
|
|
|
|
|
|
// Data for structured prediction |
|
16922
|
0
|
0
|
|
|
|
|
tree t_eval; |
|
16923
|
0
|
|
|
|
|
|
configuration conf_eval(single_root); |
|
16924
|
0
|
|
|
|
|
|
vector> nodes_embeddings_eval; |
|
16925
|
|
|
|
|
|
|
vector extracted_nodes_eval; |
|
16926
|
|
|
|
|
|
|
vector*> extracted_embeddings_eval; |
|
16927
|
|
|
|
|
|
|
vector transitions_eval; |
|
16928
|
|
|
|
|
|
|
vector hidden_layer_eval, outcomes_eval; |
|
16929
|
|
|
|
|
|
|
|
|
16930
|
0
|
0
|
|
|
|
|
for (unsigned current_index; (current_index = atomic_index++) < permutation.size();) { |
|
16931
|
0
|
|
|
|
|
|
const tree& gold = train[permutation[current_index]]; |
|
16932
|
|
|
|
|
|
|
t = gold; |
|
16933
|
|
|
|
|
|
|
t.unlink_all_nodes(); |
|
16934
|
0
|
0
|
|
|
|
|
conf.init(&t); |
|
16935
|
|
|
|
|
|
|
|
|
16936
|
|
|
|
|
|
|
// Compute embeddings |
|
16937
|
0
|
0
|
|
|
|
|
if (t.nodes.size() > nodes_embeddings.size()) nodes_embeddings.resize(t.nodes.size()); |
|
|
|
0
|
|
|
|
|
|
|
16938
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < t.nodes.size(); i++) { |
|
16939
|
0
|
0
|
|
|
|
|
nodes_embeddings[i].resize(parser.embeddings.size()); |
|
16940
|
0
|
0
|
|
|
|
|
for (size_t j = 0; j < parser.embeddings.size(); j++) { |
|
16941
|
0
|
0
|
|
|
|
|
parser.values[j].extract(t.nodes[i], word); |
|
16942
|
0
|
0
|
|
|
|
|
nodes_embeddings[i][j] = parser.embeddings[j].lookup_word(word, word_buffer); |
|
16943
|
|
|
|
|
|
|
} |
|
16944
|
|
|
|
|
|
|
} |
|
16945
|
|
|
|
|
|
|
|
|
16946
|
|
|
|
|
|
|
// Create tree oracle |
|
16947
|
0
|
0
|
|
|
|
|
auto tree_oracle = oracle->create_tree_oracle(gold); |
|
16948
|
|
|
|
|
|
|
|
|
16949
|
|
|
|
|
|
|
// Train the network |
|
16950
|
0
|
0
|
|
|
|
|
while (!conf.final()) { |
|
16951
|
|
|
|
|
|
|
// Extract nodes |
|
16952
|
0
|
0
|
|
|
|
|
parser.nodes.extract(conf, extracted_nodes); |
|
16953
|
0
|
0
|
|
|
|
|
extracted_embeddings.resize(extracted_nodes.size()); |
|
16954
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < extracted_nodes.size(); i++) |
|
16955
|
0
|
0
|
|
|
|
|
extracted_embeddings[i] = extracted_nodes[i] >= 0 ? &nodes_embeddings[extracted_nodes[i]] : nullptr; |
|
16956
|
|
|
|
|
|
|
|
|
16957
|
|
|
|
|
|
|
// Propagate |
|
16958
|
0
|
0
|
|
|
|
|
network_trainer.propagate(parser.embeddings, extracted_embeddings, workspace); |
|
16959
|
|
|
|
|
|
|
|
|
16960
|
|
|
|
|
|
|
// Find most probable applicable transition |
|
16961
|
|
|
|
|
|
|
int network_best = -1; |
|
16962
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < workspace.outcomes.size(); i++) |
|
16963
|
0
|
0
|
|
|
|
|
if (parser.system->applicable(conf, i) && (network_best < 0 || workspace.outcomes[i] > workspace.outcomes[network_best])) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
16964
|
0
|
|
|
|
|
|
network_best = i; |
|
16965
|
|
|
|
|
|
|
|
|
16966
|
|
|
|
|
|
|
// Apply the oracle |
|
16967
|
0
|
0
|
|
|
|
|
auto prediction = tree_oracle->predict(conf, network_best, iteration); |
|
16968
|
|
|
|
|
|
|
|
|
16969
|
|
|
|
|
|
|
// If the best transition is applicable, train on it |
|
16970
|
0
|
0
|
|
|
|
|
if (parser.system->applicable(conf, prediction.best)) { |
|
|
|
0
|
|
|
|
|
|
|
16971
|
|
|
|
|
|
|
// Update logprob |
|
16972
|
0
|
0
|
|
|
|
|
if (workspace.outcomes[prediction.best]) |
|
16973
|
0
|
|
|
|
|
|
logprob += log(workspace.outcomes[prediction.best]); |
|
16974
|
|
|
|
|
|
|
|
|
16975
|
|
|
|
|
|
|
// Backpropagate the chosen outcome |
|
16976
|
0
|
0
|
|
|
|
|
network_trainer.backpropagate(parser.embeddings, extracted_embeddings, prediction.best, workspace); |
|
16977
|
|
|
|
|
|
|
} |
|
16978
|
|
|
|
|
|
|
|
|
16979
|
|
|
|
|
|
|
// Emergency break if the to_follow transition is not applicable |
|
16980
|
0
|
0
|
|
|
|
|
if (!parser.system->applicable(conf, prediction.to_follow)) |
|
|
|
0
|
|
|
|
|
|
|
16981
|
|
|
|
|
|
|
break; |
|
16982
|
|
|
|
|
|
|
|
|
16983
|
|
|
|
|
|
|
// Follow the chosen outcome |
|
16984
|
0
|
0
|
|
|
|
|
int child = parser.system->perform(conf, prediction.to_follow); |
|
16985
|
|
|
|
|
|
|
|
|
16986
|
|
|
|
|
|
|
// If a node was linked, recompute its embeddings as deprel has changed |
|
16987
|
0
|
0
|
|
|
|
|
if (child >= 0) |
|
16988
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < parser.embeddings.size(); i++) { |
|
16989
|
0
|
0
|
|
|
|
|
parser.values[i].extract(t.nodes[child], word); |
|
16990
|
0
|
0
|
|
|
|
|
nodes_embeddings[child][i] = parser.embeddings[i].lookup_word(word, word_buffer); |
|
16991
|
|
|
|
|
|
|
} |
|
16992
|
|
|
|
|
|
|
} |
|
16993
|
|
|
|
|
|
|
network_trainer.finalize_sentence(); |
|
16994
|
|
|
|
|
|
|
|
|
16995
|
|
|
|
|
|
|
// Structured prediction |
|
16996
|
0
|
0
|
|
|
|
|
if (parameters.structured_interval && (current_index % parameters.structured_interval) == 0) { |
|
|
|
0
|
|
|
|
|
|
|
16997
|
0
|
|
|
|
|
|
uniform_int_distribution train_distribution(0, train.size() - 1); |
|
16998
|
0
|
|
|
|
|
|
const tree& gold = train[train_distribution(generator)]; |
|
16999
|
|
|
|
|
|
|
t = gold; |
|
17000
|
|
|
|
|
|
|
t.unlink_all_nodes(); |
|
17001
|
0
|
0
|
|
|
|
|
conf.init(&t); |
|
17002
|
|
|
|
|
|
|
|
|
17003
|
|
|
|
|
|
|
// Compute embeddings |
|
17004
|
0
|
0
|
|
|
|
|
if (t.nodes.size() > nodes_embeddings.size()) nodes_embeddings.resize(t.nodes.size()); |
|
|
|
0
|
|
|
|
|
|
|
17005
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < t.nodes.size(); i++) { |
|
17006
|
0
|
0
|
|
|
|
|
nodes_embeddings[i].resize(parser.embeddings.size()); |
|
17007
|
0
|
0
|
|
|
|
|
for (size_t j = 0; j < parser.embeddings.size(); j++) { |
|
17008
|
0
|
0
|
|
|
|
|
parser.values[j].extract(t.nodes[i], word); |
|
17009
|
0
|
0
|
|
|
|
|
nodes_embeddings[i][j] = parser.embeddings[j].lookup_word(word, word_buffer); |
|
17010
|
|
|
|
|
|
|
} |
|
17011
|
|
|
|
|
|
|
} |
|
17012
|
|
|
|
|
|
|
|
|
17013
|
|
|
|
|
|
|
// Create tree oracle |
|
17014
|
0
|
0
|
|
|
|
|
auto tree_oracle = oracle->create_tree_oracle(gold); |
|
17015
|
|
|
|
|
|
|
|
|
17016
|
|
|
|
|
|
|
// Train the network |
|
17017
|
0
|
0
|
|
|
|
|
while (!conf.final()) { |
|
17018
|
|
|
|
|
|
|
// Extract nodes |
|
17019
|
0
|
0
|
|
|
|
|
parser.nodes.extract(conf, extracted_nodes); |
|
17020
|
0
|
0
|
|
|
|
|
extracted_embeddings.resize(extracted_nodes.size()); |
|
17021
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < extracted_nodes.size(); i++) |
|
17022
|
0
|
0
|
|
|
|
|
extracted_embeddings[i] = extracted_nodes[i] >= 0 ? &nodes_embeddings[extracted_nodes[i]] : nullptr; |
|
17023
|
|
|
|
|
|
|
|
|
17024
|
|
|
|
|
|
|
// Find the best transition |
|
17025
|
|
|
|
|
|
|
int best = 0; |
|
17026
|
|
|
|
|
|
|
int best_uas = -1; |
|
17027
|
0
|
0
|
|
|
|
|
tree_oracle->interesting_transitions(conf, transitions_eval); |
|
17028
|
0
|
0
|
|
|
|
|
for (auto&& transition : transitions_eval) { |
|
17029
|
|
|
|
|
|
|
t_eval = t; |
|
17030
|
0
|
0
|
|
|
|
|
conf_eval = conf; |
|
17031
|
0
|
|
|
|
|
|
conf_eval.t = &t_eval; |
|
17032
|
0
|
0
|
|
|
|
|
nodes_embeddings_eval = nodes_embeddings; |
|
17033
|
|
|
|
|
|
|
|
|
17034
|
|
|
|
|
|
|
// Perform probed transition |
|
17035
|
0
|
0
|
|
|
|
|
int child = parser.system->perform(conf_eval, transition); |
|
17036
|
0
|
0
|
|
|
|
|
if (child >= 0) |
|
17037
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < parser.embeddings.size(); i++) { |
|
17038
|
0
|
0
|
|
|
|
|
parser.values[i].extract(t_eval.nodes[child], word); |
|
17039
|
0
|
0
|
|
|
|
|
nodes_embeddings_eval[child][i] = parser.embeddings[i].lookup_word(word, word_buffer); |
|
17040
|
|
|
|
|
|
|
} |
|
17041
|
|
|
|
|
|
|
|
|
17042
|
|
|
|
|
|
|
// Train the network |
|
17043
|
0
|
0
|
|
|
|
|
while (!conf_eval.final()) { |
|
17044
|
|
|
|
|
|
|
// Extract nodes |
|
17045
|
0
|
0
|
|
|
|
|
parser.nodes.extract(conf_eval, extracted_nodes_eval); |
|
17046
|
0
|
0
|
|
|
|
|
extracted_embeddings_eval.resize(extracted_nodes_eval.size()); |
|
17047
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < extracted_nodes_eval.size(); i++) |
|
17048
|
0
|
0
|
|
|
|
|
extracted_embeddings_eval[i] = extracted_nodes_eval[i] >= 0 ? &nodes_embeddings_eval[extracted_nodes_eval[i]] : nullptr; |
|
17049
|
|
|
|
|
|
|
|
|
17050
|
|
|
|
|
|
|
// Classify using neural network |
|
17051
|
0
|
0
|
|
|
|
|
parser.network.propagate(parser.embeddings, extracted_embeddings_eval, hidden_layer_eval, outcomes_eval, nullptr, false); |
|
17052
|
|
|
|
|
|
|
|
|
17053
|
|
|
|
|
|
|
// Find most probable applicable transition |
|
17054
|
|
|
|
|
|
|
int network_best = -1; |
|
17055
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < outcomes_eval.size(); i++) |
|
17056
|
0
|
0
|
|
|
|
|
if (parser.system->applicable(conf_eval, i) && (network_best < 0 || outcomes_eval[i] > outcomes_eval[network_best])) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
17057
|
0
|
|
|
|
|
|
network_best = i; |
|
17058
|
|
|
|
|
|
|
|
|
17059
|
|
|
|
|
|
|
// Perform the best transition |
|
17060
|
0
|
0
|
|
|
|
|
int child = parser.system->perform(conf_eval, network_best); |
|
17061
|
|
|
|
|
|
|
|
|
17062
|
|
|
|
|
|
|
// If a node was linked, recompute its embeddings as deprel has changed |
|
17063
|
0
|
0
|
|
|
|
|
if (child >= 0) |
|
17064
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < parser.embeddings.size(); i++) { |
|
17065
|
0
|
0
|
|
|
|
|
parser.values[i].extract(t_eval.nodes[child], word); |
|
17066
|
0
|
0
|
|
|
|
|
nodes_embeddings_eval[child][i] = parser.embeddings[i].lookup_word(word, word_buffer); |
|
17067
|
|
|
|
|
|
|
} |
|
17068
|
|
|
|
|
|
|
} |
|
17069
|
|
|
|
|
|
|
|
|
17070
|
|
|
|
|
|
|
int uas = 0; |
|
17071
|
0
|
0
|
|
|
|
|
for (unsigned i = 1; i < gold.nodes.size(); i++) |
|
17072
|
0
|
|
|
|
|
|
uas += gold.nodes[i].head == t_eval.nodes[i].head; |
|
17073
|
|
|
|
|
|
|
|
|
17074
|
0
|
0
|
|
|
|
|
if (uas > best_uas) best = transition, best_uas = uas; |
|
17075
|
|
|
|
|
|
|
} |
|
17076
|
|
|
|
|
|
|
|
|
17077
|
|
|
|
|
|
|
// Propagate |
|
17078
|
0
|
0
|
|
|
|
|
network_trainer.propagate(parser.embeddings, extracted_embeddings, workspace); |
|
17079
|
|
|
|
|
|
|
|
|
17080
|
|
|
|
|
|
|
// Backpropagate for the best transition |
|
17081
|
0
|
0
|
|
|
|
|
if (workspace.outcomes[best]) |
|
17082
|
0
|
|
|
|
|
|
logprob += log(workspace.outcomes[best]); |
|
17083
|
0
|
0
|
|
|
|
|
network_trainer.backpropagate(parser.embeddings, extracted_embeddings, best, workspace); |
|
17084
|
|
|
|
|
|
|
|
|
17085
|
|
|
|
|
|
|
// // Find most probable applicable transition when following network outcome |
|
17086
|
|
|
|
|
|
|
// int network_best = -1; |
|
17087
|
|
|
|
|
|
|
// for (unsigned i = 0; i < workspace.outcomes.size(); i++) |
|
17088
|
|
|
|
|
|
|
// if (parser.system->applicable(conf, i) && (network_best < 0 || workspace.outcomes[i] > workspace.outcomes[network_best])) |
|
17089
|
|
|
|
|
|
|
// network_best = i; |
|
17090
|
|
|
|
|
|
|
|
|
17091
|
|
|
|
|
|
|
// Follow the best outcome |
|
17092
|
0
|
0
|
|
|
|
|
int child = parser.system->perform(conf, /*network_*/best); |
|
17093
|
|
|
|
|
|
|
|
|
17094
|
|
|
|
|
|
|
// If a node was linked, recompute its embeddings as deprel has changed |
|
17095
|
0
|
0
|
|
|
|
|
if (child >= 0) |
|
17096
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < parser.embeddings.size(); i++) { |
|
17097
|
0
|
0
|
|
|
|
|
parser.values[i].extract(t.nodes[child], word); |
|
17098
|
0
|
0
|
|
|
|
|
nodes_embeddings[child][i] = parser.embeddings[i].lookup_word(word, word_buffer); |
|
17099
|
|
|
|
|
|
|
} |
|
17100
|
|
|
|
|
|
|
} |
|
17101
|
|
|
|
|
|
|
network_trainer.finalize_sentence(); |
|
17102
|
|
|
|
|
|
|
} |
|
17103
|
|
|
|
|
|
|
} |
|
17104
|
0
|
0
|
|
|
|
|
for (double old_atomic_logprob = atomic_logprob; atomic_logprob.compare_exchange_weak(old_atomic_logprob, old_atomic_logprob + logprob); ) {} |
|
17105
|
0
|
|
|
|
|
|
}; |
|
17106
|
|
|
|
|
|
|
|
|
17107
|
0
|
0
|
|
|
|
|
cerr << "Iteration " << iteration << ": "; |
|
|
|
0
|
|
|
|
|
|
|
17108
|
0
|
0
|
|
|
|
|
training(); |
|
17109
|
|
|
|
|
|
|
cerr << "training logprob " << scientific << setprecision(4) << atomic_logprob; |
|
17110
|
|
|
|
|
|
|
|
|
17111
|
|
|
|
|
|
|
// Evaluate heldout data if present |
|
17112
|
0
|
0
|
|
|
|
|
if (!heldout.empty()) { |
|
17113
|
0
|
0
|
|
|
|
|
tree t; |
|
17114
|
|
|
|
|
|
|
unsigned total = 0, correct_unlabelled = 0, correct_labelled = 0; |
|
17115
|
0
|
0
|
|
|
|
|
for (auto&& gold : heldout) { |
|
17116
|
|
|
|
|
|
|
t = gold; |
|
17117
|
|
|
|
|
|
|
t.unlink_all_nodes(); |
|
17118
|
|
|
|
|
|
|
parser.parse(t); |
|
17119
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < t.nodes.size(); i++) { |
|
17120
|
0
|
|
|
|
|
|
total++; |
|
17121
|
0
|
|
|
|
|
|
correct_unlabelled += t.nodes[i].head == gold.nodes[i].head; |
|
17122
|
0
|
0
|
|
|
|
|
correct_labelled += t.nodes[i].head == gold.nodes[i].head && t.nodes[i].deprel == gold.nodes[i].deprel; |
|
|
|
0
|
|
|
|
|
|
|
17123
|
|
|
|
|
|
|
} |
|
17124
|
|
|
|
|
|
|
} |
|
17125
|
|
|
|
|
|
|
|
|
17126
|
0
|
0
|
|
|
|
|
cerr << ", heldout UAS " << fixed << setprecision(2) << (100. * correct_unlabelled / total) << "%, LAS " << (100. * correct_labelled / total) << "%"; |
|
17127
|
|
|
|
|
|
|
|
|
17128
|
0
|
0
|
|
|
|
|
if (parameters.early_stopping && correct_labelled > heldout_best_correct_labelled) { |
|
|
|
0
|
|
|
|
|
|
|
17129
|
0
|
0
|
|
|
|
|
heldout_best_network = parser.network; |
|
17130
|
|
|
|
|
|
|
heldout_best_correct_labelled = correct_labelled; |
|
17131
|
0
|
|
|
|
|
|
heldout_best_iteration = iteration; |
|
17132
|
|
|
|
|
|
|
} |
|
17133
|
|
|
|
|
|
|
} |
|
17134
|
|
|
|
|
|
|
|
|
17135
|
|
|
|
|
|
|
cerr << endl; |
|
17136
|
|
|
|
|
|
|
} |
|
17137
|
|
|
|
|
|
|
|
|
17138
|
0
|
0
|
|
|
|
|
if (parameters.early_stopping && heldout_best_iteration > 0) { |
|
|
|
0
|
|
|
|
|
|
|
17139
|
|
|
|
|
|
|
cerr << "Using early stopping -- choosing network from iteration " << heldout_best_iteration << endl; |
|
17140
|
0
|
0
|
|
|
|
|
parser.network = heldout_best_network; |
|
17141
|
|
|
|
|
|
|
} |
|
17142
|
|
|
|
|
|
|
|
|
17143
|
|
|
|
|
|
|
// Encode version |
|
17144
|
0
|
0
|
|
|
|
|
enc.add_1B(parser.version); |
|
17145
|
|
|
|
|
|
|
|
|
17146
|
|
|
|
|
|
|
// Encode single_root |
|
17147
|
0
|
0
|
|
|
|
|
enc.add_1B(single_root); |
|
17148
|
|
|
|
|
|
|
|
|
17149
|
|
|
|
|
|
|
// Encode transition system |
|
17150
|
0
|
0
|
|
|
|
|
enc.add_2B(parser.labels.size()); |
|
17151
|
0
|
0
|
|
|
|
|
for (auto&& label : parser.labels) |
|
17152
|
0
|
0
|
|
|
|
|
enc.add_str(label); |
|
17153
|
0
|
0
|
|
|
|
|
enc.add_str(transition_system_name); |
|
17154
|
|
|
|
|
|
|
|
|
17155
|
|
|
|
|
|
|
// Encode nodes selector |
|
17156
|
0
|
0
|
|
|
|
|
enc.add_str(nodes_description); |
|
17157
|
|
|
|
|
|
|
|
|
17158
|
|
|
|
|
|
|
// Encode value extractors and embeddings |
|
17159
|
0
|
0
|
|
|
|
|
enc.add_2B(value_names.size()); |
|
17160
|
0
|
0
|
|
|
|
|
for (auto&& value_name : value_names) |
|
17161
|
0
|
0
|
|
|
|
|
enc.add_str(value_name); |
|
17162
|
0
|
0
|
|
|
|
|
for (auto&& embedding : parser.embeddings) |
|
17163
|
0
|
0
|
|
|
|
|
embedding.save(enc); |
|
17164
|
|
|
|
|
|
|
|
|
17165
|
|
|
|
|
|
|
// Encode the network |
|
17166
|
0
|
0
|
|
|
|
|
network_trainer.save_network(enc); |
|
17167
|
0
|
|
|
|
|
|
} |
|
17168
|
|
|
|
|
|
|
|
|
17169
|
|
|
|
|
|
|
} // namespace parsito |
|
17170
|
|
|
|
|
|
|
|
|
17171
|
|
|
|
|
|
|
///////// |
|
17172
|
|
|
|
|
|
|
// File: parsito/transition/transition.cpp |
|
17173
|
|
|
|
|
|
|
///////// |
|
17174
|
|
|
|
|
|
|
|
|
17175
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
17176
|
|
|
|
|
|
|
// |
|
17177
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
17178
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
17179
|
|
|
|
|
|
|
// |
|
17180
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
17181
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
17182
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
17183
|
|
|
|
|
|
|
|
|
17184
|
|
|
|
|
|
|
namespace parsito { |
|
17185
|
|
|
|
|
|
|
|
|
17186
|
|
|
|
|
|
|
// Left arc |
|
17187
|
387
|
|
|
|
|
|
bool transition_left_arc::applicable(const configuration& conf) const { |
|
17188
|
387
|
50
|
|
|
|
|
if (conf.single_root && label_is_root) |
|
|
|
0
|
|
|
|
|
|
|
17189
|
|
|
|
|
|
|
return false; |
|
17190
|
|
|
|
|
|
|
else |
|
17191
|
387
|
100
|
|
|
|
|
return conf.stack.size() >= 2 && conf.stack[conf.stack.size() - 2]; |
|
|
|
100
|
|
|
|
|
|
|
17192
|
|
|
|
|
|
|
} |
|
17193
|
|
|
|
|
|
|
|
|
17194
|
15
|
|
|
|
|
|
int transition_left_arc::perform(configuration& conf) const { |
|
17195
|
15
|
50
|
|
|
|
|
assert(applicable(conf)); |
|
17196
|
|
|
|
|
|
|
|
|
17197
|
15
|
|
|
|
|
|
int parent = conf.stack.back(); conf.stack.pop_back(); |
|
17198
|
15
|
|
|
|
|
|
int child = conf.stack.back(); conf.stack.pop_back(); |
|
17199
|
15
|
|
|
|
|
|
conf.stack.push_back(parent); |
|
17200
|
15
|
|
|
|
|
|
conf.t->set_head(child, parent, label); |
|
17201
|
15
|
|
|
|
|
|
return child; |
|
17202
|
|
|
|
|
|
|
} |
|
17203
|
|
|
|
|
|
|
|
|
17204
|
|
|
|
|
|
|
// Right arc |
|
17205
|
395
|
|
|
|
|
|
bool transition_right_arc::applicable(const configuration& conf) const { |
|
17206
|
395
|
50
|
|
|
|
|
if (conf.single_root && label_is_root) |
|
|
|
0
|
|
|
|
|
|
|
17207
|
0
|
0
|
|
|
|
|
return conf.stack.size() == 2 && conf.buffer.empty(); |
|
|
|
0
|
|
|
|
|
|
|
17208
|
395
|
50
|
|
|
|
|
else if (conf.single_root) // && !label_is_root |
|
17209
|
0
|
|
|
|
|
|
return conf.stack.size() > 2; |
|
17210
|
|
|
|
|
|
|
else |
|
17211
|
395
|
|
|
|
|
|
return conf.stack.size() >= 2; |
|
17212
|
|
|
|
|
|
|
} |
|
17213
|
|
|
|
|
|
|
|
|
17214
|
23
|
|
|
|
|
|
int transition_right_arc::perform(configuration& conf) const { |
|
17215
|
23
|
50
|
|
|
|
|
assert(applicable(conf)); |
|
17216
|
|
|
|
|
|
|
|
|
17217
|
23
|
|
|
|
|
|
int child = conf.stack.back(); conf.stack.pop_back(); |
|
17218
|
23
|
|
|
|
|
|
int parent = conf.stack.back(); |
|
17219
|
23
|
|
|
|
|
|
conf.t->set_head(child, parent, label); |
|
17220
|
23
|
|
|
|
|
|
return child; |
|
17221
|
|
|
|
|
|
|
} |
|
17222
|
|
|
|
|
|
|
|
|
17223
|
|
|
|
|
|
|
// Shift |
|
17224
|
90
|
|
|
|
|
|
bool transition_shift::applicable(const configuration& conf) const { |
|
17225
|
90
|
|
|
|
|
|
return !conf.buffer.empty(); |
|
17226
|
|
|
|
|
|
|
} |
|
17227
|
|
|
|
|
|
|
|
|
17228
|
28
|
|
|
|
|
|
int transition_shift::perform(configuration& conf) const { |
|
17229
|
28
|
50
|
|
|
|
|
assert(applicable(conf)); |
|
17230
|
|
|
|
|
|
|
|
|
17231
|
28
|
|
|
|
|
|
conf.stack.push_back(conf.buffer.back()); |
|
17232
|
|
|
|
|
|
|
conf.buffer.pop_back(); |
|
17233
|
28
|
|
|
|
|
|
return -1; |
|
17234
|
|
|
|
|
|
|
} |
|
17235
|
|
|
|
|
|
|
|
|
17236
|
|
|
|
|
|
|
// Swap |
|
17237
|
0
|
|
|
|
|
|
bool transition_swap::applicable(const configuration& conf) const { |
|
17238
|
0
|
0
|
|
|
|
|
return conf.stack.size() >= 2 && conf.stack[conf.stack.size() - 2] && conf.stack[conf.stack.size() - 2] < conf.stack[conf.stack.size() - 1]; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
17239
|
|
|
|
|
|
|
} |
|
17240
|
|
|
|
|
|
|
|
|
17241
|
0
|
|
|
|
|
|
int transition_swap::perform(configuration& conf) const { |
|
17242
|
0
|
0
|
|
|
|
|
assert(applicable(conf)); |
|
17243
|
|
|
|
|
|
|
|
|
17244
|
0
|
|
|
|
|
|
int top = conf.stack.back(); conf.stack.pop_back(); |
|
17245
|
0
|
|
|
|
|
|
int to_buffer = conf.stack.back(); conf.stack.pop_back(); |
|
17246
|
0
|
|
|
|
|
|
conf.stack.push_back(top); |
|
17247
|
0
|
|
|
|
|
|
conf.buffer.push_back(to_buffer); |
|
17248
|
0
|
|
|
|
|
|
return -1; |
|
17249
|
|
|
|
|
|
|
} |
|
17250
|
|
|
|
|
|
|
|
|
17251
|
|
|
|
|
|
|
// Left arc 2 |
|
17252
|
0
|
|
|
|
|
|
bool transition_left_arc_2::applicable(const configuration& conf) const { |
|
17253
|
0
|
0
|
|
|
|
|
if (conf.single_root && label_is_root) |
|
|
|
0
|
|
|
|
|
|
|
17254
|
|
|
|
|
|
|
return false; |
|
17255
|
|
|
|
|
|
|
else |
|
17256
|
0
|
0
|
|
|
|
|
return conf.stack.size() >= 3 && conf.stack[conf.stack.size() - 3]; |
|
|
|
0
|
|
|
|
|
|
|
17257
|
|
|
|
|
|
|
} |
|
17258
|
|
|
|
|
|
|
|
|
17259
|
0
|
|
|
|
|
|
int transition_left_arc_2::perform(configuration& conf) const { |
|
17260
|
0
|
0
|
|
|
|
|
assert(applicable(conf)); |
|
17261
|
|
|
|
|
|
|
|
|
17262
|
0
|
|
|
|
|
|
int parent = conf.stack.back(); conf.stack.pop_back(); |
|
17263
|
0
|
|
|
|
|
|
int ignore = conf.stack.back(); conf.stack.pop_back(); |
|
17264
|
0
|
|
|
|
|
|
int child = conf.stack.back(); conf.stack.pop_back(); |
|
17265
|
0
|
|
|
|
|
|
conf.stack.push_back(ignore); |
|
17266
|
0
|
|
|
|
|
|
conf.stack.push_back(parent); |
|
17267
|
0
|
|
|
|
|
|
conf.t->set_head(child, parent, label); |
|
17268
|
0
|
|
|
|
|
|
return child; |
|
17269
|
|
|
|
|
|
|
} |
|
17270
|
|
|
|
|
|
|
|
|
17271
|
|
|
|
|
|
|
// Right arc 2 |
|
17272
|
0
|
|
|
|
|
|
bool transition_right_arc_2::applicable(const configuration& conf) const { |
|
17273
|
0
|
0
|
|
|
|
|
if (conf.single_root && label_is_root) |
|
|
|
0
|
|
|
|
|
|
|
17274
|
|
|
|
|
|
|
return false; |
|
17275
|
0
|
0
|
|
|
|
|
else if (conf.single_root) // && !label_is_root |
|
17276
|
0
|
|
|
|
|
|
return conf.stack.size() >= 4; |
|
17277
|
|
|
|
|
|
|
else |
|
17278
|
0
|
|
|
|
|
|
return conf.stack.size() >= 3; |
|
17279
|
|
|
|
|
|
|
} |
|
17280
|
|
|
|
|
|
|
|
|
17281
|
0
|
|
|
|
|
|
int transition_right_arc_2::perform(configuration& conf) const { |
|
17282
|
0
|
0
|
|
|
|
|
assert(applicable(conf)); |
|
17283
|
|
|
|
|
|
|
|
|
17284
|
0
|
|
|
|
|
|
int child = conf.stack.back(); conf.stack.pop_back(); |
|
17285
|
0
|
|
|
|
|
|
int to_buffer = conf.stack.back(); conf.stack.pop_back(); |
|
17286
|
0
|
|
|
|
|
|
int parent = conf.stack.back(); |
|
17287
|
0
|
|
|
|
|
|
conf.buffer.push_back(to_buffer); |
|
17288
|
0
|
|
|
|
|
|
conf.t->set_head(child, parent, label); |
|
17289
|
0
|
|
|
|
|
|
return child; |
|
17290
|
|
|
|
|
|
|
} |
|
17291
|
|
|
|
|
|
|
|
|
17292
|
|
|
|
|
|
|
} // namespace parsito |
|
17293
|
|
|
|
|
|
|
|
|
17294
|
|
|
|
|
|
|
///////// |
|
17295
|
|
|
|
|
|
|
// File: parsito/transition/transition_system_link2.h |
|
17296
|
|
|
|
|
|
|
///////// |
|
17297
|
|
|
|
|
|
|
|
|
17298
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
17299
|
|
|
|
|
|
|
// |
|
17300
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
17301
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
17302
|
|
|
|
|
|
|
// |
|
17303
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
17304
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
17305
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
17306
|
|
|
|
|
|
|
|
|
17307
|
|
|
|
|
|
|
namespace parsito { |
|
17308
|
|
|
|
|
|
|
|
|
17309
|
0
|
|
|
|
|
|
class transition_system_link2 : public transition_system { |
|
17310
|
|
|
|
|
|
|
public: |
|
17311
|
|
|
|
|
|
|
transition_system_link2(const vector& labels); |
|
17312
|
|
|
|
|
|
|
|
|
17313
|
|
|
|
|
|
|
virtual transition_oracle* oracle(const string& name) const override; |
|
17314
|
|
|
|
|
|
|
}; |
|
17315
|
|
|
|
|
|
|
|
|
17316
|
|
|
|
|
|
|
} // namespace parsito |
|
17317
|
|
|
|
|
|
|
|
|
17318
|
|
|
|
|
|
|
///////// |
|
17319
|
|
|
|
|
|
|
// File: parsito/transition/transition_system_projective.h |
|
17320
|
|
|
|
|
|
|
///////// |
|
17321
|
|
|
|
|
|
|
|
|
17322
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
17323
|
|
|
|
|
|
|
// |
|
17324
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
17325
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
17326
|
|
|
|
|
|
|
// |
|
17327
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
17328
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
17329
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
17330
|
|
|
|
|
|
|
|
|
17331
|
|
|
|
|
|
|
namespace parsito { |
|
17332
|
|
|
|
|
|
|
|
|
17333
|
2
|
|
|
|
|
|
class transition_system_projective : public transition_system { |
|
17334
|
|
|
|
|
|
|
public: |
|
17335
|
|
|
|
|
|
|
transition_system_projective(const vector& labels); |
|
17336
|
|
|
|
|
|
|
|
|
17337
|
|
|
|
|
|
|
virtual transition_oracle* oracle(const string& name) const override; |
|
17338
|
|
|
|
|
|
|
}; |
|
17339
|
|
|
|
|
|
|
|
|
17340
|
|
|
|
|
|
|
} // namespace parsito |
|
17341
|
|
|
|
|
|
|
|
|
17342
|
|
|
|
|
|
|
///////// |
|
17343
|
|
|
|
|
|
|
// File: parsito/transition/transition_system_swap.h |
|
17344
|
|
|
|
|
|
|
///////// |
|
17345
|
|
|
|
|
|
|
|
|
17346
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
17347
|
|
|
|
|
|
|
// |
|
17348
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
17349
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
17350
|
|
|
|
|
|
|
// |
|
17351
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
17352
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
17353
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
17354
|
|
|
|
|
|
|
|
|
17355
|
|
|
|
|
|
|
namespace parsito { |
|
17356
|
|
|
|
|
|
|
|
|
17357
|
0
|
|
|
|
|
|
class transition_system_swap : public transition_system { |
|
17358
|
|
|
|
|
|
|
public: |
|
17359
|
|
|
|
|
|
|
transition_system_swap(const vector& labels); |
|
17360
|
|
|
|
|
|
|
|
|
17361
|
|
|
|
|
|
|
virtual transition_oracle* oracle(const string& name) const override; |
|
17362
|
|
|
|
|
|
|
}; |
|
17363
|
|
|
|
|
|
|
|
|
17364
|
|
|
|
|
|
|
} // namespace parsito |
|
17365
|
|
|
|
|
|
|
|
|
17366
|
|
|
|
|
|
|
///////// |
|
17367
|
|
|
|
|
|
|
// File: parsito/transition/transition_system.cpp |
|
17368
|
|
|
|
|
|
|
///////// |
|
17369
|
|
|
|
|
|
|
|
|
17370
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
17371
|
|
|
|
|
|
|
// |
|
17372
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
17373
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
17374
|
|
|
|
|
|
|
// |
|
17375
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
17376
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
17377
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
17378
|
|
|
|
|
|
|
|
|
17379
|
|
|
|
|
|
|
namespace parsito { |
|
17380
|
|
|
|
|
|
|
|
|
17381
|
0
|
|
|
|
|
|
unsigned transition_system::transition_count() const { |
|
17382
|
0
|
|
|
|
|
|
return transitions.size(); |
|
17383
|
|
|
|
|
|
|
} |
|
17384
|
|
|
|
|
|
|
|
|
17385
|
806
|
|
|
|
|
|
bool transition_system::applicable(const configuration& conf, unsigned transition) const { |
|
17386
|
806
|
50
|
|
|
|
|
assert(transition < transitions.size()); |
|
17387
|
|
|
|
|
|
|
|
|
17388
|
806
|
|
|
|
|
|
return transitions[transition]->applicable(conf); |
|
17389
|
|
|
|
|
|
|
} |
|
17390
|
|
|
|
|
|
|
|
|
17391
|
66
|
|
|
|
|
|
int transition_system::perform(configuration& conf, unsigned transition) const { |
|
17392
|
66
|
50
|
|
|
|
|
assert(transition < transitions.size()); |
|
17393
|
|
|
|
|
|
|
|
|
17394
|
66
|
|
|
|
|
|
return transitions[transition]->perform(conf); |
|
17395
|
|
|
|
|
|
|
} |
|
17396
|
|
|
|
|
|
|
|
|
17397
|
1
|
|
|
|
|
|
transition_system* transition_system::create(const string& name, const vector& labels) { |
|
17398
|
1
|
50
|
|
|
|
|
if (name == "projective") return new transition_system_projective(labels); |
|
|
|
50
|
|
|
|
|
|
|
17399
|
0
|
0
|
|
|
|
|
if (name == "swap") return new transition_system_swap(labels); |
|
|
|
0
|
|
|
|
|
|
|
17400
|
1
|
0
|
|
|
|
|
if (name == "link2") return new transition_system_link2(labels); |
|
|
|
0
|
|
|
|
|
|
|
17401
|
|
|
|
|
|
|
return nullptr; |
|
17402
|
|
|
|
|
|
|
} |
|
17403
|
|
|
|
|
|
|
|
|
17404
|
|
|
|
|
|
|
} // namespace parsito |
|
17405
|
|
|
|
|
|
|
|
|
17406
|
|
|
|
|
|
|
///////// |
|
17407
|
|
|
|
|
|
|
// File: parsito/transition/transition_system_link2.cpp |
|
17408
|
|
|
|
|
|
|
///////// |
|
17409
|
|
|
|
|
|
|
|
|
17410
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
17411
|
|
|
|
|
|
|
// |
|
17412
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
17413
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
17414
|
|
|
|
|
|
|
// |
|
17415
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
17416
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
17417
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
17418
|
|
|
|
|
|
|
|
|
17419
|
|
|
|
|
|
|
namespace parsito { |
|
17420
|
|
|
|
|
|
|
|
|
17421
|
0
|
|
|
|
|
|
transition_system_link2::transition_system_link2(const vector& labels) : transition_system(labels) { |
|
17422
|
0
|
0
|
|
|
|
|
transitions.emplace_back(new transition_shift()); |
|
|
|
0
|
|
|
|
|
|
|
17423
|
0
|
0
|
|
|
|
|
for (auto&& label : labels) { |
|
17424
|
0
|
0
|
|
|
|
|
transitions.emplace_back(new transition_left_arc(label)); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
17425
|
0
|
0
|
|
|
|
|
transitions.emplace_back(new transition_right_arc(label)); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
17426
|
0
|
0
|
|
|
|
|
transitions.emplace_back(new transition_left_arc_2(label)); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
17427
|
0
|
0
|
|
|
|
|
transitions.emplace_back(new transition_right_arc_2(label)); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
17428
|
|
|
|
|
|
|
} |
|
17429
|
0
|
|
|
|
|
|
} |
|
17430
|
|
|
|
|
|
|
|
|
17431
|
|
|
|
|
|
|
// Static oracle |
|
17432
|
0
|
|
|
|
|
|
class transition_system_link2_oracle_static : public transition_oracle { |
|
17433
|
|
|
|
|
|
|
public: |
|
17434
|
0
|
|
|
|
|
|
transition_system_link2_oracle_static(const vector& labels) : labels(labels) { |
|
17435
|
0
|
0
|
|
|
|
|
for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break; |
|
|
|
0
|
|
|
|
|
|
|
17436
|
0
|
|
|
|
|
|
} |
|
17437
|
|
|
|
|
|
|
|
|
17438
|
0
|
|
|
|
|
|
class tree_oracle_static : public transition_oracle::tree_oracle { |
|
17439
|
|
|
|
|
|
|
public: |
|
17440
|
0
|
|
|
|
|
|
tree_oracle_static(const vector& labels, unsigned root_label, const tree& gold) : labels(labels), root_label(root_label), gold(gold) {} |
|
17441
|
|
|
|
|
|
|
virtual predicted_transition predict(const configuration& conf, unsigned network_outcome, unsigned iteration) const override; |
|
17442
|
|
|
|
|
|
|
virtual void interesting_transitions(const configuration& conf, vector& transitions) const override; |
|
17443
|
|
|
|
|
|
|
private: |
|
17444
|
|
|
|
|
|
|
const vector& labels; |
|
17445
|
|
|
|
|
|
|
unsigned root_label; |
|
17446
|
|
|
|
|
|
|
const tree& gold; |
|
17447
|
|
|
|
|
|
|
}; |
|
17448
|
|
|
|
|
|
|
|
|
17449
|
|
|
|
|
|
|
virtual unique_ptr create_tree_oracle(const tree& gold) const override; |
|
17450
|
|
|
|
|
|
|
private: |
|
17451
|
|
|
|
|
|
|
const vector& labels; |
|
17452
|
|
|
|
|
|
|
unsigned root_label; |
|
17453
|
|
|
|
|
|
|
}; |
|
17454
|
|
|
|
|
|
|
|
|
17455
|
0
|
|
|
|
|
|
unique_ptr transition_system_link2_oracle_static::create_tree_oracle(const tree& gold) const { |
|
17456
|
0
|
|
|
|
|
|
return unique_ptr(new tree_oracle_static(labels, root_label, gold)); |
|
17457
|
|
|
|
|
|
|
} |
|
17458
|
|
|
|
|
|
|
|
|
17459
|
0
|
|
|
|
|
|
void transition_system_link2_oracle_static::tree_oracle_static::interesting_transitions(const configuration& conf, vector& transitions) const { |
|
17460
|
|
|
|
|
|
|
transitions.clear(); |
|
17461
|
|
|
|
|
|
|
|
|
17462
|
|
|
|
|
|
|
// Shift |
|
17463
|
0
|
0
|
|
|
|
|
if (!conf.buffer.empty()) transitions.push_back(0); |
|
17464
|
|
|
|
|
|
|
|
|
17465
|
|
|
|
|
|
|
// Arcs |
|
17466
|
0
|
|
|
|
|
|
unsigned parents[4] = {1, 2, 1, 3}; |
|
17467
|
0
|
|
|
|
|
|
unsigned children[4] = {2, 1, 3, 1}; |
|
17468
|
0
|
0
|
|
|
|
|
for (int direction = 0; direction < 4; direction++) |
|
17469
|
0
|
0
|
|
|
|
|
if (conf.stack.size() >= parents[direction] && conf.stack.size() >= children[direction]) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
17470
|
0
|
|
|
|
|
|
int parent = conf.stack[conf.stack.size() - parents[direction]]; |
|
17471
|
0
|
|
|
|
|
|
int child = conf.stack[conf.stack.size() - children[direction]]; |
|
17472
|
|
|
|
|
|
|
|
|
17473
|
|
|
|
|
|
|
// Allow arc_2 only when seeing golden edge. |
|
17474
|
0
|
0
|
|
|
|
|
if (direction >= 2 && gold.nodes[child].head != parent) continue; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
17475
|
|
|
|
|
|
|
|
|
17476
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < labels.size(); i++) |
|
17477
|
0
|
0
|
|
|
|
|
if (gold.nodes[child].deprel == labels[i]) |
|
17478
|
0
|
0
|
|
|
|
|
if (!conf.single_root || |
|
|
|
0
|
|
|
|
|
|
|
17479
|
0
|
0
|
|
|
|
|
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
17480
|
0
|
0
|
|
|
|
|
(i != root_label && conf.stack.size() > 2 && direction < 2) || |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
17481
|
0
|
0
|
|
|
|
|
(i != root_label && conf.stack.size() > 3 && direction >= 2)) |
|
|
|
0
|
|
|
|
|
|
|
17482
|
0
|
|
|
|
|
|
transitions.push_back(1 + 4*i + direction); |
|
17483
|
|
|
|
|
|
|
} |
|
17484
|
0
|
|
|
|
|
|
} |
|
17485
|
|
|
|
|
|
|
|
|
17486
|
0
|
|
|
|
|
|
transition_oracle::predicted_transition transition_system_link2_oracle_static::tree_oracle_static::predict(const configuration& conf, unsigned /*network_outcome*/, unsigned /*iteration*/) const { |
|
17487
|
|
|
|
|
|
|
// Arcs |
|
17488
|
0
|
|
|
|
|
|
unsigned parents[4] = {1, 2, 1, 3}; |
|
17489
|
0
|
|
|
|
|
|
unsigned children[4] = {2, 1, 3, 1}; |
|
17490
|
0
|
0
|
|
|
|
|
for (int direction = 0; direction < 4; direction++) |
|
17491
|
0
|
0
|
|
|
|
|
if (conf.stack.size() >= parents[direction] && conf.stack.size() >= children[direction]) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
17492
|
0
|
|
|
|
|
|
int parent = conf.stack[conf.stack.size() - parents[direction]]; |
|
17493
|
0
|
|
|
|
|
|
int child = conf.stack[conf.stack.size() - children[direction]]; |
|
17494
|
|
|
|
|
|
|
|
|
17495
|
0
|
0
|
|
|
|
|
if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
17496
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < labels.size(); i++) |
|
17497
|
0
|
0
|
|
|
|
|
if (gold.nodes[child].deprel == labels[i]) |
|
17498
|
0
|
|
|
|
|
|
return predicted_transition(1 + 4*i + direction, 1 + 4*i + direction); |
|
17499
|
|
|
|
|
|
|
|
|
17500
|
0
|
|
|
|
|
|
assert(!"label was not found"); |
|
17501
|
|
|
|
|
|
|
} |
|
17502
|
|
|
|
|
|
|
} |
|
17503
|
|
|
|
|
|
|
|
|
17504
|
|
|
|
|
|
|
// Otherwise, just shift |
|
17505
|
0
|
|
|
|
|
|
return predicted_transition(0, 0); |
|
17506
|
|
|
|
|
|
|
} |
|
17507
|
|
|
|
|
|
|
|
|
17508
|
|
|
|
|
|
|
// Oracle factory method |
|
17509
|
0
|
|
|
|
|
|
transition_oracle* transition_system_link2::oracle(const string& name) const { |
|
17510
|
0
|
0
|
|
|
|
|
if (name == "static") return new transition_system_link2_oracle_static(labels); |
|
|
|
0
|
|
|
|
|
|
|
17511
|
|
|
|
|
|
|
return nullptr; |
|
17512
|
|
|
|
|
|
|
} |
|
17513
|
|
|
|
|
|
|
|
|
17514
|
|
|
|
|
|
|
} // namespace parsito |
|
17515
|
|
|
|
|
|
|
|
|
17516
|
|
|
|
|
|
|
///////// |
|
17517
|
|
|
|
|
|
|
// File: parsito/transition/transition_system_projective.cpp |
|
17518
|
|
|
|
|
|
|
///////// |
|
17519
|
|
|
|
|
|
|
|
|
17520
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
17521
|
|
|
|
|
|
|
// |
|
17522
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
17523
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
17524
|
|
|
|
|
|
|
// |
|
17525
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
17526
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
17527
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
17528
|
|
|
|
|
|
|
|
|
17529
|
|
|
|
|
|
|
namespace parsito { |
|
17530
|
|
|
|
|
|
|
|
|
17531
|
1
|
|
|
|
|
|
transition_system_projective::transition_system_projective(const vector& labels) : transition_system(labels) { |
|
17532
|
1
|
50
|
|
|
|
|
transitions.emplace_back(new transition_shift()); |
|
|
|
50
|
|
|
|
|
|
|
17533
|
7
|
100
|
|
|
|
|
for (auto&& label : labels) { |
|
17534
|
6
|
50
|
|
|
|
|
transitions.emplace_back(new transition_left_arc(label)); |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
17535
|
6
|
50
|
|
|
|
|
transitions.emplace_back(new transition_right_arc(label)); |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
17536
|
|
|
|
|
|
|
} |
|
17537
|
1
|
|
|
|
|
|
} |
|
17538
|
|
|
|
|
|
|
|
|
17539
|
|
|
|
|
|
|
// Static oracle |
|
17540
|
0
|
|
|
|
|
|
class transition_system_projective_oracle_static : public transition_oracle { |
|
17541
|
|
|
|
|
|
|
public: |
|
17542
|
0
|
|
|
|
|
|
transition_system_projective_oracle_static(const vector& labels) : labels(labels) { |
|
17543
|
0
|
0
|
|
|
|
|
for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break; |
|
|
|
0
|
|
|
|
|
|
|
17544
|
0
|
|
|
|
|
|
} |
|
17545
|
|
|
|
|
|
|
|
|
17546
|
0
|
|
|
|
|
|
class tree_oracle_static : public transition_oracle::tree_oracle { |
|
17547
|
|
|
|
|
|
|
public: |
|
17548
|
0
|
|
|
|
|
|
tree_oracle_static(const vector& labels, unsigned root_label, const tree& gold) : labels(labels), root_label(root_label), gold(gold) {} |
|
17549
|
|
|
|
|
|
|
virtual predicted_transition predict(const configuration& conf, unsigned network_outcome, unsigned iteration) const override; |
|
17550
|
|
|
|
|
|
|
virtual void interesting_transitions(const configuration& conf, vector& transitions) const override; |
|
17551
|
|
|
|
|
|
|
private: |
|
17552
|
|
|
|
|
|
|
const vector& labels; |
|
17553
|
|
|
|
|
|
|
unsigned root_label; |
|
17554
|
|
|
|
|
|
|
const tree& gold; |
|
17555
|
|
|
|
|
|
|
}; |
|
17556
|
|
|
|
|
|
|
|
|
17557
|
|
|
|
|
|
|
virtual unique_ptr create_tree_oracle(const tree& gold) const override; |
|
17558
|
|
|
|
|
|
|
private: |
|
17559
|
|
|
|
|
|
|
const vector& labels; |
|
17560
|
|
|
|
|
|
|
unsigned root_label; |
|
17561
|
|
|
|
|
|
|
}; |
|
17562
|
|
|
|
|
|
|
|
|
17563
|
0
|
|
|
|
|
|
unique_ptr transition_system_projective_oracle_static::create_tree_oracle(const tree& gold) const { |
|
17564
|
0
|
|
|
|
|
|
return unique_ptr(new tree_oracle_static(labels, root_label, gold)); |
|
17565
|
|
|
|
|
|
|
} |
|
17566
|
|
|
|
|
|
|
|
|
17567
|
0
|
|
|
|
|
|
void transition_system_projective_oracle_static::tree_oracle_static::interesting_transitions(const configuration& conf, vector& transitions) const { |
|
17568
|
|
|
|
|
|
|
transitions.clear(); |
|
17569
|
0
|
0
|
|
|
|
|
if (!conf.buffer.empty()) transitions.push_back(0); |
|
17570
|
0
|
0
|
|
|
|
|
if (conf.stack.size() >= 2) |
|
17571
|
0
|
0
|
|
|
|
|
for (int direction = 0; direction < 2; direction++) { |
|
17572
|
0
|
|
|
|
|
|
int child = conf.stack[conf.stack.size() - 2 + direction]; |
|
17573
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < labels.size(); i++) |
|
17574
|
0
|
0
|
|
|
|
|
if (gold.nodes[child].deprel == labels[i]) |
|
17575
|
0
|
0
|
|
|
|
|
if (!conf.single_root || |
|
|
|
0
|
|
|
|
|
|
|
17576
|
0
|
0
|
|
|
|
|
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
17577
|
0
|
0
|
|
|
|
|
(i != root_label && conf.stack.size() > 2)) |
|
17578
|
0
|
|
|
|
|
|
transitions.push_back(1 + 2*i + direction); |
|
17579
|
|
|
|
|
|
|
} |
|
17580
|
0
|
|
|
|
|
|
} |
|
17581
|
|
|
|
|
|
|
|
|
17582
|
0
|
|
|
|
|
|
transition_oracle::predicted_transition transition_system_projective_oracle_static::tree_oracle_static::predict(const configuration& conf, unsigned /*network_outcome*/, unsigned /*iteration*/) const { |
|
17583
|
|
|
|
|
|
|
// Use left if appropriate |
|
17584
|
0
|
0
|
|
|
|
|
if (conf.stack.size() >= 2) { |
|
17585
|
0
|
|
|
|
|
|
int parent = conf.stack[conf.stack.size() - 1]; |
|
17586
|
0
|
|
|
|
|
|
int child = conf.stack[conf.stack.size() - 2]; |
|
17587
|
0
|
0
|
|
|
|
|
if (gold.nodes[child].head == parent) { |
|
17588
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < labels.size(); i++) |
|
17589
|
0
|
0
|
|
|
|
|
if (gold.nodes[child].deprel == labels[i]) |
|
17590
|
0
|
|
|
|
|
|
return predicted_transition(1 + 2*i, 1 + 2*i); |
|
17591
|
|
|
|
|
|
|
|
|
17592
|
0
|
|
|
|
|
|
assert(!"label was not found"); |
|
17593
|
|
|
|
|
|
|
} |
|
17594
|
|
|
|
|
|
|
} |
|
17595
|
|
|
|
|
|
|
|
|
17596
|
|
|
|
|
|
|
// Use right if appropriate |
|
17597
|
0
|
0
|
|
|
|
|
if (conf.stack.size() >= 2) { |
|
17598
|
0
|
|
|
|
|
|
int child = conf.stack[conf.stack.size() - 1]; |
|
17599
|
0
|
|
|
|
|
|
int parent = conf.stack[conf.stack.size() - 2]; |
|
17600
|
0
|
0
|
|
|
|
|
if (gold.nodes[child].head == parent && |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
17601
|
0
|
0
|
|
|
|
|
(conf.buffer.empty() || gold.nodes[child].children.empty() || gold.nodes[child].children.back() < conf.buffer.back())) { |
|
|
|
0
|
|
|
|
|
|
|
17602
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < labels.size(); i++) |
|
17603
|
0
|
0
|
|
|
|
|
if (gold.nodes[child].deprel == labels[i]) |
|
17604
|
0
|
|
|
|
|
|
return predicted_transition(1 + 2*i + 1, 1 + 2*i + 1); |
|
17605
|
|
|
|
|
|
|
|
|
17606
|
0
|
|
|
|
|
|
assert(!"label was not found"); |
|
17607
|
|
|
|
|
|
|
} |
|
17608
|
|
|
|
|
|
|
} |
|
17609
|
|
|
|
|
|
|
|
|
17610
|
|
|
|
|
|
|
// Otherwise, just shift |
|
17611
|
0
|
|
|
|
|
|
return predicted_transition(0, 0); |
|
17612
|
|
|
|
|
|
|
} |
|
17613
|
|
|
|
|
|
|
|
|
17614
|
|
|
|
|
|
|
// Dynamic oracle |
|
17615
|
0
|
|
|
|
|
|
class transition_system_projective_oracle_dynamic : public transition_oracle { |
|
17616
|
|
|
|
|
|
|
public: |
|
17617
|
0
|
|
|
|
|
|
transition_system_projective_oracle_dynamic(const vector& labels) : labels(labels) { |
|
17618
|
0
|
0
|
|
|
|
|
for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break; |
|
|
|
0
|
|
|
|
|
|
|
17619
|
0
|
|
|
|
|
|
} |
|
17620
|
|
|
|
|
|
|
|
|
17621
|
0
|
|
|
|
|
|
class tree_oracle_dynamic : public transition_oracle::tree_oracle { |
|
17622
|
|
|
|
|
|
|
public: |
|
17623
|
0
|
|
|
|
|
|
tree_oracle_dynamic(const vector& labels, unsigned root_label, const tree& gold) : labels(labels), gold(gold), oracle_static(labels, root_label, gold) {} |
|
17624
|
|
|
|
|
|
|
virtual predicted_transition predict(const configuration& conf, unsigned network_outcome, unsigned iteration) const override; |
|
17625
|
|
|
|
|
|
|
virtual void interesting_transitions(const configuration& conf, vector& transitions) const override; |
|
17626
|
|
|
|
|
|
|
private: |
|
17627
|
|
|
|
|
|
|
const vector& labels; |
|
17628
|
|
|
|
|
|
|
const tree& gold; |
|
17629
|
|
|
|
|
|
|
transition_system_projective_oracle_static::tree_oracle_static oracle_static; |
|
17630
|
|
|
|
|
|
|
}; |
|
17631
|
|
|
|
|
|
|
|
|
17632
|
|
|
|
|
|
|
virtual unique_ptr create_tree_oracle(const tree& gold) const override; |
|
17633
|
|
|
|
|
|
|
private: |
|
17634
|
|
|
|
|
|
|
const vector& labels; |
|
17635
|
|
|
|
|
|
|
unsigned root_label; |
|
17636
|
|
|
|
|
|
|
}; |
|
17637
|
|
|
|
|
|
|
|
|
17638
|
0
|
|
|
|
|
|
unique_ptr transition_system_projective_oracle_dynamic::create_tree_oracle(const tree& gold) const { |
|
17639
|
0
|
|
|
|
|
|
return unique_ptr(new tree_oracle_dynamic(labels, root_label, gold)); |
|
17640
|
|
|
|
|
|
|
} |
|
17641
|
|
|
|
|
|
|
|
|
17642
|
0
|
|
|
|
|
|
void transition_system_projective_oracle_dynamic::tree_oracle_dynamic::interesting_transitions(const configuration& conf, vector& transitions) const { |
|
17643
|
0
|
|
|
|
|
|
oracle_static.interesting_transitions(conf, transitions); |
|
17644
|
0
|
|
|
|
|
|
} |
|
17645
|
|
|
|
|
|
|
|
|
17646
|
0
|
|
|
|
|
|
transition_oracle::predicted_transition transition_system_projective_oracle_dynamic::tree_oracle_dynamic::predict(const configuration& conf, unsigned network_outcome, unsigned iteration) const { |
|
17647
|
|
|
|
|
|
|
// Use static oracle in the first iteration |
|
17648
|
0
|
0
|
|
|
|
|
if (iteration <= 1) |
|
17649
|
0
|
|
|
|
|
|
return oracle_static.predict(conf, network_outcome, iteration); |
|
17650
|
|
|
|
|
|
|
|
|
17651
|
|
|
|
|
|
|
// Use dynamic programming to compute transition leading to best parse tree |
|
17652
|
|
|
|
|
|
|
|
|
17653
|
|
|
|
|
|
|
// Start by computing the right stack |
|
17654
|
|
|
|
|
|
|
vector right_stack; |
|
17655
|
|
|
|
|
|
|
|
|
17656
|
|
|
|
|
|
|
unordered_set right_stack_inserted; |
|
17657
|
0
|
0
|
|
|
|
|
if (!conf.buffer.empty()) { |
|
17658
|
0
|
|
|
|
|
|
int buffer_start = conf.buffer.back(); |
|
17659
|
0
|
0
|
|
|
|
|
for (size_t i = conf.buffer.size(); i--; ) { |
|
17660
|
|
|
|
|
|
|
const auto& node = conf.buffer[i]; |
|
17661
|
0
|
|
|
|
|
|
bool to_right_stack = gold.nodes[node].head < buffer_start; |
|
17662
|
0
|
0
|
|
|
|
|
for (auto&& child : gold.nodes[node].children) |
|
17663
|
0
|
|
|
|
|
|
to_right_stack |= child < buffer_start || right_stack_inserted.count(child); |
|
17664
|
0
|
0
|
|
|
|
|
if (to_right_stack) { |
|
17665
|
0
|
0
|
|
|
|
|
right_stack.push_back(node); |
|
17666
|
|
|
|
|
|
|
right_stack_inserted.insert(node); |
|
17667
|
|
|
|
|
|
|
} |
|
17668
|
|
|
|
|
|
|
} |
|
17669
|
|
|
|
|
|
|
} |
|
17670
|
|
|
|
|
|
|
|
|
17671
|
|
|
|
|
|
|
// Fill the array T from the 2014 Goldberg paper |
|
17672
|
0
|
0
|
|
|
|
|
class t_representation { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
17673
|
|
|
|
|
|
|
public: |
|
17674
|
0
|
|
|
|
|
|
t_representation(const vector& stack, const vector& right_stack, const tree& gold, const vector& labels) |
|
17675
|
0
|
0
|
|
|
|
|
: stack(stack), right_stack(right_stack), gold(gold), labels(labels) { |
|
|
|
0
|
|
|
|
|
|
|
17676
|
0
|
0
|
|
|
|
|
for (int i = 0; i < 2; i++) { |
|
17677
|
0
|
0
|
|
|
|
|
costs[i].reserve((stack.size() + right_stack.size()) * (stack.size() + right_stack.size())); |
|
17678
|
0
|
0
|
|
|
|
|
transitions[i].reserve((stack.size() + right_stack.size()) * (stack.size() + right_stack.size())); |
|
17679
|
|
|
|
|
|
|
} |
|
17680
|
0
|
0
|
|
|
|
|
} |
|
|
|
0
|
|
|
|
|
|
|
17681
|
|
|
|
|
|
|
|
|
17682
|
0
|
|
|
|
|
|
void prepare(unsigned diagonal) { |
|
17683
|
0
|
|
|
|
|
|
costs[diagonal & 1].assign((diagonal + 1) * (diagonal + 1), gold.nodes.size() + 1); |
|
17684
|
0
|
|
|
|
|
|
transitions[diagonal & 1].assign((diagonal + 1) * (diagonal + 1), -1); |
|
17685
|
0
|
|
|
|
|
|
} |
|
17686
|
|
|
|
|
|
|
|
|
17687
|
0
|
|
|
|
|
|
int& cost(unsigned i, unsigned j, unsigned h) { return costs[(i+j) & 1][i * (i+j+1) + h]; } |
|
17688
|
0
|
|
|
|
|
|
int& transition(unsigned i, unsigned j, unsigned h) { return transitions[(i+j) & 1][i * (i+j+1) + h]; } |
|
17689
|
|
|
|
|
|
|
|
|
17690
|
0
|
0
|
|
|
|
|
int node(unsigned i, unsigned /*j*/, unsigned h) const { return h <= i ? stack[stack.size() - 1 - i + h] : right_stack[h - i - 1]; } |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
17691
|
0
|
|
|
|
|
|
int edge_cost(int parent, int child) const { return gold.nodes[child].head != parent; } |
|
17692
|
0
|
|
|
|
|
|
int which_arc_transition(int parent, int child) const { |
|
17693
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < labels.size(); i++) |
|
17694
|
0
|
0
|
|
|
|
|
if (gold.nodes[child].deprel == labels[i]) |
|
17695
|
0
|
|
|
|
|
|
return 1 + 2*i + (child > parent); |
|
17696
|
0
|
|
|
|
|
|
assert(!"label was not found"); |
|
17697
|
|
|
|
|
|
|
return 0; // To keep VS 2015 happy and warning-free |
|
17698
|
|
|
|
|
|
|
} |
|
17699
|
|
|
|
|
|
|
|
|
17700
|
|
|
|
|
|
|
private: |
|
17701
|
|
|
|
|
|
|
const vector& stack; |
|
17702
|
|
|
|
|
|
|
const vector& right_stack; |
|
17703
|
|
|
|
|
|
|
const tree& gold; |
|
17704
|
|
|
|
|
|
|
const vector& labels; |
|
17705
|
|
|
|
|
|
|
vector costs[2], transitions[2]; |
|
17706
|
0
|
0
|
|
|
|
|
} t(conf.stack, right_stack, gold, labels); |
|
17707
|
|
|
|
|
|
|
|
|
17708
|
0
|
0
|
|
|
|
|
t.prepare(0); |
|
17709
|
0
|
|
|
|
|
|
t.cost(0, 0, 0) = 0; |
|
17710
|
0
|
0
|
|
|
|
|
for (unsigned diagonal = 0; diagonal < conf.stack.size() + right_stack.size(); diagonal++) { |
|
17711
|
0
|
0
|
|
|
|
|
t.prepare(diagonal + 1); |
|
17712
|
0
|
0
|
|
|
|
|
for (unsigned i = diagonal > right_stack.size() ? diagonal - right_stack.size() : 0; i <= diagonal && i < conf.stack.size(); i++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
17713
|
0
|
|
|
|
|
|
unsigned j = diagonal - i; |
|
17714
|
|
|
|
|
|
|
|
|
17715
|
|
|
|
|
|
|
// Try extending stack |
|
17716
|
0
|
0
|
|
|
|
|
if (i+1 < conf.stack.size()) |
|
17717
|
0
|
0
|
|
|
|
|
for (unsigned h = 0; h <= diagonal; h++) { |
|
17718
|
|
|
|
|
|
|
int h_node = t.node(i, j, h), new_node = t.node(i+1, j, 0); |
|
17719
|
0
|
0
|
|
|
|
|
if (new_node && t.cost(i, j, h) + t.edge_cost(h_node, new_node) < t.cost(i+1, j, h+1) + (t.transition(i, j, h) != 0)) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
17720
|
0
|
|
|
|
|
|
t.cost(i+1, j, h+1) = t.cost(i, j, h) + t.edge_cost(h_node, new_node); |
|
17721
|
0
|
0
|
|
|
|
|
t.transition(i+1, j, h+1) = t.transition(i, j, h) >= 0 ? t.transition(i, j, h) : t.which_arc_transition(h_node, new_node); |
|
17722
|
|
|
|
|
|
|
} |
|
17723
|
0
|
0
|
|
|
|
|
if (t.cost(i, j, h) + t.edge_cost(new_node, h_node) < t.cost(i+1, j, 0) + (t.transition(i, j, h) != 0)) { |
|
17724
|
0
|
|
|
|
|
|
t.cost(i+1, j, 0) = t.cost(i, j, h) + t.edge_cost(new_node, h_node); |
|
17725
|
0
|
0
|
|
|
|
|
t.transition(i+1, j, 0) = t.transition(i, j, h) >= 0 ? t.transition(i, j, h) : t.which_arc_transition(new_node, h_node); |
|
17726
|
|
|
|
|
|
|
} |
|
17727
|
|
|
|
|
|
|
} |
|
17728
|
|
|
|
|
|
|
|
|
17729
|
|
|
|
|
|
|
// Try extending right_stack |
|
17730
|
0
|
0
|
|
|
|
|
if (j+1 < right_stack.size() + 1) |
|
17731
|
0
|
0
|
|
|
|
|
for (unsigned h = 0; h <= diagonal; h++) { |
|
17732
|
|
|
|
|
|
|
int h_node = t.node(i, j, h), new_node = t.node(i, j+1, diagonal+1); |
|
17733
|
0
|
0
|
|
|
|
|
if (t.cost(i, j, h) + t.edge_cost(h_node, new_node) < t.cost(i, j+1, h) + (t.transition(i, j, h) > 0)) { |
|
17734
|
0
|
|
|
|
|
|
t.cost(i, j+1, h) = t.cost(i, j, h) + t.edge_cost(h_node, new_node); |
|
17735
|
0
|
0
|
|
|
|
|
t.transition(i, j+1, h) = t.transition(i, j, h) >= 0 ? t.transition(i, j, h) : 0; |
|
17736
|
|
|
|
|
|
|
} |
|
17737
|
0
|
0
|
|
|
|
|
if (h_node && t.cost(i, j, h) + t.edge_cost(new_node, h_node) < t.cost(i, j+1, diagonal+1) + (t.transition(i, j, h) > 0)) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
17738
|
0
|
|
|
|
|
|
t.cost(i, j+1, diagonal+1) = t.cost(i, j, h) + t.edge_cost(new_node, h_node); |
|
17739
|
0
|
0
|
|
|
|
|
t.transition(i, j+1, diagonal+1) = t.transition(i, j, h) >= 0 ? t.transition(i, j, h) : 0; |
|
17740
|
|
|
|
|
|
|
} |
|
17741
|
|
|
|
|
|
|
} |
|
17742
|
|
|
|
|
|
|
} |
|
17743
|
|
|
|
|
|
|
} |
|
17744
|
|
|
|
|
|
|
|
|
17745
|
0
|
|
|
|
|
|
return predicted_transition(t.transition(conf.stack.size() - 1, right_stack.size(), 0), network_outcome); |
|
17746
|
|
|
|
|
|
|
} |
|
17747
|
|
|
|
|
|
|
|
|
17748
|
|
|
|
|
|
|
// Oracle factory method |
|
17749
|
0
|
|
|
|
|
|
transition_oracle* transition_system_projective::oracle(const string& name) const { |
|
17750
|
0
|
0
|
|
|
|
|
if (name == "static") return new transition_system_projective_oracle_static(labels); |
|
|
|
0
|
|
|
|
|
|
|
17751
|
0
|
0
|
|
|
|
|
if (name == "dynamic") return new transition_system_projective_oracle_dynamic(labels); |
|
|
|
0
|
|
|
|
|
|
|
17752
|
|
|
|
|
|
|
return nullptr; |
|
17753
|
|
|
|
|
|
|
} |
|
17754
|
|
|
|
|
|
|
|
|
17755
|
|
|
|
|
|
|
} // namespace parsito |
|
17756
|
|
|
|
|
|
|
|
|
17757
|
|
|
|
|
|
|
///////// |
|
17758
|
|
|
|
|
|
|
// File: parsito/transition/transition_system_swap.cpp |
|
17759
|
|
|
|
|
|
|
///////// |
|
17760
|
|
|
|
|
|
|
|
|
17761
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
17762
|
|
|
|
|
|
|
// |
|
17763
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
17764
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
17765
|
|
|
|
|
|
|
// |
|
17766
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
17767
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
17768
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
17769
|
|
|
|
|
|
|
|
|
17770
|
|
|
|
|
|
|
namespace parsito { |
|
17771
|
|
|
|
|
|
|
|
|
17772
|
0
|
|
|
|
|
|
transition_system_swap::transition_system_swap(const vector& labels) : transition_system(labels) { |
|
17773
|
0
|
0
|
|
|
|
|
transitions.emplace_back(new transition_shift()); |
|
|
|
0
|
|
|
|
|
|
|
17774
|
0
|
0
|
|
|
|
|
transitions.emplace_back(new transition_swap()); |
|
|
|
0
|
|
|
|
|
|
|
17775
|
0
|
0
|
|
|
|
|
for (auto&& label : labels) { |
|
17776
|
0
|
0
|
|
|
|
|
transitions.emplace_back(new transition_left_arc(label)); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
17777
|
0
|
0
|
|
|
|
|
transitions.emplace_back(new transition_right_arc(label)); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
17778
|
|
|
|
|
|
|
} |
|
17779
|
0
|
|
|
|
|
|
} |
|
17780
|
|
|
|
|
|
|
|
|
17781
|
|
|
|
|
|
|
// Static oracle |
|
17782
|
0
|
|
|
|
|
|
class transition_system_swap_oracle_static : public transition_oracle { |
|
17783
|
|
|
|
|
|
|
public: |
|
17784
|
0
|
|
|
|
|
|
transition_system_swap_oracle_static(const vector& labels, bool lazy) : labels(labels), lazy(lazy) { |
|
17785
|
0
|
0
|
|
|
|
|
for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break; |
|
|
|
0
|
|
|
|
|
|
|
17786
|
0
|
|
|
|
|
|
} |
|
17787
|
|
|
|
|
|
|
|
|
17788
|
0
|
|
|
|
|
|
class tree_oracle_static : public transition_oracle::tree_oracle { |
|
17789
|
|
|
|
|
|
|
public: |
|
17790
|
0
|
|
|
|
|
|
tree_oracle_static(const vector& labels, unsigned root_label, const tree& gold, vector&& projective_order, vector&& projective_components) |
|
17791
|
0
|
0
|
|
|
|
|
: labels(labels), root_label(root_label), gold(gold), projective_order(projective_order), projective_components(projective_components) {} |
|
|
|
0
|
|
|
|
|
|
|
17792
|
|
|
|
|
|
|
virtual predicted_transition predict(const configuration& conf, unsigned network_outcome, unsigned iteration) const override; |
|
17793
|
|
|
|
|
|
|
virtual void interesting_transitions(const configuration& conf, vector& transitions) const override; |
|
17794
|
|
|
|
|
|
|
private: |
|
17795
|
|
|
|
|
|
|
const vector& labels; |
|
17796
|
|
|
|
|
|
|
unsigned root_label; |
|
17797
|
|
|
|
|
|
|
const tree& gold; |
|
17798
|
|
|
|
|
|
|
const vector projective_order; |
|
17799
|
|
|
|
|
|
|
const vector projective_components; |
|
17800
|
|
|
|
|
|
|
}; |
|
17801
|
|
|
|
|
|
|
|
|
17802
|
|
|
|
|
|
|
virtual unique_ptr create_tree_oracle(const tree& gold) const override; |
|
17803
|
|
|
|
|
|
|
private: |
|
17804
|
|
|
|
|
|
|
void create_projective_order(const tree& gold, int node, vector& projective_order, int& projective_index) const; |
|
17805
|
|
|
|
|
|
|
void create_projective_component(const tree& gold, int node, vector& projective_components, int component_index) const; |
|
17806
|
|
|
|
|
|
|
|
|
17807
|
|
|
|
|
|
|
const vector& labels; |
|
17808
|
|
|
|
|
|
|
bool lazy; |
|
17809
|
|
|
|
|
|
|
unsigned root_label; |
|
17810
|
|
|
|
|
|
|
}; |
|
17811
|
|
|
|
|
|
|
|
|
17812
|
0
|
|
|
|
|
|
unique_ptr transition_system_swap_oracle_static::create_tree_oracle(const tree& gold) const { |
|
17813
|
0
|
|
|
|
|
|
vector projective_order(gold.nodes.size()); |
|
17814
|
|
|
|
|
|
|
int projective_index; |
|
17815
|
0
|
|
|
|
|
|
create_projective_order(gold, 0, projective_order, projective_index); |
|
17816
|
|
|
|
|
|
|
|
|
17817
|
|
|
|
|
|
|
vector projective_components; |
|
17818
|
0
|
0
|
|
|
|
|
if (lazy) { |
|
17819
|
0
|
0
|
|
|
|
|
tree_oracle_static projective_oracle(labels, root_label, gold, vector(), vector()); |
|
17820
|
0
|
|
|
|
|
|
configuration conf(false); |
|
17821
|
|
|
|
|
|
|
tree t = gold; |
|
17822
|
0
|
0
|
|
|
|
|
transition_system_swap system(labels); |
|
17823
|
|
|
|
|
|
|
|
|
17824
|
0
|
0
|
|
|
|
|
conf.init(&t); |
|
17825
|
0
|
0
|
|
|
|
|
while (!conf.final()) { |
|
17826
|
0
|
|
|
|
|
|
auto prediction = projective_oracle.predict(conf, 0, 0); |
|
17827
|
0
|
0
|
|
|
|
|
if (!system.applicable(conf, prediction.to_follow)) break; |
|
|
|
0
|
|
|
|
|
|
|
17828
|
0
|
0
|
|
|
|
|
system.perform(conf, prediction.to_follow); |
|
17829
|
|
|
|
|
|
|
} |
|
17830
|
|
|
|
|
|
|
|
|
17831
|
0
|
|
|
|
|
|
projective_components.assign(gold.nodes.size(), 0); |
|
17832
|
0
|
0
|
|
|
|
|
for (auto&& node : conf.stack) |
|
17833
|
0
|
0
|
|
|
|
|
if (node) |
|
17834
|
0
|
|
|
|
|
|
create_projective_component(t, node, projective_components, node); |
|
17835
|
|
|
|
|
|
|
} |
|
17836
|
|
|
|
|
|
|
|
|
17837
|
0
|
0
|
|
|
|
|
return unique_ptr(new tree_oracle_static(labels, root_label, gold, move(projective_order), move(projective_components))); |
|
|
|
0
|
|
|
|
|
|
|
17838
|
|
|
|
|
|
|
} |
|
17839
|
|
|
|
|
|
|
|
|
17840
|
0
|
|
|
|
|
|
void transition_system_swap_oracle_static::create_projective_order(const tree& gold, int node, vector& projective_order, int& projective_index) const { |
|
17841
|
|
|
|
|
|
|
unsigned child_index = 0; |
|
17842
|
0
|
0
|
|
|
|
|
while (child_index < gold.nodes[node].children.size() && gold.nodes[node].children[child_index] < node) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
17843
|
0
|
|
|
|
|
|
create_projective_order(gold, gold.nodes[node].children[child_index++], projective_order, projective_index); |
|
17844
|
0
|
|
|
|
|
|
projective_order[node] = projective_index++; |
|
17845
|
0
|
0
|
|
|
|
|
while (child_index < gold.nodes[node].children.size()) |
|
17846
|
0
|
|
|
|
|
|
create_projective_order(gold, gold.nodes[node].children[child_index++], projective_order, projective_index); |
|
17847
|
0
|
|
|
|
|
|
} |
|
17848
|
|
|
|
|
|
|
|
|
17849
|
0
|
|
|
|
|
|
void transition_system_swap_oracle_static::create_projective_component(const tree& gold, int node, vector& projective_components, int component_index) const { |
|
17850
|
0
|
|
|
|
|
|
projective_components[node] = component_index; |
|
17851
|
0
|
0
|
|
|
|
|
for (auto&& child : gold.nodes[node].children) |
|
17852
|
0
|
|
|
|
|
|
create_projective_component(gold, child, projective_components, component_index); |
|
17853
|
0
|
|
|
|
|
|
} |
|
17854
|
|
|
|
|
|
|
|
|
17855
|
0
|
|
|
|
|
|
void transition_system_swap_oracle_static::tree_oracle_static::interesting_transitions(const configuration& conf, vector& transitions) const { |
|
17856
|
|
|
|
|
|
|
transitions.clear(); |
|
17857
|
0
|
0
|
|
|
|
|
if (!conf.buffer.empty()) transitions.push_back(0); |
|
17858
|
0
|
0
|
|
|
|
|
if (conf.stack.size() >= 2) { |
|
17859
|
|
|
|
|
|
|
// Swap |
|
17860
|
0
|
0
|
|
|
|
|
if (!projective_order.empty()) { |
|
17861
|
0
|
|
|
|
|
|
int last = conf.stack[conf.stack.size() - 1]; |
|
17862
|
0
|
|
|
|
|
|
int prev = conf.stack[conf.stack.size() - 2]; |
|
17863
|
0
|
0
|
|
|
|
|
if (projective_order[last] < projective_order[prev] && |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
17864
|
0
|
0
|
|
|
|
|
(projective_components.empty() || |
|
17865
|
0
|
0
|
|
|
|
|
(conf.buffer.empty() || projective_components[last] != projective_components[conf.buffer.back()]))) |
|
17866
|
0
|
|
|
|
|
|
transitions.push_back(1); |
|
17867
|
|
|
|
|
|
|
} |
|
17868
|
|
|
|
|
|
|
|
|
17869
|
|
|
|
|
|
|
// Arcs |
|
17870
|
0
|
0
|
|
|
|
|
for (int direction = 0; direction < 2; direction++) { |
|
17871
|
0
|
|
|
|
|
|
int child = conf.stack[conf.stack.size() - 2 + direction]; |
|
17872
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < labels.size(); i++) |
|
17873
|
0
|
0
|
|
|
|
|
if (gold.nodes[child].deprel == labels[i]) |
|
17874
|
0
|
0
|
|
|
|
|
if (!conf.single_root || |
|
|
|
0
|
|
|
|
|
|
|
17875
|
0
|
0
|
|
|
|
|
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
17876
|
0
|
0
|
|
|
|
|
(i != root_label && conf.stack.size() > 2)) |
|
17877
|
0
|
|
|
|
|
|
transitions.push_back(2 + 2*i + direction); |
|
17878
|
|
|
|
|
|
|
} |
|
17879
|
|
|
|
|
|
|
} |
|
17880
|
0
|
|
|
|
|
|
} |
|
17881
|
|
|
|
|
|
|
|
|
17882
|
0
|
|
|
|
|
|
transition_oracle::predicted_transition transition_system_swap_oracle_static::tree_oracle_static::predict(const configuration& conf, unsigned /*network_outcome*/, unsigned /*iteration*/) const { |
|
17883
|
|
|
|
|
|
|
// Use left if appropriate |
|
17884
|
0
|
0
|
|
|
|
|
if (conf.stack.size() >= 2) { |
|
17885
|
0
|
|
|
|
|
|
int parent = conf.stack[conf.stack.size() - 1]; |
|
17886
|
0
|
|
|
|
|
|
int child = conf.stack[conf.stack.size() - 2]; |
|
17887
|
0
|
0
|
|
|
|
|
if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
17888
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < labels.size(); i++) |
|
17889
|
0
|
0
|
|
|
|
|
if (gold.nodes[child].deprel == labels[i]) |
|
17890
|
0
|
|
|
|
|
|
return predicted_transition(2 + 2*i, 2 + 2*i); |
|
17891
|
|
|
|
|
|
|
|
|
17892
|
0
|
|
|
|
|
|
assert(!"label was not found"); |
|
17893
|
|
|
|
|
|
|
} |
|
17894
|
|
|
|
|
|
|
} |
|
17895
|
|
|
|
|
|
|
|
|
17896
|
|
|
|
|
|
|
// Use right if appropriate |
|
17897
|
0
|
0
|
|
|
|
|
if (conf.stack.size() >= 2) { |
|
17898
|
0
|
|
|
|
|
|
int child = conf.stack[conf.stack.size() - 1]; |
|
17899
|
0
|
|
|
|
|
|
int parent = conf.stack[conf.stack.size() - 2]; |
|
17900
|
0
|
0
|
|
|
|
|
if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
17901
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < labels.size(); i++) |
|
17902
|
0
|
0
|
|
|
|
|
if (gold.nodes[child].deprel == labels[i]) |
|
17903
|
0
|
|
|
|
|
|
return predicted_transition(2 + 2*i + 1, 2 + 2*i + 1); |
|
17904
|
|
|
|
|
|
|
|
|
17905
|
0
|
|
|
|
|
|
assert(!"label was not found"); |
|
17906
|
|
|
|
|
|
|
} |
|
17907
|
|
|
|
|
|
|
} |
|
17908
|
|
|
|
|
|
|
|
|
17909
|
|
|
|
|
|
|
// Use swap if appropriate |
|
17910
|
0
|
0
|
|
|
|
|
if (conf.stack.size() >= 2 && !projective_order.empty()) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
17911
|
0
|
|
|
|
|
|
int last = conf.stack[conf.stack.size() - 1]; |
|
17912
|
0
|
|
|
|
|
|
int prev = conf.stack[conf.stack.size() - 2]; |
|
17913
|
0
|
0
|
|
|
|
|
if (projective_order[last] < projective_order[prev] && |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
17914
|
0
|
0
|
|
|
|
|
(projective_components.empty() || |
|
17915
|
0
|
0
|
|
|
|
|
(conf.buffer.empty() || projective_components[last] != projective_components[conf.buffer.back()]))) |
|
17916
|
0
|
|
|
|
|
|
return predicted_transition(1, 1); |
|
17917
|
|
|
|
|
|
|
} |
|
17918
|
|
|
|
|
|
|
|
|
17919
|
|
|
|
|
|
|
// Otherwise, just shift |
|
17920
|
0
|
|
|
|
|
|
return predicted_transition(0, 0); |
|
17921
|
|
|
|
|
|
|
} |
|
17922
|
|
|
|
|
|
|
|
|
17923
|
|
|
|
|
|
|
// Oracle factory method |
|
17924
|
0
|
|
|
|
|
|
transition_oracle* transition_system_swap::oracle(const string& name) const { |
|
17925
|
0
|
0
|
|
|
|
|
if (name == "static_eager") return new transition_system_swap_oracle_static(labels, false); |
|
|
|
0
|
|
|
|
|
|
|
17926
|
0
|
0
|
|
|
|
|
if (name == "static_lazy") return new transition_system_swap_oracle_static(labels, true); |
|
|
|
0
|
|
|
|
|
|
|
17927
|
|
|
|
|
|
|
return nullptr; |
|
17928
|
|
|
|
|
|
|
} |
|
17929
|
|
|
|
|
|
|
|
|
17930
|
|
|
|
|
|
|
} // namespace parsito |
|
17931
|
|
|
|
|
|
|
|
|
17932
|
|
|
|
|
|
|
///////// |
|
17933
|
|
|
|
|
|
|
// File: parsito/tree/tree.cpp |
|
17934
|
|
|
|
|
|
|
///////// |
|
17935
|
|
|
|
|
|
|
|
|
17936
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
17937
|
|
|
|
|
|
|
// |
|
17938
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
17939
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
17940
|
|
|
|
|
|
|
// |
|
17941
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
17942
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
17943
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
17944
|
|
|
|
|
|
|
|
|
17945
|
|
|
|
|
|
|
namespace parsito { |
|
17946
|
|
|
|
|
|
|
|
|
17947
|
2
|
|
|
|
|
|
const string tree::root_form = ""; |
|
17948
|
|
|
|
|
|
|
|
|
17949
|
1
|
|
|
|
|
|
tree::tree() { |
|
17950
|
1
|
50
|
|
|
|
|
clear(); |
|
17951
|
1
|
|
|
|
|
|
} |
|
17952
|
|
|
|
|
|
|
|
|
17953
|
0
|
|
|
|
|
|
bool tree::empty() { |
|
17954
|
0
|
|
|
|
|
|
return nodes.size() == 1; |
|
17955
|
|
|
|
|
|
|
} |
|
17956
|
|
|
|
|
|
|
|
|
17957
|
2
|
|
|
|
|
|
void tree::clear() { |
|
17958
|
|
|
|
|
|
|
nodes.clear(); |
|
17959
|
|
|
|
|
|
|
node& root = add_node(root_form); |
|
17960
|
8
|
|
|
|
|
|
root.lemma = root.upostag = root.xpostag = root.feats = root_form; |
|
17961
|
2
|
|
|
|
|
|
} |
|
17962
|
|
|
|
|
|
|
|
|
17963
|
0
|
|
|
|
|
|
node& tree::add_node(const string& form) { |
|
17964
|
9
|
0
|
|
|
|
|
nodes.emplace_back((int)nodes.size(), form); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
17965
|
0
|
|
|
|
|
|
return nodes.back(); |
|
17966
|
|
|
|
|
|
|
} |
|
17967
|
|
|
|
|
|
|
|
|
17968
|
38
|
|
|
|
|
|
void tree::set_head(int id, int head, const string& deprel) { |
|
17969
|
38
|
50
|
|
|
|
|
assert(id >= 0 && id < int(nodes.size())); |
|
|
|
50
|
|
|
|
|
|
|
17970
|
38
|
50
|
|
|
|
|
assert(head < int(nodes.size())); |
|
17971
|
|
|
|
|
|
|
|
|
17972
|
|
|
|
|
|
|
// Remove existing head |
|
17973
|
38
|
50
|
|
|
|
|
if (nodes[id].head >= 0) { |
|
17974
|
0
|
|
|
|
|
|
auto& children = nodes[nodes[id].head].children; |
|
17975
|
0
|
0
|
|
|
|
|
for (size_t i = children.size(); i && children[i-1] >= id; i--) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
17976
|
0
|
0
|
|
|
|
|
if (children[i-1] == id) { |
|
17977
|
|
|
|
|
|
|
children.erase(children.begin() + i - 1); |
|
17978
|
0
|
|
|
|
|
|
break; |
|
17979
|
|
|
|
|
|
|
} |
|
17980
|
|
|
|
|
|
|
} |
|
17981
|
|
|
|
|
|
|
|
|
17982
|
|
|
|
|
|
|
// Set new head |
|
17983
|
76
|
|
|
|
|
|
nodes[id].head = head; |
|
17984
|
38
|
|
|
|
|
|
nodes[id].deprel = deprel; |
|
17985
|
38
|
50
|
|
|
|
|
if (head >= 0) { |
|
17986
|
76
|
|
|
|
|
|
auto& children = nodes[head].children; |
|
17987
|
|
|
|
|
|
|
size_t i = children.size(); |
|
17988
|
56
|
100
|
|
|
|
|
while (i && children[i-1] > id) i--; |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
17989
|
38
|
100
|
|
|
|
|
if (!i || children[i-1] < id) children.insert(children.begin() + i, id); |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
17990
|
|
|
|
|
|
|
} |
|
17991
|
38
|
|
|
|
|
|
} |
|
17992
|
|
|
|
|
|
|
|
|
17993
|
0
|
|
|
|
|
|
void tree::unlink_all_nodes() { |
|
17994
|
9
|
0
|
|
|
|
|
for (auto&& node : nodes) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
17995
|
8
|
|
|
|
|
|
node.head = -1; |
|
17996
|
|
|
|
|
|
|
node.deprel.clear(); |
|
17997
|
|
|
|
|
|
|
node.children.clear(); |
|
17998
|
|
|
|
|
|
|
} |
|
17999
|
0
|
|
|
|
|
|
} |
|
18000
|
|
|
|
|
|
|
|
|
18001
|
|
|
|
|
|
|
} // namespace parsito |
|
18002
|
|
|
|
|
|
|
|
|
18003
|
|
|
|
|
|
|
///////// |
|
18004
|
|
|
|
|
|
|
// File: parsito/tree/tree_format.h |
|
18005
|
|
|
|
|
|
|
///////// |
|
18006
|
|
|
|
|
|
|
|
|
18007
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
18008
|
|
|
|
|
|
|
// |
|
18009
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
18010
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
18011
|
|
|
|
|
|
|
// |
|
18012
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
18013
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
18014
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
18015
|
|
|
|
|
|
|
|
|
18016
|
|
|
|
|
|
|
namespace parsito { |
|
18017
|
|
|
|
|
|
|
|
|
18018
|
|
|
|
|
|
|
// Input format |
|
18019
|
0
|
|
|
|
|
|
class tree_input_format { |
|
18020
|
|
|
|
|
|
|
public: |
|
18021
|
0
|
|
|
|
|
|
virtual ~tree_input_format() {} |
|
18022
|
|
|
|
|
|
|
|
|
18023
|
|
|
|
|
|
|
virtual bool read_block(istream& in, string& block) const = 0; |
|
18024
|
|
|
|
|
|
|
virtual void set_text(string_piece text, bool make_copy = false) = 0; |
|
18025
|
|
|
|
|
|
|
virtual bool next_tree(tree& t) = 0; |
|
18026
|
|
|
|
|
|
|
const string& last_error() const; |
|
18027
|
|
|
|
|
|
|
|
|
18028
|
|
|
|
|
|
|
// Static factory methods |
|
18029
|
|
|
|
|
|
|
static tree_input_format* new_input_format(const string& name); |
|
18030
|
|
|
|
|
|
|
static tree_input_format* new_conllu_input_format(); |
|
18031
|
|
|
|
|
|
|
|
|
18032
|
|
|
|
|
|
|
protected: |
|
18033
|
|
|
|
|
|
|
string error; |
|
18034
|
|
|
|
|
|
|
}; |
|
18035
|
|
|
|
|
|
|
|
|
18036
|
|
|
|
|
|
|
// Output format |
|
18037
|
0
|
|
|
|
|
|
class tree_output_format { |
|
18038
|
|
|
|
|
|
|
public: |
|
18039
|
0
|
|
|
|
|
|
virtual ~tree_output_format() {} |
|
18040
|
|
|
|
|
|
|
|
|
18041
|
|
|
|
|
|
|
virtual void write_tree(const tree& t, string& output, const tree_input_format* additional_info = nullptr) const = 0; |
|
18042
|
|
|
|
|
|
|
|
|
18043
|
|
|
|
|
|
|
// Static factory methods |
|
18044
|
|
|
|
|
|
|
static tree_output_format* new_output_format(const string& name); |
|
18045
|
|
|
|
|
|
|
static tree_output_format* new_conllu_output_format(); |
|
18046
|
|
|
|
|
|
|
}; |
|
18047
|
|
|
|
|
|
|
|
|
18048
|
|
|
|
|
|
|
} // namespace parsito |
|
18049
|
|
|
|
|
|
|
|
|
18050
|
|
|
|
|
|
|
///////// |
|
18051
|
|
|
|
|
|
|
// File: parsito/tree/tree_format_conllu.h |
|
18052
|
|
|
|
|
|
|
///////// |
|
18053
|
|
|
|
|
|
|
|
|
18054
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
18055
|
|
|
|
|
|
|
// |
|
18056
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
18057
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
18058
|
|
|
|
|
|
|
// |
|
18059
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
18060
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
18061
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
18062
|
|
|
|
|
|
|
|
|
18063
|
|
|
|
|
|
|
namespace parsito { |
|
18064
|
|
|
|
|
|
|
|
|
18065
|
|
|
|
|
|
|
// Input CoNLL-U format |
|
18066
|
0
|
|
|
|
|
|
class tree_input_format_conllu : public tree_input_format { |
|
18067
|
|
|
|
|
|
|
public: |
|
18068
|
|
|
|
|
|
|
virtual bool read_block(istream& in, string& block) const override; |
|
18069
|
|
|
|
|
|
|
virtual void set_text(string_piece text, bool make_copy = false) override; |
|
18070
|
|
|
|
|
|
|
virtual bool next_tree(tree& t) override; |
|
18071
|
|
|
|
|
|
|
|
|
18072
|
|
|
|
|
|
|
private: |
|
18073
|
|
|
|
|
|
|
friend class tree_output_format_conllu; |
|
18074
|
|
|
|
|
|
|
vector comments; |
|
18075
|
|
|
|
|
|
|
vector> multiword_tokens; |
|
18076
|
|
|
|
|
|
|
|
|
18077
|
|
|
|
|
|
|
string_piece text; |
|
18078
|
|
|
|
|
|
|
string text_copy; |
|
18079
|
|
|
|
|
|
|
}; |
|
18080
|
|
|
|
|
|
|
|
|
18081
|
|
|
|
|
|
|
// Output CoNLL-U format |
|
18082
|
0
|
|
|
|
|
|
class tree_output_format_conllu : public tree_output_format { |
|
18083
|
|
|
|
|
|
|
public: |
|
18084
|
|
|
|
|
|
|
virtual void write_tree(const tree& t, string& output, const tree_input_format* additional_info = nullptr) const override; |
|
18085
|
|
|
|
|
|
|
|
|
18086
|
|
|
|
|
|
|
private: |
|
18087
|
|
|
|
|
|
|
static const string underscore; |
|
18088
|
0
|
0
|
|
|
|
|
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18089
|
|
|
|
|
|
|
}; |
|
18090
|
|
|
|
|
|
|
|
|
18091
|
|
|
|
|
|
|
} // namespace parsito |
|
18092
|
|
|
|
|
|
|
|
|
18093
|
|
|
|
|
|
|
///////// |
|
18094
|
|
|
|
|
|
|
// File: parsito/tree/tree_format.cpp |
|
18095
|
|
|
|
|
|
|
///////// |
|
18096
|
|
|
|
|
|
|
|
|
18097
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
18098
|
|
|
|
|
|
|
// |
|
18099
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
18100
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
18101
|
|
|
|
|
|
|
// |
|
18102
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
18103
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
18104
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
18105
|
|
|
|
|
|
|
|
|
18106
|
|
|
|
|
|
|
namespace parsito { |
|
18107
|
|
|
|
|
|
|
|
|
18108
|
0
|
|
|
|
|
|
const string& tree_input_format::last_error() const { |
|
18109
|
0
|
|
|
|
|
|
return error; |
|
18110
|
|
|
|
|
|
|
} |
|
18111
|
|
|
|
|
|
|
|
|
18112
|
|
|
|
|
|
|
// Input Static factory methods |
|
18113
|
0
|
|
|
|
|
|
tree_input_format* tree_input_format::new_conllu_input_format() { |
|
18114
|
0
|
|
|
|
|
|
return new tree_input_format_conllu(); |
|
18115
|
|
|
|
|
|
|
} |
|
18116
|
|
|
|
|
|
|
|
|
18117
|
0
|
|
|
|
|
|
tree_input_format* tree_input_format::new_input_format(const string& name) { |
|
18118
|
0
|
0
|
|
|
|
|
if (name == "conllu") return new_conllu_input_format(); |
|
18119
|
|
|
|
|
|
|
return nullptr; |
|
18120
|
|
|
|
|
|
|
} |
|
18121
|
|
|
|
|
|
|
|
|
18122
|
|
|
|
|
|
|
// Output static factory methods |
|
18123
|
0
|
|
|
|
|
|
tree_output_format* tree_output_format::new_conllu_output_format() { |
|
18124
|
0
|
|
|
|
|
|
return new tree_output_format_conllu(); |
|
18125
|
|
|
|
|
|
|
} |
|
18126
|
|
|
|
|
|
|
|
|
18127
|
0
|
|
|
|
|
|
tree_output_format* tree_output_format::new_output_format(const string& name) { |
|
18128
|
0
|
0
|
|
|
|
|
if (name == "conllu") return new_conllu_output_format(); |
|
18129
|
|
|
|
|
|
|
return nullptr; |
|
18130
|
|
|
|
|
|
|
} |
|
18131
|
|
|
|
|
|
|
|
|
18132
|
|
|
|
|
|
|
} // namespace parsito |
|
18133
|
|
|
|
|
|
|
|
|
18134
|
|
|
|
|
|
|
///////// |
|
18135
|
|
|
|
|
|
|
// File: parsito/tree/tree_format_conllu.cpp |
|
18136
|
|
|
|
|
|
|
///////// |
|
18137
|
|
|
|
|
|
|
|
|
18138
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
18139
|
|
|
|
|
|
|
// |
|
18140
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
18141
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
18142
|
|
|
|
|
|
|
// |
|
18143
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
18144
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
18145
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
18146
|
|
|
|
|
|
|
|
|
18147
|
|
|
|
|
|
|
namespace parsito { |
|
18148
|
|
|
|
|
|
|
|
|
18149
|
|
|
|
|
|
|
// Input CoNLL-U format |
|
18150
|
|
|
|
|
|
|
|
|
18151
|
0
|
|
|
|
|
|
bool tree_input_format_conllu::read_block(istream& in, string& block) const { |
|
18152
|
0
|
|
|
|
|
|
return bool(getpara(in, block)); |
|
18153
|
|
|
|
|
|
|
} |
|
18154
|
|
|
|
|
|
|
|
|
18155
|
0
|
|
|
|
|
|
void tree_input_format_conllu::set_text(string_piece text, bool make_copy) { |
|
18156
|
0
|
0
|
|
|
|
|
if (make_copy) { |
|
18157
|
0
|
|
|
|
|
|
text_copy.assign(text.str, text.len); |
|
18158
|
|
|
|
|
|
|
text = string_piece(text_copy.c_str(), text_copy.size()); |
|
18159
|
|
|
|
|
|
|
} |
|
18160
|
0
|
|
|
|
|
|
this->text = text; |
|
18161
|
0
|
|
|
|
|
|
} |
|
18162
|
|
|
|
|
|
|
|
|
18163
|
0
|
|
|
|
|
|
bool tree_input_format_conllu::next_tree(tree& t) { |
|
18164
|
|
|
|
|
|
|
error.clear(); |
|
18165
|
0
|
|
|
|
|
|
t.clear(); |
|
18166
|
|
|
|
|
|
|
comments.clear(); |
|
18167
|
|
|
|
|
|
|
multiword_tokens.clear(); |
|
18168
|
|
|
|
|
|
|
int last_multiword_token = 0; |
|
18169
|
|
|
|
|
|
|
|
|
18170
|
|
|
|
|
|
|
vector tokens, parts; |
|
18171
|
0
|
0
|
|
|
|
|
while (text.len) { |
|
18172
|
|
|
|
|
|
|
// Read line |
|
18173
|
0
|
|
|
|
|
|
string_piece line(text.str, 0); |
|
18174
|
0
|
0
|
|
|
|
|
while (line.len < text.len && line.str[line.len] != '\n') line.len++; |
|
|
|
0
|
|
|
|
|
|
|
18175
|
0
|
|
|
|
|
|
text.str += line.len + (line.len < text.len); |
|
18176
|
0
|
|
|
|
|
|
text.len -= line.len + (line.len < text.len); |
|
18177
|
|
|
|
|
|
|
|
|
18178
|
|
|
|
|
|
|
// Empty lines denote end of tree, unless at the beginning |
|
18179
|
0
|
0
|
|
|
|
|
if (!line.len) { |
|
18180
|
0
|
0
|
|
|
|
|
if (t.empty()) continue; |
|
18181
|
0
|
|
|
|
|
|
break; |
|
18182
|
|
|
|
|
|
|
} |
|
18183
|
|
|
|
|
|
|
|
|
18184
|
0
|
0
|
|
|
|
|
if (*line.str == '#') { |
|
18185
|
|
|
|
|
|
|
// Store comments at the beginning and ignore the rest |
|
18186
|
0
|
0
|
|
|
|
|
if (t.empty()) comments.push_back(line); |
|
|
|
0
|
|
|
|
|
|
|
18187
|
|
|
|
|
|
|
continue; |
|
18188
|
|
|
|
|
|
|
} |
|
18189
|
|
|
|
|
|
|
|
|
18190
|
|
|
|
|
|
|
// Parse another tree node |
|
18191
|
0
|
0
|
|
|
|
|
split(line, '\t', tokens); |
|
18192
|
0
|
0
|
|
|
|
|
if (tokens.size() != 10) |
|
18193
|
0
|
0
|
|
|
|
|
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' does not contain 10 columns!") , false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18194
|
|
|
|
|
|
|
|
|
18195
|
|
|
|
|
|
|
// Store and skip multiword tokens |
|
18196
|
0
|
0
|
|
|
|
|
if (memchr(tokens[0].str, '-', tokens[0].len)) { |
|
18197
|
0
|
0
|
|
|
|
|
split(tokens[0], '-', parts); |
|
18198
|
0
|
0
|
|
|
|
|
if (parts.size() != 2) |
|
18199
|
0
|
0
|
|
|
|
|
return error.assign("Cannot parse ID of multiword token '").append(line.str, line.len).append("'!") , false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18200
|
|
|
|
|
|
|
int from, to; |
|
18201
|
0
|
0
|
|
|
|
|
if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error)) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18202
|
|
|
|
|
|
|
return false; |
|
18203
|
0
|
0
|
|
|
|
|
if (from != int(t.nodes.size())) |
|
18204
|
0
|
0
|
|
|
|
|
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18205
|
0
|
0
|
|
|
|
|
if (to < from) |
|
18206
|
0
|
0
|
|
|
|
|
return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18207
|
0
|
0
|
|
|
|
|
if (from <= last_multiword_token) |
|
18208
|
0
|
0
|
|
|
|
|
return error.assign("Multiword token '").append(line.str, line.len).append("' overlaps with the previous one!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18209
|
|
|
|
|
|
|
last_multiword_token = to; |
|
18210
|
0
|
0
|
|
|
|
|
multiword_tokens.emplace_back(from, line); |
|
18211
|
0
|
|
|
|
|
|
continue; |
|
18212
|
|
|
|
|
|
|
} |
|
18213
|
|
|
|
|
|
|
|
|
18214
|
|
|
|
|
|
|
// Parse node ID and head |
|
18215
|
|
|
|
|
|
|
int id; |
|
18216
|
0
|
0
|
|
|
|
|
if (!parse_int(tokens[0], "CoNLL-U id", id, error)) |
|
|
|
0
|
|
|
|
|
|
|
18217
|
|
|
|
|
|
|
return false; |
|
18218
|
0
|
0
|
|
|
|
|
if (id != int(t.nodes.size())) |
|
18219
|
0
|
0
|
|
|
|
|
return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18220
|
|
|
|
|
|
|
|
|
18221
|
|
|
|
|
|
|
int head; |
|
18222
|
0
|
0
|
|
|
|
|
if (tokens[6].len == 1 && tokens[6].str[0] == '_') { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18223
|
0
|
|
|
|
|
|
head = -1; |
|
18224
|
|
|
|
|
|
|
} else { |
|
18225
|
0
|
0
|
|
|
|
|
if (!parse_int(tokens[6], "CoNLL-U head", head, error)) |
|
|
|
0
|
|
|
|
|
|
|
18226
|
|
|
|
|
|
|
return false; |
|
18227
|
0
|
0
|
|
|
|
|
if (head < 0) |
|
18228
|
0
|
0
|
|
|
|
|
return error.assign("Numeric head value '").append(tokens[0].str, tokens[0].len).append("' cannot be negative!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18229
|
|
|
|
|
|
|
} |
|
18230
|
|
|
|
|
|
|
|
|
18231
|
|
|
|
|
|
|
// Add new node |
|
18232
|
0
|
|
|
|
|
|
auto& node = t.add_node(string(tokens[1].str, tokens[1].len)); |
|
18233
|
0
|
0
|
|
|
|
|
if (!(tokens[2].len == 1 && tokens[2].str[0] == '_')) node.lemma.assign(tokens[2].str, tokens[2].len); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18234
|
0
|
0
|
|
|
|
|
if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) node.upostag.assign(tokens[3].str, tokens[3].len); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18235
|
0
|
0
|
|
|
|
|
if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) node.xpostag.assign(tokens[4].str, tokens[4].len); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18236
|
0
|
0
|
|
|
|
|
if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) node.feats.assign(tokens[5].str, tokens[5].len); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18237
|
0
|
|
|
|
|
|
node.head = head; |
|
18238
|
0
|
0
|
|
|
|
|
if (!(tokens[7].len == 1 && tokens[7].str[0] == '_')) node.deprel.assign(tokens[7].str, tokens[7].len); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18239
|
0
|
0
|
|
|
|
|
if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) node.deps.assign(tokens[8].str, tokens[8].len); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18240
|
0
|
0
|
|
|
|
|
if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) node.misc.assign(tokens[9].str, tokens[9].len); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18241
|
|
|
|
|
|
|
} |
|
18242
|
|
|
|
|
|
|
|
|
18243
|
|
|
|
|
|
|
// Check that we got word for the last multiword token |
|
18244
|
0
|
0
|
|
|
|
|
if (last_multiword_token >= int(t.nodes.size())) |
|
18245
|
0
|
0
|
|
|
|
|
return error.assign("There are words missing for multiword token '").append(multiword_tokens.back().second.str, multiword_tokens.back().second.len).append("'!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18246
|
|
|
|
|
|
|
|
|
18247
|
|
|
|
|
|
|
// Set heads correctly |
|
18248
|
0
|
0
|
|
|
|
|
for (auto&& node : t.nodes) |
|
18249
|
0
|
0
|
|
|
|
|
if (node.id && node.head >= 0) { |
|
|
|
0
|
|
|
|
|
|
|
18250
|
0
|
0
|
|
|
|
|
if (node.head >= int(t.nodes.size())) |
|
18251
|
0
|
0
|
|
|
|
|
return error.assign("Node ID '").append(to_string(node.id)).append("' form '").append(node.form).append("' has too large head: '").append(to_string(node.head)).append("'!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18252
|
0
|
0
|
|
|
|
|
t.set_head(node.id, node.head, node.deprel); |
|
18253
|
|
|
|
|
|
|
} |
|
18254
|
|
|
|
|
|
|
|
|
18255
|
0
|
|
|
|
|
|
return !t.empty(); |
|
18256
|
|
|
|
|
|
|
} |
|
18257
|
|
|
|
|
|
|
|
|
18258
|
|
|
|
|
|
|
// Output CoNLL-U format |
|
18259
|
|
|
|
|
|
|
|
|
18260
|
2
|
|
|
|
|
|
const string tree_output_format_conllu::underscore = "_"; |
|
18261
|
|
|
|
|
|
|
|
|
18262
|
0
|
|
|
|
|
|
void tree_output_format_conllu::write_tree(const tree& t, string& output, const tree_input_format* additional_info) const { |
|
18263
|
|
|
|
|
|
|
output.clear(); |
|
18264
|
|
|
|
|
|
|
|
|
18265
|
|
|
|
|
|
|
// Try casting input format to CoNLL-U |
|
18266
|
0
|
0
|
|
|
|
|
auto input_conllu = dynamic_cast(additional_info); |
|
18267
|
|
|
|
|
|
|
size_t input_conllu_multiword_tokens = 0; |
|
18268
|
|
|
|
|
|
|
|
|
18269
|
|
|
|
|
|
|
// Comments if present |
|
18270
|
0
|
0
|
|
|
|
|
if (input_conllu) |
|
18271
|
0
|
0
|
|
|
|
|
for (auto&& comment : input_conllu->comments) |
|
18272
|
0
|
|
|
|
|
|
output.append(comment.str, comment.len).push_back('\n'); |
|
18273
|
|
|
|
|
|
|
|
|
18274
|
|
|
|
|
|
|
// Print out the tokens |
|
18275
|
0
|
0
|
|
|
|
|
for (int i = 1 /*skip the root node*/; i < int(t.nodes.size()); i++) { |
|
18276
|
|
|
|
|
|
|
// Write multiword token if present |
|
18277
|
0
|
0
|
|
|
|
|
if (input_conllu && input_conllu_multiword_tokens < input_conllu->multiword_tokens.size() && |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18278
|
0
|
|
|
|
|
|
i == input_conllu->multiword_tokens[input_conllu_multiword_tokens].first) { |
|
18279
|
0
|
|
|
|
|
|
output.append(input_conllu->multiword_tokens[input_conllu_multiword_tokens].second.str, |
|
18280
|
0
|
|
|
|
|
|
input_conllu->multiword_tokens[input_conllu_multiword_tokens].second.len).push_back('\n'); |
|
18281
|
0
|
|
|
|
|
|
input_conllu_multiword_tokens++; |
|
18282
|
|
|
|
|
|
|
} |
|
18283
|
|
|
|
|
|
|
|
|
18284
|
|
|
|
|
|
|
// Write the token |
|
18285
|
0
|
0
|
|
|
|
|
output.append(to_string(i)).push_back('\t'); |
|
18286
|
0
|
|
|
|
|
|
output.append(t.nodes[i].form).push_back('\t'); |
|
18287
|
0
|
|
|
|
|
|
output.append(underscore_on_empty(t.nodes[i].lemma)).push_back('\t'); |
|
18288
|
0
|
|
|
|
|
|
output.append(underscore_on_empty(t.nodes[i].upostag)).push_back('\t'); |
|
18289
|
0
|
|
|
|
|
|
output.append(underscore_on_empty(t.nodes[i].xpostag)).push_back('\t'); |
|
18290
|
0
|
|
|
|
|
|
output.append(underscore_on_empty(t.nodes[i].feats)).push_back('\t'); |
|
18291
|
0
|
0
|
|
|
|
|
output.append(t.nodes[i].head < 0 ? "_" : to_string(t.nodes[i].head)).push_back('\t'); |
|
|
|
0
|
|
|
|
|
|
|
18292
|
0
|
|
|
|
|
|
output.append(underscore_on_empty(t.nodes[i].deprel)).push_back('\t'); |
|
18293
|
0
|
|
|
|
|
|
output.append(underscore_on_empty(t.nodes[i].deps)).push_back('\t'); |
|
18294
|
0
|
|
|
|
|
|
output.append(underscore_on_empty(t.nodes[i].misc)).push_back('\n'); |
|
18295
|
|
|
|
|
|
|
} |
|
18296
|
0
|
|
|
|
|
|
output.push_back('\n'); |
|
18297
|
0
|
|
|
|
|
|
} |
|
18298
|
|
|
|
|
|
|
|
|
18299
|
|
|
|
|
|
|
} // namespace parsito |
|
18300
|
|
|
|
|
|
|
|
|
18301
|
|
|
|
|
|
|
///////// |
|
18302
|
|
|
|
|
|
|
// File: parsito/version/version.h |
|
18303
|
|
|
|
|
|
|
///////// |
|
18304
|
|
|
|
|
|
|
|
|
18305
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
18306
|
|
|
|
|
|
|
// |
|
18307
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
18308
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
18309
|
|
|
|
|
|
|
// |
|
18310
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
18311
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
18312
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
18313
|
|
|
|
|
|
|
|
|
18314
|
|
|
|
|
|
|
namespace parsito { |
|
18315
|
|
|
|
|
|
|
|
|
18316
|
0
|
|
|
|
|
|
struct version { |
|
18317
|
|
|
|
|
|
|
unsigned major; |
|
18318
|
|
|
|
|
|
|
unsigned minor; |
|
18319
|
|
|
|
|
|
|
unsigned patch; |
|
18320
|
|
|
|
|
|
|
std::string prerelease; |
|
18321
|
|
|
|
|
|
|
|
|
18322
|
|
|
|
|
|
|
// Returns current version. |
|
18323
|
|
|
|
|
|
|
static version current(); |
|
18324
|
|
|
|
|
|
|
|
|
18325
|
|
|
|
|
|
|
// Returns multi-line formated version and copyright string. |
|
18326
|
|
|
|
|
|
|
static string version_and_copyright(const string& other_libraries = string()); |
|
18327
|
|
|
|
|
|
|
}; |
|
18328
|
|
|
|
|
|
|
|
|
18329
|
|
|
|
|
|
|
} // namespace parsito |
|
18330
|
|
|
|
|
|
|
|
|
18331
|
|
|
|
|
|
|
///////// |
|
18332
|
|
|
|
|
|
|
// File: parsito/version/version.cpp |
|
18333
|
|
|
|
|
|
|
///////// |
|
18334
|
|
|
|
|
|
|
|
|
18335
|
|
|
|
|
|
|
// This file is part of Parsito . |
|
18336
|
|
|
|
|
|
|
// |
|
18337
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
18338
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
18339
|
|
|
|
|
|
|
// |
|
18340
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
18341
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
18342
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
18343
|
|
|
|
|
|
|
|
|
18344
|
|
|
|
|
|
|
namespace parsito { |
|
18345
|
|
|
|
|
|
|
|
|
18346
|
|
|
|
|
|
|
// Returns current version. |
|
18347
|
0
|
|
|
|
|
|
version version::current() { |
|
18348
|
0
|
0
|
|
|
|
|
return {1, 1, 1, "devel"}; |
|
|
|
0
|
|
|
|
|
|
|
18349
|
|
|
|
|
|
|
} |
|
18350
|
|
|
|
|
|
|
|
|
18351
|
|
|
|
|
|
|
// Returns multi-line formated version and copyright string. |
|
18352
|
0
|
|
|
|
|
|
string version::version_and_copyright(const string& other_libraries) { |
|
18353
|
0
|
|
|
|
|
|
ostringstream info; |
|
18354
|
|
|
|
|
|
|
|
|
18355
|
|
|
|
|
|
|
auto parsito = version::current(); |
|
18356
|
|
|
|
|
|
|
auto unilib = unilib::version::current(); |
|
18357
|
|
|
|
|
|
|
|
|
18358
|
0
|
|
|
|
|
|
info << "Parsito version " << parsito.major << '.' << parsito.minor << '.' << parsito.patch |
|
18359
|
0
|
0
|
|
|
|
|
<< (parsito.prerelease.empty() ? "" : "-") << parsito.prerelease |
|
|
|
0
|
|
|
|
|
|
|
18360
|
0
|
|
|
|
|
|
<< " (using UniLib " << unilib.major << '.' << unilib.minor << '.' << unilib.patch |
|
18361
|
0
|
0
|
|
|
|
|
<< (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n" |
|
|
|
0
|
|
|
|
|
|
|
18362
|
|
|
|
|
|
|
"Copyright 2015 by Institute of Formal and Applied Linguistics, Faculty of\n" |
|
18363
|
0
|
0
|
|
|
|
|
"Mathematics and Physics, Charles University in Prague, Czech Republic."; |
|
18364
|
|
|
|
|
|
|
|
|
18365
|
0
|
|
|
|
|
|
return info.str(); |
|
18366
|
|
|
|
|
|
|
} |
|
18367
|
|
|
|
|
|
|
|
|
18368
|
|
|
|
|
|
|
} // namespace parsito |
|
18369
|
|
|
|
|
|
|
|
|
18370
|
|
|
|
|
|
|
///////// |
|
18371
|
|
|
|
|
|
|
// File: sentence/input_format.cpp |
|
18372
|
|
|
|
|
|
|
///////// |
|
18373
|
|
|
|
|
|
|
|
|
18374
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
18375
|
|
|
|
|
|
|
// |
|
18376
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
18377
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
18378
|
|
|
|
|
|
|
// |
|
18379
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
18380
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
18381
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
18382
|
|
|
|
|
|
|
|
|
18383
|
2
|
|
|
|
|
|
const string input_format::CONLLU_V1 = "v1"; |
|
18384
|
2
|
|
|
|
|
|
const string input_format::CONLLU_V2 = "v2"; |
|
18385
|
2
|
|
|
|
|
|
const string input_format::GENERIC_TOKENIZER_NORMALIZED_SPACES = "normalized_spaces"; |
|
18386
|
2
|
|
|
|
|
|
const string input_format::GENERIC_TOKENIZER_PRESEGMENTED = "presegmented"; |
|
18387
|
2
|
|
|
|
|
|
const string input_format::GENERIC_TOKENIZER_RANGES = "ranges"; |
|
18388
|
|
|
|
|
|
|
|
|
18389
|
|
|
|
|
|
|
// CoNLL-U input format |
|
18390
|
0
|
|
|
|
|
|
class input_format_conllu : public input_format { |
|
18391
|
|
|
|
|
|
|
public: |
|
18392
|
0
|
|
|
|
|
|
input_format_conllu(unsigned version) : version(version) {} |
|
18393
|
|
|
|
|
|
|
|
|
18394
|
|
|
|
|
|
|
virtual bool read_block(istream& is, string& block) const override; |
|
18395
|
|
|
|
|
|
|
virtual void reset_document(string_piece id = string_piece()) override; |
|
18396
|
|
|
|
|
|
|
virtual void set_text(string_piece text, bool make_copy = false) override; |
|
18397
|
|
|
|
|
|
|
virtual bool next_sentence(sentence& s, string& error) override; |
|
18398
|
|
|
|
|
|
|
|
|
18399
|
|
|
|
|
|
|
private: |
|
18400
|
|
|
|
|
|
|
unsigned version; |
|
18401
|
|
|
|
|
|
|
string_piece text; |
|
18402
|
|
|
|
|
|
|
string text_copy; |
|
18403
|
|
|
|
|
|
|
|
|
18404
|
|
|
|
|
|
|
static const string columns[10]; |
|
18405
|
|
|
|
|
|
|
}; |
|
18406
|
|
|
|
|
|
|
|
|
18407
|
26
|
100
|
|
|
|
|
const string input_format_conllu::columns[10] = {"ID", "FORM", "LEMMA", |
|
18408
|
2
|
50
|
|
|
|
|
"UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"}; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18409
|
|
|
|
|
|
|
|
|
18410
|
0
|
|
|
|
|
|
bool input_format_conllu::read_block(istream& is, string& block) const { |
|
18411
|
0
|
|
|
|
|
|
return bool(getpara(is, block)); |
|
18412
|
|
|
|
|
|
|
} |
|
18413
|
|
|
|
|
|
|
|
|
18414
|
0
|
|
|
|
|
|
void input_format_conllu::reset_document(string_piece /*id*/) { |
|
18415
|
0
|
|
|
|
|
|
set_text(""); |
|
18416
|
0
|
|
|
|
|
|
} |
|
18417
|
|
|
|
|
|
|
|
|
18418
|
0
|
|
|
|
|
|
void input_format_conllu::set_text(string_piece text, bool make_copy) { |
|
18419
|
0
|
0
|
|
|
|
|
if (make_copy) { |
|
18420
|
0
|
|
|
|
|
|
text_copy.assign(text.str, text.len); |
|
18421
|
|
|
|
|
|
|
text = string_piece(text_copy.c_str(), text_copy.size()); |
|
18422
|
|
|
|
|
|
|
} |
|
18423
|
0
|
|
|
|
|
|
this->text = text; |
|
18424
|
0
|
|
|
|
|
|
} |
|
18425
|
|
|
|
|
|
|
|
|
18426
|
0
|
|
|
|
|
|
bool input_format_conllu::next_sentence(sentence& s, string& error) { |
|
18427
|
|
|
|
|
|
|
error.clear(); |
|
18428
|
0
|
|
|
|
|
|
s.clear(); |
|
18429
|
|
|
|
|
|
|
int last_multiword_token = 0; |
|
18430
|
|
|
|
|
|
|
|
|
18431
|
|
|
|
|
|
|
vector tokens, parts; |
|
18432
|
0
|
0
|
|
|
|
|
while (text.len) { |
|
18433
|
|
|
|
|
|
|
// Read line |
|
18434
|
0
|
|
|
|
|
|
string_piece line(text.str, 0); |
|
18435
|
0
|
0
|
|
|
|
|
while (line.len < text.len && (line.str[line.len] != '\r' && line.str[line.len] != '\n')) line.len++; |
|
|
|
0
|
|
|
|
|
|
|
18436
|
|
|
|
|
|
|
|
|
18437
|
0
|
|
|
|
|
|
text.str += line.len, text.len -= line.len; |
|
18438
|
0
|
0
|
|
|
|
|
if (text.len >= 2 && text.str[0] == '\r' && text.str[1] == '\n') |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18439
|
0
|
|
|
|
|
|
text.str += 2, text.len -= 2; |
|
18440
|
0
|
0
|
|
|
|
|
else if (text.len && *text.str == '\n') |
|
|
|
0
|
|
|
|
|
|
|
18441
|
0
|
|
|
|
|
|
text.str++, text.len--; |
|
18442
|
|
|
|
|
|
|
|
|
18443
|
|
|
|
|
|
|
// Empty lines denote end of tree, unless at the beginning |
|
18444
|
0
|
0
|
|
|
|
|
if (!line.len) { |
|
18445
|
0
|
0
|
|
|
|
|
if (s.empty()) continue; |
|
18446
|
0
|
|
|
|
|
|
break; |
|
18447
|
|
|
|
|
|
|
} |
|
18448
|
|
|
|
|
|
|
|
|
18449
|
0
|
0
|
|
|
|
|
if (*line.str == '#') { |
|
18450
|
|
|
|
|
|
|
// Store comments at the beginning and ignore the rest |
|
18451
|
0
|
0
|
|
|
|
|
if (s.empty()) s.comments.emplace_back(line.str, line.len); |
|
|
|
0
|
|
|
|
|
|
|
18452
|
|
|
|
|
|
|
continue; |
|
18453
|
|
|
|
|
|
|
} |
|
18454
|
|
|
|
|
|
|
|
|
18455
|
|
|
|
|
|
|
// Parse the line |
|
18456
|
0
|
0
|
|
|
|
|
split(line, '\t', tokens); |
|
18457
|
0
|
0
|
|
|
|
|
if (tokens.size() != 10) |
|
18458
|
0
|
0
|
|
|
|
|
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' does not contain 10 columns!") , false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18459
|
|
|
|
|
|
|
|
|
18460
|
|
|
|
|
|
|
// Check that no column is empty and contains no spaces (except FORM, LEMMA and MISC in version >= 2) |
|
18461
|
0
|
0
|
|
|
|
|
for (int i = 0; i < 10; i++) { |
|
18462
|
0
|
0
|
|
|
|
|
if (!tokens[i].len) |
|
18463
|
0
|
0
|
|
|
|
|
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains empty column ").append(columns[i]).append("!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18464
|
0
|
0
|
|
|
|
|
if ((version < 2 || (i != 1 && i != 2 && i != 9)) && memchr(tokens[i].str, ' ', tokens[i].len) != NULL) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18465
|
0
|
0
|
|
|
|
|
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains spaces in column ").append(columns[i]).append("!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18466
|
|
|
|
|
|
|
} |
|
18467
|
|
|
|
|
|
|
|
|
18468
|
|
|
|
|
|
|
// Handle multiword tokens |
|
18469
|
0
|
0
|
|
|
|
|
if (memchr(tokens[0].str, '-', tokens[0].len)) { |
|
18470
|
0
|
0
|
|
|
|
|
split(tokens[0], '-', parts); |
|
18471
|
0
|
0
|
|
|
|
|
if (parts.size() != 2) |
|
18472
|
0
|
0
|
|
|
|
|
return error.assign("Cannot parse ID of multiword token '").append(line.str, line.len).append("'!") , false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18473
|
|
|
|
|
|
|
int from, to; |
|
18474
|
0
|
0
|
|
|
|
|
if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error)) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18475
|
|
|
|
|
|
|
return false; |
|
18476
|
0
|
0
|
|
|
|
|
if (from != int(s.words.size())) |
|
18477
|
0
|
0
|
|
|
|
|
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18478
|
0
|
0
|
|
|
|
|
if (to < from) |
|
18479
|
0
|
0
|
|
|
|
|
return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18480
|
0
|
0
|
|
|
|
|
if (from <= last_multiword_token) |
|
18481
|
0
|
0
|
|
|
|
|
return error.assign("Multiword token '").append(line.str, line.len).append("' overlaps with the previous one!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18482
|
|
|
|
|
|
|
last_multiword_token = to; |
|
18483
|
0
|
0
|
|
|
|
|
for (int i = 2; i < 9; i++) |
|
18484
|
0
|
0
|
|
|
|
|
if (tokens[i].len != 1 || tokens[i].str[0] != '_') |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18485
|
0
|
0
|
|
|
|
|
return error.assign("Column ").append(columns[i]).append(" of an multi-word token '").append(line.str, line.len).append("' is not an empty!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18486
|
0
|
0
|
|
|
|
|
s.multiword_tokens.emplace_back(from, to, tokens[1], tokens[9].len == 1 && tokens[9].str[0] == '_' ? string_piece() : tokens[9]); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18487
|
0
|
|
|
|
|
|
continue; |
|
18488
|
|
|
|
|
|
|
} |
|
18489
|
|
|
|
|
|
|
|
|
18490
|
|
|
|
|
|
|
// Handle empty nodes |
|
18491
|
0
|
0
|
|
|
|
|
if (version >= 2) |
|
18492
|
0
|
0
|
|
|
|
|
if (memchr(tokens[0].str, '.', tokens[0].len)) { |
|
18493
|
0
|
0
|
|
|
|
|
split(tokens[0], '.', parts); |
|
18494
|
0
|
0
|
|
|
|
|
if (parts.size() != 2) |
|
18495
|
0
|
0
|
|
|
|
|
return error.assign("Cannot parse ID of empty node '").append(line.str, line.len).append("'!") , false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18496
|
|
|
|
|
|
|
int id, index; |
|
18497
|
0
|
0
|
|
|
|
|
if (!parse_int(parts[0], "CoNLL-U empty node id", id, error) || !parse_int(parts[1], "CoNLL-U empty node index", index, error)) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18498
|
|
|
|
|
|
|
return false; |
|
18499
|
0
|
0
|
|
|
|
|
if (id != int(s.words.size()) - 1) |
|
18500
|
0
|
0
|
|
|
|
|
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18501
|
0
|
0
|
|
|
|
|
if (!((s.empty_nodes.empty() && index == 1) || (!s.empty_nodes.empty() && s.empty_nodes.back().id < id && index == 1) || |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18502
|
0
|
0
|
|
|
|
|
(!s.empty_nodes.empty() && s.empty_nodes.back().id == id && index == s.empty_nodes.back().index + 1))) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18503
|
0
|
0
|
|
|
|
|
return error.assign("Incorrect ID index '").append(parts[1].str, parts[1].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18504
|
0
|
0
|
|
|
|
|
for (int i = 6; i < 8; i++) |
|
18505
|
0
|
0
|
|
|
|
|
if (tokens[i].len != 1 || tokens[i].str[0] != '_') |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18506
|
0
|
0
|
|
|
|
|
return error.assign("Column ").append(columns[i]).append(" of an empty node token '").append(line.str, line.len).append("' is not an empty!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18507
|
|
|
|
|
|
|
|
|
18508
|
0
|
0
|
|
|
|
|
s.empty_nodes.emplace_back(id, index); |
|
18509
|
0
|
|
|
|
|
|
s.empty_nodes.back().form.assign(tokens[1].str, tokens[1].len); |
|
18510
|
0
|
|
|
|
|
|
s.empty_nodes.back().lemma.assign(tokens[2].str, tokens[2].len); |
|
18511
|
0
|
0
|
|
|
|
|
if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) s.empty_nodes.back().upostag.assign(tokens[3].str, tokens[3].len); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18512
|
0
|
0
|
|
|
|
|
if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) s.empty_nodes.back().xpostag.assign(tokens[4].str, tokens[4].len); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18513
|
0
|
0
|
|
|
|
|
if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) s.empty_nodes.back().feats.assign(tokens[5].str, tokens[5].len); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18514
|
0
|
0
|
|
|
|
|
if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) s.empty_nodes.back().deps.assign(tokens[8].str, tokens[8].len); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18515
|
0
|
0
|
|
|
|
|
if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) s.empty_nodes.back().misc.assign(tokens[9].str, tokens[9].len); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18516
|
0
|
|
|
|
|
|
continue; |
|
18517
|
|
|
|
|
|
|
} |
|
18518
|
|
|
|
|
|
|
|
|
18519
|
|
|
|
|
|
|
// Parse word ID and head |
|
18520
|
|
|
|
|
|
|
int id; |
|
18521
|
0
|
0
|
|
|
|
|
if (!parse_int(tokens[0], "CoNLL-U id", id, error)) |
|
|
|
0
|
|
|
|
|
|
|
18522
|
|
|
|
|
|
|
return false; |
|
18523
|
0
|
0
|
|
|
|
|
if (id != int(s.words.size())) |
|
18524
|
0
|
0
|
|
|
|
|
return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18525
|
|
|
|
|
|
|
|
|
18526
|
|
|
|
|
|
|
int head; |
|
18527
|
0
|
0
|
|
|
|
|
if (tokens[6].len == 1 && tokens[6].str[0] == '_') { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18528
|
0
|
|
|
|
|
|
head = -1; |
|
18529
|
|
|
|
|
|
|
} else { |
|
18530
|
0
|
0
|
|
|
|
|
if (!parse_int(tokens[6], "CoNLL-U head", head, error)) |
|
|
|
0
|
|
|
|
|
|
|
18531
|
|
|
|
|
|
|
return false; |
|
18532
|
0
|
0
|
|
|
|
|
if (head < 0) |
|
18533
|
0
|
0
|
|
|
|
|
return error.assign("Numeric head value '").append(tokens[0].str, tokens[0].len).append("' cannot be negative!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18534
|
|
|
|
|
|
|
} |
|
18535
|
|
|
|
|
|
|
|
|
18536
|
|
|
|
|
|
|
// Add new word |
|
18537
|
|
|
|
|
|
|
auto& word = s.add_word(tokens[1]); |
|
18538
|
0
|
|
|
|
|
|
word.lemma.assign(tokens[2].str, tokens[2].len); |
|
18539
|
0
|
0
|
|
|
|
|
if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) word.upostag.assign(tokens[3].str, tokens[3].len); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18540
|
0
|
0
|
|
|
|
|
if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) word.xpostag.assign(tokens[4].str, tokens[4].len); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18541
|
0
|
0
|
|
|
|
|
if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) word.feats.assign(tokens[5].str, tokens[5].len); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18542
|
0
|
|
|
|
|
|
word.head = head; |
|
18543
|
0
|
0
|
|
|
|
|
if (!(tokens[7].len == 1 && tokens[7].str[0] == '_')) word.deprel.assign(tokens[7].str, tokens[7].len); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18544
|
0
|
0
|
|
|
|
|
if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) word.deps.assign(tokens[8].str, tokens[8].len); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18545
|
0
|
0
|
|
|
|
|
if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) word.misc.assign(tokens[9].str, tokens[9].len); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18546
|
|
|
|
|
|
|
} |
|
18547
|
|
|
|
|
|
|
|
|
18548
|
|
|
|
|
|
|
// Check that we got word for the last multiword token |
|
18549
|
0
|
0
|
|
|
|
|
if (last_multiword_token >= int(s.words.size())) |
|
18550
|
0
|
0
|
|
|
|
|
return error.assign("There are words missing for multiword token '").append(s.multiword_tokens.back().form).append("'!"), false; |
|
|
|
0
|
|
|
|
|
|
|
18551
|
|
|
|
|
|
|
|
|
18552
|
|
|
|
|
|
|
// Set heads correctly |
|
18553
|
0
|
0
|
|
|
|
|
for (auto&& word : s.words) |
|
18554
|
0
|
0
|
|
|
|
|
if (word.id && word.head >= 0) { |
|
|
|
0
|
|
|
|
|
|
|
18555
|
0
|
0
|
|
|
|
|
if (word.head >= int(s.words.size())) |
|
18556
|
0
|
0
|
|
|
|
|
return error.assign("Node ID '").append(to_string(word.id)).append("' form '").append(word.form).append("' has too large head: '").append(to_string(word.head)).append("'!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18557
|
0
|
0
|
|
|
|
|
s.set_head(word.id, word.head, word.deprel); |
|
18558
|
|
|
|
|
|
|
} |
|
18559
|
|
|
|
|
|
|
|
|
18560
|
0
|
|
|
|
|
|
return !s.empty(); |
|
18561
|
|
|
|
|
|
|
} |
|
18562
|
|
|
|
|
|
|
|
|
18563
|
|
|
|
|
|
|
// Horizontal input format |
|
18564
|
0
|
|
|
|
|
|
class input_format_horizontal : public input_format { |
|
18565
|
|
|
|
|
|
|
public: |
|
18566
|
|
|
|
|
|
|
virtual bool read_block(istream& is, string& block) const override; |
|
18567
|
|
|
|
|
|
|
virtual void reset_document(string_piece id = string_piece()) override; |
|
18568
|
|
|
|
|
|
|
virtual void set_text(string_piece text, bool make_copy = false) override; |
|
18569
|
|
|
|
|
|
|
virtual bool next_sentence(sentence& s, string& error) override; |
|
18570
|
|
|
|
|
|
|
|
|
18571
|
|
|
|
|
|
|
private: |
|
18572
|
|
|
|
|
|
|
string_piece text; |
|
18573
|
|
|
|
|
|
|
string text_copy; |
|
18574
|
|
|
|
|
|
|
bool new_document = true; |
|
18575
|
|
|
|
|
|
|
string document_id; |
|
18576
|
|
|
|
|
|
|
unsigned preceeding_newlines = 2; |
|
18577
|
|
|
|
|
|
|
unsigned sentence_id = 1; |
|
18578
|
|
|
|
|
|
|
}; |
|
18579
|
|
|
|
|
|
|
|
|
18580
|
0
|
|
|
|
|
|
bool input_format_horizontal::read_block(istream& is, string& block) const { |
|
18581
|
0
|
0
|
|
|
|
|
if (getline(is, block)) |
|
18582
|
0
|
|
|
|
|
|
return block.push_back('\n'), true; |
|
18583
|
|
|
|
|
|
|
return false; |
|
18584
|
|
|
|
|
|
|
} |
|
18585
|
|
|
|
|
|
|
|
|
18586
|
0
|
|
|
|
|
|
void input_format_horizontal::reset_document(string_piece id) { |
|
18587
|
0
|
|
|
|
|
|
new_document = true; |
|
18588
|
0
|
|
|
|
|
|
document_id.assign(id.str, id.len); |
|
18589
|
0
|
|
|
|
|
|
preceeding_newlines = 2; |
|
18590
|
0
|
|
|
|
|
|
sentence_id = 1; |
|
18591
|
0
|
|
|
|
|
|
set_text(""); |
|
18592
|
0
|
|
|
|
|
|
} |
|
18593
|
|
|
|
|
|
|
|
|
18594
|
0
|
|
|
|
|
|
void input_format_horizontal::set_text(string_piece text, bool make_copy) { |
|
18595
|
0
|
0
|
|
|
|
|
if (make_copy) { |
|
18596
|
0
|
|
|
|
|
|
text_copy.assign(text.str, text.len); |
|
18597
|
|
|
|
|
|
|
text = string_piece(text_copy.c_str(), text_copy.size()); |
|
18598
|
|
|
|
|
|
|
} |
|
18599
|
0
|
|
|
|
|
|
this->text = text; |
|
18600
|
0
|
|
|
|
|
|
} |
|
18601
|
|
|
|
|
|
|
|
|
18602
|
0
|
|
|
|
|
|
bool input_format_horizontal::next_sentence(sentence& s, string& error) { |
|
18603
|
|
|
|
|
|
|
error.clear(); |
|
18604
|
0
|
|
|
|
|
|
s.clear(); |
|
18605
|
|
|
|
|
|
|
|
|
18606
|
|
|
|
|
|
|
// Skip spaces and newlines |
|
18607
|
0
|
0
|
|
|
|
|
while (text.len && (*text.str == ' ' || *text.str == '\t' || *text.str == '\r' || *text.str == '\n')) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18608
|
0
|
|
|
|
|
|
preceeding_newlines += *text.str == '\n'; |
|
18609
|
0
|
|
|
|
|
|
text.str++, text.len--; |
|
18610
|
|
|
|
|
|
|
} |
|
18611
|
|
|
|
|
|
|
|
|
18612
|
|
|
|
|
|
|
// Read space (and tab) separated words |
|
18613
|
0
|
0
|
|
|
|
|
while (text.len && *text.str != '\r' && *text.str != '\n') { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18614
|
|
|
|
|
|
|
string_piece word = text; |
|
18615
|
|
|
|
|
|
|
|
|
18616
|
|
|
|
|
|
|
// Slurp the word |
|
18617
|
0
|
0
|
|
|
|
|
while (text.len && *text.str != ' ' && *text.str != '\t' && *text.str != '\r' && *text.str != '\n') |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18618
|
0
|
|
|
|
|
|
text.str++, text.len--; |
|
18619
|
0
|
|
|
|
|
|
word.len = text.str - word.str; |
|
18620
|
|
|
|
|
|
|
s.add_word(word); |
|
18621
|
|
|
|
|
|
|
|
|
18622
|
|
|
|
|
|
|
// Replace s by regular spaces |
|
18623
|
0
|
0
|
|
|
|
|
if (s.words.back().form.find("\302\240") != string::npos) { |
|
18624
|
0
|
|
|
|
|
|
string& form = s.words.back().form; |
|
18625
|
|
|
|
|
|
|
size_t form_len = 0; |
|
18626
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < form.size(); i++) { |
|
18627
|
0
|
0
|
|
|
|
|
if (form_len && form[form_len-1] == '\302' && form[i] == '\240') |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18628
|
0
|
|
|
|
|
|
form[form_len - 1] = ' '; |
|
18629
|
|
|
|
|
|
|
else |
|
18630
|
0
|
|
|
|
|
|
form[form_len++] = form[i]; |
|
18631
|
|
|
|
|
|
|
} |
|
18632
|
|
|
|
|
|
|
form.resize(form_len); |
|
18633
|
|
|
|
|
|
|
} |
|
18634
|
|
|
|
|
|
|
|
|
18635
|
|
|
|
|
|
|
// Skip spaces |
|
18636
|
0
|
0
|
|
|
|
|
while (text.len && (*text.str == ' ' || *text.str == '\t')) |
|
|
|
0
|
|
|
|
|
|
|
18637
|
0
|
|
|
|
|
|
text.str++, text.len--; |
|
18638
|
|
|
|
|
|
|
} |
|
18639
|
|
|
|
|
|
|
|
|
18640
|
0
|
0
|
|
|
|
|
if (!s.empty()) { |
|
18641
|
|
|
|
|
|
|
// Mark new document if needed |
|
18642
|
0
|
0
|
|
|
|
|
if (new_document) |
|
18643
|
0
|
|
|
|
|
|
s.set_new_doc(true, document_id); |
|
18644
|
0
|
|
|
|
|
|
new_document = false; |
|
18645
|
|
|
|
|
|
|
|
|
18646
|
|
|
|
|
|
|
// Mark new paragraph if needed |
|
18647
|
0
|
0
|
|
|
|
|
if (preceeding_newlines >= 2) |
|
18648
|
0
|
|
|
|
|
|
s.set_new_par(true); |
|
18649
|
0
|
|
|
|
|
|
preceeding_newlines = 0; |
|
18650
|
|
|
|
|
|
|
|
|
18651
|
|
|
|
|
|
|
// Sentence id |
|
18652
|
0
|
0
|
|
|
|
|
s.set_sent_id(to_string(sentence_id++)); |
|
18653
|
|
|
|
|
|
|
} |
|
18654
|
|
|
|
|
|
|
|
|
18655
|
0
|
|
|
|
|
|
return !s.empty(); |
|
18656
|
|
|
|
|
|
|
} |
|
18657
|
|
|
|
|
|
|
|
|
18658
|
|
|
|
|
|
|
// Vertical input format |
|
18659
|
0
|
|
|
|
|
|
class input_format_vertical : public input_format { |
|
18660
|
|
|
|
|
|
|
public: |
|
18661
|
|
|
|
|
|
|
virtual bool read_block(istream& is, string& block) const override; |
|
18662
|
|
|
|
|
|
|
virtual void reset_document(string_piece id = string_piece()) override; |
|
18663
|
|
|
|
|
|
|
virtual void set_text(string_piece text, bool make_copy = false) override; |
|
18664
|
|
|
|
|
|
|
virtual bool next_sentence(sentence& s, string& error) override; |
|
18665
|
|
|
|
|
|
|
|
|
18666
|
|
|
|
|
|
|
private: |
|
18667
|
|
|
|
|
|
|
string_piece text; |
|
18668
|
|
|
|
|
|
|
string text_copy; |
|
18669
|
|
|
|
|
|
|
bool new_document = true; |
|
18670
|
|
|
|
|
|
|
string document_id; |
|
18671
|
|
|
|
|
|
|
unsigned preceeding_newlines = 2; |
|
18672
|
|
|
|
|
|
|
unsigned sentence_id = 1; |
|
18673
|
|
|
|
|
|
|
}; |
|
18674
|
|
|
|
|
|
|
|
|
18675
|
0
|
|
|
|
|
|
bool input_format_vertical::read_block(istream& is, string& block) const { |
|
18676
|
0
|
|
|
|
|
|
return bool(getpara(is, block)); |
|
18677
|
|
|
|
|
|
|
} |
|
18678
|
|
|
|
|
|
|
|
|
18679
|
0
|
|
|
|
|
|
void input_format_vertical::reset_document(string_piece id) { |
|
18680
|
0
|
|
|
|
|
|
new_document = true; |
|
18681
|
0
|
|
|
|
|
|
document_id.assign(id.str, id.len); |
|
18682
|
0
|
|
|
|
|
|
preceeding_newlines = 2; |
|
18683
|
0
|
|
|
|
|
|
sentence_id = 1; |
|
18684
|
0
|
|
|
|
|
|
set_text(""); |
|
18685
|
0
|
|
|
|
|
|
} |
|
18686
|
|
|
|
|
|
|
|
|
18687
|
0
|
|
|
|
|
|
void input_format_vertical::set_text(string_piece text, bool make_copy) { |
|
18688
|
0
|
0
|
|
|
|
|
if (make_copy) { |
|
18689
|
0
|
|
|
|
|
|
text_copy.assign(text.str, text.len); |
|
18690
|
|
|
|
|
|
|
text = string_piece(text_copy.c_str(), text_copy.size()); |
|
18691
|
|
|
|
|
|
|
} |
|
18692
|
0
|
|
|
|
|
|
this->text = text; |
|
18693
|
0
|
|
|
|
|
|
} |
|
18694
|
|
|
|
|
|
|
|
|
18695
|
0
|
|
|
|
|
|
bool input_format_vertical::next_sentence(sentence& s, string& error) { |
|
18696
|
|
|
|
|
|
|
error.clear(); |
|
18697
|
0
|
|
|
|
|
|
s.clear(); |
|
18698
|
|
|
|
|
|
|
|
|
18699
|
|
|
|
|
|
|
// Skip tabs and newlines |
|
18700
|
0
|
0
|
|
|
|
|
while (text.len && (*text.str == '\t' || *text.str == '\r' || *text.str == '\n')) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18701
|
0
|
|
|
|
|
|
preceeding_newlines += *text.str == '\n'; |
|
18702
|
0
|
|
|
|
|
|
text.str++, text.len--; |
|
18703
|
|
|
|
|
|
|
} |
|
18704
|
|
|
|
|
|
|
|
|
18705
|
|
|
|
|
|
|
// Read first word without tabs on every line |
|
18706
|
0
|
0
|
|
|
|
|
while (text.len && *text.str != '\r' && *text.str != '\n') { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18707
|
|
|
|
|
|
|
string_piece word = text; |
|
18708
|
|
|
|
|
|
|
|
|
18709
|
|
|
|
|
|
|
// Slurp the word |
|
18710
|
0
|
0
|
|
|
|
|
while (text.len && *text.str != '\t' && *text.str != '\r' && *text.str != '\n') |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18711
|
0
|
|
|
|
|
|
text.str++, text.len--; |
|
18712
|
0
|
|
|
|
|
|
word.len = text.str - word.str; |
|
18713
|
|
|
|
|
|
|
s.add_word(word); |
|
18714
|
|
|
|
|
|
|
|
|
18715
|
|
|
|
|
|
|
// Skip spaces till end of line |
|
18716
|
0
|
0
|
|
|
|
|
while (text.len && *text.str != '\r' && *text.str != '\n') |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18717
|
0
|
|
|
|
|
|
text.str++, text.len--; |
|
18718
|
|
|
|
|
|
|
|
|
18719
|
|
|
|
|
|
|
// Skip one new line |
|
18720
|
0
|
0
|
|
|
|
|
if (text.len >= 2 && text.str[0] == '\r' && text.str[1] == '\n') |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18721
|
0
|
|
|
|
|
|
text.str += 2, text.len -= 2; |
|
18722
|
0
|
0
|
|
|
|
|
else if (text.len && *text.str == '\n') |
|
|
|
0
|
|
|
|
|
|
|
18723
|
0
|
|
|
|
|
|
text.str++, text.len--; |
|
18724
|
|
|
|
|
|
|
|
|
18725
|
|
|
|
|
|
|
// Skip tabs on the beginning of the line |
|
18726
|
0
|
0
|
|
|
|
|
while (text.len && *text.str == '\t') |
|
|
|
0
|
|
|
|
|
|
|
18727
|
0
|
|
|
|
|
|
text.str++, text.len--; |
|
18728
|
|
|
|
|
|
|
} |
|
18729
|
|
|
|
|
|
|
|
|
18730
|
0
|
0
|
|
|
|
|
if (!s.empty()) { |
|
18731
|
|
|
|
|
|
|
// Mark new document if needed |
|
18732
|
0
|
0
|
|
|
|
|
if (new_document) |
|
18733
|
0
|
|
|
|
|
|
s.set_new_doc(true, document_id); |
|
18734
|
0
|
|
|
|
|
|
new_document = false; |
|
18735
|
|
|
|
|
|
|
|
|
18736
|
|
|
|
|
|
|
// Mark new paragraph if needed |
|
18737
|
0
|
0
|
|
|
|
|
if (preceeding_newlines >= 2) |
|
18738
|
0
|
|
|
|
|
|
s.set_new_par(true); |
|
18739
|
0
|
|
|
|
|
|
preceeding_newlines = 0; |
|
18740
|
|
|
|
|
|
|
|
|
18741
|
|
|
|
|
|
|
// Sentence id |
|
18742
|
0
|
0
|
|
|
|
|
s.set_sent_id(to_string(sentence_id++)); |
|
18743
|
|
|
|
|
|
|
} |
|
18744
|
|
|
|
|
|
|
|
|
18745
|
0
|
|
|
|
|
|
return !s.empty(); |
|
18746
|
|
|
|
|
|
|
} |
|
18747
|
|
|
|
|
|
|
|
|
18748
|
|
|
|
|
|
|
// Presegmented tokenizer |
|
18749
|
0
|
|
|
|
|
|
class input_format_presegmented_tokenizer : public input_format { |
|
18750
|
|
|
|
|
|
|
public: |
|
18751
|
0
|
|
|
|
|
|
input_format_presegmented_tokenizer(input_format* tokenizer) : tokenizer(tokenizer) {} |
|
18752
|
|
|
|
|
|
|
|
|
18753
|
|
|
|
|
|
|
virtual bool read_block(istream& is, string& block) const override; |
|
18754
|
|
|
|
|
|
|
virtual void reset_document(string_piece id) override; |
|
18755
|
|
|
|
|
|
|
virtual void set_text(string_piece text, bool make_copy = false) override; |
|
18756
|
|
|
|
|
|
|
virtual bool next_sentence(sentence& s, string& error) override; |
|
18757
|
|
|
|
|
|
|
|
|
18758
|
|
|
|
|
|
|
private: |
|
18759
|
|
|
|
|
|
|
unique_ptr tokenizer; |
|
18760
|
|
|
|
|
|
|
string_piece text; |
|
18761
|
|
|
|
|
|
|
string text_copy; |
|
18762
|
|
|
|
|
|
|
bool new_document = true; |
|
18763
|
|
|
|
|
|
|
string document_id; |
|
18764
|
|
|
|
|
|
|
unsigned preceeding_newlines = 2; |
|
18765
|
|
|
|
|
|
|
unsigned sentence_id = 1; |
|
18766
|
|
|
|
|
|
|
}; |
|
18767
|
|
|
|
|
|
|
|
|
18768
|
0
|
|
|
|
|
|
bool input_format_presegmented_tokenizer::read_block(istream& is, string& block) const { |
|
18769
|
0
|
0
|
|
|
|
|
if (getline(is, block)) |
|
18770
|
0
|
|
|
|
|
|
return block.push_back('\n'), true; |
|
18771
|
|
|
|
|
|
|
return false; |
|
18772
|
|
|
|
|
|
|
} |
|
18773
|
|
|
|
|
|
|
|
|
18774
|
0
|
|
|
|
|
|
void input_format_presegmented_tokenizer::reset_document(string_piece id) { |
|
18775
|
0
|
|
|
|
|
|
new_document = true; |
|
18776
|
0
|
|
|
|
|
|
document_id.assign(id.str, id.len); |
|
18777
|
0
|
|
|
|
|
|
preceeding_newlines = 2; |
|
18778
|
0
|
|
|
|
|
|
sentence_id = 1; |
|
18779
|
0
|
|
|
|
|
|
tokenizer->reset_document(); |
|
18780
|
0
|
|
|
|
|
|
set_text(""); |
|
18781
|
0
|
|
|
|
|
|
} |
|
18782
|
|
|
|
|
|
|
|
|
18783
|
0
|
|
|
|
|
|
void input_format_presegmented_tokenizer::set_text(string_piece text, bool make_copy) { |
|
18784
|
0
|
0
|
|
|
|
|
if (make_copy) { |
|
18785
|
0
|
|
|
|
|
|
text_copy.assign(text.str, text.len); |
|
18786
|
|
|
|
|
|
|
text = string_piece(text_copy.c_str(), text_copy.size()); |
|
18787
|
|
|
|
|
|
|
} |
|
18788
|
0
|
|
|
|
|
|
this->text = text; |
|
18789
|
0
|
|
|
|
|
|
} |
|
18790
|
|
|
|
|
|
|
|
|
18791
|
0
|
|
|
|
|
|
bool input_format_presegmented_tokenizer::next_sentence(sentence& s, string& error) { |
|
18792
|
|
|
|
|
|
|
error.clear(); |
|
18793
|
0
|
|
|
|
|
|
s.clear(); |
|
18794
|
|
|
|
|
|
|
|
|
18795
|
0
|
|
|
|
|
|
sentence partial; |
|
18796
|
|
|
|
|
|
|
unsigned following_newlines = 0; |
|
18797
|
0
|
0
|
|
|
|
|
while (text.len && s.empty()) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18798
|
|
|
|
|
|
|
// Move next line from `text' to `line', including leading and following newlines |
|
18799
|
0
|
|
|
|
|
|
string_piece line(text.str, 0); |
|
18800
|
0
|
0
|
|
|
|
|
while (line.len < text.len && (line.str[line.len] == '\n' || line.str[line.len] == '\r')) { |
|
|
|
0
|
|
|
|
|
|
|
18801
|
0
|
|
|
|
|
|
preceeding_newlines += line.str[line.len] == '\n'; |
|
18802
|
0
|
|
|
|
|
|
line.len++; |
|
18803
|
|
|
|
|
|
|
} |
|
18804
|
0
|
0
|
|
|
|
|
while (line.len < text.len && (line.str[line.len] != '\n' && line.str[line.len] != '\r')) |
|
|
|
0
|
|
|
|
|
|
|
18805
|
0
|
|
|
|
|
|
line.len++; |
|
18806
|
0
|
0
|
|
|
|
|
while (line.len < text.len && (line.str[line.len] == '\n' || line.str[line.len] == '\r')) { |
|
|
|
0
|
|
|
|
|
|
|
18807
|
0
|
|
|
|
|
|
following_newlines += line.str[line.len] == '\n'; |
|
18808
|
0
|
|
|
|
|
|
line.len++; |
|
18809
|
|
|
|
|
|
|
} |
|
18810
|
0
|
|
|
|
|
|
text.str += line.len, text.len -= line.len; |
|
18811
|
|
|
|
|
|
|
|
|
18812
|
|
|
|
|
|
|
// Add all tokens from the line to `s' |
|
18813
|
0
|
0
|
|
|
|
|
tokenizer->set_text(line, false); |
|
18814
|
0
|
0
|
|
|
|
|
while (tokenizer->next_sentence(partial, error)) { |
|
|
|
0
|
|
|
|
|
|
|
18815
|
|
|
|
|
|
|
// Append words |
|
18816
|
0
|
|
|
|
|
|
size_t words = s.words.size() - 1; |
|
18817
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < partial.words.size(); i++) { |
|
18818
|
0
|
|
|
|
|
|
s.words.push_back(move(partial.words[i])); |
|
18819
|
0
|
|
|
|
|
|
s.words.back().id += words; |
|
18820
|
0
|
0
|
|
|
|
|
if (s.words.back().head > 0) s.words.back().head += words; |
|
18821
|
|
|
|
|
|
|
} |
|
18822
|
|
|
|
|
|
|
|
|
18823
|
|
|
|
|
|
|
// Append multiword_tokens |
|
18824
|
0
|
0
|
|
|
|
|
for (auto&& multiword_token : partial.multiword_tokens) { |
|
18825
|
0
|
|
|
|
|
|
s.multiword_tokens.push_back(move(multiword_token)); |
|
18826
|
0
|
|
|
|
|
|
s.multiword_tokens.back().id_first += words; |
|
18827
|
0
|
|
|
|
|
|
s.multiword_tokens.back().id_last += words; |
|
18828
|
|
|
|
|
|
|
} |
|
18829
|
|
|
|
|
|
|
|
|
18830
|
|
|
|
|
|
|
// Append empty nodes |
|
18831
|
0
|
0
|
|
|
|
|
for (auto&& empty_node : partial.empty_nodes) { |
|
18832
|
0
|
|
|
|
|
|
s.empty_nodes.push_back(move(empty_node)); |
|
18833
|
0
|
|
|
|
|
|
s.empty_nodes.back().id += words; |
|
18834
|
|
|
|
|
|
|
} |
|
18835
|
|
|
|
|
|
|
} |
|
18836
|
0
|
0
|
|
|
|
|
if (!error.empty()) return false; |
|
18837
|
|
|
|
|
|
|
|
|
18838
|
0
|
0
|
|
|
|
|
if (s.empty()) { |
|
18839
|
0
|
|
|
|
|
|
preceeding_newlines += following_newlines; |
|
18840
|
|
|
|
|
|
|
following_newlines = 0; |
|
18841
|
|
|
|
|
|
|
} |
|
18842
|
|
|
|
|
|
|
} |
|
18843
|
|
|
|
|
|
|
|
|
18844
|
0
|
0
|
|
|
|
|
if (!s.empty()) { |
|
18845
|
|
|
|
|
|
|
// Mark new document if needed |
|
18846
|
0
|
0
|
|
|
|
|
if (new_document) |
|
18847
|
0
|
0
|
|
|
|
|
s.set_new_doc(true, document_id); |
|
18848
|
0
|
|
|
|
|
|
new_document = false; |
|
18849
|
|
|
|
|
|
|
|
|
18850
|
|
|
|
|
|
|
// Mark new paragraph if needed |
|
18851
|
0
|
0
|
|
|
|
|
if (preceeding_newlines >= 2) |
|
18852
|
0
|
0
|
|
|
|
|
s.set_new_par(true); |
|
18853
|
0
|
|
|
|
|
|
preceeding_newlines = following_newlines; |
|
18854
|
|
|
|
|
|
|
|
|
18855
|
|
|
|
|
|
|
// Sentence id |
|
18856
|
0
|
0
|
|
|
|
|
s.set_sent_id(to_string(sentence_id++)); |
|
18857
|
|
|
|
|
|
|
|
|
18858
|
|
|
|
|
|
|
// Fill "# text" comment |
|
18859
|
0
|
0
|
|
|
|
|
s.comments.emplace_back("# text = "); |
|
18860
|
0
|
0
|
|
|
|
|
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
|
18861
|
0
|
0
|
|
|
|
|
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j].form : (const token&)s.words[i].form; |
|
|
|
0
|
|
|
|
|
|
|
18862
|
0
|
0
|
|
|
|
|
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18863
|
0
|
|
|
|
|
|
i = s.multiword_tokens[j++].id_last; |
|
18864
|
|
|
|
|
|
|
|
|
18865
|
|
|
|
|
|
|
s.comments.back().append(tok.form); |
|
18866
|
0
|
0
|
|
|
|
|
if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' '); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18867
|
|
|
|
|
|
|
} |
|
18868
|
|
|
|
|
|
|
} |
|
18869
|
|
|
|
|
|
|
|
|
18870
|
0
|
|
|
|
|
|
return !s.empty(); |
|
18871
|
|
|
|
|
|
|
} |
|
18872
|
|
|
|
|
|
|
|
|
18873
|
|
|
|
|
|
|
// Static factory methods |
|
18874
|
0
|
|
|
|
|
|
input_format* input_format::new_conllu_input_format(const string& options) { |
|
18875
|
|
|
|
|
|
|
named_values::map parsed_options; |
|
18876
|
|
|
|
|
|
|
string parse_error; |
|
18877
|
0
|
0
|
|
|
|
|
if (!named_values::parse(options, parsed_options, parse_error)) |
|
|
|
0
|
|
|
|
|
|
|
18878
|
|
|
|
|
|
|
return nullptr; |
|
18879
|
|
|
|
|
|
|
|
|
18880
|
|
|
|
|
|
|
unsigned version = 2; |
|
18881
|
0
|
0
|
|
|
|
|
if (parsed_options.count(CONLLU_V1)) |
|
18882
|
|
|
|
|
|
|
version = 1; |
|
18883
|
0
|
0
|
|
|
|
|
if (parsed_options.count(CONLLU_V2)) |
|
18884
|
|
|
|
|
|
|
version = 2; |
|
18885
|
|
|
|
|
|
|
|
|
18886
|
0
|
0
|
|
|
|
|
return new input_format_conllu(version); |
|
18887
|
|
|
|
|
|
|
} |
|
18888
|
|
|
|
|
|
|
|
|
18889
|
0
|
|
|
|
|
|
input_format* input_format::new_generic_tokenizer_input_format(const string& options) { |
|
18890
|
|
|
|
|
|
|
named_values::map parsed_options; |
|
18891
|
|
|
|
|
|
|
string parse_error; |
|
18892
|
0
|
0
|
|
|
|
|
if (!named_values::parse(options, parsed_options, parse_error)) |
|
|
|
0
|
|
|
|
|
|
|
18893
|
|
|
|
|
|
|
return nullptr; |
|
18894
|
|
|
|
|
|
|
|
|
18895
|
0
|
|
|
|
|
|
bool normalized_spaces = parsed_options.count(GENERIC_TOKENIZER_NORMALIZED_SPACES); |
|
18896
|
0
|
|
|
|
|
|
bool token_ranges = parsed_options.count(GENERIC_TOKENIZER_RANGES); |
|
18897
|
|
|
|
|
|
|
|
|
18898
|
0
|
0
|
|
|
|
|
input_format* result = new morphodita_tokenizer_wrapper(morphodita::tokenizer::new_generic_tokenizer(), nullptr, normalized_spaces, token_ranges); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18899
|
0
|
0
|
|
|
|
|
return (parsed_options.count(GENERIC_TOKENIZER_PRESEGMENTED) && result) ? input_format::new_presegmented_tokenizer(result) : result; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18900
|
|
|
|
|
|
|
} |
|
18901
|
|
|
|
|
|
|
|
|
18902
|
0
|
|
|
|
|
|
input_format* input_format::new_horizontal_input_format(const string& /*options*/) { |
|
18903
|
0
|
|
|
|
|
|
return new input_format_horizontal(); |
|
18904
|
|
|
|
|
|
|
} |
|
18905
|
|
|
|
|
|
|
|
|
18906
|
0
|
|
|
|
|
|
input_format* input_format::new_vertical_input_format(const string& /*options*/) { |
|
18907
|
0
|
|
|
|
|
|
return new input_format_vertical(); |
|
18908
|
|
|
|
|
|
|
} |
|
18909
|
|
|
|
|
|
|
|
|
18910
|
0
|
|
|
|
|
|
input_format* input_format::new_input_format(const string& name) { |
|
18911
|
0
|
|
|
|
|
|
size_t equal = name.find('='); |
|
18912
|
0
|
0
|
|
|
|
|
size_t name_len = equal != string::npos ? equal : name.size(); |
|
18913
|
0
|
0
|
|
|
|
|
size_t option_offset = equal != string::npos ? equal + 1 : name.size(); |
|
18914
|
|
|
|
|
|
|
|
|
18915
|
0
|
0
|
|
|
|
|
if (name.compare(0, name_len, "conllu") == 0) return new_conllu_input_format(name.substr(option_offset)); |
|
|
|
0
|
|
|
|
|
|
|
18916
|
0
|
0
|
|
|
|
|
if (name.compare(0, name_len, "generic_tokenizer") == 0) return new_generic_tokenizer_input_format(name.substr(option_offset)); |
|
|
|
0
|
|
|
|
|
|
|
18917
|
0
|
0
|
|
|
|
|
if (name.compare(0, name_len, "horizontal") == 0) return new_horizontal_input_format(name.substr(option_offset)); |
|
|
|
0
|
|
|
|
|
|
|
18918
|
0
|
0
|
|
|
|
|
if (name.compare(0, name_len, "vertical") == 0) return new_vertical_input_format(name.substr(option_offset)); |
|
|
|
0
|
|
|
|
|
|
|
18919
|
|
|
|
|
|
|
return nullptr; |
|
18920
|
|
|
|
|
|
|
} |
|
18921
|
|
|
|
|
|
|
|
|
18922
|
0
|
|
|
|
|
|
input_format* input_format::new_presegmented_tokenizer(input_format* tokenizer) { |
|
18923
|
0
|
|
|
|
|
|
return new input_format_presegmented_tokenizer(tokenizer); |
|
18924
|
|
|
|
|
|
|
} |
|
18925
|
|
|
|
|
|
|
|
|
18926
|
|
|
|
|
|
|
///////// |
|
18927
|
|
|
|
|
|
|
// File: utils/xml_encoded.h |
|
18928
|
|
|
|
|
|
|
///////// |
|
18929
|
|
|
|
|
|
|
|
|
18930
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
|
18931
|
|
|
|
|
|
|
// |
|
18932
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
18933
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
18934
|
|
|
|
|
|
|
// |
|
18935
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
18936
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
18937
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
18938
|
|
|
|
|
|
|
|
|
18939
|
|
|
|
|
|
|
namespace utils { |
|
18940
|
|
|
|
|
|
|
|
|
18941
|
|
|
|
|
|
|
// |
|
18942
|
|
|
|
|
|
|
// Declarations |
|
18943
|
|
|
|
|
|
|
// |
|
18944
|
|
|
|
|
|
|
|
|
18945
|
|
|
|
|
|
|
// Print xml content while encoding <>& and optionally " using XML entities. |
|
18946
|
|
|
|
|
|
|
class xml_encoded { |
|
18947
|
|
|
|
|
|
|
public: |
|
18948
|
0
|
|
|
|
|
|
xml_encoded(string_piece str, bool encode_quot = false) : str(str), encode_quot(encode_quot) {} |
|
18949
|
|
|
|
|
|
|
|
|
18950
|
|
|
|
|
|
|
friend ostream& operator<<(ostream& os, xml_encoded data); |
|
18951
|
|
|
|
|
|
|
private: |
|
18952
|
|
|
|
|
|
|
string_piece str; |
|
18953
|
|
|
|
|
|
|
bool encode_quot; |
|
18954
|
|
|
|
|
|
|
}; |
|
18955
|
|
|
|
|
|
|
|
|
18956
|
|
|
|
|
|
|
inline ostream& operator<<(ostream& os, xml_encoded data); |
|
18957
|
|
|
|
|
|
|
|
|
18958
|
|
|
|
|
|
|
// |
|
18959
|
|
|
|
|
|
|
// Definitions |
|
18960
|
|
|
|
|
|
|
// |
|
18961
|
|
|
|
|
|
|
|
|
18962
|
0
|
|
|
|
|
|
ostream& operator<<(ostream& os, xml_encoded data) { |
|
18963
|
|
|
|
|
|
|
string_piece& str = data.str; |
|
18964
|
|
|
|
|
|
|
const char* to_print = str.str; |
|
18965
|
|
|
|
|
|
|
|
|
18966
|
0
|
0
|
|
|
|
|
while (str.len) { |
|
18967
|
0
|
0
|
|
|
|
|
while (str.len && *str.str != '<' && *str.str != '>' && *str.str != '&' && (!data.encode_quot || *str.str != '"')) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18968
|
0
|
|
|
|
|
|
str.str++, str.len--; |
|
18969
|
|
|
|
|
|
|
|
|
18970
|
0
|
0
|
|
|
|
|
if (str.len) { |
|
18971
|
0
|
0
|
|
|
|
|
if (to_print < str.str) os.write(to_print, str.str - to_print); |
|
18972
|
0
|
0
|
|
|
|
|
os << (*str.str == '<' ? "<" : *str.str == '>' ? ">" : *str.str == '&' ? "&" : """); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
18973
|
0
|
|
|
|
|
|
str.str++, str.len--; |
|
18974
|
|
|
|
|
|
|
to_print = str.str; |
|
18975
|
|
|
|
|
|
|
} |
|
18976
|
|
|
|
|
|
|
} |
|
18977
|
|
|
|
|
|
|
|
|
18978
|
0
|
0
|
|
|
|
|
if (to_print < str.str) os.write(to_print, str.str - to_print); |
|
18979
|
|
|
|
|
|
|
|
|
18980
|
0
|
|
|
|
|
|
return os; |
|
18981
|
|
|
|
|
|
|
} |
|
18982
|
|
|
|
|
|
|
|
|
18983
|
|
|
|
|
|
|
} // namespace utils |
|
18984
|
|
|
|
|
|
|
|
|
18985
|
|
|
|
|
|
|
///////// |
|
18986
|
|
|
|
|
|
|
// File: sentence/output_format.cpp |
|
18987
|
|
|
|
|
|
|
///////// |
|
18988
|
|
|
|
|
|
|
|
|
18989
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
18990
|
|
|
|
|
|
|
// |
|
18991
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
18992
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
18993
|
|
|
|
|
|
|
// |
|
18994
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
18995
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
18996
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
18997
|
|
|
|
|
|
|
|
|
18998
|
2
|
|
|
|
|
|
const string output_format::CONLLU_V1 = "v1"; |
|
18999
|
2
|
|
|
|
|
|
const string output_format::CONLLU_V2 = "v2"; |
|
19000
|
2
|
|
|
|
|
|
const string output_format::HORIZONTAL_PARAGRAPHS = "paragraphs"; |
|
19001
|
2
|
|
|
|
|
|
const string output_format::PLAINTEXT_NORMALIZED_SPACES = "normalized_spaces"; |
|
19002
|
2
|
|
|
|
|
|
const string output_format::VERTICAL_PARAGRAPHS = "paragraphs"; |
|
19003
|
|
|
|
|
|
|
|
|
19004
|
|
|
|
|
|
|
// CoNLL-U output format |
|
19005
|
2
|
|
|
|
|
|
class output_format_conllu : public output_format { |
|
19006
|
|
|
|
|
|
|
public: |
|
19007
|
1
|
|
|
|
|
|
output_format_conllu(unsigned version) : version(version) {} |
|
19008
|
|
|
|
|
|
|
|
|
19009
|
|
|
|
|
|
|
virtual void write_sentence(const sentence& s, ostream& os) override; |
|
19010
|
|
|
|
|
|
|
|
|
19011
|
|
|
|
|
|
|
private: |
|
19012
|
|
|
|
|
|
|
unsigned version; |
|
19013
|
|
|
|
|
|
|
static const string underscore; |
|
19014
|
14
|
0
|
|
|
|
|
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
19015
|
|
|
|
|
|
|
ostream& write_with_spaces(ostream& os, const string& str); |
|
19016
|
|
|
|
|
|
|
}; |
|
19017
|
|
|
|
|
|
|
|
|
19018
|
2
|
|
|
|
|
|
const string output_format_conllu::underscore = "_"; |
|
19019
|
|
|
|
|
|
|
|
|
19020
|
2
|
|
|
|
|
|
void output_format_conllu::write_sentence(const sentence& s, ostream& os) { |
|
19021
|
|
|
|
|
|
|
// Comments |
|
19022
|
5
|
100
|
|
|
|
|
for (auto&& comment : s.comments) |
|
19023
|
|
|
|
|
|
|
os << comment << '\n'; |
|
19024
|
|
|
|
|
|
|
|
|
19025
|
|
|
|
|
|
|
// Words and multiword tokens |
|
19026
|
|
|
|
|
|
|
size_t multiword_token = 0, empty_node = 0; |
|
19027
|
9
|
100
|
|
|
|
|
for (int i = 0; i < int(s.words.size()); i++) { |
|
19028
|
|
|
|
|
|
|
// Write non-root nodes |
|
19029
|
8
|
100
|
|
|
|
|
if (i > 0) { |
|
19030
|
|
|
|
|
|
|
// Multiword token if present |
|
19031
|
7
|
50
|
|
|
|
|
if (multiword_token < s.multiword_tokens.size() && |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
19032
|
0
|
|
|
|
|
|
i == s.multiword_tokens[multiword_token].id_first) { |
|
19033
|
0
|
|
|
|
|
|
os << s.multiword_tokens[multiword_token].id_first << '-' |
|
19034
|
0
|
|
|
|
|
|
<< s.multiword_tokens[multiword_token].id_last << '\t'; |
|
19035
|
0
|
|
|
|
|
|
write_with_spaces(os, s.multiword_tokens[multiword_token].form) << "\t_\t_\t_\t_\t_\t_\t_\t" |
|
19036
|
0
|
|
|
|
|
|
<< underscore_on_empty(s.multiword_tokens[multiword_token].misc) << '\n'; |
|
19037
|
0
|
|
|
|
|
|
multiword_token++; |
|
19038
|
|
|
|
|
|
|
} |
|
19039
|
|
|
|
|
|
|
|
|
19040
|
|
|
|
|
|
|
// Write the word |
|
19041
|
7
|
|
|
|
|
|
os << i << '\t'; |
|
19042
|
7
|
|
|
|
|
|
write_with_spaces(os, s.words[i].form) << '\t'; |
|
19043
|
7
|
|
|
|
|
|
write_with_spaces(os, underscore_on_empty(s.words[i].lemma)) << '\t' |
|
19044
|
7
|
|
|
|
|
|
<< underscore_on_empty(s.words[i].upostag) << '\t' |
|
19045
|
7
|
|
|
|
|
|
<< underscore_on_empty(s.words[i].xpostag) << '\t' |
|
19046
|
7
|
|
|
|
|
|
<< underscore_on_empty(s.words[i].feats) << '\t'; |
|
19047
|
7
|
50
|
|
|
|
|
if (s.words[i].head < 0) os << '_'; else os << s.words[i].head; os << '\t' |
|
19048
|
7
|
|
|
|
|
|
<< underscore_on_empty(s.words[i].deprel) << '\t' |
|
19049
|
7
|
|
|
|
|
|
<< underscore_on_empty(s.words[i].deps) << '\t' |
|
19050
|
7
|
|
|
|
|
|
<< underscore_on_empty(s.words[i].misc) << '\n'; |
|
19051
|
|
|
|
|
|
|
} |
|
19052
|
|
|
|
|
|
|
|
|
19053
|
|
|
|
|
|
|
// Empty nodes |
|
19054
|
8
|
50
|
|
|
|
|
if (version >= 2) |
|
19055
|
8
|
50
|
|
|
|
|
for (; empty_node < s.empty_nodes.size() && i == s.empty_nodes[empty_node].id; empty_node++) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
19056
|
0
|
|
|
|
|
|
os << i << '.' << s.empty_nodes[empty_node].index << '\t' |
|
19057
|
|
|
|
|
|
|
<< s.empty_nodes[empty_node].form << '\t' |
|
19058
|
0
|
|
|
|
|
|
<< underscore_on_empty(s.empty_nodes[empty_node].lemma) << '\t' |
|
19059
|
0
|
|
|
|
|
|
<< underscore_on_empty(s.empty_nodes[empty_node].upostag) << '\t' |
|
19060
|
0
|
|
|
|
|
|
<< underscore_on_empty(s.empty_nodes[empty_node].xpostag) << '\t' |
|
19061
|
0
|
|
|
|
|
|
<< underscore_on_empty(s.empty_nodes[empty_node].feats) << '\t' |
|
19062
|
|
|
|
|
|
|
<< "_\t" |
|
19063
|
|
|
|
|
|
|
<< "_\t" |
|
19064
|
0
|
|
|
|
|
|
<< underscore_on_empty(s.empty_nodes[empty_node].deps) << '\t' |
|
19065
|
0
|
|
|
|
|
|
<< underscore_on_empty(s.empty_nodes[empty_node].misc) << '\n'; |
|
19066
|
|
|
|
|
|
|
} |
|
19067
|
|
|
|
|
|
|
} |
|
19068
|
|
|
|
|
|
|
os << endl; |
|
19069
|
1
|
|
|
|
|
|
} |
|
19070
|
|
|
|
|
|
|
|
|
19071
|
14
|
|
|
|
|
|
ostream& output_format_conllu::write_with_spaces(ostream& os, const string& str) { |
|
19072
|
14
|
50
|
|
|
|
|
if (version >= 2 || str.find(' ') == string::npos) |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
19073
|
|
|
|
|
|
|
os << str; |
|
19074
|
|
|
|
|
|
|
else |
|
19075
|
0
|
0
|
|
|
|
|
for (auto&& chr : str) |
|
19076
|
0
|
0
|
|
|
|
|
os << (chr == ' ' ? '_' : chr); |
|
19077
|
|
|
|
|
|
|
|
|
19078
|
14
|
|
|
|
|
|
return os; |
|
19079
|
|
|
|
|
|
|
} |
|
19080
|
|
|
|
|
|
|
|
|
19081
|
|
|
|
|
|
|
// EPE output format |
|
19082
|
0
|
|
|
|
|
|
class output_format_epe : public output_format { |
|
19083
|
|
|
|
|
|
|
public: |
|
19084
|
|
|
|
|
|
|
virtual void write_sentence(const sentence& s, ostream& os) override; |
|
19085
|
|
|
|
|
|
|
virtual void finish_document(ostream& os) override; |
|
19086
|
|
|
|
|
|
|
|
|
19087
|
|
|
|
|
|
|
private: |
|
19088
|
0
|
|
|
|
|
|
class json_builder { |
|
19089
|
|
|
|
|
|
|
public: |
|
19090
|
0
|
|
|
|
|
|
json_builder& object() { comma(); json.push_back('{'); stack.push_back('}'); return *this; } |
|
19091
|
0
|
|
|
|
|
|
json_builder& array() { comma(); json.push_back('['); stack.push_back(']'); return *this; } |
|
19092
|
0
|
0
|
|
|
|
|
json_builder& close() { if (!stack.empty()) { json.push_back(stack.back()); stack.pop_back(); } comma_needed = true; return *this; } |
|
|
|
0
|
|
|
|
|
|
|
19093
|
0
|
|
|
|
|
|
json_builder& key(string_piece name) { comma(); string(name); json.push_back(':'); return *this; } |
|
19094
|
0
|
0
|
|
|
|
|
json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; } |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
19095
|
0
|
0
|
|
|
|
|
json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; } |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
19096
|
0
|
|
|
|
|
|
json_builder& value_true() { comma(); json.push_back('t'); json.push_back('r'); json.push_back('u'); json.push_back('e'); comma_needed=true; return *this; } |
|
19097
|
|
|
|
|
|
|
|
|
19098
|
|
|
|
|
|
|
string_piece current() const { return string_piece(json.data(), json.size()); } |
|
19099
|
0
|
|
|
|
|
|
void clear() { json.clear(); stack.clear(); comma_needed=false; } |
|
19100
|
|
|
|
|
|
|
|
|
19101
|
|
|
|
|
|
|
private: |
|
19102
|
0
|
|
|
|
|
|
void comma() { |
|
19103
|
0
|
0
|
|
|
|
|
if (comma_needed) { |
|
19104
|
0
|
|
|
|
|
|
json.push_back(','); |
|
19105
|
0
|
|
|
|
|
|
json.push_back(' '); |
|
19106
|
|
|
|
|
|
|
} |
|
19107
|
0
|
|
|
|
|
|
comma_needed = false; |
|
19108
|
0
|
|
|
|
|
|
} |
|
19109
|
0
|
|
|
|
|
|
void string(string_piece str) { |
|
19110
|
0
|
|
|
|
|
|
json.push_back('"'); |
|
19111
|
0
|
0
|
|
|
|
|
for (; str.len; str.str++, str.len--) |
|
19112
|
0
|
|
|
|
|
|
switch (*str.str) { |
|
19113
|
0
|
|
|
|
|
|
case '"': json.push_back('\\'); json.push_back('\"'); break; |
|
19114
|
0
|
|
|
|
|
|
case '\\': json.push_back('\\'); json.push_back('\\'); break; |
|
19115
|
0
|
|
|
|
|
|
case '\b': json.push_back('\\'); json.push_back('b'); break; |
|
19116
|
0
|
|
|
|
|
|
case '\f': json.push_back('\\'); json.push_back('f'); break; |
|
19117
|
0
|
|
|
|
|
|
case '\n': json.push_back('\\'); json.push_back('n'); break; |
|
19118
|
0
|
|
|
|
|
|
case '\r': json.push_back('\\'); json.push_back('r'); break; |
|
19119
|
0
|
|
|
|
|
|
case '\t': json.push_back('\\'); json.push_back('t'); break; |
|
19120
|
|
|
|
|
|
|
default: |
|
19121
|
0
|
0
|
|
|
|
|
if (((unsigned char)*str.str) < 32) { |
|
19122
|
0
|
|
|
|
|
|
json.push_back('u'); json.push_back('0'); json.push_back('0'); json.push_back('0' + (*str.str >> 4)); json.push_back("0123456789ABCDEF"[*str.str & 0xF]); |
|
19123
|
|
|
|
|
|
|
} else { |
|
19124
|
0
|
|
|
|
|
|
json.push_back(*str.str); |
|
19125
|
|
|
|
|
|
|
} |
|
19126
|
|
|
|
|
|
|
} |
|
19127
|
0
|
|
|
|
|
|
json.push_back('"'); |
|
19128
|
0
|
|
|
|
|
|
} |
|
19129
|
0
|
|
|
|
|
|
void number(size_t value) { |
|
19130
|
|
|
|
|
|
|
size_t start_size = json.size(); |
|
19131
|
0
|
0
|
|
|
|
|
for (; value || start_size == json.size(); value /= 10) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
19132
|
0
|
|
|
|
|
|
json.push_back('0' + (value % 10)); |
|
19133
|
|
|
|
|
|
|
reverse(json.begin() + start_size, json.end()); |
|
19134
|
0
|
|
|
|
|
|
} |
|
19135
|
|
|
|
|
|
|
|
|
19136
|
|
|
|
|
|
|
std::vector json; |
|
19137
|
|
|
|
|
|
|
std::vector stack; |
|
19138
|
|
|
|
|
|
|
bool comma_needed = false; |
|
19139
|
|
|
|
|
|
|
} json; |
|
19140
|
|
|
|
|
|
|
|
|
19141
|
|
|
|
|
|
|
vector feats; |
|
19142
|
|
|
|
|
|
|
size_t sentences = 0; |
|
19143
|
|
|
|
|
|
|
}; |
|
19144
|
|
|
|
|
|
|
|
|
19145
|
0
|
|
|
|
|
|
void output_format_epe::write_sentence(const sentence& s, ostream& os) { |
|
19146
|
0
|
0
|
|
|
|
|
json.object().key("id").value(++sentences).key("nodes").array(); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
19147
|
|
|
|
|
|
|
|
|
19148
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < s.words.size(); i++) { |
|
19149
|
0
|
0
|
|
|
|
|
json.object().key("id").value(i).key("form").value(s.words[i].form); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
19150
|
|
|
|
|
|
|
|
|
19151
|
|
|
|
|
|
|
size_t start, end; |
|
19152
|
0
|
0
|
|
|
|
|
if (s.words[i].get_token_range(start, end)) |
|
19153
|
0
|
0
|
|
|
|
|
json.key("start").value(start).key("end").value(end); |
|
|
|
0
|
|
|
|
|
|
|
19154
|
0
|
0
|
|
|
|
|
if (s.words[i].head == 0) |
|
19155
|
0
|
|
|
|
|
|
json.key("top").value_true(); |
|
19156
|
|
|
|
|
|
|
|
|
19157
|
0
|
0
|
|
|
|
|
json.key("properties").object() |
|
|
|
0
|
|
|
|
|
|
|
19158
|
0
|
0
|
|
|
|
|
.key("lemma").value(s.words[i].lemma) |
|
19159
|
0
|
0
|
|
|
|
|
.key("upos").value(s.words[i].upostag) |
|
19160
|
0
|
0
|
|
|
|
|
.key("xpos").value(s.words[i].xpostag); |
|
19161
|
0
|
|
|
|
|
|
split(s.words[i].feats, '|', feats); |
|
19162
|
0
|
0
|
|
|
|
|
for (auto&& feat : feats) { |
|
19163
|
0
|
|
|
|
|
|
string_piece key(feat.str, 0); |
|
19164
|
0
|
0
|
|
|
|
|
while (key.len < feat.len && key.str[key.len] != '=') |
|
|
|
0
|
|
|
|
|
|
|
19165
|
0
|
|
|
|
|
|
key.len++; |
|
19166
|
0
|
0
|
|
|
|
|
if (key.len + 1 < feat.len) |
|
19167
|
0
|
0
|
|
|
|
|
json.key(key).value(string_piece(key.str + key.len + 1, feat.len - key.len - 1)); |
|
19168
|
|
|
|
|
|
|
} |
|
19169
|
0
|
|
|
|
|
|
json.close(); |
|
19170
|
|
|
|
|
|
|
|
|
19171
|
0
|
0
|
|
|
|
|
if (!s.words[i].children.empty()) { |
|
19172
|
0
|
|
|
|
|
|
json.key("edges").array(); |
|
19173
|
0
|
0
|
|
|
|
|
for (auto&& child : s.words[i].children) |
|
19174
|
0
|
0
|
|
|
|
|
json.object().key("label").value(s.words[child].deprel).key("target").value(child).close(); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
19175
|
0
|
|
|
|
|
|
json.close(); |
|
19176
|
|
|
|
|
|
|
} |
|
19177
|
|
|
|
|
|
|
|
|
19178
|
0
|
|
|
|
|
|
json.close(); |
|
19179
|
|
|
|
|
|
|
} |
|
19180
|
0
|
|
|
|
|
|
json.close().close(); |
|
19181
|
|
|
|
|
|
|
|
|
19182
|
|
|
|
|
|
|
string_piece current = json.current(); |
|
19183
|
0
|
|
|
|
|
|
os.write(current.str, current.len).put('\n'); |
|
19184
|
|
|
|
|
|
|
json.clear(); |
|
19185
|
0
|
|
|
|
|
|
} |
|
19186
|
|
|
|
|
|
|
|
|
19187
|
0
|
|
|
|
|
|
void output_format_epe::finish_document(ostream& /*os*/) { |
|
19188
|
0
|
|
|
|
|
|
sentences = 0; |
|
19189
|
0
|
|
|
|
|
|
} |
|
19190
|
|
|
|
|
|
|
|
|
19191
|
|
|
|
|
|
|
// Matxin output format |
|
19192
|
0
|
|
|
|
|
|
class output_format_matxin : public output_format { |
|
19193
|
|
|
|
|
|
|
public: |
|
19194
|
|
|
|
|
|
|
virtual void write_sentence(const sentence& s, ostream& os) override; |
|
19195
|
|
|
|
|
|
|
virtual void finish_document(ostream& os) override; |
|
19196
|
|
|
|
|
|
|
|
|
19197
|
|
|
|
|
|
|
private: |
|
19198
|
|
|
|
|
|
|
void write_node(const sentence& s, int node, string& pad, ostream& os); |
|
19199
|
|
|
|
|
|
|
|
|
19200
|
|
|
|
|
|
|
int sentences = 0; |
|
19201
|
|
|
|
|
|
|
}; |
|
19202
|
|
|
|
|
|
|
|
|
19203
|
0
|
|
|
|
|
|
void output_format_matxin::write_sentence(const sentence& s, ostream& os) { |
|
19204
|
0
|
0
|
|
|
|
|
if (!sentences) { |
|
19205
|
0
|
|
|
|
|
|
os << ""; |
|
19206
|
|
|
|
|
|
|
} |
|
19207
|
0
|
|
|
|
|
|
os << "\n\n"; |
|
19208
|
|
|
|
|
|
|
|
|
19209
|
|
|
|
|
|
|
string pad; |
|
19210
|
0
|
0
|
|
|
|
|
for (auto&& node : s.words[0].children) |
|
19211
|
0
|
0
|
|
|
|
|
write_node(s, node, pad, os); |
|
19212
|
|
|
|
|
|
|
|
|
19213
|
|
|
|
|
|
|
os << "" << endl; |
|
19214
|
0
|
|
|
|
|
|
} |
|
19215
|
|
|
|
|
|
|
|
|
19216
|
0
|
|
|
|
|
|
void output_format_matxin::finish_document(ostream& os) { |
|
19217
|
0
|
|
|
|
|
|
os << "\n"; |
|
19218
|
|
|
|
|
|
|
|
|
19219
|
0
|
|
|
|
|
|
sentences = 0; |
|
19220
|
0
|
|
|
|
|
|
} |
|
19221
|
|
|
|
|
|
|
|
|
19222
|
0
|
|
|
|
|
|
void output_format_matxin::write_node(const sentence& s, int node, string& pad, ostream& os) { |
|
19223
|
|
|
|
|
|
|
// |
|
19224
|
0
|
|
|
|
|
|
pad.push_back(' '); |
|
19225
|
|
|
|
|
|
|
|
|
19226
|
0
|
0
|
|
|
|
|
os << pad << "
|
|
|
|
0
|
|
|
|
|
|
|
19227
|
0
|
0
|
|
|
|
|
<< "\" form=\"" << xml_encoded(s.words[node].form, true) |
|
19228
|
0
|
0
|
|
|
|
|
<< "\" lem=\"" << xml_encoded(s.words[node].lemma, true) |
|
19229
|
0
|
0
|
|
|
|
|
<< "\" mi=\"" << xml_encoded(s.words[node].feats, true) |
|
19230
|
0
|
0
|
|
|
|
|
<< "\" si=\"" << xml_encoded(s.words[node].deprel, true) << '"'; |
|
19231
|
|
|
|
|
|
|
|
|
19232
|
0
|
0
|
|
|
|
|
if (s.words[node].children.empty()) { |
|
19233
|
0
|
|
|
|
|
|
os << "/>\n"; |
|
19234
|
|
|
|
|
|
|
} else { |
|
19235
|
0
|
|
|
|
|
|
os << ">\n"; |
|
19236
|
0
|
0
|
|
|
|
|
for (auto&& child : s.words[node].children) |
|
19237
|
0
|
|
|
|
|
|
write_node(s, child, pad, os); |
|
19238
|
0
|
|
|
|
|
|
os << pad << "\n"; |
|
19239
|
|
|
|
|
|
|
} |
|
19240
|
|
|
|
|
|
|
|
|
19241
|
|
|
|
|
|
|
pad.pop_back(); |
|
19242
|
0
|
|
|
|
|
|
} |
|
19243
|
|
|
|
|
|
|
|
|
19244
|
|
|
|
|
|
|
// Horizontal output format |
|
19245
|
0
|
|
|
|
|
|
class output_format_horizontal : public output_format { |
|
19246
|
|
|
|
|
|
|
public: |
|
19247
|
0
|
|
|
|
|
|
output_format_horizontal(bool paragraphs) : paragraphs(paragraphs), empty(true) {} |
|
19248
|
|
|
|
|
|
|
|
|
19249
|
|
|
|
|
|
|
virtual void write_sentence(const sentence& s, ostream& os) override; |
|
19250
|
0
|
|
|
|
|
|
virtual void finish_document(ostream& /*os*/) override { empty = true; } |
|
19251
|
|
|
|
|
|
|
|
|
19252
|
|
|
|
|
|
|
private: |
|
19253
|
|
|
|
|
|
|
bool paragraphs; |
|
19254
|
|
|
|
|
|
|
bool empty; |
|
19255
|
|
|
|
|
|
|
}; |
|
19256
|
|
|
|
|
|
|
|
|
19257
|
0
|
|
|
|
|
|
void output_format_horizontal::write_sentence(const sentence& s, ostream& os) { |
|
19258
|
0
|
0
|
|
|
|
|
if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par())) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
19259
|
|
|
|
|
|
|
os << '\n'; |
|
19260
|
0
|
|
|
|
|
|
empty = false; |
|
19261
|
|
|
|
|
|
|
|
|
19262
|
|
|
|
|
|
|
string line; |
|
19263
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < s.words.size(); i++) { |
|
19264
|
|
|
|
|
|
|
// Append word, but replace spaces by s |
|
19265
|
0
|
0
|
|
|
|
|
for (auto&& chr : s.words[i].form) |
|
19266
|
0
|
0
|
|
|
|
|
if (chr == ' ') |
|
19267
|
0
|
0
|
|
|
|
|
line.append("\302\240"); |
|
19268
|
|
|
|
|
|
|
else |
|
19269
|
0
|
0
|
|
|
|
|
line.push_back(chr); |
|
19270
|
|
|
|
|
|
|
|
|
19271
|
0
|
0
|
|
|
|
|
if (i+1 < s.words.size()) |
|
19272
|
0
|
0
|
|
|
|
|
line.push_back(' '); |
|
19273
|
|
|
|
|
|
|
} |
|
19274
|
|
|
|
|
|
|
os << line << endl; |
|
19275
|
0
|
|
|
|
|
|
} |
|
19276
|
|
|
|
|
|
|
|
|
19277
|
|
|
|
|
|
|
// Plaintext output format |
|
19278
|
0
|
|
|
|
|
|
class output_format_plaintext : public output_format { |
|
19279
|
|
|
|
|
|
|
public: |
|
19280
|
0
|
|
|
|
|
|
output_format_plaintext(bool normalized): normalized(normalized), empty(true) {} |
|
19281
|
|
|
|
|
|
|
|
|
19282
|
|
|
|
|
|
|
virtual void write_sentence(const sentence& s, ostream& os) override; |
|
19283
|
0
|
|
|
|
|
|
virtual void finish_document(ostream& /*os*/) override { empty = true; } |
|
19284
|
|
|
|
|
|
|
private: |
|
19285
|
|
|
|
|
|
|
bool normalized; |
|
19286
|
|
|
|
|
|
|
bool empty; |
|
19287
|
|
|
|
|
|
|
}; |
|
19288
|
|
|
|
|
|
|
|
|
19289
|
0
|
|
|
|
|
|
void output_format_plaintext::write_sentence(const sentence& s, ostream& os) { |
|
19290
|
0
|
0
|
|
|
|
|
if (normalized) { |
|
19291
|
0
|
0
|
|
|
|
|
if (!empty && (s.get_new_doc() || s.get_new_par())) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
19292
|
|
|
|
|
|
|
os << '\n'; |
|
19293
|
0
|
0
|
|
|
|
|
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
|
19294
|
0
|
0
|
|
|
|
|
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j] : (const token&)s.words[i]; |
|
|
|
0
|
|
|
|
|
|
|
19295
|
|
|
|
|
|
|
os << tok.form; |
|
19296
|
0
|
0
|
|
|
|
|
if (i+1 < s.words.size() && tok.get_space_after()) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
19297
|
|
|
|
|
|
|
os << ' '; |
|
19298
|
0
|
0
|
|
|
|
|
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
19299
|
0
|
|
|
|
|
|
i = s.multiword_tokens[j++].id_last; |
|
19300
|
|
|
|
|
|
|
} |
|
19301
|
|
|
|
|
|
|
os << endl; |
|
19302
|
|
|
|
|
|
|
} else { |
|
19303
|
|
|
|
|
|
|
string spaces; |
|
19304
|
0
|
0
|
|
|
|
|
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
|
19305
|
0
|
0
|
|
|
|
|
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j] : (const token&)s.words[i]; |
|
|
|
0
|
|
|
|
|
|
|
19306
|
0
|
0
|
|
|
|
|
tok.get_spaces_before(spaces); os << spaces; |
|
19307
|
0
|
0
|
|
|
|
|
tok.get_spaces_in_token(spaces); os << (!spaces.empty() ? spaces : tok.form); |
|
|
|
0
|
|
|
|
|
|
|
19308
|
0
|
0
|
|
|
|
|
tok.get_spaces_after(spaces); os << spaces; |
|
19309
|
0
|
0
|
|
|
|
|
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
19310
|
0
|
|
|
|
|
|
i = s.multiword_tokens[j++].id_last; |
|
19311
|
|
|
|
|
|
|
} |
|
19312
|
|
|
|
|
|
|
os << flush; |
|
19313
|
|
|
|
|
|
|
} |
|
19314
|
0
|
|
|
|
|
|
empty = false; |
|
19315
|
0
|
|
|
|
|
|
} |
|
19316
|
|
|
|
|
|
|
|
|
19317
|
|
|
|
|
|
|
// Vertical output format |
|
19318
|
0
|
|
|
|
|
|
class output_format_vertical : public output_format { |
|
19319
|
|
|
|
|
|
|
public: |
|
19320
|
0
|
|
|
|
|
|
output_format_vertical(bool paragraphs) : paragraphs(paragraphs), empty(true) {} |
|
19321
|
|
|
|
|
|
|
|
|
19322
|
|
|
|
|
|
|
virtual void write_sentence(const sentence& s, ostream& os) override; |
|
19323
|
0
|
|
|
|
|
|
virtual void finish_document(ostream& /*os*/) override { empty = true; } |
|
19324
|
|
|
|
|
|
|
|
|
19325
|
|
|
|
|
|
|
private: |
|
19326
|
|
|
|
|
|
|
bool paragraphs; |
|
19327
|
|
|
|
|
|
|
bool empty; |
|
19328
|
|
|
|
|
|
|
}; |
|
19329
|
|
|
|
|
|
|
|
|
19330
|
0
|
|
|
|
|
|
void output_format_vertical::write_sentence(const sentence& s, ostream& os) { |
|
19331
|
0
|
0
|
|
|
|
|
if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par())) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
19332
|
|
|
|
|
|
|
os << '\n'; |
|
19333
|
0
|
|
|
|
|
|
empty = false; |
|
19334
|
|
|
|
|
|
|
|
|
19335
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < s.words.size(); i++) |
|
19336
|
|
|
|
|
|
|
os << s.words[i].form << '\n'; |
|
19337
|
|
|
|
|
|
|
os << endl; |
|
19338
|
0
|
|
|
|
|
|
} |
|
19339
|
|
|
|
|
|
|
|
|
19340
|
|
|
|
|
|
|
// Static factory methods |
|
19341
|
1
|
|
|
|
|
|
output_format* output_format::new_conllu_output_format(const string& options) { |
|
19342
|
|
|
|
|
|
|
named_values::map parsed_options; |
|
19343
|
|
|
|
|
|
|
string parse_error; |
|
19344
|
1
|
50
|
|
|
|
|
if (!named_values::parse(options, parsed_options, parse_error)) |
|
|
|
50
|
|
|
|
|
|
|
19345
|
|
|
|
|
|
|
return nullptr; |
|
19346
|
|
|
|
|
|
|
|
|
19347
|
|
|
|
|
|
|
unsigned version = 2; |
|
19348
|
1
|
50
|
|
|
|
|
if (parsed_options.count(CONLLU_V1)) |
|
19349
|
|
|
|
|
|
|
version = 1; |
|
19350
|
1
|
50
|
|
|
|
|
if (parsed_options.count(CONLLU_V2)) |
|
19351
|
|
|
|
|
|
|
version = 2; |
|
19352
|
|
|
|
|
|
|
|
|
19353
|
1
|
50
|
|
|
|
|
return new output_format_conllu(version); |
|
19354
|
|
|
|
|
|
|
} |
|
19355
|
|
|
|
|
|
|
|
|
19356
|
0
|
|
|
|
|
|
output_format* output_format::new_epe_output_format(const string& /*options*/) { |
|
19357
|
0
|
|
|
|
|
|
return new output_format_epe(); |
|
19358
|
|
|
|
|
|
|
} |
|
19359
|
|
|
|
|
|
|
|
|
19360
|
0
|
|
|
|
|
|
output_format* output_format::new_matxin_output_format(const string& /*options*/) { |
|
19361
|
0
|
0
|
|
|
|
|
return new output_format_matxin(); |
|
19362
|
|
|
|
|
|
|
} |
|
19363
|
|
|
|
|
|
|
|
|
19364
|
0
|
|
|
|
|
|
output_format* output_format::new_horizontal_output_format(const string& options) { |
|
19365
|
|
|
|
|
|
|
named_values::map parsed_options; |
|
19366
|
|
|
|
|
|
|
string parse_error; |
|
19367
|
0
|
0
|
|
|
|
|
if (!named_values::parse(options, parsed_options, parse_error)) |
|
|
|
0
|
|
|
|
|
|
|
19368
|
|
|
|
|
|
|
return nullptr; |
|
19369
|
|
|
|
|
|
|
|
|
19370
|
0
|
0
|
|
|
|
|
return new output_format_horizontal(parsed_options.count(HORIZONTAL_PARAGRAPHS)); |
|
19371
|
|
|
|
|
|
|
} |
|
19372
|
|
|
|
|
|
|
|
|
19373
|
0
|
|
|
|
|
|
output_format* output_format::new_plaintext_output_format(const string& options) { |
|
19374
|
|
|
|
|
|
|
named_values::map parsed_options; |
|
19375
|
|
|
|
|
|
|
string parse_error; |
|
19376
|
0
|
0
|
|
|
|
|
if (!named_values::parse(options, parsed_options, parse_error)) |
|
|
|
0
|
|
|
|
|
|
|
19377
|
|
|
|
|
|
|
return nullptr; |
|
19378
|
|
|
|
|
|
|
|
|
19379
|
0
|
0
|
|
|
|
|
return new output_format_plaintext(parsed_options.count(PLAINTEXT_NORMALIZED_SPACES)); |
|
19380
|
|
|
|
|
|
|
} |
|
19381
|
|
|
|
|
|
|
|
|
19382
|
0
|
|
|
|
|
|
output_format* output_format::new_vertical_output_format(const string& options) { |
|
19383
|
|
|
|
|
|
|
named_values::map parsed_options; |
|
19384
|
|
|
|
|
|
|
string parse_error; |
|
19385
|
0
|
0
|
|
|
|
|
if (!named_values::parse(options, parsed_options, parse_error)) |
|
|
|
0
|
|
|
|
|
|
|
19386
|
|
|
|
|
|
|
return nullptr; |
|
19387
|
|
|
|
|
|
|
|
|
19388
|
0
|
0
|
|
|
|
|
return new output_format_vertical(parsed_options.count(VERTICAL_PARAGRAPHS)); |
|
19389
|
|
|
|
|
|
|
} |
|
19390
|
|
|
|
|
|
|
|
|
19391
|
1
|
|
|
|
|
|
output_format* output_format::new_output_format(const string& name) { |
|
19392
|
1
|
|
|
|
|
|
size_t equal = name.find('='); |
|
19393
|
1
|
50
|
|
|
|
|
size_t name_len = equal != string::npos ? equal : name.size(); |
|
19394
|
1
|
50
|
|
|
|
|
size_t option_offset = equal != string::npos ? equal + 1 : name.size(); |
|
19395
|
|
|
|
|
|
|
|
|
19396
|
2
|
50
|
|
|
|
|
if (name.compare(0, name_len, "conllu") == 0) return new_conllu_output_format(name.substr(option_offset)); |
|
|
|
50
|
|
|
|
|
|
|
19397
|
0
|
0
|
|
|
|
|
if (name.compare(0, name_len, "epe") == 0) return new_epe_output_format(name.substr(option_offset)); |
|
|
|
0
|
|
|
|
|
|
|
19398
|
0
|
0
|
|
|
|
|
if (name.compare(0, name_len, "matxin") == 0) return new_matxin_output_format(name.substr(option_offset)); |
|
19399
|
0
|
0
|
|
|
|
|
if (name.compare(0, name_len, "horizontal") == 0) return new_horizontal_output_format(name.substr(option_offset)); |
|
|
|
0
|
|
|
|
|
|
|
19400
|
0
|
0
|
|
|
|
|
if (name.compare(0, name_len, "plaintext") == 0) return new_plaintext_output_format(name.substr(option_offset)); |
|
|
|
0
|
|
|
|
|
|
|
19401
|
1
|
0
|
|
|
|
|
if (name.compare(0, name_len, "vertical") == 0) return new_vertical_output_format(name.substr(option_offset)); |
|
|
|
0
|
|
|
|
|
|
|
19402
|
|
|
|
|
|
|
return nullptr; |
|
19403
|
|
|
|
|
|
|
} |
|
19404
|
|
|
|
|
|
|
|
|
19405
|
|
|
|
|
|
|
///////// |
|
19406
|
|
|
|
|
|
|
// File: sentence/sentence.cpp |
|
19407
|
|
|
|
|
|
|
///////// |
|
19408
|
|
|
|
|
|
|
|
|
19409
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
19410
|
|
|
|
|
|
|
// |
|
19411
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
19412
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
19413
|
|
|
|
|
|
|
// |
|
19414
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
19415
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
19416
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
19417
|
|
|
|
|
|
|
|
|
19418
|
2
|
|
|
|
|
|
const string sentence::root_form = ""; |
|
19419
|
|
|
|
|
|
|
|
|
19420
|
1
|
|
|
|
|
|
sentence::sentence() { |
|
19421
|
1
|
50
|
|
|
|
|
clear(); |
|
19422
|
1
|
|
|
|
|
|
} |
|
19423
|
|
|
|
|
|
|
|
|
19424
|
0
|
|
|
|
|
|
bool sentence::empty() { |
|
19425
|
0
|
|
|
|
|
|
return words.size() == 1; |
|
19426
|
|
|
|
|
|
|
} |
|
19427
|
|
|
|
|
|
|
|
|
19428
|
3
|
|
|
|
|
|
void sentence::clear() { |
|
19429
|
|
|
|
|
|
|
words.clear(); |
|
19430
|
|
|
|
|
|
|
multiword_tokens.clear(); |
|
19431
|
|
|
|
|
|
|
empty_nodes.clear(); |
|
19432
|
3
|
|
|
|
|
|
comments.clear(); |
|
19433
|
|
|
|
|
|
|
|
|
19434
|
|
|
|
|
|
|
word& root = add_word(root_form); |
|
19435
|
12
|
|
|
|
|
|
root.lemma = root.upostag = root.xpostag = root.feats = root_form; |
|
19436
|
3
|
|
|
|
|
|
} |
|
19437
|
|
|
|
|
|
|
|
|
19438
|
0
|
|
|
|
|
|
word& sentence::add_word(string_piece form) { |
|
19439
|
10
|
0
|
|
|
|
|
words.emplace_back((int)words.size(), form); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
19440
|
0
|
|
|
|
|
|
return words.back(); |
|
19441
|
|
|
|
|
|
|
} |
|
19442
|
|
|
|
|
|
|
|
|
19443
|
7
|
|
|
|
|
|
void sentence::set_head(int id, int head, const string& deprel) { |
|
19444
|
7
|
50
|
|
|
|
|
assert(id >= 0 && id < int(words.size())); |
|
|
|
50
|
|
|
|
|
|
|
19445
|
7
|
50
|
|
|
|
|
assert(head < int(words.size())); |
|
19446
|
|
|
|
|
|
|
|
|
19447
|
|
|
|
|
|
|
// Remove existing head |
|
19448
|
7
|
50
|
|
|
|
|
if (words[id].head >= 0) { |
|
19449
|
0
|
|
|
|
|
|
auto& children = words[words[id].head].children; |
|
19450
|
0
|
0
|
|
|
|
|
for (size_t i = children.size(); i && children[i-1] >= id; i--) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
19451
|
0
|
0
|
|
|
|
|
if (children[i-1] == id) { |
|
19452
|
|
|
|
|
|
|
children.erase(children.begin() + i - 1); |
|
19453
|
0
|
|
|
|
|
|
break; |
|
19454
|
|
|
|
|
|
|
} |
|
19455
|
|
|
|
|
|
|
} |
|
19456
|
|
|
|
|
|
|
|
|
19457
|
|
|
|
|
|
|
// Set new head |
|
19458
|
14
|
|
|
|
|
|
words[id].head = head; |
|
19459
|
7
|
|
|
|
|
|
words[id].deprel = deprel; |
|
19460
|
7
|
50
|
|
|
|
|
if (head >= 0) { |
|
19461
|
14
|
|
|
|
|
|
auto& children = words[head].children; |
|
19462
|
|
|
|
|
|
|
size_t i = children.size(); |
|
19463
|
7
|
100
|
|
|
|
|
while (i && children[i-1] > id) i--; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
19464
|
7
|
100
|
|
|
|
|
if (!i || children[i-1] < id) children.insert(children.begin() + i, id); |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
19465
|
|
|
|
|
|
|
} |
|
19466
|
7
|
|
|
|
|
|
} |
|
19467
|
|
|
|
|
|
|
|
|
19468
|
0
|
|
|
|
|
|
void sentence::unlink_all_words() { |
|
19469
|
0
|
0
|
|
|
|
|
for (auto&& word : words) { |
|
19470
|
0
|
|
|
|
|
|
word.head = -1; |
|
19471
|
|
|
|
|
|
|
word.deprel.clear(); |
|
19472
|
|
|
|
|
|
|
word.children.clear(); |
|
19473
|
|
|
|
|
|
|
} |
|
19474
|
0
|
|
|
|
|
|
} |
|
19475
|
|
|
|
|
|
|
|
|
19476
|
0
|
|
|
|
|
|
bool sentence::get_new_doc(string* id) const { |
|
19477
|
0
|
0
|
|
|
|
|
if (get_comment("newdoc id", id)) |
|
19478
|
|
|
|
|
|
|
return true; |
|
19479
|
0
|
|
|
|
|
|
return get_comment("newdoc", id); |
|
19480
|
|
|
|
|
|
|
} |
|
19481
|
|
|
|
|
|
|
|
|
19482
|
1
|
|
|
|
|
|
void sentence::set_new_doc(bool new_doc, string_piece id) { |
|
19483
|
1
|
|
|
|
|
|
remove_comment("newdoc"); |
|
19484
|
1
|
|
|
|
|
|
remove_comment("newdoc id"); |
|
19485
|
|
|
|
|
|
|
|
|
19486
|
1
|
50
|
|
|
|
|
if (new_doc && id.len) |
|
|
|
50
|
|
|
|
|
|
|
19487
|
0
|
|
|
|
|
|
set_comment("newdoc id", id); |
|
19488
|
1
|
50
|
|
|
|
|
else if (new_doc) |
|
19489
|
1
|
|
|
|
|
|
set_comment("newdoc"); |
|
19490
|
1
|
|
|
|
|
|
} |
|
19491
|
|
|
|
|
|
|
|
|
19492
|
0
|
|
|
|
|
|
bool sentence::get_new_par(string* id) const { |
|
19493
|
0
|
0
|
|
|
|
|
if (get_comment("newpar id", id)) |
|
19494
|
|
|
|
|
|
|
return true; |
|
19495
|
0
|
|
|
|
|
|
return get_comment("newpar", id); |
|
19496
|
|
|
|
|
|
|
} |
|
19497
|
|
|
|
|
|
|
|
|
19498
|
1
|
|
|
|
|
|
void sentence::set_new_par(bool new_par, string_piece id) { |
|
19499
|
1
|
|
|
|
|
|
remove_comment("newpar"); |
|
19500
|
1
|
|
|
|
|
|
remove_comment("newpar id"); |
|
19501
|
|
|
|
|
|
|
|
|
19502
|
1
|
50
|
|
|
|
|
if (new_par && id.len) |
|
|
|
50
|
|
|
|
|
|
|
19503
|
0
|
|
|
|
|
|
set_comment("newpar id", id); |
|
19504
|
1
|
50
|
|
|
|
|
else if (new_par) |
|
19505
|
1
|
|
|
|
|
|
set_comment("newpar"); |
|
19506
|
1
|
|
|
|
|
|
} |
|
19507
|
|
|
|
|
|
|
|
|
19508
|
0
|
|
|
|
|
|
bool sentence::get_sent_id(string& id) const { |
|
19509
|
|
|
|
|
|
|
id.clear(); |
|
19510
|
|
|
|
|
|
|
|
|
19511
|
0
|
|
|
|
|
|
return get_comment("sent_id", &id); |
|
19512
|
|
|
|
|
|
|
} |
|
19513
|
|
|
|
|
|
|
|
|
19514
|
1
|
|
|
|
|
|
void sentence::set_sent_id(string_piece id) { |
|
19515
|
1
|
|
|
|
|
|
remove_comment("sent_id"); |
|
19516
|
|
|
|
|
|
|
|
|
19517
|
1
|
50
|
|
|
|
|
if (id.len) |
|
19518
|
1
|
|
|
|
|
|
set_comment("sent_id", id); |
|
19519
|
1
|
|
|
|
|
|
} |
|
19520
|
|
|
|
|
|
|
|
|
19521
|
0
|
|
|
|
|
|
bool sentence::get_text(string& text) const { |
|
19522
|
|
|
|
|
|
|
text.clear(); |
|
19523
|
|
|
|
|
|
|
|
|
19524
|
0
|
|
|
|
|
|
return get_comment("text", &text); |
|
19525
|
|
|
|
|
|
|
} |
|
19526
|
|
|
|
|
|
|
|
|
19527
|
0
|
|
|
|
|
|
void sentence::set_text(string_piece text) { |
|
19528
|
0
|
|
|
|
|
|
remove_comment("text"); |
|
19529
|
|
|
|
|
|
|
|
|
19530
|
0
|
0
|
|
|
|
|
if (text.len) |
|
19531
|
0
|
|
|
|
|
|
set_comment("text", text); |
|
19532
|
0
|
|
|
|
|
|
} |
|
19533
|
|
|
|
|
|
|
|
|
19534
|
0
|
|
|
|
|
|
bool sentence::get_comment(string_piece name, string* value) const { |
|
19535
|
0
|
0
|
|
|
|
|
for (auto&& comment : comments) |
|
19536
|
0
|
0
|
|
|
|
|
if (comment[0] == '#') { |
|
19537
|
|
|
|
|
|
|
// Skip spaces |
|
19538
|
|
|
|
|
|
|
unsigned j = 1; |
|
19539
|
0
|
0
|
|
|
|
|
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
19540
|
|
|
|
|
|
|
|
|
19541
|
|
|
|
|
|
|
// Try matching the name |
|
19542
|
0
|
0
|
|
|
|
|
if (j + name.len <= comment.size() && comment.compare(j, name.len, name.str, name.len) == 0) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
19543
|
0
|
|
|
|
|
|
j += name.len; |
|
19544
|
0
|
0
|
|
|
|
|
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
19545
|
0
|
0
|
|
|
|
|
if (j < comment.size() && comment[j] == '=') { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
19546
|
|
|
|
|
|
|
//We have a value |
|
19547
|
0
|
|
|
|
|
|
j++; |
|
19548
|
0
|
0
|
|
|
|
|
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
19549
|
0
|
0
|
|
|
|
|
if (value) value->assign(comment, j, comment.size() - j); |
|
19550
|
|
|
|
|
|
|
} else { |
|
19551
|
|
|
|
|
|
|
// No value |
|
19552
|
0
|
0
|
|
|
|
|
if (value) value->clear(); |
|
19553
|
|
|
|
|
|
|
} |
|
19554
|
|
|
|
|
|
|
|
|
19555
|
|
|
|
|
|
|
return true; |
|
19556
|
|
|
|
|
|
|
} |
|
19557
|
|
|
|
|
|
|
} |
|
19558
|
|
|
|
|
|
|
|
|
19559
|
|
|
|
|
|
|
return false; |
|
19560
|
|
|
|
|
|
|
} |
|
19561
|
|
|
|
|
|
|
|
|
19562
|
8
|
|
|
|
|
|
void sentence::remove_comment(string_piece name) { |
|
19563
|
15
|
100
|
|
|
|
|
for (unsigned i = comments.size(); i--; ) |
|
19564
|
7
|
50
|
|
|
|
|
if (comments[i][0] == '#') { |
|
19565
|
|
|
|
|
|
|
// Skip spaces |
|
19566
|
|
|
|
|
|
|
unsigned j = 1; |
|
19567
|
14
|
50
|
|
|
|
|
while (j < comments[i].size() && (comments[i][j] == ' ' || comments[i][j] == '\t')) j++; |
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
19568
|
|
|
|
|
|
|
|
|
19569
|
|
|
|
|
|
|
// Remove matching comments |
|
19570
|
7
|
100
|
|
|
|
|
if (j + name.len <= comments[i].size() && comments[i].compare(j, name.len, name.str, name.len) == 0) |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
19571
|
0
|
|
|
|
|
|
comments.erase(comments.begin() + i); |
|
19572
|
|
|
|
|
|
|
} |
|
19573
|
8
|
|
|
|
|
|
} |
|
19574
|
|
|
|
|
|
|
|
|
19575
|
3
|
|
|
|
|
|
void sentence::set_comment(string_piece name, string_piece value) { |
|
19576
|
3
|
|
|
|
|
|
remove_comment(name); |
|
19577
|
|
|
|
|
|
|
|
|
19578
|
|
|
|
|
|
|
string comment; |
|
19579
|
3
|
50
|
|
|
|
|
comment.append("# ").append(name.str, name.len); |
|
|
|
50
|
|
|
|
|
|
|
19580
|
3
|
100
|
|
|
|
|
if (value.len) { |
|
19581
|
1
|
50
|
|
|
|
|
comment.append(" = "); |
|
19582
|
2
|
100
|
|
|
|
|
for (size_t i = 0; i < value.len; i++) |
|
19583
|
1
|
50
|
|
|
|
|
comment.push_back(value.str[i] == '\r' || value.str[i] == '\n' ? ' ' : value.str[i]); |
|
|
|
50
|
|
|
|
|
|
|
19584
|
|
|
|
|
|
|
} |
|
19585
|
3
|
|
|
|
|
|
comments.push_back(move(comment)); |
|
19586
|
3
|
|
|
|
|
|
} |
|
19587
|
|
|
|
|
|
|
|
|
19588
|
|
|
|
|
|
|
///////// |
|
19589
|
|
|
|
|
|
|
// File: sentence/token.cpp |
|
19590
|
|
|
|
|
|
|
///////// |
|
19591
|
|
|
|
|
|
|
|
|
19592
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
19593
|
|
|
|
|
|
|
// |
|
19594
|
|
|
|
|
|
|
// Copyright 2017 Institute of Formal and Applied Linguistics, Faculty of |
|
19595
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
19596
|
|
|
|
|
|
|
// |
|
19597
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
19598
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
19599
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
19600
|
|
|
|
|
|
|
|
|
19601
|
11
|
|
|
|
|
|
token::token(string_piece form, string_piece misc) { |
|
19602
|
11
|
100
|
|
|
|
|
if (form.len) this->form.assign(form.str, form.len); |
|
19603
|
11
|
50
|
|
|
|
|
if (misc.len) this->misc.assign(misc.str, misc.len); |
|
19604
|
11
|
|
|
|
|
|
} |
|
19605
|
|
|
|
|
|
|
|
|
19606
|
|
|
|
|
|
|
// CoNLL-U defined SpaceAfter=No feature |
|
19607
|
6
|
|
|
|
|
|
bool token::get_space_after() const { |
|
19608
|
|
|
|
|
|
|
string_piece value; |
|
19609
|
|
|
|
|
|
|
|
|
19610
|
6
|
100
|
|
|
|
|
return !(get_misc_field("SpaceAfter", value) && value.len == 2 && memcmp(value.str, "No", 2) == 0); |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
19611
|
|
|
|
|
|
|
} |
|
19612
|
|
|
|
|
|
|
|
|
19613
|
7
|
|
|
|
|
|
void token::set_space_after(bool space_after) { |
|
19614
|
7
|
100
|
|
|
|
|
if (space_after) |
|
19615
|
5
|
|
|
|
|
|
remove_misc_field("SpaceAfter"); |
|
19616
|
|
|
|
|
|
|
else |
|
19617
|
2
|
|
|
|
|
|
start_misc_field("SpaceAfter").append("No"); |
|
19618
|
7
|
|
|
|
|
|
} |
|
19619
|
|
|
|
|
|
|
|
|
19620
|
|
|
|
|
|
|
// UDPipe-specific all-spaces-preserving SpacesBefore and SpacesAfter features |
|
19621
|
0
|
|
|
|
|
|
void token::get_spaces_before(string& spaces_before) const { |
|
19622
|
|
|
|
|
|
|
string_piece value; |
|
19623
|
|
|
|
|
|
|
|
|
19624
|
0
|
0
|
|
|
|
|
if (get_misc_field("SpacesBefore", value)) |
|
19625
|
0
|
|
|
|
|
|
unescape_spaces(value, spaces_before); |
|
19626
|
|
|
|
|
|
|
else |
|
19627
|
|
|
|
|
|
|
spaces_before.clear(); |
|
19628
|
0
|
|
|
|
|
|
} |
|
19629
|
|
|
|
|
|
|
|
|
19630
|
7
|
|
|
|
|
|
void token::set_spaces_before(string_piece spaces_before) { |
|
19631
|
7
|
50
|
|
|
|
|
if (spaces_before.len == 0) |
|
19632
|
7
|
|
|
|
|
|
remove_misc_field("SpacesBefore"); |
|
19633
|
|
|
|
|
|
|
else |
|
19634
|
0
|
|
|
|
|
|
append_escaped_spaces(spaces_before, start_misc_field("SpacesBefore")); |
|
19635
|
7
|
|
|
|
|
|
} |
|
19636
|
|
|
|
|
|
|
|
|
19637
|
0
|
|
|
|
|
|
void token::get_spaces_after(string& spaces_after) const { |
|
19638
|
|
|
|
|
|
|
string_piece value; |
|
19639
|
|
|
|
|
|
|
|
|
19640
|
0
|
0
|
|
|
|
|
if (get_misc_field("SpacesAfter", value)) |
|
19641
|
0
|
|
|
|
|
|
unescape_spaces(value, spaces_after); |
|
19642
|
|
|
|
|
|
|
else |
|
19643
|
0
|
0
|
|
|
|
|
spaces_after.assign(get_space_after() ? " " : ""); |
|
19644
|
0
|
|
|
|
|
|
} |
|
19645
|
|
|
|
|
|
|
|
|
19646
|
7
|
|
|
|
|
|
void token::set_spaces_after(string_piece spaces_after) { |
|
19647
|
7
|
100
|
|
|
|
|
if (spaces_after.len == 0) { |
|
19648
|
2
|
|
|
|
|
|
set_space_after(false); |
|
19649
|
2
|
|
|
|
|
|
remove_misc_field("SpacesAfter"); |
|
19650
|
5
|
50
|
|
|
|
|
} else if (spaces_after.len == 1 && spaces_after.str[0] == ' ') { |
|
|
|
50
|
|
|
|
|
|
|
19651
|
5
|
|
|
|
|
|
set_space_after(true); |
|
19652
|
5
|
|
|
|
|
|
remove_misc_field("SpacesAfter"); |
|
19653
|
|
|
|
|
|
|
} else { |
|
19654
|
0
|
|
|
|
|
|
set_space_after(true); |
|
19655
|
0
|
|
|
|
|
|
append_escaped_spaces(spaces_after, start_misc_field("SpacesAfter")); |
|
19656
|
|
|
|
|
|
|
} |
|
19657
|
7
|
|
|
|
|
|
} |
|
19658
|
|
|
|
|
|
|
|
|
19659
|
0
|
|
|
|
|
|
void token::get_spaces_in_token(string& spaces_in_token) const { |
|
19660
|
|
|
|
|
|
|
string_piece value; |
|
19661
|
|
|
|
|
|
|
|
|
19662
|
0
|
0
|
|
|
|
|
if (get_misc_field("SpacesInToken", value)) |
|
19663
|
0
|
|
|
|
|
|
unescape_spaces(value, spaces_in_token); |
|
19664
|
|
|
|
|
|
|
else |
|
19665
|
|
|
|
|
|
|
spaces_in_token.clear(); |
|
19666
|
0
|
|
|
|
|
|
} |
|
19667
|
|
|
|
|
|
|
|
|
19668
|
7
|
|
|
|
|
|
void token::set_spaces_in_token(string_piece spaces_in_token) { |
|
19669
|
7
|
50
|
|
|
|
|
if (spaces_in_token.len == 0) |
|
19670
|
7
|
|
|
|
|
|
remove_misc_field("SpacesInToken"); |
|
19671
|
|
|
|
|
|
|
else |
|
19672
|
0
|
|
|
|
|
|
append_escaped_spaces(spaces_in_token, start_misc_field("SpacesInToken")); |
|
19673
|
7
|
|
|
|
|
|
} |
|
19674
|
|
|
|
|
|
|
|
|
19675
|
|
|
|
|
|
|
// UDPipe-specific TokenRange feature |
|
19676
|
0
|
|
|
|
|
|
bool token::get_token_range(size_t& start, size_t& end) const { |
|
19677
|
|
|
|
|
|
|
string_piece value; |
|
19678
|
|
|
|
|
|
|
|
|
19679
|
0
|
0
|
|
|
|
|
if (!get_misc_field("TokenRange", value)) return false; |
|
19680
|
|
|
|
|
|
|
|
|
19681
|
0
|
|
|
|
|
|
start = 0; |
|
19682
|
0
|
0
|
|
|
|
|
while (value.len && value.str[0] >= '0' && value.str[0] <= '9') { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
19683
|
0
|
0
|
|
|
|
|
if (start > (numeric_limits::max() - (value.str[0] - '0')) / 10) |
|
19684
|
|
|
|
|
|
|
return false; |
|
19685
|
0
|
|
|
|
|
|
start = 10 * start + (value.str[0] - '0'); |
|
19686
|
0
|
|
|
|
|
|
value.str++, value.len--; |
|
19687
|
|
|
|
|
|
|
} |
|
19688
|
|
|
|
|
|
|
|
|
19689
|
0
|
0
|
|
|
|
|
if (value.len == 0 || value.str[0] != ':') return false; |
|
|
|
0
|
|
|
|
|
|
|
19690
|
0
|
|
|
|
|
|
value.str++, value.len--; |
|
19691
|
|
|
|
|
|
|
|
|
19692
|
0
|
|
|
|
|
|
end = 0; |
|
19693
|
0
|
0
|
|
|
|
|
while (value.len && value.str[0] >= '0' && value.str[0] <= '9') { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
19694
|
0
|
0
|
|
|
|
|
if (end > (numeric_limits::max() - (value.str[0] - '0')) / 10) |
|
19695
|
|
|
|
|
|
|
return false; |
|
19696
|
0
|
|
|
|
|
|
end = 10 * end + (value.str[0] - '0'); |
|
19697
|
0
|
|
|
|
|
|
value.str++, value.len--; |
|
19698
|
|
|
|
|
|
|
} |
|
19699
|
|
|
|
|
|
|
|
|
19700
|
|
|
|
|
|
|
return true; |
|
19701
|
|
|
|
|
|
|
} |
|
19702
|
|
|
|
|
|
|
|
|
19703
|
0
|
|
|
|
|
|
void token::set_token_range(size_t start, size_t end) { |
|
19704
|
0
|
0
|
|
|
|
|
if (start == size_t(string::npos)) |
|
19705
|
0
|
|
|
|
|
|
remove_misc_field("TokenRange"); |
|
19706
|
|
|
|
|
|
|
else |
|
19707
|
0
|
0
|
|
|
|
|
start_misc_field("TokenRange").append(to_string(start)).append(1, ':').append(to_string(end)); |
|
19708
|
0
|
|
|
|
|
|
} |
|
19709
|
|
|
|
|
|
|
|
|
19710
|
|
|
|
|
|
|
// Private MISC field helpers |
|
19711
|
12
|
|
|
|
|
|
bool token::get_misc_field(string_piece name, string_piece& value) const { |
|
19712
|
6
|
100
|
|
|
|
|
for (size_t index = 0; index < misc.size(); ) { |
|
19713
|
2
|
50
|
|
|
|
|
if (misc.compare(index, name.len, name.str, name.len) == 0 && misc[index + name.len] == '=') { |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
19714
|
2
|
|
|
|
|
|
index += name.len + 1; |
|
19715
|
2
|
|
|
|
|
|
value.str = misc.c_str() + index; |
|
19716
|
2
|
|
|
|
|
|
value.len = misc.find('|', index); |
|
19717
|
2
|
50
|
|
|
|
|
value.len = (value.len == size_t(string::npos) ? misc.size() : value.len) - index; |
|
19718
|
2
|
|
|
|
|
|
return true; |
|
19719
|
|
|
|
|
|
|
} |
|
19720
|
0
|
|
|
|
|
|
index = misc.find('|', index); |
|
19721
|
0
|
0
|
|
|
|
|
if (index != size_t(string::npos)) index++; |
|
19722
|
|
|
|
|
|
|
} |
|
19723
|
|
|
|
|
|
|
return false; |
|
19724
|
|
|
|
|
|
|
} |
|
19725
|
|
|
|
|
|
|
|
|
19726
|
64
|
|
|
|
|
|
void token::remove_misc_field(string_piece name) { |
|
19727
|
36
|
100
|
|
|
|
|
for (size_t index = 0; index < misc.size(); ) |
|
19728
|
8
|
100
|
|
|
|
|
if (misc.compare(index, name.len, name.str, name.len) == 0 && misc[index + name.len] == '=') { |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
19729
|
2
|
|
|
|
|
|
size_t end_index = misc.find('|', index + name.len + 1); |
|
19730
|
2
|
50
|
|
|
|
|
if (end_index == size_t(string::npos)) end_index = misc.size(); |
|
19731
|
|
|
|
|
|
|
|
|
19732
|
|
|
|
|
|
|
// Be careful to delete at most one neighboring '|' |
|
19733
|
2
|
50
|
|
|
|
|
if (index) |
|
19734
|
0
|
|
|
|
|
|
misc.erase(index - 1, end_index - (index - 1)); |
|
19735
|
|
|
|
|
|
|
else |
|
19736
|
2
|
50
|
|
|
|
|
misc.erase(index, end_index + (end_index < misc.size() ? 1 : 0) - index); |
|
19737
|
|
|
|
|
|
|
} else { |
|
19738
|
6
|
|
|
|
|
|
index = misc.find('|', index); |
|
19739
|
6
|
50
|
|
|
|
|
if (index != size_t(string::npos)) index++; |
|
19740
|
|
|
|
|
|
|
} |
|
19741
|
28
|
|
|
|
|
|
} |
|
19742
|
|
|
|
|
|
|
|
|
19743
|
2
|
|
|
|
|
|
string& token::start_misc_field(string_piece name) { |
|
19744
|
2
|
|
|
|
|
|
remove_misc_field(name); |
|
19745
|
2
|
50
|
|
|
|
|
if (!misc.empty()) misc.push_back('|'); |
|
19746
|
2
|
|
|
|
|
|
misc.append(name.str, name.len).push_back('='); |
|
19747
|
2
|
|
|
|
|
|
return misc; |
|
19748
|
|
|
|
|
|
|
} |
|
19749
|
|
|
|
|
|
|
|
|
19750
|
0
|
|
|
|
|
|
void token::append_escaped_spaces(string_piece spaces, string& escaped_spaces) const { |
|
19751
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < spaces.len; i++) |
|
19752
|
0
|
|
|
|
|
|
switch (spaces.str[i]) { |
|
19753
|
|
|
|
|
|
|
case ' ': |
|
19754
|
0
|
|
|
|
|
|
escaped_spaces.push_back('\\'); escaped_spaces.push_back('s'); break; |
|
19755
|
|
|
|
|
|
|
case '|': |
|
19756
|
0
|
|
|
|
|
|
escaped_spaces.push_back('\\'); escaped_spaces.push_back('p'); break; |
|
19757
|
|
|
|
|
|
|
case '\t': |
|
19758
|
0
|
|
|
|
|
|
escaped_spaces.push_back('\\'); escaped_spaces.push_back('t'); break; |
|
19759
|
|
|
|
|
|
|
case '\r': |
|
19760
|
0
|
|
|
|
|
|
escaped_spaces.push_back('\\'); escaped_spaces.push_back('r'); break; |
|
19761
|
|
|
|
|
|
|
case '\n': |
|
19762
|
0
|
|
|
|
|
|
escaped_spaces.push_back('\\'); escaped_spaces.push_back('n'); break; |
|
19763
|
|
|
|
|
|
|
case '\\': |
|
19764
|
0
|
|
|
|
|
|
escaped_spaces.push_back('\\'); escaped_spaces.push_back('\\'); break; |
|
19765
|
|
|
|
|
|
|
default: |
|
19766
|
0
|
|
|
|
|
|
escaped_spaces.push_back(spaces.str[i]); |
|
19767
|
|
|
|
|
|
|
} |
|
19768
|
0
|
|
|
|
|
|
} |
|
19769
|
|
|
|
|
|
|
|
|
19770
|
0
|
|
|
|
|
|
void token::unescape_spaces(string_piece escaped_spaces, string& spaces) const { |
|
19771
|
|
|
|
|
|
|
spaces.clear(); |
|
19772
|
|
|
|
|
|
|
|
|
19773
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < escaped_spaces.len; i++) |
|
19774
|
0
|
0
|
|
|
|
|
if (escaped_spaces.str[i] != '\\' || i+1 >= escaped_spaces.len) |
|
|
|
0
|
|
|
|
|
|
|
19775
|
0
|
|
|
|
|
|
spaces.push_back(escaped_spaces.str[i]); |
|
19776
|
0
|
|
|
|
|
|
else switch (escaped_spaces.str[++i]) { |
|
19777
|
|
|
|
|
|
|
case 's': |
|
19778
|
0
|
|
|
|
|
|
spaces.push_back(' '); break; |
|
19779
|
|
|
|
|
|
|
case 'p': |
|
19780
|
0
|
|
|
|
|
|
spaces.push_back('|'); break; |
|
19781
|
|
|
|
|
|
|
case 't': |
|
19782
|
0
|
|
|
|
|
|
spaces.push_back('\t'); break; |
|
19783
|
|
|
|
|
|
|
case 'r': |
|
19784
|
0
|
|
|
|
|
|
spaces.push_back('\r'); break; |
|
19785
|
|
|
|
|
|
|
case 'n': |
|
19786
|
0
|
|
|
|
|
|
spaces.push_back('\n'); break; |
|
19787
|
|
|
|
|
|
|
case '\\': |
|
19788
|
0
|
|
|
|
|
|
spaces.push_back('\\'); break; |
|
19789
|
|
|
|
|
|
|
default: |
|
19790
|
0
|
|
|
|
|
|
spaces.push_back(escaped_spaces.str[i - 1]); |
|
19791
|
0
|
|
|
|
|
|
spaces.push_back(escaped_spaces.str[i]); |
|
19792
|
|
|
|
|
|
|
} |
|
19793
|
0
|
|
|
|
|
|
} |
|
19794
|
|
|
|
|
|
|
|
|
19795
|
|
|
|
|
|
|
///////// |
|
19796
|
|
|
|
|
|
|
// File: tokenizer/detokenizer.h |
|
19797
|
|
|
|
|
|
|
///////// |
|
19798
|
|
|
|
|
|
|
|
|
19799
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
19800
|
|
|
|
|
|
|
// |
|
19801
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
|
19802
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
19803
|
|
|
|
|
|
|
// |
|
19804
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
19805
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
19806
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
19807
|
|
|
|
|
|
|
|
|
19808
|
0
|
|
|
|
|
|
class detokenizer { |
|
19809
|
|
|
|
|
|
|
public: |
|
19810
|
|
|
|
|
|
|
detokenizer(const string& plain_text); |
|
19811
|
|
|
|
|
|
|
|
|
19812
|
|
|
|
|
|
|
void detokenize(sentence& s) const; |
|
19813
|
|
|
|
|
|
|
private: |
|
19814
|
|
|
|
|
|
|
enum { LOWERCASE, CATEGORIZE, TOTAL }; |
|
19815
|
|
|
|
|
|
|
|
|
19816
|
|
|
|
|
|
|
int difference(const string& left, const string& right, bool separate, int mode) const; |
|
19817
|
|
|
|
|
|
|
|
|
19818
|
|
|
|
|
|
|
static string perform_lowercase(const string& input); |
|
19819
|
|
|
|
|
|
|
static string perform_categorize(const string& input); |
|
19820
|
|
|
|
|
|
|
bool has_letters(const string& word) const; |
|
19821
|
|
|
|
|
|
|
bool only_digits(const string& word) const; |
|
19822
|
|
|
|
|
|
|
|
|
19823
|
0
|
|
|
|
|
|
class suffix_array { |
|
19824
|
|
|
|
|
|
|
public: |
|
19825
|
|
|
|
|
|
|
suffix_array(const string& str); |
|
19826
|
|
|
|
|
|
|
suffix_array(suffix_array&& other) = default; |
|
19827
|
|
|
|
|
|
|
|
|
19828
|
|
|
|
|
|
|
unsigned count(const string& data) const; |
|
19829
|
|
|
|
|
|
|
|
|
19830
|
|
|
|
|
|
|
private: |
|
19831
|
|
|
|
|
|
|
vector sa; |
|
19832
|
|
|
|
|
|
|
|
|
19833
|
|
|
|
|
|
|
struct suffix_compare { |
|
19834
|
0
|
|
|
|
|
|
suffix_compare(const string& str) : str(str) {} |
|
19835
|
0
|
|
|
|
|
|
bool operator()(unsigned a, unsigned b) const { return str.compare(a, string::npos, str, b, string::npos) < 0; } |
|
19836
|
|
|
|
|
|
|
private: |
|
19837
|
|
|
|
|
|
|
const string& str; |
|
19838
|
|
|
|
|
|
|
} suffix_comparator; |
|
19839
|
|
|
|
|
|
|
|
|
19840
|
|
|
|
|
|
|
struct suffix_lower_find { |
|
19841
|
0
|
|
|
|
|
|
suffix_lower_find(const string& str) : str(str) {} |
|
19842
|
0
|
|
|
|
|
|
bool operator()(unsigned a, const string& data) const { return str.compare(a, data.size(), data) < 0; } |
|
19843
|
|
|
|
|
|
|
|
|
19844
|
|
|
|
|
|
|
private: |
|
19845
|
|
|
|
|
|
|
const string& str; |
|
19846
|
|
|
|
|
|
|
} suffix_lower_finder; |
|
19847
|
|
|
|
|
|
|
|
|
19848
|
|
|
|
|
|
|
struct suffix_upper_find { |
|
19849
|
0
|
|
|
|
|
|
suffix_upper_find(const string& str) : str(str) {} |
|
19850
|
0
|
|
|
|
|
|
bool operator()(const string& data, unsigned a) const { return str.compare(a, data.size(), data) > 0; } |
|
19851
|
|
|
|
|
|
|
|
|
19852
|
|
|
|
|
|
|
private: |
|
19853
|
|
|
|
|
|
|
const string& str; |
|
19854
|
|
|
|
|
|
|
} suffix_upper_finder; |
|
19855
|
|
|
|
|
|
|
}; |
|
19856
|
|
|
|
|
|
|
|
|
19857
|
|
|
|
|
|
|
string data_lowercased, data_categorized; |
|
19858
|
|
|
|
|
|
|
suffix_array sa_lowercased, sa_categorized; |
|
19859
|
|
|
|
|
|
|
}; |
|
19860
|
|
|
|
|
|
|
|
|
19861
|
|
|
|
|
|
|
///////// |
|
19862
|
|
|
|
|
|
|
// File: tokenizer/detokenizer.cpp |
|
19863
|
|
|
|
|
|
|
///////// |
|
19864
|
|
|
|
|
|
|
|
|
19865
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
19866
|
|
|
|
|
|
|
// |
|
19867
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
|
19868
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
19869
|
|
|
|
|
|
|
// |
|
19870
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
19871
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
19872
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
19873
|
|
|
|
|
|
|
|
|
19874
|
0
|
|
|
|
|
|
detokenizer::detokenizer(const string& plain_text) |
|
19875
|
|
|
|
|
|
|
: data_lowercased(perform_lowercase(plain_text)), data_categorized(perform_categorize(plain_text)), |
|
19876
|
0
|
0
|
|
|
|
|
sa_lowercased(data_lowercased), sa_categorized(data_categorized) {} |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
19877
|
|
|
|
|
|
|
|
|
19878
|
0
|
|
|
|
|
|
void detokenizer::detokenize(sentence& s) const { |
|
19879
|
|
|
|
|
|
|
token* previous_tok = nullptr; |
|
19880
|
0
|
0
|
|
|
|
|
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
|
19881
|
0
|
0
|
|
|
|
|
token* tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (token*)&s.multiword_tokens[j] : (token*)&s.words[i]; |
|
|
|
0
|
|
|
|
|
|
|
19882
|
|
|
|
|
|
|
|
|
19883
|
0
|
0
|
|
|
|
|
if (previous_tok) { |
|
19884
|
|
|
|
|
|
|
// Should we add SpaceAfter=No to the previous form? |
|
19885
|
0
|
|
|
|
|
|
int score = difference(previous_tok->form, tok->form, true, LOWERCASE); |
|
19886
|
0
|
0
|
|
|
|
|
if (!score) score = has_letters(previous_tok->form) && has_letters(tok->form) ? -1 : 0; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
19887
|
0
|
0
|
|
|
|
|
if (!score) score = only_digits(previous_tok->form) && only_digits(tok->form) ? -1 : 0; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
19888
|
0
|
0
|
|
|
|
|
if (!score) score = difference(previous_tok->form, tok->form, false, LOWERCASE); |
|
19889
|
0
|
0
|
|
|
|
|
if (!score) score = difference(previous_tok->form, tok->form, false, CATEGORIZE); |
|
19890
|
0
|
0
|
|
|
|
|
if (!score) score = difference(previous_tok->form, tok->form, true, CATEGORIZE); |
|
19891
|
|
|
|
|
|
|
|
|
19892
|
0
|
0
|
|
|
|
|
if (score > 0) |
|
19893
|
0
|
|
|
|
|
|
previous_tok->set_space_after(false); |
|
19894
|
|
|
|
|
|
|
} |
|
19895
|
|
|
|
|
|
|
|
|
19896
|
|
|
|
|
|
|
// Remove the SpaceAfter attribute on current token |
|
19897
|
0
|
|
|
|
|
|
tok->set_space_after(true); |
|
19898
|
|
|
|
|
|
|
previous_tok = tok; |
|
19899
|
|
|
|
|
|
|
|
|
19900
|
0
|
0
|
|
|
|
|
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
19901
|
0
|
|
|
|
|
|
i = s.multiword_tokens[j++].id_last; |
|
19902
|
|
|
|
|
|
|
} |
|
19903
|
0
|
|
|
|
|
|
} |
|
19904
|
|
|
|
|
|
|
|
|
19905
|
0
|
|
|
|
|
|
int detokenizer::difference(const string& left, const string& right, bool separate, int mode) const { |
|
19906
|
0
|
0
|
|
|
|
|
auto& func = mode == LOWERCASE ? perform_lowercase : perform_categorize; |
|
19907
|
0
|
0
|
|
|
|
|
auto& sa = mode == LOWERCASE ? sa_lowercased : sa_categorized; |
|
19908
|
|
|
|
|
|
|
|
|
19909
|
0
|
|
|
|
|
|
string left_mapped = func(left); |
|
19910
|
0
|
0
|
|
|
|
|
string right_mapped = func(right); |
|
19911
|
|
|
|
|
|
|
string pattern; |
|
19912
|
|
|
|
|
|
|
|
|
19913
|
0
|
0
|
|
|
|
|
pattern.assign(separate?" ":"").append(left_mapped).append(right_mapped).append(separate?" ":""); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
19914
|
0
|
0
|
|
|
|
|
int together = sa.count(pattern); |
|
19915
|
|
|
|
|
|
|
|
|
19916
|
0
|
0
|
|
|
|
|
pattern.assign(separate?" ":"").append(left_mapped).append(" ").append(right_mapped).append(separate?" ":""); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
19917
|
0
|
0
|
|
|
|
|
int apart = sa.count(pattern); |
|
19918
|
|
|
|
|
|
|
|
|
19919
|
0
|
|
|
|
|
|
return together - apart; |
|
19920
|
|
|
|
|
|
|
} |
|
19921
|
|
|
|
|
|
|
|
|
19922
|
0
|
|
|
|
|
|
string detokenizer::perform_lowercase(const string& input) { |
|
19923
|
|
|
|
|
|
|
using namespace unilib; |
|
19924
|
|
|
|
|
|
|
|
|
19925
|
|
|
|
|
|
|
string output; |
|
19926
|
0
|
0
|
|
|
|
|
for (auto&& chr : utf8::decoder(input)) |
|
19927
|
0
|
0
|
|
|
|
|
utf8::append(output, unicode::lowercase(chr)); |
|
19928
|
0
|
|
|
|
|
|
return output; |
|
19929
|
|
|
|
|
|
|
} |
|
19930
|
|
|
|
|
|
|
|
|
19931
|
0
|
|
|
|
|
|
string detokenizer::perform_categorize(const string& input) { |
|
19932
|
|
|
|
|
|
|
using namespace unilib; |
|
19933
|
|
|
|
|
|
|
|
|
19934
|
|
|
|
|
|
|
string output; |
|
19935
|
0
|
0
|
|
|
|
|
for (auto&& chr : utf8::decoder(input)) { |
|
19936
|
0
|
|
|
|
|
|
auto category = unicode::category(chr); |
|
19937
|
0
|
0
|
|
|
|
|
if (category & unicode::C) output.push_back('C'); |
|
|
|
0
|
|
|
|
|
|
|
19938
|
0
|
0
|
|
|
|
|
if (category & unicode::L) output.push_back('L'); |
|
|
|
0
|
|
|
|
|
|
|
19939
|
0
|
0
|
|
|
|
|
if (category & unicode::M) output.push_back('M'); |
|
|
|
0
|
|
|
|
|
|
|
19940
|
0
|
0
|
|
|
|
|
if (category & unicode::N) output.push_back('N'); |
|
|
|
0
|
|
|
|
|
|
|
19941
|
0
|
0
|
|
|
|
|
if (category & unicode::Pc) output.push_back('c'); |
|
|
|
0
|
|
|
|
|
|
|
19942
|
0
|
0
|
|
|
|
|
if (category & unicode::Pd) output.push_back('d'); |
|
|
|
0
|
|
|
|
|
|
|
19943
|
0
|
0
|
|
|
|
|
if (category & unicode::Pe) output.push_back('e'); |
|
|
|
0
|
|
|
|
|
|
|
19944
|
0
|
0
|
|
|
|
|
if (category & unicode::Pf) output.push_back('f'); |
|
|
|
0
|
|
|
|
|
|
|
19945
|
0
|
0
|
|
|
|
|
if (category & unicode::Pi) output.push_back('i'); |
|
|
|
0
|
|
|
|
|
|
|
19946
|
0
|
0
|
|
|
|
|
if (category & unicode::Po) output.push_back('o'); |
|
|
|
0
|
|
|
|
|
|
|
19947
|
0
|
0
|
|
|
|
|
if (category & unicode::Ps) output.push_back('s'); |
|
|
|
0
|
|
|
|
|
|
|
19948
|
0
|
0
|
|
|
|
|
if (category & unicode::S) output.push_back('S'); |
|
|
|
0
|
|
|
|
|
|
|
19949
|
0
|
0
|
|
|
|
|
if (category & unicode::Zl) output.push_back('Z'); |
|
|
|
0
|
|
|
|
|
|
|
19950
|
0
|
0
|
|
|
|
|
if (category & unicode::Zp) output.push_back('z'); |
|
|
|
0
|
|
|
|
|
|
|
19951
|
0
|
0
|
|
|
|
|
if (category & unicode::Zs) output.push_back(' '); |
|
|
|
0
|
|
|
|
|
|
|
19952
|
|
|
|
|
|
|
} |
|
19953
|
0
|
|
|
|
|
|
return output; |
|
19954
|
|
|
|
|
|
|
} |
|
19955
|
|
|
|
|
|
|
|
|
19956
|
0
|
|
|
|
|
|
bool detokenizer::has_letters(const string& word) const { |
|
19957
|
|
|
|
|
|
|
using namespace unilib; |
|
19958
|
|
|
|
|
|
|
|
|
19959
|
0
|
0
|
|
|
|
|
for (auto&& chr : utf8::decoder(word)) |
|
19960
|
0
|
0
|
|
|
|
|
if (unicode::category(chr) & unicode::L) |
|
19961
|
0
|
|
|
|
|
|
return true; |
|
19962
|
0
|
|
|
|
|
|
return false; |
|
19963
|
|
|
|
|
|
|
} |
|
19964
|
|
|
|
|
|
|
|
|
19965
|
0
|
|
|
|
|
|
bool detokenizer::only_digits(const string& word) const { |
|
19966
|
|
|
|
|
|
|
using namespace unilib; |
|
19967
|
|
|
|
|
|
|
|
|
19968
|
0
|
0
|
|
|
|
|
for (auto&& chr : utf8::decoder(word)) |
|
19969
|
0
|
0
|
|
|
|
|
if (unicode::category(chr) & ~unicode::N) |
|
19970
|
0
|
|
|
|
|
|
return false; |
|
19971
|
0
|
|
|
|
|
|
return true; |
|
19972
|
|
|
|
|
|
|
} |
|
19973
|
|
|
|
|
|
|
|
|
19974
|
0
|
|
|
|
|
|
detokenizer::suffix_array::suffix_array(const string& str) : suffix_comparator(str), suffix_lower_finder(str), suffix_upper_finder(str) { |
|
19975
|
0
|
0
|
|
|
|
|
sa.reserve(str.size()); |
|
19976
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < str.size(); i++) |
|
19977
|
0
|
0
|
|
|
|
|
sa.push_back(i); |
|
19978
|
|
|
|
|
|
|
|
|
19979
|
|
|
|
|
|
|
sort(sa.begin(), sa.end(), suffix_comparator); |
|
19980
|
0
|
|
|
|
|
|
} |
|
19981
|
|
|
|
|
|
|
|
|
19982
|
0
|
|
|
|
|
|
unsigned detokenizer::suffix_array::count(const string& data) const { |
|
19983
|
|
|
|
|
|
|
auto lower_it = lower_bound(sa.begin(), sa.end(), data, suffix_lower_finder); |
|
19984
|
|
|
|
|
|
|
auto upper_it = upper_bound(sa.begin(), sa.end(), data, suffix_upper_finder); |
|
19985
|
0
|
|
|
|
|
|
return upper_it - lower_it; |
|
19986
|
|
|
|
|
|
|
} |
|
19987
|
|
|
|
|
|
|
|
|
19988
|
|
|
|
|
|
|
///////// |
|
19989
|
|
|
|
|
|
|
// File: tokenizer/morphodita_tokenizer_wrapper.cpp |
|
19990
|
|
|
|
|
|
|
///////// |
|
19991
|
|
|
|
|
|
|
|
|
19992
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
19993
|
|
|
|
|
|
|
// |
|
19994
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
19995
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
19996
|
|
|
|
|
|
|
// |
|
19997
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
19998
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
19999
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
20000
|
|
|
|
|
|
|
|
|
20001
|
1
|
|
|
|
|
|
morphodita_tokenizer_wrapper::morphodita_tokenizer_wrapper(morphodita::tokenizer* tokenizer, const multiword_splitter* splitter, |
|
20002
|
|
|
|
|
|
|
bool normalized_spaces, bool token_ranges) |
|
20003
|
1
|
50
|
|
|
|
|
: tokenizer(tokenizer), splitter(splitter), normalized_spaces(normalized_spaces), token_ranges(token_ranges) {} |
|
20004
|
|
|
|
|
|
|
|
|
20005
|
0
|
|
|
|
|
|
bool morphodita_tokenizer_wrapper::read_block(istream& is, string& block) const { |
|
20006
|
0
|
|
|
|
|
|
return bool(getpara(is, block)); |
|
20007
|
|
|
|
|
|
|
} |
|
20008
|
|
|
|
|
|
|
|
|
20009
|
0
|
|
|
|
|
|
void morphodita_tokenizer_wrapper::reset_document(string_piece id) { |
|
20010
|
0
|
|
|
|
|
|
new_document = true; |
|
20011
|
0
|
|
|
|
|
|
document_id.assign(id.str, id.len); |
|
20012
|
0
|
|
|
|
|
|
preceeding_newlines = 2; |
|
20013
|
0
|
|
|
|
|
|
sentence_id = 1; |
|
20014
|
0
|
|
|
|
|
|
set_text(""); |
|
20015
|
0
|
|
|
|
|
|
unicode_offset = 0; |
|
20016
|
0
|
|
|
|
|
|
text_unicode_length = 0; |
|
20017
|
|
|
|
|
|
|
saved_spaces.clear(); |
|
20018
|
0
|
|
|
|
|
|
} |
|
20019
|
|
|
|
|
|
|
|
|
20020
|
1
|
|
|
|
|
|
void morphodita_tokenizer_wrapper::set_text(string_piece text, bool make_copy) { |
|
20021
|
|
|
|
|
|
|
// Start by skipping spaces and copying them to saved_spaces |
|
20022
|
|
|
|
|
|
|
string_piece following; |
|
20023
|
1
|
50
|
|
|
|
|
for (char32_t chr; |
|
20024
|
2
|
50
|
|
|
|
|
text.len && (following = text, chr = unilib::utf8::decode(following.str, following.len), |
|
|
|
50
|
|
|
|
|
|
|
20025
|
1
|
50
|
|
|
|
|
(unilib::unicode::category(chr) & unilib::unicode::Zs) || chr == '\r' || chr == '\n' || chr == '\t'); |
|
|
|
50
|
|
|
|
|
|
|
20026
|
0
|
|
|
|
|
|
text = following, unicode_offset++) |
|
20027
|
0
|
|
|
|
|
|
saved_spaces.append(text.str, following.str - text.str); |
|
20028
|
|
|
|
|
|
|
|
|
20029
|
|
|
|
|
|
|
// Offset unicode_offset by length of previous text, update text_unicode_length for the new text |
|
20030
|
1
|
|
|
|
|
|
unicode_offset += text_unicode_length; |
|
20031
|
1
|
|
|
|
|
|
text_unicode_length = 0; |
|
20032
|
35
|
100
|
|
|
|
|
for (following = text; following.len; unilib::utf8::decode(following.str, following.len)) |
|
20033
|
34
|
|
|
|
|
|
text_unicode_length++; |
|
20034
|
|
|
|
|
|
|
|
|
20035
|
|
|
|
|
|
|
// Copy the text to local storage if needed |
|
20036
|
1
|
50
|
|
|
|
|
if (make_copy) { |
|
20037
|
1
|
|
|
|
|
|
text_copy.assign(text.str, text.len); |
|
20038
|
|
|
|
|
|
|
text = string_piece(text_copy.c_str(), text_copy.size()); |
|
20039
|
|
|
|
|
|
|
} |
|
20040
|
|
|
|
|
|
|
|
|
20041
|
|
|
|
|
|
|
// Store the text locally and in the morphodita::tokenizer |
|
20042
|
1
|
|
|
|
|
|
this->text = text; |
|
20043
|
1
|
|
|
|
|
|
tokenizer->set_text(this->text, false); |
|
20044
|
|
|
|
|
|
|
|
|
20045
|
1
|
|
|
|
|
|
} |
|
20046
|
|
|
|
|
|
|
|
|
20047
|
2
|
|
|
|
|
|
bool morphodita_tokenizer_wrapper::next_sentence(sentence& s, string& error) { |
|
20048
|
|
|
|
|
|
|
unsigned following_newlines = 0; |
|
20049
|
|
|
|
|
|
|
|
|
20050
|
2
|
|
|
|
|
|
s.clear(); |
|
20051
|
|
|
|
|
|
|
error.clear(); |
|
20052
|
|
|
|
|
|
|
|
|
20053
|
2
|
50
|
|
|
|
|
if (tokenizer->next_sentence(&forms, token_ranges ? &tokens : nullptr)) { |
|
|
|
100
|
|
|
|
|
|
|
20054
|
|
|
|
|
|
|
// The forms returned by GRU tokenizer *should not* start/end with spaces, |
|
20055
|
|
|
|
|
|
|
// but we trim them anyway (including all "remove empty forms/sentences" machinery). |
|
20056
|
8
|
100
|
|
|
|
|
for (size_t i = 0; i < forms.size(); i++) { |
|
20057
|
14
|
50
|
|
|
|
|
while (forms[i].len && (forms[i].str[0] == '\r' || forms[i].str[0] == '\n' || |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
20058
|
7
|
50
|
|
|
|
|
forms[i].str[0] == '\t' || forms[i].str[0] == ' ')) |
|
20059
|
0
|
|
|
|
|
|
forms[i].str++, forms[i].len--; |
|
20060
|
14
|
50
|
|
|
|
|
while (forms[i].len && (forms[i].str[forms[i].len-1] == '\r' || forms[i].str[forms[i].len-1] == '\n' || |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
20061
|
7
|
50
|
|
|
|
|
forms[i].str[forms[i].len-1] == '\t' || forms[i].str[forms[i].len-1] == ' ')) |
|
20062
|
0
|
|
|
|
|
|
forms[i].len--; |
|
20063
|
7
|
50
|
|
|
|
|
if (!forms[i].len) |
|
20064
|
0
|
|
|
|
|
|
forms.erase(forms.begin() + i--); |
|
20065
|
|
|
|
|
|
|
} |
|
20066
|
8
|
50
|
|
|
|
|
if (!forms.size()) return next_sentence(s, error); |
|
20067
|
|
|
|
|
|
|
|
|
20068
|
8
|
100
|
|
|
|
|
for (size_t i = 0; i < forms.size(); i++) { |
|
20069
|
|
|
|
|
|
|
// The form might contain spaces, even '\r', '\n' or '\t', |
|
20070
|
|
|
|
|
|
|
// which we change to space. We also normalize multiple spaces to one. |
|
20071
|
|
|
|
|
|
|
tok.form.clear(); |
|
20072
|
41
|
100
|
|
|
|
|
for (size_t j = 0; j < forms[i].len; j++) { |
|
20073
|
34
|
|
|
|
|
|
char chr = forms[i].str[j]; |
|
20074
|
34
|
50
|
|
|
|
|
if (chr == '\r' || chr == '\n' || chr == '\t') chr = ' '; |
|
|
|
50
|
|
|
|
|
|
|
20075
|
34
|
50
|
|
|
|
|
if (chr != ' ' || tok.form.empty() || tok.form.back() != ' ') |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
20076
|
34
|
|
|
|
|
|
tok.form.push_back(chr); |
|
20077
|
|
|
|
|
|
|
} |
|
20078
|
|
|
|
|
|
|
|
|
20079
|
|
|
|
|
|
|
// Track pre-sentence spaces and store SpacesBefore |
|
20080
|
7
|
100
|
|
|
|
|
if (i == 0) { |
|
20081
|
1
|
50
|
|
|
|
|
if (forms[0].str > text.str) |
|
20082
|
0
|
|
|
|
|
|
saved_spaces.append(text.str, forms[0].str - text.str); |
|
20083
|
1
|
|
|
|
|
|
preceeding_newlines += count(saved_spaces.begin(), saved_spaces.end(), '\n'); |
|
20084
|
|
|
|
|
|
|
} |
|
20085
|
7
|
50
|
|
|
|
|
if (!normalized_spaces) { |
|
20086
|
15
|
100
|
|
|
|
|
tok.set_spaces_before(i == 0 ? saved_spaces : ""); |
|
|
|
50
|
|
|
|
|
|
|
20087
|
|
|
|
|
|
|
} |
|
20088
|
|
|
|
|
|
|
saved_spaces.clear(); |
|
20089
|
|
|
|
|
|
|
|
|
20090
|
|
|
|
|
|
|
// Track post-sentence spaces and store SpaceAfter, SpacesInToken and SpacesAfter |
|
20091
|
7
|
100
|
|
|
|
|
if (i+1 == forms.size()) { |
|
20092
|
1
|
|
|
|
|
|
text.len -= forms[i].str + forms[i].len - text.str; |
|
20093
|
1
|
|
|
|
|
|
text.str = forms[i].str + forms[i].len; |
|
20094
|
|
|
|
|
|
|
|
|
20095
|
|
|
|
|
|
|
string_piece following; |
|
20096
|
3
|
100
|
|
|
|
|
for (char32_t chr; text.len && (following = text, chr = unilib::utf8::decode(following.str, following.len), |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
20097
|
0
|
0
|
|
|
|
|
(unilib::unicode::category(chr) & unilib::unicode::Zs) || chr == '\r' || chr == '\n' || chr == '\t'); text = following) |
|
|
|
0
|
|
|
|
|
|
|
20098
|
1
|
|
|
|
|
|
saved_spaces.append(text.str, following.str - text.str); |
|
20099
|
|
|
|
|
|
|
|
|
20100
|
1
|
|
|
|
|
|
following_newlines += count(saved_spaces.begin(), saved_spaces.end(), '\n'); |
|
20101
|
|
|
|
|
|
|
} |
|
20102
|
7
|
50
|
|
|
|
|
if (normalized_spaces) { |
|
20103
|
0
|
0
|
|
|
|
|
tok.set_space_after(i+1 == forms.size() ? !saved_spaces.empty() : forms[i+1].str > forms[i].str + forms[i].len); |
|
20104
|
|
|
|
|
|
|
} else { |
|
20105
|
7
|
50
|
|
|
|
|
tok.set_spaces_in_token(tok.form.size() != forms[i].len ? forms[i] : ""); |
|
20106
|
7
|
100
|
|
|
|
|
tok.set_spaces_after(i+1 == forms.size() ? saved_spaces : string_piece(forms[i].str + forms[i].len, forms[i+1].str - forms[i].str - forms[i].len)); |
|
20107
|
|
|
|
|
|
|
} |
|
20108
|
|
|
|
|
|
|
saved_spaces.clear(); |
|
20109
|
|
|
|
|
|
|
|
|
20110
|
|
|
|
|
|
|
// Store TokenRange if requested |
|
20111
|
7
|
50
|
|
|
|
|
if (token_ranges) |
|
20112
|
0
|
|
|
|
|
|
tok.set_token_range(unicode_offset + tokens[i].start, unicode_offset + tokens[i].start + tokens[i].length); |
|
20113
|
|
|
|
|
|
|
|
|
20114
|
7
|
50
|
|
|
|
|
if (splitter) |
|
20115
|
7
|
|
|
|
|
|
splitter->append_token(tok.form, tok.misc, s); |
|
20116
|
|
|
|
|
|
|
else |
|
20117
|
0
|
|
|
|
|
|
s.add_word(tok.form).misc.assign(tok.misc); |
|
20118
|
|
|
|
|
|
|
} |
|
20119
|
|
|
|
|
|
|
|
|
20120
|
|
|
|
|
|
|
// Mark new document if needed |
|
20121
|
1
|
50
|
|
|
|
|
if (new_document) { |
|
20122
|
1
|
|
|
|
|
|
s.set_new_doc(true, document_id); |
|
20123
|
1
|
|
|
|
|
|
new_document = false; |
|
20124
|
|
|
|
|
|
|
} |
|
20125
|
|
|
|
|
|
|
|
|
20126
|
|
|
|
|
|
|
// Mark new paragraph if needed |
|
20127
|
1
|
50
|
|
|
|
|
if (preceeding_newlines >= 2) |
|
20128
|
1
|
|
|
|
|
|
s.set_new_par(true); |
|
20129
|
1
|
|
|
|
|
|
preceeding_newlines = following_newlines; |
|
20130
|
|
|
|
|
|
|
|
|
20131
|
1
|
50
|
|
|
|
|
s.set_sent_id(to_string(sentence_id++)); |
|
20132
|
|
|
|
|
|
|
|
|
20133
|
|
|
|
|
|
|
// Fill "# text" comment |
|
20134
|
8
|
|
|
|
|
|
s.comments.emplace_back("# text = "); |
|
20135
|
8
|
100
|
|
|
|
|
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
|
20136
|
7
|
50
|
|
|
|
|
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j].form : (const token&)s.words[i].form; |
|
|
|
0
|
|
|
|
|
|
|
20137
|
7
|
50
|
|
|
|
|
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
20138
|
0
|
|
|
|
|
|
i = s.multiword_tokens[j++].id_last; |
|
20139
|
|
|
|
|
|
|
|
|
20140
|
|
|
|
|
|
|
s.comments.back().append(tok.form); |
|
20141
|
7
|
100
|
|
|
|
|
if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' '); |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
20142
|
|
|
|
|
|
|
} |
|
20143
|
|
|
|
|
|
|
|
|
20144
|
|
|
|
|
|
|
return true; |
|
20145
|
|
|
|
|
|
|
} |
|
20146
|
|
|
|
|
|
|
|
|
20147
|
|
|
|
|
|
|
// Save unused text parts. |
|
20148
|
1
|
50
|
|
|
|
|
if (text.len) { |
|
20149
|
0
|
|
|
|
|
|
saved_spaces.append(text.str, text.len); |
|
20150
|
0
|
|
|
|
|
|
text.str += text.len; |
|
20151
|
2
|
|
|
|
|
|
text.len = 0; |
|
20152
|
|
|
|
|
|
|
} |
|
20153
|
|
|
|
|
|
|
|
|
20154
|
|
|
|
|
|
|
return false; |
|
20155
|
|
|
|
|
|
|
} |
|
20156
|
|
|
|
|
|
|
|
|
20157
|
|
|
|
|
|
|
///////// |
|
20158
|
|
|
|
|
|
|
// File: tokenizer/multiword_splitter.cpp |
|
20159
|
|
|
|
|
|
|
///////// |
|
20160
|
|
|
|
|
|
|
|
|
20161
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
20162
|
|
|
|
|
|
|
// |
|
20163
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
|
20164
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
20165
|
|
|
|
|
|
|
// |
|
20166
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
20167
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
20168
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
20169
|
|
|
|
|
|
|
|
|
20170
|
7
|
|
|
|
|
|
void multiword_splitter::append_token(string_piece token, string_piece misc, sentence& s) const { |
|
20171
|
|
|
|
|
|
|
using namespace unilib; |
|
20172
|
|
|
|
|
|
|
|
|
20173
|
|
|
|
|
|
|
// Buffer |
|
20174
|
|
|
|
|
|
|
s.add_word(); |
|
20175
|
7
|
|
|
|
|
|
string& buffer = s.words.back().form; |
|
20176
|
|
|
|
|
|
|
|
|
20177
|
|
|
|
|
|
|
// Lowercase the token |
|
20178
|
7
|
|
|
|
|
|
utf8::map(unicode::lowercase, token.str, token.len, buffer); |
|
20179
|
|
|
|
|
|
|
reverse(buffer.begin(), buffer.end()); |
|
20180
|
|
|
|
|
|
|
|
|
20181
|
|
|
|
|
|
|
// Try finding lowercased version in the full_rules |
|
20182
|
|
|
|
|
|
|
size_t prefix_len = 0; |
|
20183
|
|
|
|
|
|
|
auto it = full_rules.find(buffer); |
|
20184
|
|
|
|
|
|
|
|
|
20185
|
7
|
50
|
|
|
|
|
if (it == full_rules.end()) { |
|
20186
|
7
|
50
|
|
|
|
|
if (version >= 2) { |
|
20187
|
0
|
|
|
|
|
|
string& suffix = s.words.back().misc; |
|
20188
|
|
|
|
|
|
|
// Try searching suffix_rules if needed |
|
20189
|
0
|
0
|
|
|
|
|
while (suffix.size() + 1 < buffer.size()) { |
|
20190
|
0
|
|
|
|
|
|
suffix.push_back(buffer[suffix.size()]); |
|
20191
|
|
|
|
|
|
|
|
|
20192
|
|
|
|
|
|
|
auto suffix_it = suffix_rules.find(suffix); |
|
20193
|
0
|
0
|
|
|
|
|
if (suffix_it == suffix_rules.end()) |
|
20194
|
|
|
|
|
|
|
break; |
|
20195
|
|
|
|
|
|
|
|
|
20196
|
0
|
0
|
|
|
|
|
if (!suffix_it->second.words.empty()) { |
|
20197
|
|
|
|
|
|
|
it = suffix_it; |
|
20198
|
0
|
|
|
|
|
|
prefix_len = buffer.size() - suffix.size(); |
|
20199
|
|
|
|
|
|
|
} |
|
20200
|
|
|
|
|
|
|
} |
|
20201
|
|
|
|
|
|
|
suffix.clear(); |
|
20202
|
|
|
|
|
|
|
} |
|
20203
|
|
|
|
|
|
|
|
|
20204
|
7
|
50
|
|
|
|
|
if (!prefix_len) { |
|
20205
|
|
|
|
|
|
|
// No match |
|
20206
|
14
|
|
|
|
|
|
s.words.back().form.assign(token.str, token.len); |
|
20207
|
7
|
100
|
|
|
|
|
if (misc.len) s.words.back().misc.assign(misc.str, misc.len); |
|
20208
|
|
|
|
|
|
|
return; |
|
20209
|
|
|
|
|
|
|
} |
|
20210
|
|
|
|
|
|
|
} |
|
20211
|
|
|
|
|
|
|
|
|
20212
|
|
|
|
|
|
|
// Determine casing |
|
20213
|
|
|
|
|
|
|
enum { UC_FIRST, UC_ALL, UC_OTHER }; int casing = UC_OTHER; |
|
20214
|
|
|
|
|
|
|
|
|
20215
|
0
|
0
|
|
|
|
|
if (unicode::category(utf8::first(token.str, token.len)) & unicode::Lut) { |
|
20216
|
|
|
|
|
|
|
casing = UC_ALL; |
|
20217
|
0
|
0
|
|
|
|
|
for (auto&& chr : utf8::decoder(token.str, token.len)) |
|
20218
|
0
|
0
|
|
|
|
|
if (unicode::category(chr) & (unicode::L & ~unicode::Lut)) { casing = UC_FIRST; break; } |
|
20219
|
|
|
|
|
|
|
} |
|
20220
|
|
|
|
|
|
|
|
|
20221
|
|
|
|
|
|
|
// Fill the multiword token |
|
20222
|
0
|
|
|
|
|
|
s.multiword_tokens.emplace_back(s.words.back().id, s.words.back().id + (int)it->second.words.size() - 1, token, misc); |
|
20223
|
|
|
|
|
|
|
|
|
20224
|
|
|
|
|
|
|
s.words.back().form.clear(); |
|
20225
|
0
|
0
|
|
|
|
|
if (prefix_len) { |
|
20226
|
|
|
|
|
|
|
// Note that prefix_len is measured in byte length of lowercased characters |
|
20227
|
0
|
|
|
|
|
|
string_piece suffix(token); |
|
20228
|
0
|
0
|
|
|
|
|
while (s.words.back().form.size() < prefix_len && suffix.len) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
20229
|
0
|
|
|
|
|
|
utf8::append(s.words.back().form, unicode::lowercase(utf8::decode(suffix.str, suffix.len))); |
|
20230
|
0
|
|
|
|
|
|
s.words.back().form.assign(token.str, token.len - suffix.len); |
|
20231
|
|
|
|
|
|
|
} |
|
20232
|
0
|
0
|
|
|
|
|
for (auto&& chr : utf8::decoder(it->second.words[0])) |
|
20233
|
0
|
0
|
|
|
|
|
utf8::append(s.words.back().form, casing == UC_ALL || (casing == UC_FIRST && s.words.back().form.empty()) ? unicode::uppercase(chr) : chr); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
20234
|
|
|
|
|
|
|
|
|
20235
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < it->second.words.size(); i++) |
|
20236
|
0
|
0
|
|
|
|
|
if (casing != UC_ALL) { |
|
20237
|
|
|
|
|
|
|
s.add_word(it->second.words[i]); |
|
20238
|
|
|
|
|
|
|
} else { |
|
20239
|
|
|
|
|
|
|
s.add_word(); |
|
20240
|
0
|
|
|
|
|
|
utf8::map(unicode::uppercase, it->second.words[i], s.words.back().form); |
|
20241
|
|
|
|
|
|
|
} |
|
20242
|
|
|
|
|
|
|
} |
|
20243
|
|
|
|
|
|
|
|
|
20244
|
1
|
|
|
|
|
|
multiword_splitter* multiword_splitter::load(istream& is) { |
|
20245
|
|
|
|
|
|
|
char version; |
|
20246
|
1
|
50
|
|
|
|
|
if (!is.get(version)) return nullptr; |
|
20247
|
1
|
50
|
|
|
|
|
if (!(version >= 1 && version <= VERSION_LATEST)) return nullptr; |
|
20248
|
|
|
|
|
|
|
|
|
20249
|
|
|
|
|
|
|
binary_decoder data; |
|
20250
|
1
|
50
|
|
|
|
|
if (!compressor::load(is, data)) return nullptr; |
|
|
|
50
|
|
|
|
|
|
|
20251
|
|
|
|
|
|
|
|
|
20252
|
1
|
50
|
|
|
|
|
unique_ptr splitter(new multiword_splitter(version)); |
|
20253
|
|
|
|
|
|
|
try { |
|
20254
|
1
|
50
|
|
|
|
|
for (unsigned full_rules = data.next_4B(); full_rules; full_rules--) { |
|
|
|
50
|
|
|
|
|
|
|
20255
|
|
|
|
|
|
|
string full_rule; |
|
20256
|
0
|
0
|
|
|
|
|
data.next_str(full_rule); |
|
20257
|
|
|
|
|
|
|
reverse(full_rule.begin(), full_rule.end()); |
|
20258
|
|
|
|
|
|
|
|
|
20259
|
|
|
|
|
|
|
// Add the full_rule and its words |
|
20260
|
|
|
|
|
|
|
auto& info = splitter->full_rules[full_rule]; |
|
20261
|
0
|
0
|
|
|
|
|
for (unsigned words = data.next_1B(); words; words--) { |
|
|
|
0
|
|
|
|
|
|
|
20262
|
0
|
0
|
|
|
|
|
info.words.emplace_back(); |
|
20263
|
0
|
0
|
|
|
|
|
data.next_str(info.words.back()); |
|
20264
|
|
|
|
|
|
|
} |
|
20265
|
0
|
0
|
|
|
|
|
if (info.words.empty()) return nullptr; |
|
20266
|
|
|
|
|
|
|
} |
|
20267
|
|
|
|
|
|
|
|
|
20268
|
1
|
50
|
|
|
|
|
if (version >= 2) |
|
20269
|
0
|
0
|
|
|
|
|
for (unsigned suffix_rules = data.next_4B(); suffix_rules; suffix_rules--) { |
|
|
|
0
|
|
|
|
|
|
|
20270
|
|
|
|
|
|
|
string suffix_rule; |
|
20271
|
0
|
0
|
|
|
|
|
data.next_str(suffix_rule); |
|
20272
|
|
|
|
|
|
|
reverse(suffix_rule.begin(), suffix_rule.end()); |
|
20273
|
|
|
|
|
|
|
|
|
20274
|
|
|
|
|
|
|
// Add the suffix_rule and its words |
|
20275
|
|
|
|
|
|
|
auto& info = splitter->suffix_rules[suffix_rule]; |
|
20276
|
0
|
0
|
|
|
|
|
for (unsigned words = data.next_1B(); words; words--) { |
|
|
|
0
|
|
|
|
|
|
|
20277
|
0
|
0
|
|
|
|
|
info.words.emplace_back(); |
|
20278
|
0
|
0
|
|
|
|
|
data.next_str(info.words.back()); |
|
20279
|
|
|
|
|
|
|
} |
|
20280
|
0
|
0
|
|
|
|
|
if (info.words.empty()) return nullptr; |
|
20281
|
|
|
|
|
|
|
|
|
20282
|
|
|
|
|
|
|
// Add prefixes of the suffix with empty data |
|
20283
|
0
|
0
|
|
|
|
|
if (!suffix_rule.empty()) |
|
20284
|
0
|
0
|
|
|
|
|
for (suffix_rule.pop_back(); !suffix_rule.empty(); suffix_rule.pop_back()) |
|
20285
|
|
|
|
|
|
|
splitter->suffix_rules[suffix_rule]; |
|
20286
|
|
0
|
|
|
|
|
} |
|
20287
|
|
|
|
|
|
|
} catch (binary_decoder_error&) { |
|
20288
|
|
|
|
|
|
|
return nullptr; |
|
20289
|
|
|
|
|
|
|
} |
|
20290
|
|
|
|
|
|
|
|
|
20291
|
1
|
50
|
|
|
|
|
return data.is_end() ? splitter.release() : nullptr; |
|
20292
|
|
|
|
|
|
|
} |
|
20293
|
|
|
|
|
|
|
|
|
20294
|
|
|
|
|
|
|
///////// |
|
20295
|
|
|
|
|
|
|
// File: tokenizer/multiword_splitter_trainer.h |
|
20296
|
|
|
|
|
|
|
///////// |
|
20297
|
|
|
|
|
|
|
|
|
20298
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
20299
|
|
|
|
|
|
|
// |
|
20300
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
|
20301
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
20302
|
|
|
|
|
|
|
// |
|
20303
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
20304
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
20305
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
20306
|
|
|
|
|
|
|
|
|
20307
|
|
|
|
|
|
|
class multiword_splitter_trainer { |
|
20308
|
|
|
|
|
|
|
public: |
|
20309
|
|
|
|
|
|
|
static bool train(const vector& data, ostream& os, string& error); |
|
20310
|
|
|
|
|
|
|
}; |
|
20311
|
|
|
|
|
|
|
|
|
20312
|
|
|
|
|
|
|
///////// |
|
20313
|
|
|
|
|
|
|
// File: tokenizer/multiword_splitter_trainer.cpp |
|
20314
|
|
|
|
|
|
|
///////// |
|
20315
|
|
|
|
|
|
|
|
|
20316
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
20317
|
|
|
|
|
|
|
// |
|
20318
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
|
20319
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
20320
|
|
|
|
|
|
|
// |
|
20321
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
20322
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
20323
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
20324
|
|
|
|
|
|
|
|
|
20325
|
0
|
|
|
|
|
|
bool multiword_splitter_trainer::train(const vector& data, ostream& os, string& error) { |
|
20326
|
|
|
|
|
|
|
using namespace unilib; |
|
20327
|
|
|
|
|
|
|
error.clear(); |
|
20328
|
|
|
|
|
|
|
|
|
20329
|
|
|
|
|
|
|
// Train |
|
20330
|
0
|
|
|
|
|
|
struct rule_info { |
|
20331
|
|
|
|
|
|
|
vector words; |
|
20332
|
|
|
|
|
|
|
unsigned count = 0; |
|
20333
|
|
|
|
|
|
|
}; |
|
20334
|
|
|
|
|
|
|
map full_rules, suffix_rules; |
|
20335
|
|
|
|
|
|
|
|
|
20336
|
|
|
|
|
|
|
// Full rules |
|
20337
|
|
|
|
|
|
|
string lc_form; |
|
20338
|
0
|
|
|
|
|
|
vector lc_words; |
|
20339
|
0
|
0
|
|
|
|
|
for (auto&& sentence : data) |
|
20340
|
0
|
0
|
|
|
|
|
for (auto&& multiword : sentence.multiword_tokens) { |
|
20341
|
|
|
|
|
|
|
utf8::map(unicode::lowercase, multiword.form, lc_form); |
|
20342
|
0
|
|
|
|
|
|
lc_words.clear(); |
|
20343
|
0
|
0
|
|
|
|
|
for (int i = multiword.id_first; i <= multiword.id_last; i++) |
|
20344
|
0
|
0
|
|
|
|
|
utf8::map(unicode::lowercase, sentence.words[i].form, (lc_words.emplace_back(), lc_words.back())); |
|
20345
|
|
|
|
|
|
|
|
|
20346
|
0
|
0
|
|
|
|
|
auto& info = full_rules[lc_form]; |
|
20347
|
0
|
0
|
|
|
|
|
if (info.words.empty()) |
|
20348
|
0
|
|
|
|
|
|
info.words.assign(lc_words.begin(), lc_words.end()); |
|
20349
|
0
|
|
|
|
|
|
info.count += lc_words == info.words; |
|
20350
|
0
|
0
|
|
|
|
|
if (!info.count) full_rules.erase(lc_form); |
|
20351
|
|
|
|
|
|
|
} |
|
20352
|
|
|
|
|
|
|
|
|
20353
|
|
|
|
|
|
|
// Remove the full rules which trigger too negatively |
|
20354
|
0
|
0
|
|
|
|
|
for (auto&& sentence : data) |
|
20355
|
0
|
0
|
|
|
|
|
for (size_t i = 1, j = 0; i < sentence.words.size(); i++) { |
|
20356
|
0
|
0
|
|
|
|
|
if (j < sentence.multiword_tokens.size() && sentence.multiword_tokens[j].id_first == int(i)) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
20357
|
0
|
|
|
|
|
|
i = sentence.multiword_tokens[j++].id_last; |
|
20358
|
|
|
|
|
|
|
continue; |
|
20359
|
|
|
|
|
|
|
} |
|
20360
|
|
|
|
|
|
|
|
|
20361
|
|
|
|
|
|
|
utf8::map(unicode::lowercase, sentence.words[i].form, lc_form); |
|
20362
|
|
|
|
|
|
|
auto it = full_rules.find(lc_form); |
|
20363
|
0
|
0
|
|
|
|
|
if (it != full_rules.end()) |
|
20364
|
0
|
0
|
|
|
|
|
if (!--it->second.count) |
|
20365
|
|
|
|
|
|
|
full_rules.erase(it); |
|
20366
|
|
|
|
|
|
|
} |
|
20367
|
|
|
|
|
|
|
|
|
20368
|
|
|
|
|
|
|
// Suffix rules |
|
20369
|
0
|
0
|
|
|
|
|
for (auto&& full_rule : full_rules) { |
|
20370
|
|
|
|
|
|
|
size_t prefix_match = 0; |
|
20371
|
0
|
0
|
|
|
|
|
while (prefix_match < full_rule.first.size() && prefix_match < full_rule.second.words[0].size()) prefix_match++; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
20372
|
0
|
0
|
|
|
|
|
for (; prefix_match; prefix_match--) |
|
20373
|
0
|
0
|
|
|
|
|
if (((unsigned char)full_rule.first[prefix_match]) < 0x80 || ((unsigned char)full_rule.first[prefix_match]) >= 0xC0) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
20374
|
0
|
0
|
|
|
|
|
lc_form.assign(full_rule.first, prefix_match, string::npos); |
|
20375
|
|
|
|
|
|
|
lc_words.assign(full_rule.second.words.begin(), full_rule.second.words.end()); |
|
20376
|
0
|
0
|
|
|
|
|
lc_words[0].erase(0, prefix_match); |
|
20377
|
|
|
|
|
|
|
|
|
20378
|
0
|
0
|
|
|
|
|
auto& info = suffix_rules[lc_form]; |
|
20379
|
0
|
0
|
|
|
|
|
if (info.words.empty()) |
|
20380
|
0
|
|
|
|
|
|
info.words.assign(lc_words.begin(), lc_words.end()); |
|
20381
|
0
|
|
|
|
|
|
info.count += lc_words == info.words; |
|
20382
|
0
|
0
|
|
|
|
|
if (!info.count) suffix_rules.erase(lc_form); |
|
20383
|
|
|
|
|
|
|
} |
|
20384
|
|
|
|
|
|
|
} |
|
20385
|
|
|
|
|
|
|
|
|
20386
|
|
|
|
|
|
|
// Remove the suffix rules which trigger too negatively |
|
20387
|
0
|
0
|
|
|
|
|
for (auto&& sentence : data) |
|
20388
|
0
|
0
|
|
|
|
|
for (size_t i = 1, j = 0; i < sentence.words.size(); i++) { |
|
20389
|
0
|
0
|
|
|
|
|
if (j < sentence.multiword_tokens.size() && sentence.multiword_tokens[j].id_first == int(i)) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
20390
|
0
|
|
|
|
|
|
i = sentence.multiword_tokens[j++].id_last; |
|
20391
|
0
|
|
|
|
|
|
continue; |
|
20392
|
|
|
|
|
|
|
} |
|
20393
|
|
|
|
|
|
|
|
|
20394
|
|
|
|
|
|
|
utf8::map(unicode::lowercase, sentence.words[i].form, lc_form); |
|
20395
|
0
|
0
|
|
|
|
|
while (lc_form.size() > 1) { |
|
20396
|
0
|
0
|
|
|
|
|
lc_form.erase(0, 1); |
|
20397
|
|
|
|
|
|
|
auto it = suffix_rules.find(lc_form); |
|
20398
|
0
|
0
|
|
|
|
|
if (it != suffix_rules.end()) { |
|
20399
|
0
|
0
|
|
|
|
|
if (it->second.count <= 10) |
|
20400
|
|
|
|
|
|
|
suffix_rules.erase(it); |
|
20401
|
|
|
|
|
|
|
else |
|
20402
|
0
|
|
|
|
|
|
it->second.count -= 10; |
|
20403
|
|
|
|
|
|
|
} |
|
20404
|
|
|
|
|
|
|
} |
|
20405
|
|
|
|
|
|
|
} |
|
20406
|
|
|
|
|
|
|
|
|
20407
|
|
|
|
|
|
|
// Encode |
|
20408
|
0
|
0
|
|
|
|
|
binary_encoder enc; |
|
20409
|
0
|
|
|
|
|
|
enc.add_4B(full_rules.size()); |
|
20410
|
0
|
0
|
|
|
|
|
for (auto&& full_rule : full_rules) { |
|
20411
|
0
|
0
|
|
|
|
|
enc.add_str(full_rule.first); |
|
20412
|
0
|
0
|
|
|
|
|
enc.add_1B(full_rule.second.words.size()); |
|
20413
|
0
|
0
|
|
|
|
|
for (auto& word : full_rule.second.words) |
|
20414
|
0
|
0
|
|
|
|
|
enc.add_str(word); |
|
20415
|
|
|
|
|
|
|
} |
|
20416
|
0
|
|
|
|
|
|
enc.add_4B(suffix_rules.size()); |
|
20417
|
0
|
0
|
|
|
|
|
for (auto&& suffix_rule : suffix_rules) { |
|
20418
|
0
|
0
|
|
|
|
|
enc.add_str(suffix_rule.first); |
|
20419
|
0
|
0
|
|
|
|
|
enc.add_1B(suffix_rule.second.words.size()); |
|
20420
|
0
|
0
|
|
|
|
|
for (auto& word : suffix_rule.second.words) |
|
20421
|
0
|
0
|
|
|
|
|
enc.add_str(word); |
|
20422
|
|
|
|
|
|
|
} |
|
20423
|
|
|
|
|
|
|
|
|
20424
|
|
|
|
|
|
|
// Save |
|
20425
|
0
|
0
|
|
|
|
|
os.put(multiword_splitter::VERSION_LATEST); |
|
20426
|
0
|
0
|
|
|
|
|
if (!compressor::save(os, enc)) return error.assign("Cannot encode multiword_splitter!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
20427
|
|
|
|
|
|
|
|
|
20428
|
|
|
|
|
|
|
return true; |
|
20429
|
|
|
|
|
|
|
} |
|
20430
|
|
|
|
|
|
|
|
|
20431
|
|
|
|
|
|
|
///////// |
|
20432
|
|
|
|
|
|
|
// File: trainer/trainer.h |
|
20433
|
|
|
|
|
|
|
///////// |
|
20434
|
|
|
|
|
|
|
|
|
20435
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
20436
|
|
|
|
|
|
|
// |
|
20437
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
20438
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
20439
|
|
|
|
|
|
|
// |
|
20440
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
20441
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
20442
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
20443
|
|
|
|
|
|
|
|
|
20444
|
|
|
|
|
|
|
class trainer { |
|
20445
|
|
|
|
|
|
|
public: |
|
20446
|
|
|
|
|
|
|
static bool train(const string& method, const vector& train, const vector& heldout, |
|
20447
|
|
|
|
|
|
|
const string& tokenizer, const string& tagger, const string& parser, ostream& os, string& error); |
|
20448
|
|
|
|
|
|
|
|
|
20449
|
|
|
|
|
|
|
static const string DEFAULT; |
|
20450
|
|
|
|
|
|
|
static const string NONE; |
|
20451
|
|
|
|
|
|
|
|
|
20452
|
|
|
|
|
|
|
protected: |
|
20453
|
|
|
|
|
|
|
static unsigned hyperparameter_integer(unsigned run, unsigned index, unsigned minimum, unsigned maximum); |
|
20454
|
|
|
|
|
|
|
static double hyperparameter_uniform(unsigned run, unsigned index, double minimum, double maximum); |
|
20455
|
|
|
|
|
|
|
static double hyperparameter_logarithmic(unsigned run, unsigned index, double minimum, double maximum); |
|
20456
|
|
|
|
|
|
|
|
|
20457
|
|
|
|
|
|
|
private: |
|
20458
|
|
|
|
|
|
|
static double rnd(unsigned run, unsigned index); |
|
20459
|
|
|
|
|
|
|
}; |
|
20460
|
|
|
|
|
|
|
|
|
20461
|
|
|
|
|
|
|
///////// |
|
20462
|
|
|
|
|
|
|
// File: trainer/trainer_morphodita_parsito.h |
|
20463
|
|
|
|
|
|
|
///////// |
|
20464
|
|
|
|
|
|
|
|
|
20465
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
20466
|
|
|
|
|
|
|
// |
|
20467
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
20468
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
20469
|
|
|
|
|
|
|
// |
|
20470
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
20471
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
20472
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
20473
|
|
|
|
|
|
|
|
|
20474
|
|
|
|
|
|
|
class trainer_morphodita_parsito : public trainer { |
|
20475
|
|
|
|
|
|
|
public: |
|
20476
|
|
|
|
|
|
|
static bool train(const vector& training, const vector& heldout, |
|
20477
|
|
|
|
|
|
|
const string& tokenizer, const string& tagger, const string& parser, ostream& os, string& error); |
|
20478
|
|
|
|
|
|
|
|
|
20479
|
|
|
|
|
|
|
private: |
|
20480
|
|
|
|
|
|
|
static bool train_tokenizer(const vector& training, const vector& heldout, |
|
20481
|
|
|
|
|
|
|
const string& options, ostream& os, string& error); |
|
20482
|
|
|
|
|
|
|
static bool train_tagger(const vector& training, const vector& heldout, |
|
20483
|
|
|
|
|
|
|
const string& options, ostream& os, string& error); |
|
20484
|
|
|
|
|
|
|
static bool train_parser(const vector& training, const vector& heldout, |
|
20485
|
|
|
|
|
|
|
const string& options, const string& tagger_model, ostream& os, string& error); |
|
20486
|
|
|
|
|
|
|
|
|
20487
|
|
|
|
|
|
|
// Generic model methods |
|
20488
|
|
|
|
|
|
|
enum model_type { TOKENIZER_MODEL, TAGGER_MODEL, PARSER_MODEL }; |
|
20489
|
|
|
|
|
|
|
static bool load_model(const string& data, model_type model, string_piece& range); |
|
20490
|
|
|
|
|
|
|
static const string& model_normalize_form(string_piece form, string& output); |
|
20491
|
|
|
|
|
|
|
static const string& model_normalize_lemma(string_piece lemma, string& output); |
|
20492
|
|
|
|
|
|
|
static void model_fill_word_analysis(const morphodita::tagged_lemma& analysis, bool upostag, int lemma, bool xpostag, bool feats, word& word); |
|
20493
|
|
|
|
|
|
|
|
|
20494
|
|
|
|
|
|
|
// Tagger-specific model methods |
|
20495
|
|
|
|
|
|
|
static bool train_tagger_model(const vector& training, const vector& heldout, |
|
20496
|
|
|
|
|
|
|
unsigned model, unsigned models, const named_values::map& tagger, ostream& os, string& error); |
|
20497
|
|
|
|
|
|
|
static bool can_combine_tag(const word& w, string& error); |
|
20498
|
|
|
|
|
|
|
static const string& combine_tag(const word& w, bool xpostag, bool feats, string& combined_tag); |
|
20499
|
|
|
|
|
|
|
static const string& most_frequent_tag(const vector& data, const string& upostag, bool xpostag, bool feats, string& combined_tag); |
|
20500
|
|
|
|
|
|
|
static const string& combine_lemma(const word& w, int use_lemma, string& combined_lemma, const unordered_set& flat_lemmas = unordered_set()); |
|
20501
|
|
|
|
|
|
|
|
|
20502
|
|
|
|
|
|
|
// Generic options handling |
|
20503
|
|
|
|
|
|
|
static const string& option_str(const named_values::map& options, const string& name, int model = -1); |
|
20504
|
|
|
|
|
|
|
static bool option_int(const named_values::map& options, const string& name, int& value, string& error, int model = -1); |
|
20505
|
|
|
|
|
|
|
static bool option_bool(const named_values::map& options, const string& name, bool& value, string& error, int model = -1); |
|
20506
|
|
|
|
|
|
|
static bool option_double(const named_values::map& options, const string& name, double& value, string& error, int model = -1); |
|
20507
|
|
|
|
|
|
|
|
|
20508
|
|
|
|
|
|
|
// Various string data |
|
20509
|
|
|
|
|
|
|
static const string empty_string; |
|
20510
|
|
|
|
|
|
|
static const string tag_separators; |
|
20511
|
|
|
|
|
|
|
static const string tagger_features_tagger; |
|
20512
|
|
|
|
|
|
|
static const string tagger_features_lemmatizer; |
|
20513
|
|
|
|
|
|
|
static const string parser_nodes; |
|
20514
|
|
|
|
|
|
|
}; |
|
20515
|
|
|
|
|
|
|
|
|
20516
|
|
|
|
|
|
|
///////// |
|
20517
|
|
|
|
|
|
|
// File: trainer/trainer.cpp |
|
20518
|
|
|
|
|
|
|
///////// |
|
20519
|
|
|
|
|
|
|
|
|
20520
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
20521
|
|
|
|
|
|
|
// |
|
20522
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
20523
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
20524
|
|
|
|
|
|
|
// |
|
20525
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
20526
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
20527
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
20528
|
|
|
|
|
|
|
|
|
20529
|
2
|
|
|
|
|
|
const string trainer::DEFAULT; |
|
20530
|
2
|
|
|
|
|
|
const string trainer::NONE = "none"; |
|
20531
|
|
|
|
|
|
|
|
|
20532
|
0
|
|
|
|
|
|
bool trainer::train(const string& method, const vector& training, const vector& heldout, |
|
20533
|
|
|
|
|
|
|
const string& tokenizer, const string& tagger, const string& parser, ostream& os, string& error) { |
|
20534
|
|
|
|
|
|
|
error.clear(); |
|
20535
|
|
|
|
|
|
|
|
|
20536
|
0
|
0
|
|
|
|
|
stringstream os_buffer; |
|
20537
|
0
|
0
|
|
|
|
|
os_buffer.put(method.size()); |
|
20538
|
0
|
0
|
|
|
|
|
os_buffer.write(method.c_str(), method.size()); |
|
20539
|
|
|
|
|
|
|
|
|
20540
|
|
|
|
|
|
|
try { |
|
20541
|
0
|
0
|
|
|
|
|
if (method == "morphodita_parsito") { |
|
20542
|
0
|
0
|
|
|
|
|
if (!trainer_morphodita_parsito::train(training, heldout, tokenizer, tagger, parser, os_buffer, error)) |
|
|
|
0
|
|
|
|
|
|
|
20543
|
|
|
|
|
|
|
return false; |
|
20544
|
|
|
|
|
|
|
} else { |
|
20545
|
0
|
0
|
|
|
|
|
error.assign("Unknown UDPipe method '").append(method).append("'!"); |
|
|
|
0
|
|
|
|
|
|
|
20546
|
|
|
|
|
|
|
return false; |
|
20547
|
|
0
|
|
|
|
|
} |
|
|
|
0
|
|
|
|
|
|
|
20548
|
|
|
|
|
|
|
} catch (training_error& e) { |
|
20549
|
|
|
|
|
|
|
error.assign(e.what()); |
|
20550
|
|
|
|
|
|
|
return false; |
|
20551
|
|
|
|
|
|
|
} |
|
20552
|
|
|
|
|
|
|
|
|
20553
|
0
|
0
|
|
|
|
|
os << os_buffer.rdbuf(); |
|
20554
|
|
|
|
|
|
|
return true; |
|
20555
|
|
|
|
|
|
|
} |
|
20556
|
|
|
|
|
|
|
|
|
20557
|
0
|
|
|
|
|
|
unsigned trainer::hyperparameter_integer(unsigned run, unsigned index, unsigned minimum, unsigned maximum) { |
|
20558
|
0
|
|
|
|
|
|
return minimum + int((maximum - minimum + 1) * rnd(run, index)); |
|
20559
|
|
|
|
|
|
|
} |
|
20560
|
|
|
|
|
|
|
|
|
20561
|
0
|
|
|
|
|
|
double trainer::hyperparameter_uniform(unsigned run, unsigned index, double minimum, double maximum) { |
|
20562
|
0
|
|
|
|
|
|
return minimum + (maximum - minimum) * rnd(run, index); |
|
20563
|
|
|
|
|
|
|
} |
|
20564
|
|
|
|
|
|
|
|
|
20565
|
0
|
|
|
|
|
|
double trainer::hyperparameter_logarithmic(unsigned run, unsigned index, double minimum, double maximum) { |
|
20566
|
0
|
|
|
|
|
|
return exp(log(minimum) + (log(maximum) - log(minimum)) * rnd(run, index)); |
|
20567
|
|
|
|
|
|
|
} |
|
20568
|
|
|
|
|
|
|
|
|
20569
|
0
|
|
|
|
|
|
double trainer::rnd(unsigned run, unsigned index) { |
|
20570
|
|
|
|
|
|
|
uint32_t state = 12345U; |
|
20571
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < 10; i++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
20572
|
0
|
|
|
|
|
|
state = state * 1103515245U + run * 19999999U + index * 1000000007U + 12345U; |
|
20573
|
0
|
|
|
|
|
|
return (state >> 16) / double(1<<16); |
|
20574
|
|
|
|
|
|
|
} |
|
20575
|
|
|
|
|
|
|
|
|
20576
|
|
|
|
|
|
|
///////// |
|
20577
|
|
|
|
|
|
|
// File: morphodita/tagger/elementary_features_encoder.h |
|
20578
|
|
|
|
|
|
|
///////// |
|
20579
|
|
|
|
|
|
|
|
|
20580
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
20581
|
|
|
|
|
|
|
// |
|
20582
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
20583
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
20584
|
|
|
|
|
|
|
// |
|
20585
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
20586
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
20587
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
20588
|
|
|
|
|
|
|
|
|
20589
|
|
|
|
|
|
|
namespace morphodita { |
|
20590
|
|
|
|
|
|
|
|
|
20591
|
|
|
|
|
|
|
template |
|
20592
|
0
|
|
|
|
|
|
inline bool elementary_features |
|
20593
|
0
|
|
|
|
|
|
binary_encoder enc; |
|
20594
|
|
|
|
|
|
|
|
|
20595
|
0
|
0
|
|
|
|
|
enc.add_1B(maps.size()); |
|
20596
|
0
|
0
|
|
|
|
|
for (auto&& map : maps) |
|
20597
|
0
|
0
|
|
|
|
|
map.save(enc); |
|
20598
|
|
|
|
|
|
|
|
|
20599
|
0
|
0
|
|
|
|
|
return compressor::save(os, enc); |
|
20600
|
|
|
|
|
|
|
} |
|
20601
|
|
|
|
|
|
|
|
|
20602
|
|
|
|
|
|
|
} // namespace morphodita |
|
20603
|
|
|
|
|
|
|
|
|
20604
|
|
|
|
|
|
|
///////// |
|
20605
|
|
|
|
|
|
|
// File: morphodita/tagger/feature_sequences_encoder.h |
|
20606
|
|
|
|
|
|
|
///////// |
|
20607
|
|
|
|
|
|
|
|
|
20608
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
20609
|
|
|
|
|
|
|
// |
|
20610
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
20611
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
20612
|
|
|
|
|
|
|
// |
|
20613
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
20614
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
20615
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
20616
|
|
|
|
|
|
|
|
|
20617
|
|
|
|
|
|
|
namespace morphodita { |
|
20618
|
|
|
|
|
|
|
|
|
20619
|
|
|
|
|
|
|
template |
|
20620
|
0
|
|
|
|
|
|
void feature_sequences::parse(int window_size, istream& is) { |
|
20621
|
|
|
|
|
|
|
unordered_map elementary_map; |
|
20622
|
0
|
0
|
|
|
|
|
for (auto&& description : ElementaryFeatures::descriptions) |
|
20623
|
0
|
0
|
|
|
|
|
if (!elementary_map.emplace(description.name, description).second) |
|
20624
|
0
|
0
|
|
|
|
|
training_failure("Repeated elementary feature with name " << description.name << '!'); |
|
20625
|
|
|
|
|
|
|
|
|
20626
|
|
|
|
|
|
|
string line; |
|
20627
|
0
|
|
|
|
|
|
vector tokens; |
|
20628
|
0
|
0
|
|
|
|
|
while (getline(is, line)) { |
|
|
|
0
|
|
|
|
|
|
|
20629
|
0
|
0
|
|
|
|
|
split(line, ',', tokens); |
|
20630
|
0
|
0
|
|
|
|
|
if (tokens.empty()) training_failure("Feature sequence cannot be empty!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
20631
|
|
|
|
|
|
|
|
|
20632
|
|
|
|
|
|
|
bool contains_only_current = false; |
|
20633
|
0
|
0
|
|
|
|
|
sequences.emplace_back(); |
|
20634
|
0
|
0
|
|
|
|
|
for (auto&& token : tokens) { |
|
20635
|
0
|
|
|
|
|
|
vector parts; |
|
20636
|
0
|
0
|
|
|
|
|
split(token, ' ', parts); |
|
20637
|
0
|
0
|
|
|
|
|
if (parts.size() != 2) training_failure("Cannot parse feature sequence element '" << token << "'!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
20638
|
|
|
|
|
|
|
auto it = elementary_map.find(parts[0]); |
|
20639
|
0
|
0
|
|
|
|
|
if (it == elementary_map.end()) training_failure("Unknown elementary feature '" << parts[0] << "' used in feature sequence '" << token << "'!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
20640
|
|
|
|
|
|
|
|
|
20641
|
|
|
|
|
|
|
auto& desc = it->second; |
|
20642
|
0
|
0
|
|
|
|
|
int sequence_index = parse_int(parts[1].c_str(), "sequence_index"); |
|
20643
|
0
|
0
|
|
|
|
|
if (desc.type == DYNAMIC && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of dynamic elementary feature '" << desc.name << "'!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
20644
|
0
|
0
|
|
|
|
|
if (desc.type == PER_TAG && (sequence_index > 0 || sequence_index <= -window_size)) training_failure("Wrong sequence index " << sequence_index << " of per-tag elementary feature '" << desc.name << "'!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
20645
|
0
|
0
|
|
|
|
|
if (desc.range == ONLY_CURRENT && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of elementary feature '" << desc.name << "' requiring zero offset!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
20646
|
|
|
|
|
|
|
|
|
20647
|
0
|
0
|
|
|
|
|
sequences.back().elements.emplace_back(it->second.type, it->second.index, sequence_index); |
|
20648
|
0
|
0
|
|
|
|
|
if (desc.type == DYNAMIC) sequences.back().dependant_range = max(sequences.back().dependant_range, window_size + 1); |
|
20649
|
0
|
0
|
|
|
|
|
if (desc.type == PER_TAG) sequences.back().dependant_range = max(sequences.back().dependant_range, 1 - sequence_index); |
|
20650
|
0
|
|
|
|
|
|
contains_only_current |= desc.range == ONLY_CURRENT; |
|
20651
|
|
|
|
|
|
|
} |
|
20652
|
0
|
0
|
|
|
|
|
if (contains_only_current && sequences.back().dependant_range > 1) training_failure("Feature sequence '" << line << "' contains both a non-local elementary feature and exclusively-local elementary feature!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
20653
|
|
|
|
|
|
|
} |
|
20654
|
|
|
|
|
|
|
|
|
20655
|
0
|
|
|
|
|
|
stable_sort(sequences.begin(), sequences.end(), [](const feature_sequence& a, const feature_sequence& b) { return a.dependant_range > b.dependant_range; }); |
|
20656
|
0
|
0
|
|
|
|
|
scores.resize(sequences.size()); |
|
20657
|
0
|
|
|
|
|
|
} |
|
20658
|
|
|
|
|
|
|
|
|
20659
|
|
|
|
|
|
|
template |
|
20660
|
0
|
|
|
|
|
|
inline bool feature_sequences::save(ostream& os) { |
|
20661
|
0
|
0
|
|
|
|
|
if (!elementary.save(os)) return false; |
|
20662
|
|
|
|
|
|
|
|
|
20663
|
0
|
|
|
|
|
|
binary_encoder enc; |
|
20664
|
0
|
0
|
|
|
|
|
enc.add_1B(sequences.size()); |
|
20665
|
0
|
0
|
|
|
|
|
for (auto&& sequence : sequences) { |
|
20666
|
0
|
|
|
|
|
|
enc.add_4B(sequence.dependant_range); |
|
20667
|
0
|
0
|
|
|
|
|
enc.add_1B(sequence.elements.size()); |
|
20668
|
0
|
0
|
|
|
|
|
for (auto&& element : sequence.elements) { |
|
20669
|
0
|
|
|
|
|
|
enc.add_4B(element.type); |
|
20670
|
0
|
|
|
|
|
|
enc.add_4B(element.elementary_index); |
|
20671
|
0
|
|
|
|
|
|
enc.add_4B(element.sequence_index); |
|
20672
|
|
|
|
|
|
|
} |
|
20673
|
|
|
|
|
|
|
} |
|
20674
|
|
|
|
|
|
|
|
|
20675
|
0
|
0
|
|
|
|
|
enc.add_1B(scores.size()); |
|
20676
|
0
|
0
|
|
|
|
|
for (auto&& score : scores) |
|
20677
|
0
|
0
|
|
|
|
|
score.save(enc); |
|
20678
|
|
|
|
|
|
|
|
|
20679
|
0
|
0
|
|
|
|
|
return compressor::save(os, enc); |
|
20680
|
|
|
|
|
|
|
} |
|
20681
|
|
|
|
|
|
|
|
|
20682
|
|
|
|
|
|
|
} // namespace morphodita |
|
20683
|
|
|
|
|
|
|
|
|
20684
|
|
|
|
|
|
|
///////// |
|
20685
|
|
|
|
|
|
|
// File: morphodita/tagger/training_maps.h |
|
20686
|
|
|
|
|
|
|
///////// |
|
20687
|
|
|
|
|
|
|
|
|
20688
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
20689
|
|
|
|
|
|
|
// |
|
20690
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
20691
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
20692
|
|
|
|
|
|
|
// |
|
20693
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
20694
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
20695
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
20696
|
|
|
|
|
|
|
|
|
20697
|
|
|
|
|
|
|
namespace morphodita { |
|
20698
|
|
|
|
|
|
|
|
|
20699
|
|
|
|
|
|
|
// Declarations |
|
20700
|
0
|
0
|
|
|
|
|
class training_elementary_feature_map { |
|
|
|
0
|
|
|
|
|
|
|
20701
|
|
|
|
|
|
|
public: |
|
20702
|
|
|
|
|
|
|
inline elementary_feature_value value(const char* feature, int len) const; |
|
20703
|
|
|
|
|
|
|
mutable unordered_map map = {{"", elementary_feature_empty}}; |
|
20704
|
|
|
|
|
|
|
private: |
|
20705
|
|
|
|
|
|
|
mutable string key; |
|
20706
|
|
|
|
|
|
|
}; |
|
20707
|
|
|
|
|
|
|
|
|
20708
|
0
|
|
|
|
|
|
class training_feature_sequence_map { |
|
20709
|
|
|
|
|
|
|
public: |
|
20710
|
|
|
|
|
|
|
struct info { |
|
20711
|
|
|
|
|
|
|
// We deliberately use feature_sequence*s*_score to check for overflow |
|
20712
|
|
|
|
|
|
|
feature_sequences_score alpha = 0; |
|
20713
|
|
|
|
|
|
|
feature_sequences_score gamma = 0; |
|
20714
|
|
|
|
|
|
|
int last_gamma_update = 0; |
|
20715
|
|
|
|
|
|
|
}; |
|
20716
|
|
|
|
|
|
|
|
|
20717
|
|
|
|
|
|
|
inline feature_sequence_score score(const char* feature, int len) const; |
|
20718
|
|
|
|
|
|
|
mutable unordered_map map; |
|
20719
|
|
|
|
|
|
|
private: |
|
20720
|
|
|
|
|
|
|
mutable string key; |
|
20721
|
|
|
|
|
|
|
}; |
|
20722
|
|
|
|
|
|
|
|
|
20723
|
|
|
|
|
|
|
template class ElementaryFeatures> using train_feature_sequences = feature_sequences, training_feature_sequence_map>; |
|
20724
|
|
|
|
|
|
|
|
|
20725
|
|
|
|
|
|
|
// Definitions |
|
20726
|
0
|
|
|
|
|
|
elementary_feature_value training_elementary_feature_map::value(const char* feature, int len) const { |
|
20727
|
0
|
|
|
|
|
|
key.assign(feature, len); |
|
20728
|
0
|
|
|
|
|
|
return map.emplace(key, elementary_feature_empty + elementary_feature_value(map.size())).first->second; |
|
20729
|
|
|
|
|
|
|
} |
|
20730
|
|
|
|
|
|
|
|
|
20731
|
0
|
|
|
|
|
|
feature_sequence_score training_feature_sequence_map::score(const char* feature, int len) const { |
|
20732
|
0
|
|
|
|
|
|
key.assign(feature, len); |
|
20733
|
|
|
|
|
|
|
auto it = map.find(key); |
|
20734
|
0
|
0
|
|
|
|
|
return it != map.end() ? it->second.alpha : 0; |
|
20735
|
|
|
|
|
|
|
} |
|
20736
|
|
|
|
|
|
|
|
|
20737
|
|
|
|
|
|
|
} // namespace morphodita |
|
20738
|
|
|
|
|
|
|
|
|
20739
|
|
|
|
|
|
|
///////// |
|
20740
|
|
|
|
|
|
|
// File: morphodita/tagger/feature_sequences_optimizer.h |
|
20741
|
|
|
|
|
|
|
///////// |
|
20742
|
|
|
|
|
|
|
|
|
20743
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
20744
|
|
|
|
|
|
|
// |
|
20745
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
20746
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
20747
|
|
|
|
|
|
|
// |
|
20748
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
20749
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
20750
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
20751
|
|
|
|
|
|
|
|
|
20752
|
|
|
|
|
|
|
namespace morphodita { |
|
20753
|
|
|
|
|
|
|
|
|
20754
|
|
|
|
|
|
|
// Declarations |
|
20755
|
|
|
|
|
|
|
template |
|
20756
|
|
|
|
|
|
|
class feature_sequences_optimizer; |
|
20757
|
|
|
|
|
|
|
|
|
20758
|
|
|
|
|
|
|
template class FeatureSequences, template class ElementaryFeatures> |
|
20759
|
|
|
|
|
|
|
class feature_sequences_optimizer, training_feature_sequence_map>> { |
|
20760
|
|
|
|
|
|
|
public: |
|
20761
|
|
|
|
|
|
|
typedef FeatureSequences, training_feature_sequence_map> original_feature_sequences; |
|
20762
|
|
|
|
|
|
|
typedef FeatureSequences, persistent_feature_sequence_map> optimized_feature_sequences; |
|
20763
|
|
|
|
|
|
|
|
|
20764
|
|
|
|
|
|
|
static void optimize(const original_feature_sequences& features, optimized_feature_sequences& optimized_features); |
|
20765
|
|
|
|
|
|
|
}; |
|
20766
|
|
|
|
|
|
|
|
|
20767
|
|
|
|
|
|
|
// Definitions |
|
20768
|
|
|
|
|
|
|
template class FeatureSequences, template class ElementaryFeatures> |
|
20769
|
0
|
|
|
|
|
|
void feature_sequences_optimizer, training_feature_sequence_map>>::optimize(const original_feature_sequences& features, optimized_feature_sequences& optimized_features) { |
|
20770
|
|
|
|
|
|
|
const ElementaryFeatures& elementary = features.elementary; |
|
20771
|
|
|
|
|
|
|
ElementaryFeatures& optimized_elementary = optimized_features.elementary; |
|
20772
|
|
|
|
|
|
|
|
|
20773
|
|
|
|
|
|
|
// Iterate over feature sequences of non-zero weight and count number of |
|
20774
|
|
|
|
|
|
|
// occurences in corresponding elementary feature maps. |
|
20775
|
|
|
|
|
|
|
// In order to be able to do so, precompute map_index for elements of features.sequences. |
|
20776
|
0
|
|
|
|
|
|
vector> map_indices(features.sequences.size()); |
|
20777
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < map_indices.size(); i++) { |
|
20778
|
0
|
0
|
|
|
|
|
for (auto&& element : features.sequences[i].elements) |
|
20779
|
0
|
0
|
|
|
|
|
for (auto&& description : decltype(features.elementary)::descriptions) |
|
20780
|
0
|
0
|
|
|
|
|
if (element.type == description.type && element.elementary_index == description.index) |
|
|
|
0
|
|
|
|
|
|
|
20781
|
0
|
0
|
|
|
|
|
map_indices[i].emplace_back(description.map_index); |
|
20782
|
|
|
|
|
|
|
|
|
20783
|
0
|
0
|
|
|
|
|
assert(map_indices[i].size() == features.sequences[i].elements.size()); |
|
20784
|
|
|
|
|
|
|
} |
|
20785
|
|
|
|
|
|
|
|
|
20786
|
|
|
|
|
|
|
struct count_info { elementary_feature_value ori = 0; int count = 0; }; |
|
20787
|
0
|
0
|
|
|
|
|
vector> counts(elementary.maps.size()); |
|
20788
|
|
|
|
|
|
|
vector elementary_ids; |
|
20789
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < features.sequences.size(); i++) |
|
20790
|
0
|
0
|
|
|
|
|
for (auto&& element : features.scores[i].map) |
|
20791
|
0
|
0
|
|
|
|
|
if (element.second.gamma) { |
|
20792
|
|
|
|
|
|
|
elementary_ids.clear(); |
|
20793
|
0
|
0
|
|
|
|
|
for (const char* key = element.first.c_str(); key != element.first.c_str() + element.first.size(); assert(key <= element.first.c_str() + element.first.size())) |
|
|
|
0
|
|
|
|
|
|
|
20794
|
0
|
0
|
|
|
|
|
elementary_ids.emplace_back(vli::decode(key)); |
|
20795
|
|
|
|
|
|
|
|
|
20796
|
0
|
0
|
|
|
|
|
assert(elementary_ids.size() == features.sequences[i].elements.size()); |
|
20797
|
0
|
0
|
|
|
|
|
for (unsigned j = 0; j < elementary_ids.size(); j++) { |
|
20798
|
0
|
0
|
|
|
|
|
if (map_indices[i][j] < 0) continue; |
|
20799
|
0
|
0
|
|
|
|
|
if (elementary_ids[j] >= counts[map_indices[i][j]].size()) counts[map_indices[i][j]].resize(elementary_ids[j] + 1); |
|
|
|
0
|
|
|
|
|
|
|
20800
|
0
|
|
|
|
|
|
counts[map_indices[i][j]][elementary_ids[j]].count++; |
|
20801
|
|
|
|
|
|
|
} |
|
20802
|
|
|
|
|
|
|
} |
|
20803
|
|
|
|
|
|
|
|
|
20804
|
|
|
|
|
|
|
// Sort counts by sizes decreasing |
|
20805
|
0
|
0
|
|
|
|
|
for (auto&& count : counts) { |
|
20806
|
0
|
0
|
|
|
|
|
if (elementary_feature_empty >= count.size()) count.resize(elementary_feature_empty + 1); |
|
|
|
0
|
|
|
|
|
|
|
20807
|
0
|
|
|
|
|
|
count[elementary_feature_unknown].count = 0; |
|
20808
|
0
|
|
|
|
|
|
count[elementary_feature_empty].count = 1; |
|
20809
|
0
|
0
|
|
|
|
|
for (elementary_feature_value i = 0; i < count.size(); i++) count[i].ori = i; |
|
20810
|
0
|
|
|
|
|
|
sort(count.begin() + elementary_feature_empty + 1, count.end(), [](const count_info& a, const count_info& b){ return a.count > b.count; }); |
|
20811
|
|
|
|
|
|
|
} |
|
20812
|
|
|
|
|
|
|
|
|
20813
|
|
|
|
|
|
|
// Create an elementary ids map |
|
20814
|
0
|
0
|
|
|
|
|
vector> elementary_ids_map(counts.size()); |
|
20815
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < counts.size(); i++) { |
|
20816
|
0
|
0
|
|
|
|
|
elementary_ids_map[i].resize(counts[i].size()); |
|
20817
|
0
|
0
|
|
|
|
|
for (elementary_feature_value j = 0; j < counts[i].size(); j++) |
|
20818
|
0
|
0
|
|
|
|
|
elementary_ids_map[i][counts[i][j].ori] = counts[i][j].count ? j : elementary_feature_unknown; |
|
20819
|
|
|
|
|
|
|
} |
|
20820
|
|
|
|
|
|
|
|
|
20821
|
|
|
|
|
|
|
// Make optimized elementary maps by applying elementary ids map |
|
20822
|
|
|
|
|
|
|
optimized_elementary.maps.clear(); |
|
20823
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < elementary.maps.size(); i++) { |
|
20824
|
|
|
|
|
|
|
unordered_map mapped_ids; |
|
20825
|
0
|
0
|
|
|
|
|
for (auto&& element : elementary.maps[i].map) |
|
20826
|
0
|
0
|
|
|
|
|
if (element.second < elementary_ids_map[i].size() && elementary_ids_map[i][element.second] != elementary_feature_unknown) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
20827
|
0
|
|
|
|
|
|
mapped_ids.emplace(element.first, elementary_ids_map[i][element.second]); |
|
20828
|
|
|
|
|
|
|
|
|
20829
|
0
|
0
|
|
|
|
|
optimized_elementary.maps.emplace_back(persistent_unordered_map(mapped_ids, 1, [](binary_encoder& enc, int id) { |
|
|
|
0
|
|
|
|
|
|
|
20830
|
|
|
|
|
|
|
enc.add_4B(id); |
|
20831
|
|
|
|
|
|
|
})); |
|
20832
|
|
|
|
|
|
|
} |
|
20833
|
|
|
|
|
|
|
|
|
20834
|
|
|
|
|
|
|
// Remap keys in feature sequences by applying elementary_ids_map to appropriate subkeys |
|
20835
|
0
|
0
|
|
|
|
|
optimized_features.sequences = features.sequences; |
|
20836
|
|
|
|
|
|
|
optimized_features.scores.clear(); |
|
20837
|
|
|
|
|
|
|
vector key_buffer; |
|
20838
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < features.sequences.size(); i++) { |
|
20839
|
|
|
|
|
|
|
decltype(features.scores[i].map) updated_map; |
|
20840
|
0
|
0
|
|
|
|
|
for (auto&& element : features.scores[i].map) |
|
20841
|
0
|
0
|
|
|
|
|
if (element.second.gamma) { |
|
20842
|
|
|
|
|
|
|
elementary_ids.clear(); |
|
20843
|
0
|
0
|
|
|
|
|
for (const char* key = element.first.c_str(); key < element.first.c_str() + element.first.size(); ) |
|
20844
|
0
|
0
|
|
|
|
|
elementary_ids.emplace_back(vli::decode(key)); |
|
20845
|
|
|
|
|
|
|
|
|
20846
|
0
|
0
|
|
|
|
|
assert(elementary_ids.size() == features.sequences[i].elements.size()); |
|
20847
|
0
|
0
|
|
|
|
|
for (unsigned j = 0; j < elementary_ids.size(); j++) { |
|
20848
|
0
|
0
|
|
|
|
|
if (map_indices[i][j] < 0) continue; |
|
20849
|
0
|
0
|
|
|
|
|
assert(elementary_ids[j] < elementary_ids_map[map_indices[i][j]].size() && elementary_ids_map[map_indices[i][j]][elementary_ids[j]] != elementary_feature_unknown); |
|
|
|
0
|
|
|
|
|
|
|
20850
|
0
|
|
|
|
|
|
elementary_ids[j] = elementary_ids_map[map_indices[i][j]][elementary_ids[j]]; |
|
20851
|
|
|
|
|
|
|
} |
|
20852
|
|
|
|
|
|
|
|
|
20853
|
0
|
0
|
|
|
|
|
key_buffer.resize(elementary_ids.size() * vli::max_length()); |
|
20854
|
0
|
|
|
|
|
|
char* key = key_buffer.data(); |
|
20855
|
0
|
0
|
|
|
|
|
for (unsigned j = 0; j < elementary_ids.size(); j++) |
|
20856
|
0
|
|
|
|
|
|
vli::encode(elementary_ids[j], key); |
|
20857
|
|
|
|
|
|
|
|
|
20858
|
0
|
|
|
|
|
|
updated_map.emplace(string(key_buffer.data(), key - key_buffer.data()), element.second); |
|
20859
|
|
|
|
|
|
|
} |
|
20860
|
|
|
|
|
|
|
|
|
20861
|
0
|
0
|
|
|
|
|
optimized_features.scores.emplace_back(persistent_unordered_map(updated_map, 1, [](binary_encoder& enc, const training_feature_sequence_map::info& info) { |
|
|
|
0
|
|
|
|
|
|
|
20862
|
0
|
0
|
|
|
|
|
assert(feature_sequence_score(info.gamma) == info.gamma); |
|
20863
|
0
|
|
|
|
|
|
enc.add_4B(info.gamma); |
|
20864
|
0
|
|
|
|
|
|
})); |
|
20865
|
|
|
|
|
|
|
} |
|
20866
|
|
|
|
|
|
|
|
|
20867
|
|
|
|
|
|
|
// Original code which only dropped feature sequences with gamma == 0 |
|
20868
|
|
|
|
|
|
|
// optimized_elementary.maps.clear(); |
|
20869
|
|
|
|
|
|
|
// for (auto&& map : elementary.maps) |
|
20870
|
|
|
|
|
|
|
// optimized_elementary.maps.emplace_back(persistent_unordered_map(map.map, 1, [](binary_encoder& enc, elementary_feature_value value) { |
|
20871
|
|
|
|
|
|
|
// enc.add_4B(value); |
|
20872
|
|
|
|
|
|
|
// })); |
|
20873
|
|
|
|
|
|
|
// |
|
20874
|
|
|
|
|
|
|
// optimized_features.sequences = features.sequences; |
|
20875
|
|
|
|
|
|
|
// optimized_features.scores.clear(); |
|
20876
|
|
|
|
|
|
|
// for (auto&& score : features.scores) { |
|
20877
|
|
|
|
|
|
|
// decltype(score.map) pruned_map; |
|
20878
|
|
|
|
|
|
|
// for (auto&& element : score.map) |
|
20879
|
|
|
|
|
|
|
// if (element.second.gamma) |
|
20880
|
|
|
|
|
|
|
// pruned_map.insert(element); |
|
20881
|
|
|
|
|
|
|
// |
|
20882
|
|
|
|
|
|
|
// optimized_features.scores.emplace_back(persistent_unordered_map(pruned_map, 1, [](binary_encoder& enc, const training_feature_sequence_map::info& info) { |
|
20883
|
|
|
|
|
|
|
// enc.add_4B(info.gamma); |
|
20884
|
|
|
|
|
|
|
// })); |
|
20885
|
|
|
|
|
|
|
// } |
|
20886
|
0
|
|
|
|
|
|
} |
|
20887
|
|
|
|
|
|
|
|
|
20888
|
|
|
|
|
|
|
} // namespace morphodita |
|
20889
|
|
|
|
|
|
|
|
|
20890
|
|
|
|
|
|
|
///////// |
|
20891
|
|
|
|
|
|
|
// File: morphodita/tagger/tagger_trainer.h |
|
20892
|
|
|
|
|
|
|
///////// |
|
20893
|
|
|
|
|
|
|
|
|
20894
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
20895
|
|
|
|
|
|
|
// |
|
20896
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
20897
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
20898
|
|
|
|
|
|
|
// |
|
20899
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
20900
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
20901
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
20902
|
|
|
|
|
|
|
|
|
20903
|
|
|
|
|
|
|
namespace morphodita { |
|
20904
|
|
|
|
|
|
|
|
|
20905
|
|
|
|
|
|
|
// Declarations |
|
20906
|
|
|
|
|
|
|
template |
|
20907
|
|
|
|
|
|
|
class tagger_trainer { |
|
20908
|
|
|
|
|
|
|
public: |
|
20909
|
0
|
|
|
|
|
|
struct sentence { |
|
20910
|
|
|
|
|
|
|
vector words; |
|
20911
|
|
|
|
|
|
|
vector forms; |
|
20912
|
|
|
|
|
|
|
vector> analyses; |
|
20913
|
|
|
|
|
|
|
vector gold; |
|
20914
|
|
|
|
|
|
|
vector gold_index; |
|
20915
|
|
|
|
|
|
|
}; |
|
20916
|
|
|
|
|
|
|
|
|
20917
|
|
|
|
|
|
|
static void train(int decoding_order, int window_size, int iterations, istream& in_morpho_dict, bool use_guesser, istream& in_feature_templates, bool prune_features, istream& in_train, istream& in_heldout, bool early_stopping, ostream& out_tagger); |
|
20918
|
|
|
|
|
|
|
|
|
20919
|
|
|
|
|
|
|
private: |
|
20920
|
|
|
|
|
|
|
static double load_data(istream& is, const morpho& d, bool use_guesser, vector& sentences, bool add_gold); |
|
20921
|
|
|
|
|
|
|
}; |
|
20922
|
|
|
|
|
|
|
|
|
20923
|
|
|
|
|
|
|
// Definitions |
|
20924
|
|
|
|
|
|
|
template |
|
20925
|
0
|
|
|
|
|
|
void tagger_trainer::train(int decoding_order, int window_size, int iterations, istream& in_morpho_dict, bool use_guesser, istream& in_feature_templates, bool prune_features, istream& in_train, istream& in_heldout, bool early_stopping, ostream& out_tagger) { |
|
20926
|
|
|
|
|
|
|
// cerr << "Loading dictionary: "; |
|
20927
|
0
|
|
|
|
|
|
unique_ptr d(morpho::load(in_morpho_dict)); |
|
20928
|
0
|
0
|
|
|
|
|
if (!d) training_failure("Cannot load dictionary!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
20929
|
|
|
|
|
|
|
// cerr << "done" << endl; |
|
20930
|
0
|
0
|
|
|
|
|
if (!in_morpho_dict.seekg(0, istream::beg)) training_failure("Cannot seek in dictionary file to the beginning!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
20931
|
|
|
|
|
|
|
|
|
20932
|
0
|
|
|
|
|
|
vector train_data; |
|
20933
|
|
|
|
|
|
|
// cerr << "Loading train data: "; |
|
20934
|
|
|
|
|
|
|
// cerr << "done, matched " << fixed << setprecision(2) << 100 * load_data(in_train, *d, use_guesser, train_data, true) << '%' << endl; |
|
20935
|
0
|
0
|
|
|
|
|
load_data(in_train, *d, use_guesser, train_data, true); |
|
20936
|
|
|
|
|
|
|
|
|
20937
|
0
|
|
|
|
|
|
vector heldout_data; |
|
20938
|
0
|
0
|
|
|
|
|
if (in_heldout) { |
|
20939
|
|
|
|
|
|
|
// cerr << "Loading heldout data: "; |
|
20940
|
|
|
|
|
|
|
// cerr << "done, matched " << fixed << setprecision(2) << 100 * load_data(in_heldout, *d, use_guesser, heldout_data, false) << '%' << endl; |
|
20941
|
0
|
0
|
|
|
|
|
load_data(in_heldout, *d, use_guesser, heldout_data, false); |
|
20942
|
|
|
|
|
|
|
} |
|
20943
|
|
|
|
|
|
|
|
|
20944
|
|
|
|
|
|
|
// Encode morphological dictionary |
|
20945
|
|
|
|
|
|
|
// cerr << "Encoding morphological dictionary." << endl; |
|
20946
|
0
|
0
|
|
|
|
|
out_tagger << in_morpho_dict.rdbuf(); |
|
20947
|
0
|
0
|
|
|
|
|
out_tagger.put(use_guesser); |
|
20948
|
|
|
|
|
|
|
|
|
20949
|
|
|
|
|
|
|
// Train and encode the tagger |
|
20950
|
0
|
0
|
|
|
|
|
TaggerTrainer::train(decoding_order, window_size, iterations, train_data, heldout_data, early_stopping, prune_features, in_feature_templates, out_tagger); |
|
20951
|
0
|
|
|
|
|
|
} |
|
20952
|
|
|
|
|
|
|
|
|
20953
|
|
|
|
|
|
|
template |
|
20954
|
0
|
|
|
|
|
|
double tagger_trainer::load_data(istream& is, const morpho& d, bool use_guesser, vector& sentences, bool add_gold) { |
|
20955
|
|
|
|
|
|
|
sentences.clear(); |
|
20956
|
|
|
|
|
|
|
|
|
20957
|
|
|
|
|
|
|
int forms = 0, forms_matched = 0; |
|
20958
|
|
|
|
|
|
|
|
|
20959
|
|
|
|
|
|
|
string line; |
|
20960
|
0
|
|
|
|
|
|
vector tokens; |
|
20961
|
0
|
0
|
|
|
|
|
sentences.emplace_back(); |
|
20962
|
0
|
0
|
|
|
|
|
while (getline(is, line)) { |
|
|
|
0
|
|
|
|
|
|
|
20963
|
0
|
0
|
|
|
|
|
if (line.empty()) { |
|
20964
|
0
|
0
|
|
|
|
|
if (!sentences.back().words.empty()) |
|
20965
|
0
|
0
|
|
|
|
|
sentences.emplace_back(); |
|
20966
|
|
|
|
|
|
|
continue; |
|
20967
|
|
|
|
|
|
|
} |
|
20968
|
|
|
|
|
|
|
|
|
20969
|
0
|
0
|
|
|
|
|
split(line, '\t', tokens); |
|
20970
|
0
|
0
|
|
|
|
|
if (tokens.size() != 3) training_failure("The tagger data line '" << line << "' does not contain three columns!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
20971
|
|
|
|
|
|
|
|
|
20972
|
|
|
|
|
|
|
// Add form to sentence |
|
20973
|
0
|
|
|
|
|
|
forms++; |
|
20974
|
|
|
|
|
|
|
sentence& s = sentences.back(); |
|
20975
|
0
|
0
|
|
|
|
|
s.words.emplace_back(tokens[0]); |
|
20976
|
0
|
0
|
|
|
|
|
s.gold.emplace_back(tokens[1], tokens[2]); |
|
20977
|
0
|
0
|
|
|
|
|
s.gold_index.emplace_back(-1); |
|
20978
|
|
|
|
|
|
|
|
|
20979
|
|
|
|
|
|
|
// Analyse |
|
20980
|
0
|
0
|
|
|
|
|
s.analyses.emplace_back(); |
|
20981
|
0
|
0
|
|
|
|
|
d.analyze(tokens[0], use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, s.analyses.back()); |
|
|
|
0
|
|
|
|
|
|
|
20982
|
|
|
|
|
|
|
|
|
20983
|
|
|
|
|
|
|
// Locate gold analysis |
|
20984
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < s.analyses.back().size(); i++) |
|
20985
|
0
|
0
|
|
|
|
|
if (s.analyses.back()[i].lemma == s.gold.back().lemma && s.analyses.back()[i].tag == s.gold.back().tag) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
20986
|
0
|
|
|
|
|
|
s.gold_index.back() = i; |
|
20987
|
0
|
|
|
|
|
|
forms_matched++; |
|
20988
|
0
|
|
|
|
|
|
break; |
|
20989
|
|
|
|
|
|
|
} |
|
20990
|
0
|
0
|
|
|
|
|
if (s.gold_index.back() == -1 && add_gold) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
20991
|
0
|
|
|
|
|
|
s.gold_index.back() = s.analyses.back().size(); |
|
20992
|
0
|
0
|
|
|
|
|
s.analyses.back().emplace_back(tokens[1], tokens[2]); |
|
20993
|
|
|
|
|
|
|
} |
|
20994
|
|
|
|
|
|
|
} |
|
20995
|
0
|
0
|
|
|
|
|
if (!sentences.empty() && sentences.back().words.empty()) sentences.pop_back(); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
20996
|
|
|
|
|
|
|
|
|
20997
|
|
|
|
|
|
|
// Fill the forms string_pieces now that the sentences will not reallocate |
|
20998
|
0
|
0
|
|
|
|
|
for (auto&& sentence : sentences) |
|
20999
|
0
|
0
|
|
|
|
|
for (auto&& word : sentence.words) |
|
21000
|
0
|
0
|
|
|
|
|
sentence.forms.emplace_back(string_piece(word.c_str(), d.raw_form_len(word))); |
|
|
|
0
|
|
|
|
|
|
|
21001
|
|
|
|
|
|
|
|
|
21002
|
0
|
|
|
|
|
|
return forms_matched / double(forms); |
|
21003
|
|
|
|
|
|
|
} |
|
21004
|
|
|
|
|
|
|
|
|
21005
|
|
|
|
|
|
|
} // namespace morphodita |
|
21006
|
|
|
|
|
|
|
|
|
21007
|
|
|
|
|
|
|
///////// |
|
21008
|
|
|
|
|
|
|
// File: morphodita/tagger/perceptron_tagger_trainer.h |
|
21009
|
|
|
|
|
|
|
///////// |
|
21010
|
|
|
|
|
|
|
|
|
21011
|
|
|
|
|
|
|
// This file is part of MorphoDiTa . |
|
21012
|
|
|
|
|
|
|
// |
|
21013
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
21014
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
21015
|
|
|
|
|
|
|
// |
|
21016
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
21017
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
21018
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
21019
|
|
|
|
|
|
|
|
|
21020
|
|
|
|
|
|
|
namespace morphodita { |
|
21021
|
|
|
|
|
|
|
|
|
21022
|
|
|
|
|
|
|
// Declarations |
|
21023
|
|
|
|
|
|
|
template |
|
21024
|
|
|
|
|
|
|
class perceptron_tagger_trainer { |
|
21025
|
|
|
|
|
|
|
public: |
|
21026
|
|
|
|
|
|
|
typedef typename tagger_trainer>::sentence sentence; |
|
21027
|
|
|
|
|
|
|
|
|
21028
|
|
|
|
|
|
|
static void train(int decoding_order, int window_size, int iterations, const vector& train, const vector& heldout, bool early_stopping, bool prune_features, istream& in_feature_templates, ostream& out_tagger); |
|
21029
|
|
|
|
|
|
|
|
|
21030
|
|
|
|
|
|
|
private: |
|
21031
|
|
|
|
|
|
|
static void train_viterbi(int decoding_order, int window_size, int iterations, const vector& train, const vector& heldout, bool early_stopping, bool prune_features, FeatureSequences& features); |
|
21032
|
|
|
|
|
|
|
}; |
|
21033
|
|
|
|
|
|
|
|
|
21034
|
|
|
|
|
|
|
// Definitions |
|
21035
|
|
|
|
|
|
|
template |
|
21036
|
0
|
|
|
|
|
|
void perceptron_tagger_trainer::train(int decoding_order, int window_size, int iterations, const vector& train, const vector& heldout, bool early_stopping, bool prune_features, istream& in_feature_templates, ostream& out_tagger) { |
|
21037
|
0
|
|
|
|
|
|
FeatureSequences features; |
|
21038
|
|
|
|
|
|
|
|
|
21039
|
|
|
|
|
|
|
// cerr << "Parsing feature templates..." << endl; |
|
21040
|
0
|
0
|
|
|
|
|
features.parse(window_size, in_feature_templates); |
|
21041
|
|
|
|
|
|
|
|
|
21042
|
|
|
|
|
|
|
// cerr << "Training tagger..." << endl; |
|
21043
|
0
|
0
|
|
|
|
|
train_viterbi(decoding_order, window_size, iterations, train, heldout, early_stopping, prune_features, features); |
|
21044
|
|
|
|
|
|
|
|
|
21045
|
|
|
|
|
|
|
// cerr << "Encoding tagger..." << endl; |
|
21046
|
|
|
|
|
|
|
typedef feature_sequences_optimizer optimizer; |
|
21047
|
0
|
|
|
|
|
|
typename optimizer::optimized_feature_sequences optimized_features; |
|
21048
|
0
|
0
|
|
|
|
|
optimizer::optimize(features, optimized_features); |
|
21049
|
0
|
0
|
|
|
|
|
if (!optimized_features.save(out_tagger)) training_failure("Cannot save feature sequences!"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21050
|
0
|
|
|
|
|
|
} |
|
21051
|
|
|
|
|
|
|
|
|
21052
|
|
|
|
|
|
|
template |
|
21053
|
0
|
|
|
|
|
|
void perceptron_tagger_trainer::train_viterbi(int decoding_order, int window_size, int iterations, const vector& train, const vector& heldout, bool early_stopping, bool prune_features, FeatureSequences& features) { |
|
21054
|
|
|
|
|
|
|
int best_correct = 0, best_iteration = -1; |
|
21055
|
0
|
|
|
|
|
|
FeatureSequences best_features; |
|
21056
|
|
|
|
|
|
|
|
|
21057
|
|
|
|
|
|
|
viterbi decoder(features, decoding_order, window_size); |
|
21058
|
0
|
0
|
|
|
|
|
typename decltype(decoder)::cache decoder_cache(decoder); |
|
21059
|
|
|
|
|
|
|
|
|
21060
|
0
|
0
|
|
|
|
|
typename FeatureSequences::cache feature_sequences_cache(features); |
|
21061
|
|
|
|
|
|
|
typename FeatureSequences::dynamic_features decoded_dynamic_features, gold_dynamic_features; |
|
21062
|
0
|
|
|
|
|
|
vector decoded_feature_sequences_keys, gold_feature_sequences_keys; |
|
21063
|
|
|
|
|
|
|
|
|
21064
|
0
|
0
|
|
|
|
|
vector window(window_size); |
|
21065
|
|
|
|
|
|
|
|
|
21066
|
|
|
|
|
|
|
// Initialize feature sequences for the gold decoding only if requested |
|
21067
|
0
|
0
|
|
|
|
|
if (prune_features) |
|
21068
|
0
|
0
|
|
|
|
|
for (unsigned s = 0; s < train.size(); s++) { |
|
21069
|
|
|
|
|
|
|
auto& sentence = train[s]; |
|
21070
|
0
|
0
|
|
|
|
|
features.initialize_sentence(sentence.forms, sentence.analyses, feature_sequences_cache); |
|
21071
|
0
|
0
|
|
|
|
|
for (int i = 0; i < int(sentence.forms.size()); i++) { |
|
21072
|
0
|
|
|
|
|
|
window.assign(window_size, -1); |
|
21073
|
0
|
0
|
|
|
|
|
for (int j = 0; j < window_size && i - j >= 0; j++) window[j] = sentence.gold_index[i - j]; |
|
|
|
0
|
|
|
|
|
|
|
21074
|
|
|
|
|
|
|
|
|
21075
|
0
|
|
|
|
|
|
features.compute_dynamic_features(i, window[0], &gold_dynamic_features, gold_dynamic_features, feature_sequences_cache); |
|
21076
|
0
|
0
|
|
|
|
|
features.feature_keys(i, window.data(), 0, gold_dynamic_features, gold_feature_sequences_keys, feature_sequences_cache); |
|
21077
|
|
|
|
|
|
|
|
|
21078
|
0
|
0
|
|
|
|
|
for (unsigned f = 0; f < features.scores.size(); f++) |
|
21079
|
0
|
0
|
|
|
|
|
if (!gold_feature_sequences_keys[f].empty()) |
|
21080
|
|
|
|
|
|
|
features.scores[f].map[gold_feature_sequences_keys[f]]; |
|
21081
|
|
|
|
|
|
|
} |
|
21082
|
|
|
|
|
|
|
} |
|
21083
|
|
|
|
|
|
|
|
|
21084
|
|
|
|
|
|
|
// Train for given number of iterations |
|
21085
|
0
|
0
|
|
|
|
|
for (int i = 0; i < iterations; i++) { |
|
21086
|
|
|
|
|
|
|
// Train |
|
21087
|
|
|
|
|
|
|
int train_correct = 0, train_total = 0; |
|
21088
|
0
|
0
|
|
|
|
|
cerr << "Iteration " << i + 1 << ": "; |
|
|
|
0
|
|
|
|
|
|
|
21089
|
|
|
|
|
|
|
|
|
21090
|
|
|
|
|
|
|
vector tags; |
|
21091
|
0
|
0
|
|
|
|
|
for (unsigned s = 0; s < train.size(); s++) { |
|
21092
|
|
|
|
|
|
|
auto& sentence = train[s]; |
|
21093
|
|
|
|
|
|
|
|
|
21094
|
|
|
|
|
|
|
// Run Viterbi |
|
21095
|
0
|
0
|
|
|
|
|
if (tags.size() < sentence.forms.size()) tags.resize(2 * sentence.forms.size()); |
|
|
|
0
|
|
|
|
|
|
|
21096
|
0
|
0
|
|
|
|
|
decoder.tag(sentence.forms, sentence.analyses, decoder_cache, tags); |
|
21097
|
|
|
|
|
|
|
|
|
21098
|
|
|
|
|
|
|
// Compute feature sequence keys or decoded result and gold result and update alpha & gamma |
|
21099
|
0
|
0
|
|
|
|
|
features.initialize_sentence(sentence.forms, sentence.analyses, feature_sequences_cache); |
|
21100
|
0
|
0
|
|
|
|
|
for (int i = 0; i < int(sentence.forms.size()); i++) { |
|
21101
|
0
|
|
|
|
|
|
train_correct += tags[i] == sentence.gold_index[i]; |
|
21102
|
0
|
|
|
|
|
|
train_total++; |
|
21103
|
|
|
|
|
|
|
|
|
21104
|
0
|
|
|
|
|
|
window.assign(window_size, -1); |
|
21105
|
0
|
0
|
|
|
|
|
for (int j = 0; j < window_size && i - j >= 0; j++) window[j] = tags[i - j]; |
|
|
|
0
|
|
|
|
|
|
|
21106
|
0
|
|
|
|
|
|
features.compute_dynamic_features(i, window[0], &decoded_dynamic_features, decoded_dynamic_features, feature_sequences_cache); |
|
21107
|
0
|
0
|
|
|
|
|
features.feature_keys(i, window.data(), 0, decoded_dynamic_features, decoded_feature_sequences_keys, feature_sequences_cache); |
|
21108
|
|
|
|
|
|
|
|
|
21109
|
0
|
0
|
|
|
|
|
for (int j = 0; j < window_size && i - j >= 0; j++) window[j] = sentence.gold_index[i - j]; |
|
|
|
0
|
|
|
|
|
|
|
21110
|
0
|
|
|
|
|
|
features.compute_dynamic_features(i, window[0], &gold_dynamic_features, gold_dynamic_features, feature_sequences_cache); |
|
21111
|
0
|
0
|
|
|
|
|
features.feature_keys(i, window.data(), 0, gold_dynamic_features, gold_feature_sequences_keys, feature_sequences_cache); |
|
21112
|
|
|
|
|
|
|
|
|
21113
|
0
|
0
|
|
|
|
|
for (unsigned f = 0; f < features.scores.size(); f++) { |
|
21114
|
0
|
0
|
|
|
|
|
if (decoded_feature_sequences_keys[f] != gold_feature_sequences_keys[f]) { |
|
21115
|
0
|
0
|
|
|
|
|
if (!decoded_feature_sequences_keys[f].empty()) { |
|
21116
|
|
|
|
|
|
|
auto it = features.scores[f].map.find(decoded_feature_sequences_keys[f]); |
|
21117
|
0
|
0
|
|
|
|
|
if (it == features.scores[f].map.end() && !prune_features) it = features.scores[f].map.emplace(decoded_feature_sequences_keys[f], typename decltype(features.scores[f].map)::mapped_type()).first; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21118
|
0
|
0
|
|
|
|
|
if (it != features.scores[f].map.end()) { |
|
21119
|
|
|
|
|
|
|
auto& decoded_info = it->second; |
|
21120
|
0
|
|
|
|
|
|
decoded_info.gamma += decoded_info.alpha * (s - decoded_info.last_gamma_update); |
|
21121
|
0
|
|
|
|
|
|
decoded_info.last_gamma_update = s; |
|
21122
|
0
|
|
|
|
|
|
decoded_info.alpha--; |
|
21123
|
|
|
|
|
|
|
} |
|
21124
|
|
|
|
|
|
|
} |
|
21125
|
|
|
|
|
|
|
|
|
21126
|
0
|
0
|
|
|
|
|
if (!gold_feature_sequences_keys[f].empty()) { |
|
21127
|
|
|
|
|
|
|
auto it = features.scores[f].map.find(gold_feature_sequences_keys[f]); |
|
21128
|
0
|
0
|
|
|
|
|
if (it == features.scores[f].map.end() && !prune_features) it = features.scores[f].map.emplace(gold_feature_sequences_keys[f], typename decltype(features.scores[f].map)::mapped_type()).first; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21129
|
0
|
0
|
|
|
|
|
if (it != features.scores[f].map.end()) { |
|
21130
|
|
|
|
|
|
|
auto& gold_info = it->second; |
|
21131
|
0
|
|
|
|
|
|
gold_info.gamma += gold_info.alpha * (s - gold_info.last_gamma_update); |
|
21132
|
0
|
|
|
|
|
|
gold_info.last_gamma_update = s; |
|
21133
|
0
|
|
|
|
|
|
gold_info.alpha++; |
|
21134
|
|
|
|
|
|
|
} |
|
21135
|
|
|
|
|
|
|
} |
|
21136
|
|
|
|
|
|
|
} |
|
21137
|
|
|
|
|
|
|
} |
|
21138
|
|
|
|
|
|
|
} |
|
21139
|
|
|
|
|
|
|
} |
|
21140
|
|
|
|
|
|
|
|
|
21141
|
|
|
|
|
|
|
// Finalize incremental gamma updates |
|
21142
|
0
|
0
|
|
|
|
|
for (auto&& score : features.scores) |
|
21143
|
0
|
0
|
|
|
|
|
for (auto&& element : score.map) { |
|
21144
|
0
|
|
|
|
|
|
element.second.gamma += element.second.alpha * (train.size() - element.second.last_gamma_update); |
|
21145
|
0
|
|
|
|
|
|
element.second.last_gamma_update = 0; |
|
21146
|
|
|
|
|
|
|
} |
|
21147
|
0
|
|
|
|
|
|
cerr << "done, accuracy " << fixed << setprecision(2) << train_correct * 100 / double(train_total) << '%'; |
|
21148
|
|
|
|
|
|
|
|
|
21149
|
|
|
|
|
|
|
// If we have any heldout data, compute accuracy and if requested store best tagger configuration |
|
21150
|
0
|
0
|
|
|
|
|
if (!heldout.empty()) { |
|
21151
|
|
|
|
|
|
|
enum { TAGS, LEMMAS, BOTH, TOTAL }; |
|
21152
|
|
|
|
|
|
|
int heldout_correct[TOTAL] = {}, heldout_total = 0; |
|
21153
|
|
|
|
|
|
|
|
|
21154
|
|
|
|
|
|
|
typedef feature_sequences_optimizer optimizer; |
|
21155
|
0
|
|
|
|
|
|
typename optimizer::optimized_feature_sequences frozen_features; |
|
21156
|
0
|
0
|
|
|
|
|
optimizer::optimize(features, frozen_features); |
|
21157
|
|
|
|
|
|
|
viterbi frozen_decoder(frozen_features, decoding_order, window_size); |
|
21158
|
0
|
0
|
|
|
|
|
typename decltype(frozen_decoder)::cache frozen_decoder_cache(frozen_decoder); |
|
21159
|
|
|
|
|
|
|
|
|
21160
|
0
|
0
|
|
|
|
|
for (auto&& sentence : heldout) { |
|
21161
|
0
|
0
|
|
|
|
|
if (tags.size() < sentence.forms.size()) tags.resize(sentence.forms.size() * 2); |
|
|
|
0
|
|
|
|
|
|
|
21162
|
0
|
0
|
|
|
|
|
frozen_decoder.tag(sentence.forms, sentence.analyses, frozen_decoder_cache, tags); |
|
21163
|
|
|
|
|
|
|
|
|
21164
|
0
|
0
|
|
|
|
|
for (unsigned i = 0; i < sentence.forms.size(); i++) { |
|
21165
|
0
|
|
|
|
|
|
heldout_correct[TAGS] += sentence.gold[i].tag == sentence.analyses[i][tags[i]].tag; |
|
21166
|
0
|
|
|
|
|
|
heldout_correct[LEMMAS] += sentence.gold[i].lemma == sentence.analyses[i][tags[i]].lemma; |
|
21167
|
0
|
0
|
|
|
|
|
heldout_correct[BOTH] += sentence.gold[i].tag == sentence.analyses[i][tags[i]].tag && sentence.gold[i].lemma == sentence.analyses[i][tags[i]].lemma; |
|
|
|
0
|
|
|
|
|
|
|
21168
|
0
|
|
|
|
|
|
heldout_total++; |
|
21169
|
|
|
|
|
|
|
} |
|
21170
|
|
|
|
|
|
|
} |
|
21171
|
|
|
|
|
|
|
|
|
21172
|
0
|
0
|
|
|
|
|
if (early_stopping && heldout_correct[BOTH] > best_correct) { |
|
|
|
0
|
|
|
|
|
|
|
21173
|
|
|
|
|
|
|
best_correct = heldout_correct[BOTH]; |
|
21174
|
|
|
|
|
|
|
best_iteration = i; |
|
21175
|
0
|
0
|
|
|
|
|
best_features = features; |
|
21176
|
|
|
|
|
|
|
} |
|
21177
|
|
|
|
|
|
|
|
|
21178
|
0
|
0
|
|
|
|
|
cerr << ", heldout accuracy " << fixed << setprecision(2) |
|
21179
|
0
|
|
|
|
|
|
<< 100 * heldout_correct[TAGS] / double(heldout_total) << "%t/" |
|
21180
|
0
|
|
|
|
|
|
<< 100 * heldout_correct[LEMMAS] / double(heldout_total) << "%l/" |
|
21181
|
0
|
|
|
|
|
|
<< 100 * heldout_correct[BOTH] / double(heldout_total) << "%b"; |
|
21182
|
|
|
|
|
|
|
} |
|
21183
|
|
|
|
|
|
|
cerr << endl; |
|
21184
|
|
|
|
|
|
|
} |
|
21185
|
|
|
|
|
|
|
|
|
21186
|
0
|
0
|
|
|
|
|
if (early_stopping && best_iteration >= 0) { |
|
21187
|
0
|
0
|
|
|
|
|
cerr << "Chosen tagger model from iteration " << best_iteration + 1 << endl; |
|
21188
|
0
|
0
|
|
|
|
|
features = best_features; |
|
21189
|
|
|
|
|
|
|
} |
|
21190
|
0
|
|
|
|
|
|
} |
|
21191
|
|
|
|
|
|
|
|
|
21192
|
|
|
|
|
|
|
} // namespace morphodita |
|
21193
|
|
|
|
|
|
|
|
|
21194
|
|
|
|
|
|
|
///////// |
|
21195
|
|
|
|
|
|
|
// File: utils/options.h |
|
21196
|
|
|
|
|
|
|
///////// |
|
21197
|
|
|
|
|
|
|
|
|
21198
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
|
21199
|
|
|
|
|
|
|
// |
|
21200
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
21201
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
21202
|
|
|
|
|
|
|
// |
|
21203
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
21204
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
21205
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
21206
|
|
|
|
|
|
|
|
|
21207
|
|
|
|
|
|
|
namespace utils { |
|
21208
|
|
|
|
|
|
|
|
|
21209
|
|
|
|
|
|
|
class options { |
|
21210
|
|
|
|
|
|
|
public: |
|
21211
|
|
|
|
|
|
|
typedef unordered_map map; |
|
21212
|
|
|
|
|
|
|
|
|
21213
|
|
|
|
|
|
|
struct value { |
|
21214
|
|
|
|
|
|
|
enum allowed_t { NONE, ANY, SET }; |
|
21215
|
|
|
|
|
|
|
allowed_t allowed; |
|
21216
|
|
|
|
|
|
|
unordered_set set; |
|
21217
|
|
|
|
|
|
|
|
|
21218
|
|
|
|
|
|
|
value(initializer_list set) : allowed(SET), set(set) {} |
|
21219
|
|
|
|
|
|
|
static const value none; |
|
21220
|
|
|
|
|
|
|
static const value any; |
|
21221
|
|
|
|
|
|
|
|
|
21222
|
|
|
|
|
|
|
private: |
|
21223
|
|
|
|
|
|
|
value(allowed_t allowed) : allowed(allowed) {} |
|
21224
|
|
|
|
|
|
|
}; |
|
21225
|
|
|
|
|
|
|
|
|
21226
|
|
|
|
|
|
|
// Parse options according to allowed map. If successful, argv is reordered so |
|
21227
|
|
|
|
|
|
|
// that non-option arguments are placed in argv[1] to argv[argc-1]. The '--' |
|
21228
|
|
|
|
|
|
|
// indicates end of option arguments (as usual). The allowed map contains |
|
21229
|
|
|
|
|
|
|
// values allowed for every option. If empty, no value is allowed, if it |
|
21230
|
|
|
|
|
|
|
// contains just an empty string, any value is allowed. |
|
21231
|
|
|
|
|
|
|
static bool parse(const unordered_map& allowed, int& argc, char**& argv, map& options); |
|
21232
|
|
|
|
|
|
|
}; |
|
21233
|
|
|
|
|
|
|
|
|
21234
|
|
|
|
|
|
|
} // namespace utils |
|
21235
|
|
|
|
|
|
|
|
|
21236
|
|
|
|
|
|
|
///////// |
|
21237
|
|
|
|
|
|
|
// File: version/version.h |
|
21238
|
|
|
|
|
|
|
///////// |
|
21239
|
|
|
|
|
|
|
|
|
21240
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
21241
|
|
|
|
|
|
|
// |
|
21242
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
|
21243
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
21244
|
|
|
|
|
|
|
// |
|
21245
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
21246
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
21247
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
21248
|
|
|
|
|
|
|
|
|
21249
|
0
|
|
|
|
|
|
class version { |
|
21250
|
|
|
|
|
|
|
public: |
|
21251
|
|
|
|
|
|
|
unsigned major; |
|
21252
|
|
|
|
|
|
|
unsigned minor; |
|
21253
|
|
|
|
|
|
|
unsigned patch; |
|
21254
|
|
|
|
|
|
|
std::string prerelease; |
|
21255
|
|
|
|
|
|
|
|
|
21256
|
|
|
|
|
|
|
// Returns current version. |
|
21257
|
|
|
|
|
|
|
static version current(); |
|
21258
|
|
|
|
|
|
|
|
|
21259
|
|
|
|
|
|
|
// Returns multi-line formated version and copyright string. |
|
21260
|
|
|
|
|
|
|
static string version_and_copyright(const string& other_libraries = string()); |
|
21261
|
|
|
|
|
|
|
}; |
|
21262
|
|
|
|
|
|
|
|
|
21263
|
|
|
|
|
|
|
///////// |
|
21264
|
|
|
|
|
|
|
// File: trainer/trainer_morphodita_parsito.cpp |
|
21265
|
|
|
|
|
|
|
///////// |
|
21266
|
|
|
|
|
|
|
|
|
21267
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
21268
|
|
|
|
|
|
|
// |
|
21269
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
21270
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
21271
|
|
|
|
|
|
|
// |
|
21272
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
21273
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
21274
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
21275
|
|
|
|
|
|
|
|
|
21276
|
0
|
|
|
|
|
|
bool trainer_morphodita_parsito::train(const vector& training, const vector& heldout, |
|
21277
|
|
|
|
|
|
|
const string& tokenizer, const string& tagger, const string& parser, ostream& os, string& error) { |
|
21278
|
|
|
|
|
|
|
error.clear(); |
|
21279
|
|
|
|
|
|
|
|
|
21280
|
|
|
|
|
|
|
// Save model version info |
|
21281
|
0
|
|
|
|
|
|
os.put(model_morphodita_parsito::VERSION_LATEST); |
|
21282
|
|
|
|
|
|
|
// Add sentinel required since version 2 |
|
21283
|
0
|
|
|
|
|
|
os.put(0x7F).put(0x7F); |
|
21284
|
|
|
|
|
|
|
|
|
21285
|
|
|
|
|
|
|
// Check input data |
|
21286
|
0
|
0
|
|
|
|
|
for (auto&& sentence : training) |
|
21287
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < sentence.words.size(); i++) |
|
21288
|
0
|
0
|
|
|
|
|
if (!can_combine_tag(sentence.words[i], error)) |
|
21289
|
|
|
|
|
|
|
return false; |
|
21290
|
0
|
0
|
|
|
|
|
for (auto&& sentence : heldout) |
|
21291
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < sentence.words.size(); i++) |
|
21292
|
0
|
0
|
|
|
|
|
if (!can_combine_tag(sentence.words[i], error)) |
|
21293
|
|
|
|
|
|
|
return false; |
|
21294
|
|
|
|
|
|
|
|
|
21295
|
0
|
0
|
|
|
|
|
if (!train_tokenizer(training, heldout, tokenizer, os, error)) return false; |
|
21296
|
|
|
|
|
|
|
string tagger_model; |
|
21297
|
|
|
|
|
|
|
{ |
|
21298
|
0
|
0
|
|
|
|
|
ostringstream os_tagger; |
|
21299
|
0
|
0
|
|
|
|
|
if (!train_tagger(training, heldout, tagger, os_tagger, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
21300
|
0
|
|
|
|
|
|
tagger_model.assign(os_tagger.str()); |
|
21301
|
0
|
0
|
|
|
|
|
os.write(tagger_model.data(), tagger_model.size()); |
|
21302
|
|
|
|
|
|
|
} |
|
21303
|
0
|
0
|
|
|
|
|
if (!train_parser(training, heldout, parser, tagger_model, os, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
21304
|
|
|
|
|
|
|
|
|
21305
|
0
|
|
|
|
|
|
return true; |
|
21306
|
|
|
|
|
|
|
} |
|
21307
|
|
|
|
|
|
|
|
|
21308
|
0
|
|
|
|
|
|
bool trainer_morphodita_parsito::train_tokenizer(const vector& training, const vector& heldout, |
|
21309
|
|
|
|
|
|
|
const string& options, ostream& os, string& error) { |
|
21310
|
0
|
0
|
|
|
|
|
if (options == NONE) { |
|
21311
|
0
|
|
|
|
|
|
os.put(0); |
|
21312
|
|
|
|
|
|
|
} else { |
|
21313
|
|
|
|
|
|
|
// Tokenizer options |
|
21314
|
|
|
|
|
|
|
named_values::map tokenizer; |
|
21315
|
0
|
0
|
|
|
|
|
if (!named_values::parse(options, tokenizer, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
21316
|
0
|
0
|
|
|
|
|
int run = 0; if (!option_int(tokenizer, "run", run, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21317
|
|
|
|
|
|
|
|
|
21318
|
0
|
0
|
|
|
|
|
if (tokenizer.count("from_model")) { |
|
|
|
0
|
|
|
|
|
|
|
21319
|
|
|
|
|
|
|
// Use specified tokenizer model |
|
21320
|
|
|
|
|
|
|
string_piece tokenizer_data; |
|
21321
|
0
|
0
|
|
|
|
|
if (!load_model(tokenizer["from_model"], TOKENIZER_MODEL, tokenizer_data)) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21322
|
0
|
0
|
|
|
|
|
return error.assign("Cannot load model from which the tokenizer should be used!"), false; |
|
21323
|
|
|
|
|
|
|
|
|
21324
|
|
|
|
|
|
|
cerr << "Using tokenizer from given model." << endl; |
|
21325
|
0
|
0
|
|
|
|
|
os.write(tokenizer_data.str, tokenizer_data.len); |
|
21326
|
|
|
|
|
|
|
} else { |
|
21327
|
0
|
0
|
|
|
|
|
os.put(1); |
|
21328
|
0
|
0
|
|
|
|
|
const string& model = option_str(tokenizer, "model"); |
|
|
|
0
|
|
|
|
|
|
|
21329
|
|
|
|
|
|
|
|
|
21330
|
|
|
|
|
|
|
// Tokenizer itself |
|
21331
|
0
|
0
|
|
|
|
|
if (model == "generic") { |
|
21332
|
0
|
0
|
|
|
|
|
os.put(morphodita::tokenizer_id::GENERIC); |
|
21333
|
|
|
|
|
|
|
morphodita::generic_tokenizer_factory_encoder::encode(morphodita::generic_tokenizer::LATEST, os); |
|
21334
|
0
|
0
|
|
|
|
|
} else if (model.empty() || model == "gru") { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21335
|
|
|
|
|
|
|
// Create a detokenizator if required |
|
21336
|
0
|
|
|
|
|
|
unique_ptr detokenizer; |
|
21337
|
0
|
0
|
|
|
|
|
if (tokenizer.count("detokenize")) { |
|
|
|
0
|
|
|
|
|
|
|
21338
|
0
|
0
|
|
|
|
|
detokenizer.reset(new udpipe::detokenizer(tokenizer["detokenize"])); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21339
|
0
|
0
|
|
|
|
|
if (!detokenizer) return error.assign("Cannot create detokenizer!"), false; |
|
|
|
0
|
|
|
|
|
|
|
21340
|
|
|
|
|
|
|
} |
|
21341
|
|
|
|
|
|
|
|
|
21342
|
|
|
|
|
|
|
// Prepare training data for the gru_tokenizer |
|
21343
|
0
|
|
|
|
|
|
vector sentences; |
|
21344
|
|
|
|
|
|
|
bool spaces_in_training = false; |
|
21345
|
0
|
0
|
|
|
|
|
for (size_t training_sentence = 0; training_sentence < training.size(); training_sentence++) { |
|
21346
|
0
|
0
|
|
|
|
|
sentence s = training[training_sentence]; |
|
21347
|
0
|
0
|
|
|
|
|
if (detokenizer) detokenizer->detokenize(s); |
|
|
|
0
|
|
|
|
|
|
|
21348
|
|
|
|
|
|
|
|
|
21349
|
0
|
0
|
|
|
|
|
auto& sentence = (sentences.emplace_back(), sentences.back()); |
|
21350
|
|
|
|
|
|
|
|
|
21351
|
0
|
0
|
|
|
|
|
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
|
21352
|
0
|
0
|
|
|
|
|
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? |
|
21353
|
0
|
0
|
|
|
|
|
(const token&)s.multiword_tokens[j] : (const token&)s.words[i]; |
|
21354
|
|
|
|
|
|
|
|
|
21355
|
0
|
0
|
|
|
|
|
sentence.tokens.emplace_back(sentence.sentence.size(), 0); |
|
21356
|
0
|
0
|
|
|
|
|
for (auto&& chr : unilib::utf8::decoder(tok.form)) { |
|
21357
|
0
|
0
|
|
|
|
|
sentence.sentence.push_back(chr); |
|
21358
|
0
|
0
|
|
|
|
|
if (unilib::unicode::category(chr) & unilib::unicode::Zs) spaces_in_training = true; |
|
21359
|
|
|
|
|
|
|
} |
|
21360
|
0
|
|
|
|
|
|
sentence.tokens.back().length = sentence.sentence.size() - sentence.tokens.back().start; |
|
21361
|
|
|
|
|
|
|
|
|
21362
|
0
|
0
|
|
|
|
|
if (tok.get_space_after()) sentence.sentence.push_back(' '); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21363
|
|
|
|
|
|
|
|
|
21364
|
0
|
0
|
|
|
|
|
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21365
|
0
|
|
|
|
|
|
i = s.multiword_tokens[j++].id_last; |
|
21366
|
|
|
|
|
|
|
} |
|
21367
|
0
|
0
|
|
|
|
|
if (training_sentence + 1 < training.size() && (training[training_sentence + 1].get_new_doc() || training[training_sentence + 1].get_new_par())) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21368
|
0
|
|
|
|
|
|
sentence.sentence.append(2, '\n'); |
|
21369
|
|
|
|
|
|
|
} |
|
21370
|
|
|
|
|
|
|
|
|
21371
|
|
|
|
|
|
|
// Heldout data |
|
21372
|
0
|
|
|
|
|
|
vector heldout_sentences; |
|
21373
|
|
|
|
|
|
|
|
|
21374
|
0
|
0
|
|
|
|
|
bool detokenize_handout = true; if (!option_bool(tokenizer, "detokenize_handout", detokenize_handout, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21375
|
0
|
0
|
|
|
|
|
for (size_t heldout_sentence = 0; heldout_sentence < heldout.size(); heldout_sentence++) { |
|
21376
|
0
|
0
|
|
|
|
|
sentence s = heldout[heldout_sentence]; |
|
21377
|
0
|
0
|
|
|
|
|
if (detokenizer && detokenize_handout) detokenizer->detokenize(s); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21378
|
|
|
|
|
|
|
|
|
21379
|
0
|
0
|
|
|
|
|
auto& sentence = (heldout_sentences.emplace_back(), heldout_sentences.back()); |
|
21380
|
|
|
|
|
|
|
|
|
21381
|
0
|
0
|
|
|
|
|
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
|
21382
|
0
|
0
|
|
|
|
|
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? |
|
21383
|
0
|
0
|
|
|
|
|
(const token&)s.multiword_tokens[j] : (const token&)s.words[i]; |
|
21384
|
|
|
|
|
|
|
|
|
21385
|
0
|
0
|
|
|
|
|
sentence.tokens.emplace_back(sentence.sentence.size(), 0); |
|
21386
|
0
|
0
|
|
|
|
|
for (auto&& chr : unilib::utf8::decoder(tok.form)) |
|
21387
|
0
|
0
|
|
|
|
|
sentence.sentence.push_back(chr); |
|
21388
|
0
|
|
|
|
|
|
sentence.tokens.back().length = sentence.sentence.size() - sentence.tokens.back().start; |
|
21389
|
|
|
|
|
|
|
|
|
21390
|
0
|
0
|
|
|
|
|
if (tok.get_space_after()) sentence.sentence.push_back(' '); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21391
|
|
|
|
|
|
|
|
|
21392
|
0
|
0
|
|
|
|
|
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21393
|
0
|
|
|
|
|
|
i = s.multiword_tokens[j++].id_last; |
|
21394
|
|
|
|
|
|
|
} |
|
21395
|
0
|
0
|
|
|
|
|
if (heldout_sentence + 1 < heldout.size() && (heldout[heldout_sentence + 1].get_new_doc() || heldout[heldout_sentence + 1].get_new_par())) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21396
|
0
|
|
|
|
|
|
sentence.sentence.append(2, '\n'); |
|
21397
|
|
|
|
|
|
|
} |
|
21398
|
|
|
|
|
|
|
|
|
21399
|
|
|
|
|
|
|
// Options |
|
21400
|
0
|
0
|
|
|
|
|
bool tokenize_url = true; if (!option_bool(tokenizer, "tokenize_url", tokenize_url, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21401
|
0
|
0
|
|
|
|
|
int segment_size = 50; if (!option_int(tokenizer, "segment_size", segment_size, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21402
|
0
|
0
|
|
|
|
|
bool allow_spaces = spaces_in_training; if (!option_bool(tokenizer, "allow_spaces", allow_spaces, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21403
|
0
|
0
|
|
|
|
|
int dimension = 24; if (!option_int(tokenizer, "dimension", dimension, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21404
|
0
|
0
|
|
|
|
|
int epochs = 100; if (!option_int(tokenizer, "epochs", epochs, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21405
|
0
|
0
|
|
|
|
|
int batch_size = run <= 1 ? 50 : 50 + 50 * hyperparameter_integer(run, 1, 0, 1); |
|
21406
|
0
|
0
|
|
|
|
|
if (!option_int(tokenizer, "batch_size", batch_size, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21407
|
0
|
0
|
|
|
|
|
double learning_rate = run <= 1 ? 0.005 : hyperparameter_logarithmic(run, 2, 0.0005, 0.01); |
|
21408
|
0
|
0
|
|
|
|
|
if (!option_double(tokenizer, "learning_rate", learning_rate, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21409
|
0
|
0
|
|
|
|
|
double learning_rate_final = 0.0; if (!option_double(tokenizer, "learning_rate_final", learning_rate_final, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21410
|
0
|
0
|
|
|
|
|
double dropout = 0.1; if (!option_double(tokenizer, "dropout", dropout, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21411
|
0
|
0
|
|
|
|
|
double initialization_range = 0.5; if (!option_double(tokenizer, "initialization_range", initialization_range, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21412
|
0
|
0
|
|
|
|
|
bool early_stopping = !heldout_sentences.empty(); if (!option_bool(tokenizer, "early_stopping", early_stopping, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21413
|
|
|
|
|
|
|
|
|
21414
|
0
|
0
|
|
|
|
|
if (run >= 1) cerr << "Random search run " << run << ", batch_size=" << batch_size |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21415
|
0
|
|
|
|
|
|
<< ", learning_rate=" << fixed << setprecision(8) << learning_rate << endl; |
|
21416
|
|
|
|
|
|
|
|
|
21417
|
0
|
0
|
|
|
|
|
cerr << "Training tokenizer with the following options: " << "tokenize_url=" << (tokenize_url ? 1 : 0) |
|
|
|
0
|
|
|
|
|
|
|
21418
|
0
|
0
|
|
|
|
|
<< ", allow_spaces=" << (allow_spaces ? 1 : 0) << ", dimension=" << dimension << endl |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21419
|
0
|
0
|
|
|
|
|
<< " epochs=" << epochs << ", batch_size=" << batch_size << ", segment_size=" << segment_size |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21420
|
0
|
|
|
|
|
|
<< ", learning_rate=" << fixed << setprecision(4) << learning_rate << ", learning_rate_final=" << learning_rate_final << endl |
|
21421
|
0
|
0
|
|
|
|
|
<< " dropout=" << dropout << ", early_stopping=" << (early_stopping ? 1 : 0) << endl; |
|
|
|
0
|
|
|
|
|
|
|
21422
|
|
|
|
|
|
|
|
|
21423
|
|
|
|
|
|
|
// Train and encode gru_tokenizer |
|
21424
|
0
|
0
|
|
|
|
|
os.put(morphodita::tokenizer_ids::GRU); |
|
21425
|
0
|
0
|
|
|
|
|
if (!morphodita::gru_tokenizer_trainer::train(tokenize_url ? morphodita::gru_tokenizer_trainer::URL_EMAIL_LATEST : 0, |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21426
|
|
|
|
|
|
|
segment_size, allow_spaces, dimension, epochs, batch_size, learning_rate, |
|
21427
|
|
|
|
|
|
|
learning_rate_final, dropout, initialization_range, early_stopping, |
|
21428
|
|
|
|
|
|
|
sentences, heldout_sentences, os, error)) |
|
21429
|
|
|
|
|
|
|
return false; |
|
21430
|
|
|
|
|
|
|
} else { |
|
21431
|
0
|
0
|
|
|
|
|
return error.assign("Unknown tokenizer model '").append(model).append("'!"), false; |
|
|
|
0
|
|
|
|
|
|
|
21432
|
|
|
|
|
|
|
} |
|
21433
|
|
|
|
|
|
|
|
|
21434
|
|
|
|
|
|
|
// Multiword splitter |
|
21435
|
0
|
0
|
|
|
|
|
if (!multiword_splitter_trainer::train(training, os, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
21436
|
|
|
|
|
|
|
} |
|
21437
|
|
|
|
|
|
|
} |
|
21438
|
|
|
|
|
|
|
|
|
21439
|
|
|
|
|
|
|
return true; |
|
21440
|
|
|
|
|
|
|
} |
|
21441
|
|
|
|
|
|
|
|
|
21442
|
0
|
|
|
|
|
|
bool trainer_morphodita_parsito::train_tagger(const vector& training, const vector& heldout, |
|
21443
|
|
|
|
|
|
|
const string& options, ostream& os, string& error) { |
|
21444
|
0
|
0
|
|
|
|
|
if (options == NONE) { |
|
21445
|
0
|
|
|
|
|
|
os.put(0); |
|
21446
|
|
|
|
|
|
|
} else { |
|
21447
|
|
|
|
|
|
|
// Parse options |
|
21448
|
|
|
|
|
|
|
named_values::map tagger; |
|
21449
|
0
|
0
|
|
|
|
|
if (!named_values::parse(options, tagger, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
21450
|
|
|
|
|
|
|
|
|
21451
|
0
|
0
|
|
|
|
|
if (tagger.count("from_model")) { |
|
|
|
0
|
|
|
|
|
|
|
21452
|
|
|
|
|
|
|
// Use specified tokenizer model(s) |
|
21453
|
|
|
|
|
|
|
int model_index = 0, taggers_total = 0; |
|
21454
|
0
|
0
|
|
|
|
|
string model_name = "from_model"; |
|
21455
|
|
|
|
|
|
|
vector taggers_data; |
|
21456
|
0
|
0
|
|
|
|
|
do { |
|
21457
|
0
|
0
|
|
|
|
|
taggers_data.emplace_back(); |
|
21458
|
0
|
0
|
|
|
|
|
if (!load_model(tagger[model_name], TAGGER_MODEL, taggers_data.back())) |
|
|
|
0
|
|
|
|
|
|
|
21459
|
0
|
0
|
|
|
|
|
return error.assign("Cannot load model from which the tagger should be used!"), false; |
|
21460
|
0
|
0
|
|
|
|
|
if (taggers_data.back().str[0]) { |
|
21461
|
0
|
|
|
|
|
|
taggers_total += taggers_data.back().str[0]; |
|
21462
|
|
|
|
|
|
|
|
|
21463
|
0
|
0
|
|
|
|
|
vector overrides = {"lemma", "xpostag", "feats"}; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21464
|
0
|
0
|
|
|
|
|
for (size_t i = 0; i < overrides.size(); i++) { |
|
21465
|
0
|
0
|
|
|
|
|
string override_name = "from_model_" + overrides[i]; |
|
21466
|
0
|
|
|
|
|
|
int override_value = -1; |
|
21467
|
0
|
0
|
|
|
|
|
if (!option_int(tagger, override_name, override_value, error, model_index)) return false; |
|
|
|
0
|
|
|
|
|
|
|
21468
|
0
|
0
|
|
|
|
|
if (override_value >= 0) |
|
21469
|
0
|
|
|
|
|
|
const_cast(taggers_data.back().str[1 + i]) = override_value; |
|
21470
|
|
|
|
|
|
|
} |
|
21471
|
|
|
|
|
|
|
} else { |
|
21472
|
|
|
|
|
|
|
taggers_data.pop_back(); |
|
21473
|
|
|
|
|
|
|
} |
|
21474
|
0
|
0
|
|
|
|
|
model_name = "from_model_" + to_string(1 + ++model_index); |
|
21475
|
|
|
|
|
|
|
} while (tagger.count(model_name)); |
|
21476
|
0
|
0
|
|
|
|
|
if (taggers_total < 0 || taggers_total > 4) return error.assign("Cannot create more than four tagger models!"), false; |
|
|
|
0
|
|
|
|
|
|
|
21477
|
|
|
|
|
|
|
|
|
21478
|
|
|
|
|
|
|
cerr << "Using tagger from given model(s)." << endl; |
|
21479
|
0
|
0
|
|
|
|
|
os.put(taggers_total); |
|
21480
|
0
|
0
|
|
|
|
|
for (auto&& tagger_data : taggers_data) |
|
21481
|
0
|
0
|
|
|
|
|
os.write(tagger_data.str + 1, tagger_data.len - 1); |
|
21482
|
|
|
|
|
|
|
} else { |
|
21483
|
|
|
|
|
|
|
// Create MorphoDiTa model(s) |
|
21484
|
0
|
0
|
|
|
|
|
int models = 1; if (!option_int(tagger, "models", models, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21485
|
0
|
0
|
|
|
|
|
if (models <= 0) return error.assign("Number of tagger models cannot be negative or zero!"), false; |
|
|
|
0
|
|
|
|
|
|
|
21486
|
0
|
0
|
|
|
|
|
if (models > 4) return error.assign("Cannot create more than four tagger models!"), false; |
|
|
|
0
|
|
|
|
|
|
|
21487
|
|
|
|
|
|
|
|
|
21488
|
0
|
0
|
|
|
|
|
os.put(models); |
|
21489
|
0
|
0
|
|
|
|
|
for (int model = 0; model < models; model++) |
|
21490
|
0
|
0
|
|
|
|
|
if (!train_tagger_model(training, heldout, model, models, tagger, os, error)) |
|
|
|
0
|
|
|
|
|
|
|
21491
|
|
|
|
|
|
|
return false; |
|
21492
|
|
|
|
|
|
|
} |
|
21493
|
|
|
|
|
|
|
} |
|
21494
|
|
|
|
|
|
|
|
|
21495
|
|
|
|
|
|
|
return true; |
|
21496
|
|
|
|
|
|
|
} |
|
21497
|
|
|
|
|
|
|
|
|
21498
|
0
|
|
|
|
|
|
bool trainer_morphodita_parsito::train_parser(const vector& training, const vector& heldout, |
|
21499
|
|
|
|
|
|
|
const string& options, const string& tagger_model, ostream& os, string& error) { |
|
21500
|
0
|
0
|
|
|
|
|
if (options == NONE) { |
|
21501
|
0
|
|
|
|
|
|
os.put(0); |
|
21502
|
|
|
|
|
|
|
} else { |
|
21503
|
|
|
|
|
|
|
// Create Parsito model |
|
21504
|
|
|
|
|
|
|
named_values::map parser; |
|
21505
|
0
|
0
|
|
|
|
|
if (!named_values::parse(options, parser, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
21506
|
0
|
0
|
|
|
|
|
int run = 0; if (!option_int(parser, "run", run, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21507
|
|
|
|
|
|
|
|
|
21508
|
0
|
0
|
|
|
|
|
if (parser.count("from_model")) { |
|
|
|
0
|
|
|
|
|
|
|
21509
|
|
|
|
|
|
|
// Use specified parser model |
|
21510
|
|
|
|
|
|
|
string_piece parser_data; |
|
21511
|
0
|
0
|
|
|
|
|
if (!load_model(parser["from_model"], PARSER_MODEL, parser_data)) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21512
|
0
|
0
|
|
|
|
|
return error.assign("Cannot load model from which the parser should be used!"), false; |
|
21513
|
|
|
|
|
|
|
|
|
21514
|
|
|
|
|
|
|
cerr << "Using parser from given model." << endl; |
|
21515
|
0
|
0
|
|
|
|
|
os.write(parser_data.str, parser_data.len); |
|
21516
|
|
|
|
|
|
|
} else { |
|
21517
|
0
|
0
|
|
|
|
|
os.put(1); |
|
21518
|
|
|
|
|
|
|
|
|
21519
|
|
|
|
|
|
|
// Parsito options |
|
21520
|
0
|
0
|
|
|
|
|
string transition_system = parser.count("transition_system") ? parser["transition_system"] : "projective"; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21521
|
0
|
0
|
|
|
|
|
string transition_oracle = parser.count("transition_oracle") ? parser["transition_oracle"] : |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21522
|
|
|
|
|
|
|
transition_system == "projective" ? "dynamic" : |
|
21523
|
|
|
|
|
|
|
transition_system == "swap" ? "static_lazy" : |
|
21524
|
0
|
0
|
|
|
|
|
"static"; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21525
|
|
|
|
|
|
|
|
|
21526
|
0
|
0
|
|
|
|
|
int embedding_upostag = 20; if (!option_int(parser, "embedding_upostag", embedding_upostag, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21527
|
0
|
0
|
|
|
|
|
int embedding_feats = 20; if (!option_int(parser, "embedding_feats", embedding_feats, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21528
|
0
|
0
|
|
|
|
|
int embedding_xpostag = 0; if (!option_int(parser, "embedding_xpostag", embedding_xpostag, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21529
|
0
|
0
|
|
|
|
|
int embedding_form = 50; if (!option_int(parser, "embedding_form", embedding_form, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21530
|
0
|
0
|
|
|
|
|
int embedding_form_mincount = 2; if (!option_int(parser, "embedding_form_mincount", embedding_form_mincount, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21531
|
0
|
0
|
|
|
|
|
int embedding_lemma = 0; if (!option_int(parser, "embedding_lemma", embedding_lemma, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21532
|
0
|
0
|
|
|
|
|
int embedding_lemma_mincount = 2; if (!option_int(parser, "embedding_lemma_mincount", embedding_lemma_mincount, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21533
|
0
|
0
|
|
|
|
|
int embedding_deprel = 20; if (!option_int(parser, "embedding_deprel", embedding_deprel, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21534
|
|
|
|
|
|
|
string embeddings; |
|
21535
|
0
|
0
|
|
|
|
|
if (embedding_upostag) embeddings.append("universal_tag ").append(to_string(embedding_upostag)).append(" 1\n"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21536
|
0
|
0
|
|
|
|
|
if (embedding_feats) embeddings.append("feats ").append(to_string(embedding_feats)).append(" 1\n"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21537
|
0
|
0
|
|
|
|
|
if (embedding_xpostag) embeddings.append("tag ").append(to_string(embedding_xpostag)).append(" 1\n"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21538
|
0
|
0
|
|
|
|
|
if (embedding_form) { |
|
21539
|
0
|
0
|
|
|
|
|
embeddings.append("form ").append(to_string(embedding_form)).append(" ").append(to_string(embedding_form_mincount)); |
|
|
|
0
|
|
|
|
|
|
|
21540
|
0
|
0
|
|
|
|
|
if (!option_str(parser, "embedding_form_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_form_file")); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21541
|
0
|
0
|
|
|
|
|
embeddings.push_back('\n'); |
|
21542
|
|
|
|
|
|
|
} |
|
21543
|
0
|
0
|
|
|
|
|
if (embedding_lemma) { |
|
21544
|
0
|
0
|
|
|
|
|
embeddings.append("lemma ").append(to_string(embedding_lemma)).append(" ").append(to_string(embedding_lemma_mincount)); |
|
|
|
0
|
|
|
|
|
|
|
21545
|
0
|
0
|
|
|
|
|
if (!option_str(parser, "embedding_lemma_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_lemma_file")); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21546
|
0
|
0
|
|
|
|
|
embeddings.push_back('\n'); |
|
21547
|
|
|
|
|
|
|
} |
|
21548
|
0
|
0
|
|
|
|
|
if (embedding_deprel) embeddings.append("deprel ").append(to_string(embedding_deprel)).append(" 1\n"); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21549
|
|
|
|
|
|
|
|
|
21550
|
0
|
0
|
|
|
|
|
bool single_root = true; if (!option_bool(parser, "single_root", single_root, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21551
|
0
|
0
|
|
|
|
|
int iterations = 10; if (!option_int(parser, "iterations", iterations, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21552
|
0
|
0
|
|
|
|
|
int hidden_layer = 200; if (!option_int(parser, "hidden_layer", hidden_layer, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21553
|
0
|
0
|
|
|
|
|
int batch_size = 10; if (!option_int(parser, "batch_size", batch_size, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21554
|
0
|
0
|
|
|
|
|
int structured_interval = run <= 1 ? 8 : hyperparameter_integer(run,1,0,2) == 2 ? 0 : 8 + 2*hyperparameter_integer(run,1,0,2); |
|
|
|
0
|
|
|
|
|
|
|
21555
|
0
|
0
|
|
|
|
|
if (!option_int(parser, "structured_interval", structured_interval, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21556
|
0
|
0
|
|
|
|
|
double learning_rate = run <= 1 ? 0.02 : hyperparameter_logarithmic(run, 2, 0.005, 0.04); |
|
21557
|
0
|
0
|
|
|
|
|
if (!option_double(parser, "learning_rate", learning_rate, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21558
|
0
|
0
|
|
|
|
|
double learning_rate_final = 0.001; if (!option_double(parser, "learning_rate_final", learning_rate_final, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21559
|
0
|
0
|
|
|
|
|
double l2 = run <= 1 ? 0.5 : hyperparameter_uniform(run, 3, 0.2, 0.6); |
|
21560
|
0
|
0
|
|
|
|
|
if (!option_double(parser, "l2", l2, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21561
|
0
|
0
|
|
|
|
|
bool early_stopping = !heldout.empty(); if (!option_bool(parser, "early_stopping", early_stopping, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21562
|
|
|
|
|
|
|
|
|
21563
|
0
|
0
|
|
|
|
|
if (run >= 1) cerr << "Random search run " << run << ", structured_interval=" << structured_interval |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21564
|
0
|
|
|
|
|
|
<< ", learning_rate=" << fixed << setprecision(8) << learning_rate |
|
21565
|
0
|
|
|
|
|
|
<< ", l2=" << l2 << endl; |
|
21566
|
|
|
|
|
|
|
|
|
21567
|
|
|
|
|
|
|
// Prepare data in the correct format |
|
21568
|
|
|
|
|
|
|
parsito::network_parameters parameters; |
|
21569
|
0
|
|
|
|
|
|
parameters.iterations = iterations; |
|
21570
|
0
|
|
|
|
|
|
parameters.structured_interval = structured_interval; |
|
21571
|
0
|
|
|
|
|
|
parameters.hidden_layer = hidden_layer; |
|
21572
|
0
|
|
|
|
|
|
parameters.hidden_layer_type = parsito::activation_function::TANH; |
|
21573
|
0
|
|
|
|
|
|
parameters.trainer.algorithm = parsito::network_trainer::SGD; |
|
21574
|
0
|
|
|
|
|
|
parameters.trainer.learning_rate = learning_rate; |
|
21575
|
0
|
|
|
|
|
|
parameters.trainer.learning_rate_final = learning_rate_final; |
|
21576
|
0
|
|
|
|
|
|
parameters.trainer.momentum = 0; |
|
21577
|
0
|
|
|
|
|
|
parameters.trainer.epsilon = 0; |
|
21578
|
0
|
|
|
|
|
|
parameters.batch_size = batch_size; |
|
21579
|
0
|
|
|
|
|
|
parameters.initialization_range = 0.1f; |
|
21580
|
0
|
|
|
|
|
|
parameters.l1_regularization = 0; |
|
21581
|
0
|
|
|
|
|
|
parameters.l2_regularization = l2; |
|
21582
|
0
|
|
|
|
|
|
parameters.maxnorm_regularization = 0; |
|
21583
|
0
|
|
|
|
|
|
parameters.dropout_hidden = 0; |
|
21584
|
0
|
|
|
|
|
|
parameters.dropout_input = 0; |
|
21585
|
0
|
|
|
|
|
|
parameters.early_stopping = early_stopping; |
|
21586
|
|
|
|
|
|
|
|
|
21587
|
|
|
|
|
|
|
// Tag the input if required |
|
21588
|
|
|
|
|
|
|
unique_ptr tagger; |
|
21589
|
0
|
0
|
|
|
|
|
bool use_gold_tags = false; if (!option_bool(parser, "use_gold_tags", use_gold_tags, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21590
|
0
|
0
|
|
|
|
|
if (!use_gold_tags && !tagger_model.empty() && tagger_model[0]) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21591
|
0
|
0
|
|
|
|
|
stringstream tagger_description; |
|
21592
|
0
|
0
|
|
|
|
|
tagger_description.put(model_morphodita_parsito::VERSION_LATEST).put(0x7F).put(0x7F).put(0).write(tagger_model.data(), tagger_model.size()).put(0); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21593
|
0
|
0
|
|
|
|
|
tagger.reset(model_morphodita_parsito::load(tagger_description)); |
|
21594
|
0
|
0
|
|
|
|
|
if (!tagger) return error.assign("Cannot load the tagger model for parser training data generation!"), false; |
|
|
|
0
|
|
|
|
|
|
|
21595
|
|
|
|
|
|
|
} |
|
21596
|
|
|
|
|
|
|
|
|
21597
|
|
|
|
|
|
|
// Training data |
|
21598
|
0
|
0
|
|
|
|
|
sentence tagged; |
|
21599
|
0
|
|
|
|
|
|
vector train_trees; |
|
21600
|
0
|
0
|
|
|
|
|
for (auto&& sentence : training) { |
|
21601
|
0
|
0
|
|
|
|
|
tagged = sentence; |
|
21602
|
0
|
0
|
|
|
|
|
if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21603
|
|
|
|
|
|
|
|
|
21604
|
0
|
0
|
|
|
|
|
train_trees.emplace_back(); |
|
21605
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < tagged.words.size(); i++) { |
|
21606
|
0
|
|
|
|
|
|
train_trees.back().add_node(string()); |
|
21607
|
0
|
0
|
|
|
|
|
model_normalize_form(tagged.words[i].form, train_trees.back().nodes.back().form); |
|
21608
|
0
|
|
|
|
|
|
train_trees.back().nodes.back().lemma.assign(tagged.words[i].lemma); |
|
21609
|
0
|
|
|
|
|
|
train_trees.back().nodes.back().upostag.assign(tagged.words[i].upostag); |
|
21610
|
0
|
|
|
|
|
|
train_trees.back().nodes.back().xpostag.assign(tagged.words[i].xpostag); |
|
21611
|
0
|
|
|
|
|
|
train_trees.back().nodes.back().feats.assign(tagged.words[i].feats); |
|
21612
|
|
|
|
|
|
|
} |
|
21613
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < tagged.words.size(); i++) |
|
21614
|
0
|
0
|
|
|
|
|
train_trees.back().set_head(tagged.words[i].id, tagged.words[i].head, tagged.words[i].deprel); |
|
21615
|
|
|
|
|
|
|
} |
|
21616
|
|
|
|
|
|
|
|
|
21617
|
|
|
|
|
|
|
// Heldout data |
|
21618
|
0
|
|
|
|
|
|
vector heldout_trees; |
|
21619
|
0
|
0
|
|
|
|
|
for (auto&& sentence : heldout) { |
|
21620
|
0
|
0
|
|
|
|
|
tagged = sentence; |
|
21621
|
0
|
0
|
|
|
|
|
if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21622
|
|
|
|
|
|
|
|
|
21623
|
0
|
0
|
|
|
|
|
heldout_trees.emplace_back(); |
|
21624
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < tagged.words.size(); i++) { |
|
21625
|
0
|
|
|
|
|
|
heldout_trees.back().add_node(string()); |
|
21626
|
0
|
0
|
|
|
|
|
model_normalize_form(tagged.words[i].form, heldout_trees.back().nodes.back().form); |
|
21627
|
0
|
|
|
|
|
|
heldout_trees.back().nodes.back().lemma.assign(tagged.words[i].lemma); |
|
21628
|
0
|
|
|
|
|
|
heldout_trees.back().nodes.back().upostag.assign(tagged.words[i].upostag); |
|
21629
|
0
|
|
|
|
|
|
heldout_trees.back().nodes.back().xpostag.assign(tagged.words[i].xpostag); |
|
21630
|
0
|
|
|
|
|
|
heldout_trees.back().nodes.back().feats.assign(tagged.words[i].feats); |
|
21631
|
|
|
|
|
|
|
} |
|
21632
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < tagged.words.size(); i++) |
|
21633
|
0
|
0
|
|
|
|
|
heldout_trees.back().set_head(tagged.words[i].id, tagged.words[i].head, tagged.words[i].deprel); |
|
21634
|
|
|
|
|
|
|
} |
|
21635
|
|
|
|
|
|
|
|
|
21636
|
|
|
|
|
|
|
cerr << "Parser transition options: system=" << transition_system << ", oracle=" << transition_oracle |
|
21637
|
0
|
0
|
|
|
|
|
<< ", structured_interval=" << structured_interval << ", single_root=" << (single_root ? 1 : 0) << endl |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21638
|
0
|
0
|
|
|
|
|
<< "Parser uses lemmas/upos/xpos/feats: " << (tagger ? "automatically generated by tagger" : "from gold data") << endl |
|
|
|
0
|
|
|
|
|
|
|
21639
|
0
|
0
|
|
|
|
|
<< "Parser embeddings options: upostag=" << embedding_upostag << ", feats=" << embedding_feats << ", xpostag=" << embedding_xpostag |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21640
|
0
|
0
|
|
|
|
|
<< ", form=" << embedding_form << ", lemma=" << embedding_lemma << ", deprel=" << embedding_deprel << endl |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21641
|
0
|
0
|
|
|
|
|
<< " form mincount=" << embedding_form_mincount << ", precomputed form embeddings=" << (parser["embedding_form_file"].empty() ? "none" : parser["embedding_form_file"]) << endl |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21642
|
0
|
0
|
|
|
|
|
<< " lemma mincount=" << embedding_lemma_mincount << ", precomputed lemma embeddings=" << (parser["embedding_lemma_file"].empty() ? "none" : parser["embedding_lemma_file"]) << endl |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21643
|
0
|
0
|
|
|
|
|
<< "Parser network options: iterations=" << iterations << ", hidden_layer=" << hidden_layer << ", batch_size=" << batch_size << "," << endl |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21644
|
0
|
|
|
|
|
|
<< " learning_rate=" << fixed << setprecision(4) << learning_rate << ", learning_rate_final=" << learning_rate_final |
|
21645
|
0
|
0
|
|
|
|
|
<< ", l2=" << l2 << ", early_stopping=" << (early_stopping ? 1 : 0) << endl; |
|
|
|
0
|
|
|
|
|
|
|
21646
|
|
|
|
|
|
|
|
|
21647
|
|
|
|
|
|
|
// Train the parser |
|
21648
|
0
|
0
|
|
|
|
|
binary_encoder enc; |
|
21649
|
0
|
0
|
|
|
|
|
enc.add_str("nn_versioned"); |
|
21650
|
0
|
|
|
|
|
|
parsito::parser_nn_trainer::train(transition_system, transition_oracle, single_root, embeddings, parser_nodes, |
|
21651
|
0
|
0
|
|
|
|
|
parameters, 1, train_trees, heldout_trees, enc); |
|
21652
|
0
|
0
|
|
|
|
|
compressor::save(os, enc); |
|
21653
|
|
|
|
|
|
|
} |
|
21654
|
|
|
|
|
|
|
} |
|
21655
|
|
|
|
|
|
|
|
|
21656
|
|
|
|
|
|
|
return true; |
|
21657
|
|
|
|
|
|
|
} |
|
21658
|
|
|
|
|
|
|
|
|
21659
|
0
|
|
|
|
|
|
bool trainer_morphodita_parsito::load_model(const string& data, model_type model, string_piece& range) { |
|
21660
|
0
|
|
|
|
|
|
istringstream is(data); |
|
21661
|
|
|
|
|
|
|
|
|
21662
|
|
|
|
|
|
|
// Check that it is morphodita_parsito model. |
|
21663
|
|
|
|
|
|
|
char len; |
|
21664
|
0
|
0
|
|
|
|
|
if (!is.get(len)) return false; |
|
|
|
0
|
|
|
|
|
|
|
21665
|
0
|
|
|
|
|
|
string name(len, ' '); |
|
21666
|
0
|
0
|
|
|
|
|
if (!is.read(&name[0], len)) return false; |
|
|
|
0
|
|
|
|
|
|
|
21667
|
0
|
0
|
|
|
|
|
if (name != "morphodita_parsito") return false; |
|
21668
|
|
|
|
|
|
|
|
|
21669
|
|
|
|
|
|
|
char version; |
|
21670
|
0
|
0
|
|
|
|
|
if (!is.get(version)) return false; |
|
|
|
0
|
|
|
|
|
|
|
21671
|
0
|
0
|
|
|
|
|
if (!(version >= 1 && version <= model_morphodita_parsito::VERSION_LATEST)) return false; |
|
21672
|
|
|
|
|
|
|
|
|
21673
|
|
|
|
|
|
|
// Because UDPipe 1.0 does not check the model version, |
|
21674
|
|
|
|
|
|
|
// a specific sentinel was added since version 2 so that |
|
21675
|
|
|
|
|
|
|
// loading of such model fail on UDPipe 1.0 |
|
21676
|
0
|
0
|
|
|
|
|
if (version >= 2) { |
|
21677
|
|
|
|
|
|
|
char sentinel; |
|
21678
|
0
|
0
|
|
|
|
|
if (!is.get(sentinel) || sentinel != 0x7F) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21679
|
0
|
0
|
|
|
|
|
if (!is.get(sentinel) || sentinel != 0x7F) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21680
|
|
|
|
|
|
|
} |
|
21681
|
|
|
|
|
|
|
|
|
21682
|
|
|
|
|
|
|
// Tokenizer |
|
21683
|
|
|
|
|
|
|
{ |
|
21684
|
0
|
0
|
|
|
|
|
if (model == TOKENIZER_MODEL) range.str = data.data() + is.tellg(); |
|
|
|
0
|
|
|
|
|
|
|
21685
|
0
|
0
|
|
|
|
|
char tokenizer; if (!is.get(tokenizer)) return false; |
|
|
|
0
|
|
|
|
|
|
|
21686
|
0
|
0
|
|
|
|
|
unique_ptr tokenizer_factory(tokenizer ? morphodita::tokenizer_factory::load(is) : nullptr); |
|
|
|
0
|
|
|
|
|
|
|
21687
|
0
|
0
|
|
|
|
|
if (tokenizer && !tokenizer_factory) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21688
|
0
|
0
|
|
|
|
|
unique_ptr splitter(tokenizer ? multiword_splitter::load(is) : nullptr); |
|
|
|
0
|
|
|
|
|
|
|
21689
|
0
|
0
|
|
|
|
|
if (model == TOKENIZER_MODEL) return range.len = is.tellg() - streampos(range.str - data.data()), true; |
|
|
|
0
|
|
|
|
|
|
|
21690
|
|
|
|
|
|
|
} |
|
21691
|
|
|
|
|
|
|
|
|
21692
|
|
|
|
|
|
|
// Tagger |
|
21693
|
|
|
|
|
|
|
{ |
|
21694
|
0
|
0
|
|
|
|
|
if (model == TAGGER_MODEL) range.str = data.data() + is.tellg(); |
|
|
|
0
|
|
|
|
|
|
|
21695
|
0
|
0
|
|
|
|
|
char taggers; if (!is.get(taggers)) return false; |
|
|
|
0
|
|
|
|
|
|
|
21696
|
0
|
0
|
|
|
|
|
for (char i = 0; i < taggers; i++) { |
|
21697
|
0
|
0
|
|
|
|
|
char lemma; if (!is.get(lemma)) return false; |
|
|
|
0
|
|
|
|
|
|
|
21698
|
0
|
0
|
|
|
|
|
char xpostag; if (!is.get(xpostag)) return false; |
|
|
|
0
|
|
|
|
|
|
|
21699
|
0
|
0
|
|
|
|
|
char feats; if (!is.get(feats)) return false; |
|
|
|
0
|
|
|
|
|
|
|
21700
|
0
|
0
|
|
|
|
|
unique_ptr tagger(morphodita::tagger::load(is)); |
|
21701
|
0
|
0
|
|
|
|
|
if (!tagger) return false; |
|
21702
|
|
|
|
|
|
|
} |
|
21703
|
0
|
0
|
|
|
|
|
if (model == TAGGER_MODEL) return range.len = is.tellg() - streampos(range.str - data.data()), true; |
|
|
|
0
|
|
|
|
|
|
|
21704
|
|
|
|
|
|
|
} |
|
21705
|
|
|
|
|
|
|
|
|
21706
|
|
|
|
|
|
|
// Parser |
|
21707
|
|
|
|
|
|
|
{ |
|
21708
|
0
|
0
|
|
|
|
|
if (model == PARSER_MODEL) range.str = data.data() + is.tellg(); |
|
|
|
0
|
|
|
|
|
|
|
21709
|
|
|
|
|
|
|
char parser; |
|
21710
|
0
|
0
|
|
|
|
|
if (!is.get(parser)) return false; |
|
|
|
0
|
|
|
|
|
|
|
21711
|
0
|
0
|
|
|
|
|
unique_ptr parser_model(parser ? parsito::parser::load(is) : nullptr); |
|
|
|
0
|
|
|
|
|
|
|
21712
|
0
|
0
|
|
|
|
|
if (parser && !parser_model) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21713
|
0
|
0
|
|
|
|
|
if (model == PARSER_MODEL) return range.len = is.tellg() - streampos(range.str - data.data()), true; |
|
|
|
0
|
|
|
|
|
|
|
21714
|
|
|
|
|
|
|
} |
|
21715
|
|
|
|
|
|
|
|
|
21716
|
0
|
|
|
|
|
|
return false; |
|
21717
|
|
|
|
|
|
|
} |
|
21718
|
|
|
|
|
|
|
|
|
21719
|
0
|
|
|
|
|
|
const string& trainer_morphodita_parsito::model_normalize_form(string_piece form, string& output) { |
|
21720
|
0
|
0
|
|
|
|
|
return model_morphodita_parsito(model_morphodita_parsito::VERSION_LATEST).normalize_form(form, output); |
|
21721
|
|
|
|
|
|
|
} |
|
21722
|
|
|
|
|
|
|
|
|
21723
|
0
|
|
|
|
|
|
const string& trainer_morphodita_parsito::model_normalize_lemma(string_piece lemma, string& output) { |
|
21724
|
0
|
0
|
|
|
|
|
return model_morphodita_parsito(model_morphodita_parsito::VERSION_LATEST).normalize_lemma(lemma, output); |
|
21725
|
|
|
|
|
|
|
} |
|
21726
|
|
|
|
|
|
|
|
|
21727
|
0
|
|
|
|
|
|
void trainer_morphodita_parsito::model_fill_word_analysis(const morphodita::tagged_lemma& analysis, bool upostag, int lemma, bool xpostag, bool feats, word& word) { |
|
21728
|
0
|
0
|
|
|
|
|
model_morphodita_parsito(model_morphodita_parsito::VERSION_LATEST).fill_word_analysis(analysis, false, upostag, lemma, xpostag, feats, word); |
|
21729
|
0
|
|
|
|
|
|
} |
|
21730
|
|
|
|
|
|
|
|
|
21731
|
|
|
|
|
|
|
// Tagger model helper functions |
|
21732
|
|
|
|
|
|
|
|
|
21733
|
0
|
|
|
|
|
|
bool trainer_morphodita_parsito::train_tagger_model(const vector& training, const vector& heldout, |
|
21734
|
|
|
|
|
|
|
unsigned model, unsigned models, const named_values::map& tagger, |
|
21735
|
|
|
|
|
|
|
ostream& os, string& error) { |
|
21736
|
0
|
0
|
|
|
|
|
unique_ptr conllu_input_format(input_format::new_conllu_input_format()); |
|
21737
|
|
|
|
|
|
|
|
|
21738
|
0
|
0
|
|
|
|
|
int run = 0; if (!option_int(tagger, "run", run, error, model)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21739
|
|
|
|
|
|
|
|
|
21740
|
|
|
|
|
|
|
bool have_lemma = false; |
|
21741
|
0
|
0
|
|
|
|
|
for (auto&& sentence : training) |
|
21742
|
0
|
0
|
|
|
|
|
for (size_t i = 1; !have_lemma && i < sentence.words.size(); i++) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21743
|
0
|
0
|
|
|
|
|
if (!sentence.words[i].lemma.empty() && sentence.words[i].lemma != "_") |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21744
|
|
|
|
|
|
|
have_lemma = true; |
|
21745
|
0
|
0
|
|
|
|
|
bool use_lemma_flag = model == 1 || models == 1; if (!option_bool(tagger, "use_lemma", use_lemma_flag, error, model)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21746
|
0
|
0
|
|
|
|
|
int lemma_encoding = 2; if (!option_int(tagger, "dictionary_lemma_encoding", lemma_encoding, error, model)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21747
|
0
|
0
|
|
|
|
|
int use_lemma = have_lemma && use_lemma_flag ? lemma_encoding : 0; |
|
|
|
0
|
|
|
|
|
|
|
21748
|
0
|
0
|
|
|
|
|
bool use_xpostag = model == 0; if (!option_bool(tagger, "use_xpostag", use_xpostag, error, model)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21749
|
0
|
0
|
|
|
|
|
bool use_feats = model == 0; if (!option_bool(tagger, "use_feats", use_feats, error, model)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21750
|
|
|
|
|
|
|
|
|
21751
|
0
|
0
|
|
|
|
|
bool provide_lemma = model == 1 || models == 1; if (!option_bool(tagger, "provide_lemma", provide_lemma, error, model)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21752
|
0
|
0
|
|
|
|
|
bool provide_xpostag = model == 0; if (!option_bool(tagger, "provide_xpostag", provide_xpostag, error, model)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21753
|
0
|
0
|
|
|
|
|
bool provide_feats = model == 0; if (!option_bool(tagger, "provide_feats", provide_feats, error, model)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21754
|
0
|
0
|
|
|
|
|
os.put(char(provide_lemma ? use_lemma : 0)); |
|
|
|
0
|
|
|
|
|
|
|
21755
|
0
|
0
|
|
|
|
|
os.put(char(provide_xpostag && use_xpostag)); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21756
|
0
|
0
|
|
|
|
|
os.put(char(provide_feats && use_feats)); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21757
|
|
|
|
|
|
|
|
|
21758
|
0
|
0
|
|
|
|
|
cerr << "Tagger model " << model+1 << " columns: " << "lemma use=" << (use_lemma ? 1 : 0) << "/provide=" << (provide_lemma ? 1 : 0) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21759
|
0
|
0
|
|
|
|
|
<< ", xpostag use=" << (use_xpostag ? 1 : 0) << "/provide=" << (provide_xpostag ? 1 : 0) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21760
|
0
|
0
|
|
|
|
|
<< ", feats use=" << (use_feats ? 1 : 0) << "/provide=" << (provide_feats ? 1 : 0) << endl; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21761
|
|
|
|
|
|
|
|
|
21762
|
|
|
|
|
|
|
// Start by creating the morphological dictionary |
|
21763
|
0
|
0
|
|
|
|
|
stringstream morpho_description; |
|
21764
|
|
|
|
|
|
|
string normalized_form, combined_tag, combined_lemma; |
|
21765
|
|
|
|
|
|
|
|
|
21766
|
|
|
|
|
|
|
// Generic options |
|
21767
|
0
|
0
|
|
|
|
|
const string& dictionary_model = option_str(tagger, "dictionary_model", model); |
|
|
|
0
|
|
|
|
|
|
|
21768
|
0
|
0
|
|
|
|
|
if (!dictionary_model.empty()) { |
|
21769
|
|
|
|
|
|
|
// Use specified morphological dictionary |
|
21770
|
|
|
|
|
|
|
cerr << "Using given morphological dictionary for tagger model " << model+1 << "." << endl; |
|
21771
|
|
|
|
|
|
|
morpho_description << dictionary_model; |
|
21772
|
|
|
|
|
|
|
} else { |
|
21773
|
|
|
|
|
|
|
// Create the morphological dictionary and guesser from data |
|
21774
|
|
|
|
|
|
|
cerr << "Creating morphological dictionary for tagger model " << model+1 << "." << endl; |
|
21775
|
|
|
|
|
|
|
|
|
21776
|
|
|
|
|
|
|
// Dictionary options |
|
21777
|
0
|
0
|
|
|
|
|
int dictionary_suffix_len = 8; if (!option_int(tagger, "dictionary_suffix_len", dictionary_suffix_len, error, model)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21778
|
|
|
|
|
|
|
unordered_set flat_lemmas; |
|
21779
|
0
|
0
|
|
|
|
|
if (!option_str(tagger, "dictionary_flat_lemmas", model).empty()) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21780
|
0
|
|
|
|
|
|
vector lemmas; |
|
21781
|
0
|
0
|
|
|
|
|
split(option_str(tagger, "dictionary_flat_lemmas", model), ',', lemmas); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21782
|
0
|
0
|
|
|
|
|
for (auto&& lemma : lemmas) { |
|
21783
|
0
|
0
|
|
|
|
|
if (lemma.find('~') != string::npos) |
|
21784
|
0
|
0
|
|
|
|
|
return error.assign("Dictionary_flat_lemmas cannot contain '~' character!"), false; |
|
21785
|
|
|
|
|
|
|
flat_lemmas.insert(lemma); |
|
21786
|
|
|
|
|
|
|
} |
|
21787
|
|
|
|
|
|
|
} else { |
|
21788
|
0
|
0
|
|
|
|
|
flat_lemmas.insert("greek.expression"); |
|
21789
|
|
|
|
|
|
|
} |
|
21790
|
|
|
|
|
|
|
|
|
21791
|
0
|
0
|
|
|
|
|
if (!option_str(tagger, "dictionary", model).empty()) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21792
|
0
|
0
|
|
|
|
|
return error.assign("The tagger 'dictionary' option is no longer supported, use 'dictionary_file' instead!"), false; |
|
21793
|
0
|
0
|
|
|
|
|
const string& dictionary_file = option_str(tagger, "dictionary_file", model); |
|
|
|
0
|
|
|
|
|
|
|
21794
|
0
|
0
|
|
|
|
|
int max_form_analyses = 0; if (!option_int(tagger, "dictionary_max_form_analyses", max_form_analyses, error, model)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21795
|
|
|
|
|
|
|
|
|
21796
|
0
|
0
|
|
|
|
|
cerr << "Tagger model " << model+1 << " dictionary options: " << "max_form_analyses=" << max_form_analyses |
|
21797
|
0
|
0
|
|
|
|
|
<< ", custom dictionary_file=" << (dictionary_file.empty() ? "none" : dictionary_file) << endl; |
|
|
|
0
|
|
|
|
|
|
|
21798
|
|
|
|
|
|
|
|
|
21799
|
|
|
|
|
|
|
// Guesser options |
|
21800
|
0
|
0
|
|
|
|
|
int guesser_suffix_len = 4; if (!option_int(tagger, "guesser_suffix_len", guesser_suffix_len, error, model)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21801
|
0
|
0
|
|
|
|
|
int guesser_suffix_rules = run <= 1 ? 8 : 5 + hyperparameter_integer(run, 1, 0, 7); |
|
21802
|
0
|
0
|
|
|
|
|
if (!option_int(tagger, "guesser_suffix_rules", guesser_suffix_rules, error, model)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21803
|
0
|
0
|
|
|
|
|
int guesser_prefixes_max = provide_lemma ? 4 : 0; if (!option_int(tagger, "guesser_prefixes_max", guesser_prefixes_max, error, model)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21804
|
0
|
0
|
|
|
|
|
int guesser_prefix_min_count = 10; if (!option_int(tagger, "guesser_prefix_min_count", guesser_prefix_min_count, error, model)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21805
|
0
|
0
|
|
|
|
|
int guesser_enrich_dictionary = run <= 1 ? 6 : 3 + hyperparameter_integer(run, 2, 0, 7); |
|
21806
|
0
|
0
|
|
|
|
|
if (!dictionary_file.empty()) guesser_enrich_dictionary = 0; |
|
21807
|
0
|
0
|
|
|
|
|
if (!option_int(tagger, "guesser_enrich_dictionary", guesser_enrich_dictionary, error, model)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21808
|
|
|
|
|
|
|
|
|
21809
|
0
|
0
|
|
|
|
|
if (run >= 1) cerr << "Random search run " << run << ", guesser_suffix_rules=" << guesser_suffix_rules |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21810
|
0
|
0
|
|
|
|
|
<< ", guesser_enrich_dictionary=" << guesser_enrich_dictionary << endl; |
|
21811
|
|
|
|
|
|
|
|
|
21812
|
0
|
0
|
|
|
|
|
cerr << "Tagger model " << model+1 << " guesser options: " << "suffix_rules=" << guesser_suffix_rules |
|
21813
|
0
|
0
|
|
|
|
|
<< ", prefixes_max=" << guesser_prefixes_max << ", prefix_min_count=" << guesser_prefix_min_count |
|
|
|
0
|
|
|
|
|
|
|
21814
|
0
|
0
|
|
|
|
|
<< ", enrich_dictionary=" << guesser_enrich_dictionary << endl; |
|
21815
|
|
|
|
|
|
|
|
|
21816
|
|
|
|
|
|
|
// Start by generating statistical guesser |
|
21817
|
0
|
0
|
|
|
|
|
stringstream guesser_description; |
|
21818
|
|
|
|
|
|
|
{ |
|
21819
|
0
|
0
|
|
|
|
|
stringstream guesser_input; |
|
21820
|
0
|
0
|
|
|
|
|
for (auto&& sentence : training) { |
|
21821
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < sentence.words.size(); i++) |
|
21822
|
0
|
0
|
|
|
|
|
guesser_input << model_normalize_form(sentence.words[i].form, normalized_form) << '\t' |
|
21823
|
0
|
0
|
|
|
|
|
<< combine_lemma(sentence.words[i], use_lemma, combined_lemma, flat_lemmas) << '\t' |
|
21824
|
0
|
0
|
|
|
|
|
<< combine_tag(sentence.words[i], use_xpostag, use_feats, combined_tag) << '\n'; |
|
21825
|
|
|
|
|
|
|
guesser_input << '\n'; |
|
21826
|
|
|
|
|
|
|
} |
|
21827
|
0
|
0
|
|
|
|
|
morphodita::morpho_statistical_guesser_trainer::train(guesser_input, guesser_suffix_len, guesser_suffix_rules, guesser_prefixes_max, guesser_prefix_min_count, guesser_description); |
|
21828
|
|
|
|
|
|
|
} |
|
21829
|
|
|
|
|
|
|
|
|
21830
|
|
|
|
|
|
|
// Generate morphological dictionary data from the input |
|
21831
|
|
|
|
|
|
|
unordered_set dictionary_entries; |
|
21832
|
|
|
|
|
|
|
{ |
|
21833
|
|
|
|
|
|
|
unordered_map> entries; |
|
21834
|
|
|
|
|
|
|
string entry; |
|
21835
|
0
|
0
|
|
|
|
|
for (auto&& sentence : training) |
|
21836
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < sentence.words.size(); i++) { |
|
21837
|
0
|
0
|
|
|
|
|
model_normalize_form(sentence.words[i].form, normalized_form); |
|
21838
|
0
|
0
|
|
|
|
|
entry.assign(combine_lemma(sentence.words[i], use_lemma, combined_lemma, flat_lemmas)) |
|
21839
|
0
|
0
|
|
|
|
|
.append("\t").append(combine_tag(sentence.words[i], use_xpostag, use_feats, combined_tag)) |
|
|
|
0
|
|
|
|
|
|
|
21840
|
0
|
0
|
|
|
|
|
.append("\t").append(normalized_form); |
|
21841
|
0
|
|
|
|
|
|
entries[normalized_form][entry]++; |
|
21842
|
|
|
|
|
|
|
} |
|
21843
|
|
|
|
|
|
|
|
|
21844
|
0
|
|
|
|
|
|
vector> analyses; |
|
21845
|
0
|
0
|
|
|
|
|
for (auto&& form_analyses : entries) { |
|
21846
|
0
|
|
|
|
|
|
analyses.clear(); |
|
21847
|
0
|
0
|
|
|
|
|
for (auto&& analysis : form_analyses.second) |
|
21848
|
0
|
0
|
|
|
|
|
analyses.emplace_back(analysis.second, analysis.first); |
|
21849
|
0
|
0
|
|
|
|
|
if (max_form_analyses && int(analyses.size()) > max_form_analyses) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21850
|
|
|
|
|
|
|
sort(analyses.begin(), analyses.end(), greater>()); |
|
21851
|
0
|
0
|
|
|
|
|
analyses.resize(max_form_analyses); |
|
21852
|
|
|
|
|
|
|
} |
|
21853
|
0
|
0
|
|
|
|
|
for (auto&& analysis : analyses) |
|
21854
|
0
|
|
|
|
|
|
dictionary_entries.insert(analysis.second); |
|
21855
|
|
|
|
|
|
|
} |
|
21856
|
|
|
|
|
|
|
} |
|
21857
|
0
|
|
|
|
|
|
morphodita::generic_morpho_encoder::tags dictionary_special_tags; |
|
21858
|
|
|
|
|
|
|
dictionary_special_tags.unknown_tag = "~X"; |
|
21859
|
0
|
0
|
|
|
|
|
dictionary_special_tags.number_tag = most_frequent_tag(training, "NUM", use_xpostag, use_feats, combined_tag); |
|
|
|
0
|
|
|
|
|
|
|
21860
|
0
|
0
|
|
|
|
|
dictionary_special_tags.punctuation_tag = most_frequent_tag(training, "PUNCT", use_xpostag, use_feats, combined_tag); |
|
|
|
0
|
|
|
|
|
|
|
21861
|
0
|
0
|
|
|
|
|
dictionary_special_tags.symbol_tag = most_frequent_tag(training, "SYM", use_xpostag, use_feats, combined_tag); |
|
|
|
0
|
|
|
|
|
|
|
21862
|
|
|
|
|
|
|
|
|
21863
|
|
|
|
|
|
|
// Append given dictionary_file if given |
|
21864
|
0
|
0
|
|
|
|
|
if (!dictionary_file.empty()) { |
|
21865
|
0
|
0
|
|
|
|
|
ifstream is(path_from_utf8(dictionary_file).c_str()); |
|
21866
|
0
|
0
|
|
|
|
|
if (!is.is_open()) return error.assign("Cannot open dictionary_file '").append(dictionary_file).append("'!"), false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21867
|
|
|
|
|
|
|
|
|
21868
|
|
|
|
|
|
|
vector dictionary_parts; |
|
21869
|
0
|
0
|
|
|
|
|
word entry; |
|
21870
|
|
|
|
|
|
|
string entry_encoded, line; |
|
21871
|
0
|
0
|
|
|
|
|
while (getline(is, line)) { |
|
|
|
0
|
|
|
|
|
|
|
21872
|
|
|
|
|
|
|
// Skip empty lines |
|
21873
|
0
|
0
|
|
|
|
|
if (line.empty()) continue; |
|
21874
|
|
|
|
|
|
|
|
|
21875
|
0
|
0
|
|
|
|
|
split(line, '\t', dictionary_parts); |
|
21876
|
|
|
|
|
|
|
|
|
21877
|
0
|
0
|
|
|
|
|
if (dictionary_parts.size() != 5) |
|
21878
|
0
|
0
|
|
|
|
|
return error.assign("Dictionary line '").append(line).append("' does not contain 5 tab-separated columns!"), false; |
|
|
|
0
|
|
|
|
|
|
|
21879
|
|
|
|
|
|
|
|
|
21880
|
0
|
0
|
|
|
|
|
model_normalize_form(dictionary_parts[0], entry.form); |
|
21881
|
0
|
0
|
|
|
|
|
entry.lemma.assign(dictionary_parts[1].str, dictionary_parts[1].len == 1 && dictionary_parts[1].str[0] == '_' ? 0 : dictionary_parts[1].len); |
|
|
|
0
|
|
|
|
|
|
|
21882
|
0
|
0
|
|
|
|
|
entry.upostag.assign(dictionary_parts[2].str, dictionary_parts[2].len == 1 && dictionary_parts[2].str[0] == '_' ? 0 : dictionary_parts[2].len); |
|
|
|
0
|
|
|
|
|
|
|
21883
|
0
|
0
|
|
|
|
|
entry.xpostag.assign(dictionary_parts[3].str, dictionary_parts[3].len == 1 && dictionary_parts[3].str[0] == '_' ? 0 : dictionary_parts[3].len); |
|
|
|
0
|
|
|
|
|
|
|
21884
|
0
|
0
|
|
|
|
|
entry.feats.assign(dictionary_parts[4].str, dictionary_parts[4].len == 1 && dictionary_parts[4].str[0] == '_' ? 0 : dictionary_parts[4].len); |
|
|
|
0
|
|
|
|
|
|
|
21885
|
|
|
|
|
|
|
|
|
21886
|
0
|
0
|
|
|
|
|
entry_encoded.assign(combine_lemma(entry, use_lemma, combined_lemma, flat_lemmas)) |
|
21887
|
0
|
0
|
|
|
|
|
.append("\t").append(combine_tag(entry, use_xpostag, use_feats, combined_tag)) |
|
|
|
0
|
|
|
|
|
|
|
21888
|
0
|
0
|
|
|
|
|
.append("\t").append(entry.form); |
|
21889
|
|
|
|
|
|
|
dictionary_entries.insert(entry_encoded); |
|
21890
|
|
|
|
|
|
|
} |
|
21891
|
|
|
|
|
|
|
} |
|
21892
|
|
|
|
|
|
|
|
|
21893
|
|
|
|
|
|
|
// Enrich the dictionary if required |
|
21894
|
0
|
0
|
|
|
|
|
if (guesser_enrich_dictionary) { |
|
21895
|
|
|
|
|
|
|
// Create temporary morphology using only the guesser |
|
21896
|
0
|
0
|
|
|
|
|
stringstream empty_data, guesser_description_copy(guesser_description.str()), guesser_only_morphology; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21897
|
0
|
0
|
|
|
|
|
guesser_only_morphology.put(morphodita::morpho_ids::GENERIC); |
|
21898
|
0
|
0
|
|
|
|
|
morphodita::generic_morpho_encoder::encode(empty_data, dictionary_suffix_len, dictionary_special_tags, guesser_description_copy, guesser_only_morphology); |
|
21899
|
|
|
|
|
|
|
|
|
21900
|
0
|
0
|
|
|
|
|
unique_ptr guesser_only_morpho(morphodita::morpho::load(guesser_only_morphology)); |
|
21901
|
0
|
0
|
|
|
|
|
if (!guesser_only_morpho) return error.assign("Cannot create temporary guesser-only morphology!"), false; |
|
|
|
0
|
|
|
|
|
|
|
21902
|
|
|
|
|
|
|
|
|
21903
|
|
|
|
|
|
|
string entry; |
|
21904
|
|
|
|
|
|
|
unordered_set analyzed_forms; |
|
21905
|
0
|
|
|
|
|
|
vector analyses; |
|
21906
|
0
|
0
|
|
|
|
|
for (auto&& sentence : training) |
|
21907
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < sentence.words.size(); i++) { |
|
21908
|
0
|
0
|
|
|
|
|
const auto& form = model_normalize_form(sentence.words[i].form, normalized_form); |
|
21909
|
0
|
0
|
|
|
|
|
if (!analyzed_forms.count(form)) { |
|
21910
|
0
|
0
|
|
|
|
|
guesser_only_morpho->analyze(form, morphodita::morpho::GUESSER, analyses); |
|
21911
|
|
|
|
|
|
|
|
|
21912
|
0
|
|
|
|
|
|
int to_add = guesser_enrich_dictionary; |
|
21913
|
0
|
0
|
|
|
|
|
for (auto&& analyse : analyses) { |
|
21914
|
0
|
0
|
|
|
|
|
entry.assign(analyse.lemma).push_back('\t'); |
|
21915
|
0
|
0
|
|
|
|
|
entry.append(analyse.tag).push_back('\t'); |
|
21916
|
|
|
|
|
|
|
entry.append(form); |
|
21917
|
0
|
0
|
|
|
|
|
if (dictionary_entries.insert(entry).second) |
|
21918
|
0
|
0
|
|
|
|
|
if (!--to_add) |
|
21919
|
|
|
|
|
|
|
break; |
|
21920
|
|
|
|
|
|
|
} |
|
21921
|
|
|
|
|
|
|
analyzed_forms.insert(form); |
|
21922
|
|
|
|
|
|
|
} |
|
21923
|
|
|
|
|
|
|
} |
|
21924
|
|
|
|
|
|
|
} |
|
21925
|
|
|
|
|
|
|
|
|
21926
|
|
|
|
|
|
|
// Create the dictionary |
|
21927
|
0
|
0
|
|
|
|
|
vector sorted_dictionary(dictionary_entries.begin(), dictionary_entries.end()); |
|
21928
|
|
|
|
|
|
|
sort(sorted_dictionary.begin(), sorted_dictionary.end()); |
|
21929
|
|
|
|
|
|
|
|
|
21930
|
0
|
0
|
|
|
|
|
stringstream morpho_input; |
|
21931
|
0
|
0
|
|
|
|
|
for (auto&& entry : sorted_dictionary) |
|
21932
|
|
|
|
|
|
|
morpho_input << entry << '\n'; |
|
21933
|
|
|
|
|
|
|
|
|
21934
|
0
|
0
|
|
|
|
|
morpho_description.put(morphodita::morpho_ids::GENERIC); |
|
21935
|
0
|
0
|
|
|
|
|
morphodita::generic_morpho_encoder::encode(morpho_input, dictionary_suffix_len, dictionary_special_tags, guesser_description, morpho_description); |
|
21936
|
|
|
|
|
|
|
} |
|
21937
|
|
|
|
|
|
|
|
|
21938
|
|
|
|
|
|
|
// Measure dictionary accuracy if required |
|
21939
|
0
|
0
|
|
|
|
|
const string& dictionary_accuracy = option_str(tagger, "dictionary_accuracy", model); |
|
|
|
0
|
|
|
|
|
|
|
21940
|
0
|
0
|
|
|
|
|
if (!dictionary_accuracy.empty()) { |
|
21941
|
0
|
0
|
|
|
|
|
unique_ptr morpho(morphodita::morpho::load(morpho_description)); |
|
21942
|
0
|
0
|
|
|
|
|
if (!morpho) return error.assign("Cannot create temporary morphology for evaluating accuracy!"), false; |
|
|
|
0
|
|
|
|
|
|
|
21943
|
0
|
0
|
|
|
|
|
morpho_description.seekg(0, ios::beg); |
|
21944
|
|
|
|
|
|
|
|
|
21945
|
|
|
|
|
|
|
// Measure dictionary accuracy on given data |
|
21946
|
|
|
|
|
|
|
unsigned words = 0, total_analyses = 0, upostag = 0, xpostag = 0, feats = 0, all_tags = 0, lemma = 0; |
|
21947
|
|
|
|
|
|
|
|
|
21948
|
0
|
0
|
|
|
|
|
word w; |
|
21949
|
0
|
|
|
|
|
|
vector analyses; |
|
21950
|
0
|
0
|
|
|
|
|
conllu_input_format->set_text(dictionary_accuracy.c_str()); |
|
21951
|
0
|
0
|
|
|
|
|
for (sentence sentence; conllu_input_format->next_sentence(sentence, error); ) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21952
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < sentence.words.size(); i++) { |
|
21953
|
0
|
0
|
|
|
|
|
morpho->analyze(model_normalize_form(sentence.words[i].form, normalized_form), morphodita::morpho::GUESSER, analyses); |
|
|
|
0
|
|
|
|
|
|
|
21954
|
|
|
|
|
|
|
unsigned upostag_ok = 0, xpostag_ok = 0, feats_ok = 0, all_tags_ok = 0, lemma_ok = 0; |
|
21955
|
0
|
0
|
|
|
|
|
for (auto&& analysis : analyses) { |
|
21956
|
0
|
0
|
|
|
|
|
w.lemma.assign("_"); |
|
21957
|
0
|
0
|
|
|
|
|
model_fill_word_analysis(analysis, true, use_lemma, true, true, w); |
|
21958
|
0
|
|
|
|
|
|
upostag_ok |= int(sentence.words[i].upostag == w.upostag); |
|
21959
|
0
|
|
|
|
|
|
xpostag_ok |= int(sentence.words[i].xpostag == w.xpostag); |
|
21960
|
0
|
|
|
|
|
|
feats_ok |= int(sentence.words[i].feats == w.feats); |
|
21961
|
0
|
0
|
|
|
|
|
all_tags_ok |= int(sentence.words[i].upostag == w.upostag && sentence.words[i].xpostag == w.xpostag && sentence.words[i].feats == w.feats); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21962
|
0
|
|
|
|
|
|
lemma_ok |= int(sentence.words[i].lemma == w.lemma); |
|
21963
|
|
|
|
|
|
|
} |
|
21964
|
0
|
|
|
|
|
|
words++; |
|
21965
|
0
|
|
|
|
|
|
total_analyses += analyses.size(); |
|
21966
|
0
|
|
|
|
|
|
upostag += upostag_ok; |
|
21967
|
0
|
|
|
|
|
|
xpostag += xpostag_ok; |
|
21968
|
0
|
|
|
|
|
|
feats += feats_ok; |
|
21969
|
0
|
|
|
|
|
|
all_tags += all_tags_ok; |
|
21970
|
0
|
|
|
|
|
|
lemma += lemma_ok; |
|
21971
|
|
|
|
|
|
|
} |
|
21972
|
0
|
0
|
|
|
|
|
if (!error.empty()) return false; |
|
21973
|
|
|
|
|
|
|
|
|
21974
|
|
|
|
|
|
|
cerr << "Dictionary accuracy for tagging model " << model+1 << " - forms: " << words |
|
21975
|
0
|
|
|
|
|
|
<< ", analyses per form: " << fixed << setprecision(2) << total_analyses / double(words) |
|
21976
|
0
|
|
|
|
|
|
<< ", upostag: " << setprecision(1) << 100. * upostag / words << "%, xpostag: " << 100. * xpostag / words |
|
21977
|
0
|
|
|
|
|
|
<< "%, feats: " << 100. * feats / words << "%, all tags: " << 100. * all_tags / words << "%, lemma: " << 100. * lemma / words << '%' << endl; |
|
21978
|
|
|
|
|
|
|
} |
|
21979
|
|
|
|
|
|
|
|
|
21980
|
|
|
|
|
|
|
// Tagger options |
|
21981
|
0
|
0
|
|
|
|
|
double tagger_order = 3; if (!option_double(tagger, "order", tagger_order, error, model)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21982
|
|
|
|
|
|
|
morphodita::tagger_id tagger_id; |
|
21983
|
0
|
0
|
|
|
|
|
if (tagger_order == 2) tagger_id = morphodita::tagger_ids::CONLLU2; |
|
21984
|
0
|
0
|
|
|
|
|
else if (tagger_order == 2.5) tagger_id = morphodita::tagger_ids::CONLLU2_3; |
|
21985
|
0
|
0
|
|
|
|
|
else if (tagger_order == 3) tagger_id = morphodita::tagger_ids::CONLLU3; |
|
21986
|
0
|
0
|
|
|
|
|
else return error.assign("The tagger_order can be only 2, 2.5 or 3!"), false; |
|
21987
|
|
|
|
|
|
|
|
|
21988
|
0
|
0
|
|
|
|
|
int tagger_iterations = 20; if (!option_int(tagger, "iterations", tagger_iterations, error, model)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21989
|
0
|
0
|
|
|
|
|
bool tagger_prune_features = false; if (!option_bool(tagger, "prune_features", tagger_prune_features, error, model)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21990
|
0
|
0
|
|
|
|
|
bool tagger_early_stopping = true; if (!option_bool(tagger, "early_stopping", tagger_early_stopping, error, model)) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21991
|
|
|
|
|
|
|
const string& tagger_feature_templates = |
|
21992
|
0
|
0
|
|
|
|
|
option_str(tagger, "templates", model) == "tagger" ? tagger_features_tagger : |
|
|
|
0
|
|
|
|
|
|
|
21993
|
0
|
0
|
|
|
|
|
option_str(tagger, "templates", model) == "lemmatizer" ? tagger_features_lemmatizer : |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21994
|
0
|
0
|
|
|
|
|
!option_str(tagger, "templates", model).empty() ? option_str(tagger, "templates", model) : |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21995
|
0
|
0
|
|
|
|
|
model == 1 ? tagger_features_lemmatizer : tagger_features_tagger; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
21996
|
0
|
0
|
|
|
|
|
if (heldout.empty()) tagger_early_stopping = false; |
|
21997
|
|
|
|
|
|
|
|
|
21998
|
0
|
0
|
|
|
|
|
cerr << "Tagger model " << model+1 << " options: iterations=" << tagger_iterations |
|
21999
|
0
|
0
|
|
|
|
|
<< ", early_stopping=" << (tagger_early_stopping ? 1 : 0) << ", templates=" |
|
|
|
0
|
|
|
|
|
|
|
22000
|
0
|
|
|
|
|
|
<< (tagger_feature_templates == tagger_features_tagger ? "tagger" : |
|
22001
|
0
|
0
|
|
|
|
|
tagger_feature_templates == tagger_features_lemmatizer ? "lemmatizer" : "custom") << endl; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
22002
|
|
|
|
|
|
|
|
|
22003
|
|
|
|
|
|
|
// Train the tagger |
|
22004
|
|
|
|
|
|
|
cerr << "Training tagger model " << model+1 << "." << endl; |
|
22005
|
0
|
0
|
|
|
|
|
stringstream input, heldout_input, feature_templates_input(tagger_feature_templates); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
22006
|
0
|
0
|
|
|
|
|
for (auto&& sentence : training) { |
|
22007
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < sentence.words.size(); i++) |
|
22008
|
0
|
0
|
|
|
|
|
input << model_normalize_form(sentence.words[i].form, normalized_form) << '\t' |
|
22009
|
0
|
0
|
|
|
|
|
<< combine_lemma(sentence.words[i], use_lemma, combined_lemma) << '\t' |
|
22010
|
0
|
0
|
|
|
|
|
<< combine_tag(sentence.words[i], use_xpostag, use_feats, combined_tag) << '\n'; |
|
22011
|
|
|
|
|
|
|
input << '\n'; |
|
22012
|
|
|
|
|
|
|
} |
|
22013
|
|
|
|
|
|
|
|
|
22014
|
0
|
0
|
|
|
|
|
for (auto&& sentence : heldout) { |
|
22015
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < sentence.words.size(); i++) |
|
22016
|
0
|
0
|
|
|
|
|
heldout_input << model_normalize_form(sentence.words[i].form, normalized_form) << '\t' |
|
22017
|
0
|
0
|
|
|
|
|
<< combine_lemma(sentence.words[i], use_lemma, combined_lemma) << '\t' |
|
22018
|
0
|
0
|
|
|
|
|
<< combine_tag(sentence.words[i], use_xpostag, use_feats, combined_tag) << '\n'; |
|
22019
|
|
|
|
|
|
|
heldout_input << '\n'; |
|
22020
|
|
|
|
|
|
|
} |
|
22021
|
|
|
|
|
|
|
|
|
22022
|
0
|
0
|
|
|
|
|
os.put(tagger_id); |
|
22023
|
0
|
0
|
|
|
|
|
morphodita::tagger_trainer>>::train(morphodita::tagger_ids::decoding_order(tagger_id), morphodita::tagger_ids::window_size(tagger_id), tagger_iterations, morpho_description, true, feature_templates_input, tagger_prune_features, input, heldout_input, tagger_early_stopping, os); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
22024
|
|
|
|
|
|
|
|
|
22025
|
|
|
|
|
|
|
return true; |
|
22026
|
|
|
|
|
|
|
} |
|
22027
|
|
|
|
|
|
|
|
|
22028
|
0
|
|
|
|
|
|
bool trainer_morphodita_parsito::can_combine_tag(const word& w, string& error) { |
|
22029
|
|
|
|
|
|
|
error.clear(); |
|
22030
|
|
|
|
|
|
|
|
|
22031
|
|
|
|
|
|
|
unsigned separator = 0; |
|
22032
|
0
|
0
|
|
|
|
|
while (separator < tag_separators.size() && |
|
22033
|
0
|
0
|
|
|
|
|
(w.upostag.find(tag_separators[separator]) != string::npos || w.xpostag.find(tag_separators[separator]) != string::npos)) |
|
22034
|
0
|
|
|
|
|
|
separator++; |
|
22035
|
|
|
|
|
|
|
|
|
22036
|
0
|
0
|
|
|
|
|
if (separator >= tag_separators.size()) { |
|
22037
|
0
|
|
|
|
|
|
error.assign("Cannot find tag separating character, UPOSTAG and XPOSTAG contain all of '").append(tag_separators).append("'!"); |
|
22038
|
0
|
|
|
|
|
|
return false; |
|
22039
|
|
|
|
|
|
|
} |
|
22040
|
|
|
|
|
|
|
return true; |
|
22041
|
|
|
|
|
|
|
} |
|
22042
|
|
|
|
|
|
|
|
|
22043
|
0
|
|
|
|
|
|
const string& trainer_morphodita_parsito::combine_tag(const word& w, bool xpostag, bool feats, string& combined_tag) { |
|
22044
|
|
|
|
|
|
|
unsigned separator = 0; |
|
22045
|
0
|
0
|
|
|
|
|
while (separator < tag_separators.size() && |
|
22046
|
0
|
0
|
|
|
|
|
(w.upostag.find(tag_separators[separator]) != string::npos || w.xpostag.find(tag_separators[separator]) != string::npos)) |
|
22047
|
0
|
|
|
|
|
|
separator++; |
|
22048
|
0
|
0
|
|
|
|
|
if (separator >= tag_separators.size()) |
|
22049
|
|
|
|
|
|
|
// Should not happen, as can_combine_tag was called before |
|
22050
|
|
|
|
|
|
|
separator = 0; |
|
22051
|
|
|
|
|
|
|
|
|
22052
|
0
|
|
|
|
|
|
combined_tag.assign(1, tag_separators[separator]); |
|
22053
|
|
|
|
|
|
|
combined_tag.append(w.upostag); |
|
22054
|
0
|
0
|
|
|
|
|
if (xpostag || feats) { |
|
22055
|
0
|
|
|
|
|
|
combined_tag.push_back(tag_separators[separator]); |
|
22056
|
0
|
0
|
|
|
|
|
if (xpostag) combined_tag.append(w.xpostag); |
|
22057
|
0
|
0
|
|
|
|
|
if (feats) combined_tag.push_back(tag_separators[separator]); |
|
22058
|
0
|
0
|
|
|
|
|
if (feats) combined_tag.append(w.feats); |
|
22059
|
|
|
|
|
|
|
} |
|
22060
|
|
|
|
|
|
|
|
|
22061
|
0
|
|
|
|
|
|
return combined_tag; |
|
22062
|
|
|
|
|
|
|
} |
|
22063
|
|
|
|
|
|
|
|
|
22064
|
0
|
|
|
|
|
|
const string& trainer_morphodita_parsito::most_frequent_tag(const vector& data, const string& upostag, bool xpostag, bool feats, string& combined_tag) { |
|
22065
|
|
|
|
|
|
|
unordered_map counts; |
|
22066
|
|
|
|
|
|
|
|
|
22067
|
0
|
0
|
|
|
|
|
for (auto&& sentence : data) |
|
22068
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < sentence.words.size(); i++) |
|
22069
|
0
|
0
|
|
|
|
|
if (sentence.words[i].upostag == upostag) |
|
22070
|
0
|
0
|
|
|
|
|
counts[combine_tag(sentence.words[i], xpostag, feats, combined_tag)]++; |
|
22071
|
|
|
|
|
|
|
|
|
22072
|
0
|
0
|
|
|
|
|
combined_tag.assign("~").append(upostag); |
|
22073
|
|
|
|
|
|
|
unsigned best = 0; |
|
22074
|
0
|
0
|
|
|
|
|
for (auto&& tags : counts) |
|
22075
|
0
|
0
|
|
|
|
|
if (tags.second > best) { |
|
22076
|
|
|
|
|
|
|
best = tags.second; |
|
22077
|
0
|
|
|
|
|
|
combined_tag.assign(tags.first); |
|
22078
|
|
|
|
|
|
|
} |
|
22079
|
0
|
|
|
|
|
|
return combined_tag; |
|
22080
|
|
|
|
|
|
|
} |
|
22081
|
|
|
|
|
|
|
|
|
22082
|
0
|
|
|
|
|
|
const string& trainer_morphodita_parsito::combine_lemma(const word& w, int use_lemma, string& combined_lemma, const unordered_set& flat_lemmas) { |
|
22083
|
0
|
|
|
|
|
|
switch (use_lemma) { |
|
22084
|
|
|
|
|
|
|
case 0: |
|
22085
|
0
|
|
|
|
|
|
return model_normalize_form(w.form, combined_lemma); |
|
22086
|
|
|
|
|
|
|
case 1: |
|
22087
|
0
|
|
|
|
|
|
model_normalize_lemma(w.lemma, combined_lemma); |
|
22088
|
0
|
0
|
|
|
|
|
if (flat_lemmas.count(w.lemma) || flat_lemmas.count(combined_lemma)) |
|
22089
|
0
|
|
|
|
|
|
return model_normalize_form(w.form, combined_lemma); |
|
22090
|
|
|
|
|
|
|
return combined_lemma; |
|
22091
|
|
|
|
|
|
|
default: /*2*/ |
|
22092
|
0
|
0
|
|
|
|
|
if (w.lemma == "") |
|
22093
|
0
|
|
|
|
|
|
return model_normalize_form(w.form, combined_lemma), combined_lemma.insert(0, "~~"); |
|
22094
|
0
|
0
|
|
|
|
|
else if (w.lemma == "_") |
|
22095
|
0
|
|
|
|
|
|
return model_normalize_form(w.form, combined_lemma), combined_lemma.insert(0, "~_~"); |
|
22096
|
|
|
|
|
|
|
|
|
22097
|
0
|
|
|
|
|
|
model_normalize_lemma(w.lemma, combined_lemma); |
|
22098
|
0
|
0
|
|
|
|
|
if (flat_lemmas.count(w.lemma) || flat_lemmas.count(combined_lemma)) { |
|
22099
|
|
|
|
|
|
|
string normalized_form; |
|
22100
|
0
|
0
|
|
|
|
|
model_normalize_form(w.form, normalized_form); |
|
22101
|
0
|
0
|
|
|
|
|
return combined_lemma.insert(0, "~").append("~").append(normalized_form); |
|
|
|
0
|
|
|
|
|
|
|
22102
|
|
|
|
|
|
|
} |
|
22103
|
|
|
|
|
|
|
return combined_lemma; |
|
22104
|
|
|
|
|
|
|
} |
|
22105
|
|
|
|
|
|
|
} |
|
22106
|
|
|
|
|
|
|
|
|
22107
|
|
|
|
|
|
|
// Generic options handling |
|
22108
|
|
|
|
|
|
|
|
|
22109
|
0
|
|
|
|
|
|
const string& trainer_morphodita_parsito::option_str(const named_values::map& options, const string& name, int model) { |
|
22110
|
|
|
|
|
|
|
string indexed_name(name); |
|
22111
|
0
|
0
|
|
|
|
|
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
22112
|
|
|
|
|
|
|
|
|
22113
|
0
|
|
|
|
|
|
return options.count(indexed_name) ? options.at(indexed_name) : options.count(name) ? options.at(name) : empty_string; |
|
22114
|
|
|
|
|
|
|
} |
|
22115
|
|
|
|
|
|
|
|
|
22116
|
0
|
|
|
|
|
|
bool trainer_morphodita_parsito::option_int(const named_values::map& options, const string& name, int& value, string& error, int model) { |
|
22117
|
|
|
|
|
|
|
string indexed_name(name); |
|
22118
|
0
|
0
|
|
|
|
|
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
22119
|
|
|
|
|
|
|
|
|
22120
|
0
|
0
|
|
|
|
|
if (options.count(indexed_name)) |
|
22121
|
0
|
0
|
|
|
|
|
return parse_int(options.at(indexed_name), name.c_str(), value, error); |
|
22122
|
0
|
0
|
|
|
|
|
if (options.count(name)) |
|
22123
|
0
|
0
|
|
|
|
|
return parse_int(options.at(name), name.c_str(), value, error); |
|
22124
|
|
|
|
|
|
|
return true; |
|
22125
|
|
|
|
|
|
|
} |
|
22126
|
|
|
|
|
|
|
|
|
22127
|
0
|
|
|
|
|
|
bool trainer_morphodita_parsito::option_bool(const named_values::map& options, const string& name, bool& value, string& error, int model) { |
|
22128
|
|
|
|
|
|
|
string indexed_name(name); |
|
22129
|
0
|
0
|
|
|
|
|
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
22130
|
|
|
|
|
|
|
|
|
22131
|
0
|
0
|
|
|
|
|
if (options.count(indexed_name) || options.count(name)) { |
|
22132
|
|
|
|
|
|
|
int int_value; |
|
22133
|
0
|
0
|
|
|
|
|
if (!parse_int(options.count(indexed_name) ? options.at(indexed_name) : options.at(name), name.c_str(), int_value, error)) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
22134
|
0
|
|
|
|
|
|
return false; |
|
22135
|
0
|
|
|
|
|
|
value = int_value != 0; |
|
22136
|
|
|
|
|
|
|
} |
|
22137
|
|
|
|
|
|
|
return true; |
|
22138
|
|
|
|
|
|
|
} |
|
22139
|
|
|
|
|
|
|
|
|
22140
|
0
|
|
|
|
|
|
bool trainer_morphodita_parsito::option_double(const named_values::map& options, const string& name, double& value, string& error, int model) { |
|
22141
|
|
|
|
|
|
|
string indexed_name(name); |
|
22142
|
0
|
0
|
|
|
|
|
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
22143
|
|
|
|
|
|
|
|
|
22144
|
0
|
0
|
|
|
|
|
if (options.count(indexed_name)) |
|
22145
|
0
|
0
|
|
|
|
|
return parse_double(options.at(indexed_name), name.c_str(), value, error); |
|
22146
|
0
|
0
|
|
|
|
|
if (options.count(name)) |
|
22147
|
0
|
0
|
|
|
|
|
return parse_double(options.at(name), name.c_str(), value, error); |
|
22148
|
|
|
|
|
|
|
return true; |
|
22149
|
|
|
|
|
|
|
} |
|
22150
|
|
|
|
|
|
|
|
|
22151
|
|
|
|
|
|
|
// Various string data |
|
22152
|
|
|
|
|
|
|
|
|
22153
|
2
|
|
|
|
|
|
const string trainer_morphodita_parsito::empty_string; |
|
22154
|
|
|
|
|
|
|
|
|
22155
|
2
|
|
|
|
|
|
const string trainer_morphodita_parsito::tag_separators = "~!@#$%^&*()/"; |
|
22156
|
|
|
|
|
|
|
|
|
22157
|
2
|
|
|
|
|
|
const string trainer_morphodita_parsito::tagger_features_tagger = |
|
22158
|
|
|
|
|
|
|
"Tag 0\n" |
|
22159
|
|
|
|
|
|
|
"Tag 0,Tag -1\n" |
|
22160
|
|
|
|
|
|
|
"Tag 0,TagUPos -1\n" |
|
22161
|
|
|
|
|
|
|
"Tag 0,Tag -1,Tag -2\n" |
|
22162
|
|
|
|
|
|
|
"Tag 0,TagUPos -1,TagUPos -2\n" |
|
22163
|
|
|
|
|
|
|
"Tag 0,Tag -2\n" |
|
22164
|
|
|
|
|
|
|
"Tag 0,Form 0\n" |
|
22165
|
|
|
|
|
|
|
"Tag 0,Form 0,Form -1\n" |
|
22166
|
|
|
|
|
|
|
"Tag 0,Form -1\n" |
|
22167
|
|
|
|
|
|
|
"Tag 0,Form -2\n" |
|
22168
|
|
|
|
|
|
|
"Tag 0,Form -1,Form -2\n" |
|
22169
|
|
|
|
|
|
|
"Tag 0,Form 1\n" |
|
22170
|
|
|
|
|
|
|
"Tag 0,Form 1,Form 2\n" |
|
22171
|
|
|
|
|
|
|
"Tag 0,PreviousVerbTag 0\n" |
|
22172
|
|
|
|
|
|
|
"Tag 0,PreviousVerbForm 0\n" |
|
22173
|
|
|
|
|
|
|
"Tag 0,FollowingVerbTag 0\n" |
|
22174
|
|
|
|
|
|
|
"Tag 0,FollowingVerbForm 0\n" |
|
22175
|
|
|
|
|
|
|
"Tag 0,Lemma -1\n" |
|
22176
|
|
|
|
|
|
|
"Tag 0,Form 1\n" |
|
22177
|
|
|
|
|
|
|
"Lemma 0,Tag -1\n" |
|
22178
|
|
|
|
|
|
|
"Tag 0,Prefix1 0\n" |
|
22179
|
|
|
|
|
|
|
"Tag 0,Prefix2 0\n" |
|
22180
|
|
|
|
|
|
|
"Tag 0,Prefix3 0\n" |
|
22181
|
|
|
|
|
|
|
"Tag 0,Prefix4 0\n" |
|
22182
|
|
|
|
|
|
|
"Tag 0,Prefix5 0\n" |
|
22183
|
|
|
|
|
|
|
"Tag 0,Prefix6 0\n" |
|
22184
|
|
|
|
|
|
|
"Tag 0,Prefix7 0\n" |
|
22185
|
|
|
|
|
|
|
"Tag 0,Prefix8 0\n" |
|
22186
|
|
|
|
|
|
|
"Tag 0,Prefix9 0\n" |
|
22187
|
|
|
|
|
|
|
"Tag 0,Suffix1 0\n" |
|
22188
|
|
|
|
|
|
|
"Tag 0,Suffix2 0\n" |
|
22189
|
|
|
|
|
|
|
"Tag 0,Suffix3 0\n" |
|
22190
|
|
|
|
|
|
|
"Tag 0,Suffix4 0\n" |
|
22191
|
|
|
|
|
|
|
"Tag 0,Suffix5 0\n" |
|
22192
|
|
|
|
|
|
|
"Tag 0,Suffix6 0\n" |
|
22193
|
|
|
|
|
|
|
"Tag 0,Suffix7 0\n" |
|
22194
|
|
|
|
|
|
|
"Tag 0,Suffix8 0\n" |
|
22195
|
|
|
|
|
|
|
"Tag 0,Suffix9 0\n" |
|
22196
|
|
|
|
|
|
|
"TagUPos 0\n" |
|
22197
|
|
|
|
|
|
|
"TagUPos 0,TagUPos -1\n" |
|
22198
|
|
|
|
|
|
|
"TagUPos 0,TagUPos -1,TagUPos -2\n" |
|
22199
|
|
|
|
|
|
|
"TagCase 0,TagCase -1\n" |
|
22200
|
|
|
|
|
|
|
"TagCase 0,TagCase -1,TagCase -2\n" |
|
22201
|
|
|
|
|
|
|
"TagGender 0,TagGender -1\n" |
|
22202
|
|
|
|
|
|
|
"TagGender 0,TagGender -1,TagGender -2\n" |
|
22203
|
|
|
|
|
|
|
"TagUPos 0,Prefix1 0\n" |
|
22204
|
|
|
|
|
|
|
"TagUPos 0,Prefix2 0\n" |
|
22205
|
|
|
|
|
|
|
"TagUPos 0,Prefix3 0\n" |
|
22206
|
|
|
|
|
|
|
"TagUPos 0,Prefix4 0\n" |
|
22207
|
|
|
|
|
|
|
"TagUPos 0,Prefix5 0\n" |
|
22208
|
|
|
|
|
|
|
"TagUPos 0,Prefix6 0\n" |
|
22209
|
|
|
|
|
|
|
"TagUPos 0,Prefix7 0\n" |
|
22210
|
|
|
|
|
|
|
"TagUPos 0,Prefix8 0\n" |
|
22211
|
|
|
|
|
|
|
"TagUPos 0,Prefix9 0\n" |
|
22212
|
|
|
|
|
|
|
"TagUPos 0,Suffix1 0\n" |
|
22213
|
|
|
|
|
|
|
"TagUPos 0,Suffix2 0\n" |
|
22214
|
|
|
|
|
|
|
"TagUPos 0,Suffix3 0\n" |
|
22215
|
|
|
|
|
|
|
"TagUPos 0,Suffix4 0\n" |
|
22216
|
|
|
|
|
|
|
"TagUPos 0,Suffix5 0\n" |
|
22217
|
|
|
|
|
|
|
"TagUPos 0,Suffix6 0\n" |
|
22218
|
|
|
|
|
|
|
"TagUPos 0,Suffix7 0\n" |
|
22219
|
|
|
|
|
|
|
"TagUPos 0,Suffix8 0\n" |
|
22220
|
|
|
|
|
|
|
"TagUPos 0,Suffix9 0\n" |
|
22221
|
|
|
|
|
|
|
"Tag 0,Num 0\n" |
|
22222
|
|
|
|
|
|
|
"Tag 0,Cap 0\n" |
|
22223
|
|
|
|
|
|
|
"Tag 0,Dash 0\n" |
|
22224
|
|
|
|
|
|
|
"TagNegative 0,Prefix1 0\n" |
|
22225
|
|
|
|
|
|
|
"TagNegative 0,Prefix2 0\n" |
|
22226
|
|
|
|
|
|
|
"TagNegative 0,Prefix3 0\n" |
|
22227
|
|
|
|
|
|
|
"TagCase 0,Suffix1 0\n" |
|
22228
|
|
|
|
|
|
|
"TagCase 0,Suffix2 0\n" |
|
22229
|
|
|
|
|
|
|
"TagCase 0,Suffix3 0\n" |
|
22230
|
|
|
|
|
|
|
"TagCase 0,Suffix4 0\n" |
|
22231
|
|
|
|
|
|
|
"TagCase 0,Suffix5 0\n"; |
|
22232
|
|
|
|
|
|
|
|
|
22233
|
2
|
|
|
|
|
|
const string trainer_morphodita_parsito::tagger_features_lemmatizer = |
|
22234
|
|
|
|
|
|
|
"Tag 0\n" |
|
22235
|
|
|
|
|
|
|
"Tag 0,Tag -1\n" |
|
22236
|
|
|
|
|
|
|
"Tag 0,Tag -1,Tag -2\n" |
|
22237
|
|
|
|
|
|
|
"Tag 0,Tag -2\n" |
|
22238
|
|
|
|
|
|
|
"Tag 0,Form 0\n" |
|
22239
|
|
|
|
|
|
|
"Tag 0,Form 0,Form -1\n" |
|
22240
|
|
|
|
|
|
|
"Tag 0,Form -1\n" |
|
22241
|
|
|
|
|
|
|
"Tag 0,Form -2\n" |
|
22242
|
|
|
|
|
|
|
"Tag 0,PreviousVerbTag 0\n" |
|
22243
|
|
|
|
|
|
|
"Tag 0,PreviousVerbForm 0\n" |
|
22244
|
|
|
|
|
|
|
"Tag 0,FollowingVerbTag 0\n" |
|
22245
|
|
|
|
|
|
|
"Tag 0,FollowingVerbForm 0\n" |
|
22246
|
|
|
|
|
|
|
"Tag 0,Lemma -1\n" |
|
22247
|
|
|
|
|
|
|
"Tag 0,Form 1\n" |
|
22248
|
|
|
|
|
|
|
"Lemma 0\n" |
|
22249
|
|
|
|
|
|
|
"Lemma 0,Tag -1\n" |
|
22250
|
|
|
|
|
|
|
"Lemma 0,Tag -1,Tag -2\n" |
|
22251
|
|
|
|
|
|
|
"Lemma 0,Tag -2\n" |
|
22252
|
|
|
|
|
|
|
"Lemma 0,Form -1\n" |
|
22253
|
|
|
|
|
|
|
"Lemma 0,Form -1,Form -2\n" |
|
22254
|
|
|
|
|
|
|
"Lemma 0,Form -2\n" |
|
22255
|
|
|
|
|
|
|
"Lemma 0,PreviousVerbTag 0\n" |
|
22256
|
|
|
|
|
|
|
"Lemma 0,PreviousVerbForm 0\n" |
|
22257
|
|
|
|
|
|
|
"Lemma 0,FollowingVerbTag 0\n" |
|
22258
|
|
|
|
|
|
|
"Lemma 0,FollowingVerbForm 0\n" |
|
22259
|
|
|
|
|
|
|
"Lemma 0,Form 1\n" |
|
22260
|
|
|
|
|
|
|
"Tag 0,Prefix1 0\n" |
|
22261
|
|
|
|
|
|
|
"Tag 0,Prefix2 0\n" |
|
22262
|
|
|
|
|
|
|
"Tag 0,Prefix3 0\n" |
|
22263
|
|
|
|
|
|
|
"Tag 0,Prefix4 0\n" |
|
22264
|
|
|
|
|
|
|
"Tag 0,Prefix5 0\n" |
|
22265
|
|
|
|
|
|
|
"Tag 0,Suffix1 0\n" |
|
22266
|
|
|
|
|
|
|
"Tag 0,Suffix2 0\n" |
|
22267
|
|
|
|
|
|
|
"Tag 0,Suffix3 0\n" |
|
22268
|
|
|
|
|
|
|
"Tag 0,Suffix4 0\n" |
|
22269
|
|
|
|
|
|
|
"Tag 0,Suffix5 0\n" |
|
22270
|
|
|
|
|
|
|
"Tag 0,Num 0\n" |
|
22271
|
|
|
|
|
|
|
"Tag 0,Cap 0\n" |
|
22272
|
|
|
|
|
|
|
"Tag 0,Dash 0\n"; |
|
22273
|
|
|
|
|
|
|
|
|
22274
|
2
|
|
|
|
|
|
const string trainer_morphodita_parsito::parser_nodes = |
|
22275
|
|
|
|
|
|
|
"stack 0\n" |
|
22276
|
|
|
|
|
|
|
"stack 1\n" |
|
22277
|
|
|
|
|
|
|
"stack 2\n" |
|
22278
|
|
|
|
|
|
|
"buffer 0\n" |
|
22279
|
|
|
|
|
|
|
"buffer 1\n" |
|
22280
|
|
|
|
|
|
|
"buffer 2\n" |
|
22281
|
|
|
|
|
|
|
"stack 0,child 0\n" |
|
22282
|
|
|
|
|
|
|
"stack 0,child 1\n" |
|
22283
|
|
|
|
|
|
|
"stack 0,child -2\n" |
|
22284
|
|
|
|
|
|
|
"stack 0,child -1\n" |
|
22285
|
|
|
|
|
|
|
"stack 1,child 0\n" |
|
22286
|
|
|
|
|
|
|
"stack 1,child 1\n" |
|
22287
|
|
|
|
|
|
|
"stack 1,child -2\n" |
|
22288
|
|
|
|
|
|
|
"stack 1,child -1\n" |
|
22289
|
|
|
|
|
|
|
"stack 0,child 0,child 0\n" |
|
22290
|
|
|
|
|
|
|
"stack 0,child -1,child -1\n" |
|
22291
|
|
|
|
|
|
|
"stack 1,child 0,child 0\n" |
|
22292
|
|
|
|
|
|
|
"stack 1,child -1,child -1\n"; |
|
22293
|
|
|
|
|
|
|
|
|
22294
|
|
|
|
|
|
|
///////// |
|
22295
|
|
|
|
|
|
|
// File: trainer/training_failure.cpp |
|
22296
|
|
|
|
|
|
|
///////// |
|
22297
|
|
|
|
|
|
|
|
|
22298
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
22299
|
|
|
|
|
|
|
// |
|
22300
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
|
22301
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
22302
|
|
|
|
|
|
|
// |
|
22303
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
22304
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
22305
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
22306
|
|
|
|
|
|
|
|
|
22307
|
0
|
0
|
|
|
|
|
training_error::training_error() : runtime_error(message_collector.str()) { |
|
22308
|
0
|
|
|
|
|
|
message_collector.str(string()); |
|
22309
|
0
|
|
|
|
|
|
} |
|
22310
|
|
|
|
|
|
|
|
|
22311
|
2
|
|
|
|
|
|
ostringstream training_error::message_collector; |
|
22312
|
|
|
|
|
|
|
|
|
22313
|
|
|
|
|
|
|
///////// |
|
22314
|
|
|
|
|
|
|
// File: unilib/unicode.cpp |
|
22315
|
|
|
|
|
|
|
///////// |
|
22316
|
|
|
|
|
|
|
|
|
22317
|
|
|
|
|
|
|
// This file is part of UniLib . |
|
22318
|
|
|
|
|
|
|
// |
|
22319
|
|
|
|
|
|
|
// Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of |
|
22320
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
22321
|
|
|
|
|
|
|
// |
|
22322
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
22323
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
22324
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
22325
|
|
|
|
|
|
|
// |
|
22326
|
|
|
|
|
|
|
// UniLib version: 3.3.0 |
|
22327
|
|
|
|
|
|
|
// Unicode version: 15.0.0 |
|
22328
|
|
|
|
|
|
|
|
|
22329
|
|
|
|
|
|
|
namespace unilib { |
|
22330
|
|
|
|
|
|
|
|
|
22331
|
|
|
|
|
|
|
const char32_t unicode::CHARS; |
|
22332
|
|
|
|
|
|
|
|
|
22333
|
|
|
|
|
|
|
const int32_t unicode::DEFAULT_CAT; |
|
22334
|
|
|
|
|
|
|
|
|
22335
|
|
|
|
|
|
|
const uint8_t unicode::category_index[unicode::CHARS >> 8] = { |
|
22336
|
|
|
|
|
|
|
0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,17,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,33,41,42,43,44,45,46,47,48,39,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,49,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,50,17,17,17,51,17,52,53,54,55,56,57,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,58,59,59,59,59,59,59,59,59,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,17,61,62,17,63,64,65,66,67,68,69,70,71,17,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,17,17,17,97,98,99,100,100,100,100,100,100,100,100,100,101,17,17,17,17,102,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,17,17,103,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,17,17,104,105,100,100,106,107,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,108,17,17,17,17,109,110,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,111,17,112,113,100,100,100,100,100,100,100,100,100,114,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,115,116,117,118,119,120,121,122,123,39,39,124,100,100,100,100,125,126,127,128,100,129,100,100,130,131,132,100,100,133,134,135,100,136,137,138,139,39,39,140,141,142,39,143,144,100,100,100,100,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17, |
|
22337
|
|
|
|
|
|
|
17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,145,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,146,147,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,148,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,149,100,100,100,100,100,100,100,100,100,100,100,100,17,17,150,100,100,100,100,100,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,151,17,17,17,17,17,17,17,17,17,17,17,17,17,17,17,152,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100, |
|
22338
|
|
|
|
|
|
|
100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100, |
|
22339
|
|
|
|
|
|
|
100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100, |
|
22340
|
|
|
|
|
|
|
100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100, |
|
22341
|
|
|
|
|
|
|
100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100, |
|
22342
|
|
|
|
|
|
|
100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,153,154,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100, |
|
22343
|
|
|
|
|
|
|
100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,100,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,155,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60, |
|
22344
|
|
|
|
|
|
|
60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,155 |
|
22345
|
|
|
|
|
|
|
}; |
|
22346
|
|
|
|
|
|
|
|
|
22347
|
|
|
|
|
|
|
const uint8_t unicode::category_block[][256] = { |
|
22348
|
|
|
|
|
|
|
{_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Zs,_Po,_Po,_Po,_Sc,_Po,_Po,_Po,_Ps,_Pe,_Po,_Sm,_Po,_Pd,_Po,_Po,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Po,_Po,_Sm,_Sm,_Sm,_Po,_Po,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ps,_Po,_Pe,_Sk,_Pc,_Sk,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ps,_Sm,_Pe,_Sm,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Cc,_Zs,_Po,_Sc,_Sc,_Sc,_Sc,_So,_Po,_Sk,_So,_Lo,_Pi,_Sm,_Cf,_So,_Sk,_So,_Sm,_No,_No,_Sk,_Ll,_Po,_Po,_Sk,_No,_Lo,_Pf,_No,_No,_No,_Po,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Sm,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll}, |
|
22349
|
|
|
|
|
|
|
{_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Ll,_Lu,_Lu,_Ll,_Lu,_Ll,_Lu,_Lu,_Ll,_Lu,_Lu,_Lu,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Ll,_Lu,_Lu,_Ll,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Lu,_Lu,_Ll,_Lu,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Lu,_Ll,_Lu,_Ll,_Ll,_Lu,_Ll,_Lu,_Lu,_Ll,_Lu,_Lu,_Lu,_Ll,_Lu,_Ll,_Lu,_Lu,_Ll,_Ll,_Lo,_Lu,_Ll,_Ll,_Ll,_Lo,_Lo,_Lo,_Lo,_Lu,_Lt,_Ll,_Lu,_Lt,_Ll,_Lu,_Lt,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Lu,_Lt,_Ll,_Lu,_Ll,_Lu,_Lu,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll}, |
|
22350
|
|
|
|
|
|
|
{_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Ll,_Lu,_Lu,_Ll,_Ll,_Lu,_Ll,_Lu,_Lu,_Lu,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lo,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Sk,_Sk,_Sk,_Sk,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Lm,_Lm,_Lm,_Lm,_Lm,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Lm,_Sk,_Lm,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk}, |
|
22351
|
|
|
|
|
|
|
{_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lu,_Ll,_Lu,_Ll,_Lm,_Sk,_Lu,_Ll,_Cn,_Cn,_Lm,_Ll,_Ll,_Ll,_Po,_Lu,_Cn,_Cn,_Cn,_Cn,_Sk,_Sk,_Lu,_Po,_Lu,_Lu,_Lu,_Cn,_Lu,_Cn,_Lu,_Lu,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Ll,_Ll,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Ll,_Sm,_Lu,_Ll,_Lu,_Lu,_Ll,_Ll,_Lu,_Lu,_Lu}, |
|
22352
|
|
|
|
|
|
|
{_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_So,_Mn,_Mn,_Mn,_Mn,_Mn,_Me,_Me,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll}, |
|
22353
|
|
|
|
|
|
|
{_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Cn,_Lm,_Po,_Po,_Po,_Po,_Po,_Po,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Po,_Pd,_Cn,_Cn,_So,_So,_Sc,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Pd,_Mn,_Po,_Mn,_Mn,_Po,_Mn,_Mn,_Po,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22354
|
|
|
|
|
|
|
{_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Sm,_Sm,_Sm,_Po,_Po,_Sc,_Po,_Po,_So,_So,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Cf,_Po,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Po,_Po,_Po,_Po,_Lo,_Lo,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Po,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cf,_So,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lm,_Lm,_Mn,_Mn,_So,_Mn,_Mn,_Mn,_Mn,_Lo,_Lo,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Lo,_Lo,_So,_So,_Lo}, |
|
22355
|
|
|
|
|
|
|
{_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Cf,_Lo,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lm,_Lm,_So,_Po,_Po,_Po,_Lm,_Cn,_Cn,_Mn,_Sc,_Sc}, |
|
22356
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Lm,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lm,_Mn,_Mn,_Mn,_Lm,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Cn,_Cn,_Po,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Sk,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cf,_Cf,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cf,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn}, |
|
22357
|
|
|
|
|
|
|
{_Mn,_Mn,_Mn,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mc,_Mn,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mc,_Mc,_Mn,_Mc,_Mc,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Po,_Po,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Po,_Lm,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mc,_Mc,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Mc,_Mc,_Cn,_Cn,_Mc,_Mc,_Mn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mc,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Mn,_Mn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Lo,_Sc,_Sc,_No,_No,_No,_No,_No,_No,_So,_Sc,_Lo,_Po,_Mn,_Cn}, |
|
22358
|
|
|
|
|
|
|
{_Cn,_Mn,_Mn,_Mc,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Cn,_Mn,_Cn,_Mc,_Mc,_Mc,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Cn,_Cn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Mn,_Mn,_Lo,_Lo,_Lo,_Mn,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mc,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Mn,_Mn,_Mc,_Cn,_Mc,_Mc,_Mn,_Cn,_Cn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Mn,_Mn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Po,_Sc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn}, |
|
22359
|
|
|
|
|
|
|
{_Cn,_Mn,_Mc,_Mc,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Lo,_Mc,_Mn,_Mc,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Mc,_Mc,_Cn,_Cn,_Mc,_Mc,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mc,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Mn,_Mn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_So,_Lo,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Cn,_Lo,_Cn,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Mc,_Mc,_Mn,_Mc,_Mc,_Cn,_Cn,_Cn,_Mc,_Mc,_Mc,_Cn,_Mc,_Mc,_Mc,_Mn,_Cn,_Cn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_No,_No,_No,_So,_So,_So,_So,_So,_So,_Sc,_So,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22360
|
|
|
|
|
|
|
{_Mn,_Mc,_Mc,_Mc,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Lo,_Mn,_Mn,_Mn,_Mc,_Mc,_Mc,_Mc,_Cn,_Mn,_Mn,_Mn,_Cn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Cn,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Cn,_Cn,_Lo,_Lo,_Mn,_Mn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_No,_No,_No,_No,_No,_No,_No,_So,_Lo,_Mn,_Mc,_Mc,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Lo,_Mc,_Mn,_Mc,_Mc,_Mc,_Mc,_Mc,_Cn,_Mn,_Mc,_Mc,_Cn,_Mc,_Mc,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mc,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Mn,_Mn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Lo,_Lo,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22361
|
|
|
|
|
|
|
{_Mn,_Mn,_Mc,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Cn,_Mc,_Mc,_Mc,_Cn,_Mc,_Mc,_Mc,_Mn,_Lo,_So,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Mc,_No,_No,_No,_No,_No,_No,_No,_Lo,_Lo,_Lo,_Mn,_Mn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Mn,_Mc,_Mc,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Mn,_Cn,_Cn,_Cn,_Cn,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Cn,_Mn,_Cn,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Mc,_Mc,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22362
|
|
|
|
|
|
|
{_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Sc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Cn,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lm,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22363
|
|
|
|
|
|
|
{_Lo,_So,_So,_So,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_So,_Po,_So,_So,_So,_Mn,_Mn,_So,_So,_So,_So,_So,_So,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_Mn,_So,_Mn,_So,_Mn,_Ps,_Pe,_Ps,_Pe,_Mc,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_Mn,_So,_So,_So,_So,_So,_So,_Cn,_So,_So,_Po,_Po,_Po,_Po,_Po,_So,_So,_So,_So,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22364
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mn,_Mc,_Mc,_Mn,_Mn,_Lo,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Po,_Po,_Po,_Po,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Lo,_Mc,_Mc,_Mc,_Lo,_Lo,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mc,_Mc,_Mn,_Mn,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mn,_Lo,_Mc,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Mc,_Mc,_Mc,_Mn,_So,_So,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Lu,_Cn,_Cn,_Cn,_Cn,_Cn,_Lu,_Cn,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Po,_Lm,_Ll,_Ll,_Ll}, |
|
22365
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo}, |
|
22366
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo}, |
|
22367
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Mn,_Mn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn}, |
|
22368
|
|
|
|
|
|
|
{_Pd,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo}, |
|
22369
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_So,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Zs,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Ps,_Pe,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Po,_Po,_Po,_Nl,_Nl,_Nl,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22370
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mc,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Cn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mn,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Po,_Po,_Lm,_Po,_Po,_Po,_Sc,_Lo,_Mn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22371
|
|
|
|
|
|
|
{_Po,_Po,_Po,_Po,_Po,_Po,_Pd,_Po,_Po,_Po,_Po,_Mn,_Mn,_Mn,_Cf,_Mn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22372
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mc,_Mc,_Mn,_Mn,_Mc,_Mc,_Mc,_Cn,_Cn,_Cn,_Cn,_Mc,_Mc,_Mn,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_So,_Cn,_Cn,_Cn,_Po,_Po,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_No,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So}, |
|
22373
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mc,_Mc,_Mn,_Cn,_Cn,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mn,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Mn,_Mc,_Mn,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Mn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Lm,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Me,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22374
|
|
|
|
|
|
|
{_Mn,_Mn,_Mn,_Mn,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mc,_Mc,_Mc,_Mc,_Mc,_Mn,_Mc,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Po,_Po,_Cn,_Mn,_Mn,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mn,_Mn,_Mc,_Mn,_Mn,_Mn,_Lo,_Lo,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mc,_Mn,_Mn,_Mc,_Mc,_Mc,_Mn,_Mc,_Mn,_Mn,_Mn,_Mc,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Po,_Po,_Po}, |
|
22375
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mn,_Mn,_Cn,_Cn,_Cn,_Po,_Po,_Po,_Po,_Po,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Po,_Po,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Cn,_Lu,_Lu,_Lu,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Po,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Lo,_Lo,_Mc,_Mn,_Mn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22376
|
|
|
|
|
|
|
{_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn}, |
|
22377
|
|
|
|
|
|
|
{_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll}, |
|
22378
|
|
|
|
|
|
|
{_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Lu,_Cn,_Lu,_Cn,_Lu,_Cn,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lt,_Lt,_Lt,_Lt,_Lt,_Lt,_Lt,_Lt,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lt,_Lt,_Lt,_Lt,_Lt,_Lt,_Lt,_Lt,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lt,_Lt,_Lt,_Lt,_Lt,_Lt,_Lt,_Lt,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lt,_Sk,_Ll,_Sk,_Sk,_Sk,_Ll,_Ll,_Ll,_Cn,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lt,_Sk,_Sk,_Sk,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Cn,_Sk,_Sk,_Sk,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Sk,_Sk,_Sk,_Cn,_Cn,_Ll,_Ll,_Ll,_Cn,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lt,_Sk,_Sk,_Cn}, |
|
22379
|
|
|
|
|
|
|
{_Zs,_Zs,_Zs,_Zs,_Zs,_Zs,_Zs,_Zs,_Zs,_Zs,_Zs,_Cf,_Cf,_Cf,_Cf,_Cf,_Pd,_Pd,_Pd,_Pd,_Pd,_Pd,_Po,_Po,_Pi,_Pf,_Ps,_Pi,_Pi,_Pf,_Ps,_Pi,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Zl,_Zp,_Cf,_Cf,_Cf,_Cf,_Cf,_Zs,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Pi,_Pf,_Po,_Po,_Po,_Po,_Pc,_Pc,_Po,_Po,_Po,_Sm,_Ps,_Pe,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Sm,_Po,_Pc,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Zs,_Cf,_Cf,_Cf,_Cf,_Cf,_Cn,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_No,_Lm,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_Sm,_Sm,_Sm,_Ps,_Pe,_Lm,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Sm,_Sm,_Sm,_Ps,_Pe,_Cn,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Cn,_Cn,_Cn,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Sc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Me,_Me,_Me,_Me,_Mn,_Me,_Me,_Me,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22380
|
|
|
|
|
|
|
{_So,_So,_Lu,_So,_So,_So,_So,_Lu,_So,_So,_Ll,_Lu,_Lu,_Lu,_Ll,_Ll,_Lu,_Lu,_Lu,_Ll,_So,_Lu,_So,_So,_Sm,_Lu,_Lu,_Lu,_Lu,_Lu,_So,_So,_So,_So,_So,_So,_Lu,_So,_Lu,_So,_Lu,_So,_Lu,_Lu,_Lu,_Lu,_So,_Ll,_Lu,_Lu,_Lu,_Lu,_Ll,_Lo,_Lo,_Lo,_Lo,_Ll,_So,_So,_Ll,_Ll,_Lu,_Lu,_Sm,_Sm,_Sm,_Sm,_Sm,_Lu,_Ll,_Ll,_Ll,_Ll,_So,_Sm,_So,_So,_Ll,_So,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Lu,_Ll,_Nl,_Nl,_Nl,_Nl,_No,_So,_So,_Cn,_Cn,_Cn,_Cn,_Sm,_Sm,_Sm,_Sm,_Sm,_So,_So,_So,_So,_So,_Sm,_Sm,_So,_So,_So,_So,_Sm,_So,_So,_Sm,_So,_So,_Sm,_So,_So,_So,_So,_So,_So,_So,_Sm,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_Sm,_So,_So,_Sm,_So,_Sm,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm}, |
|
22381
|
|
|
|
|
|
|
{_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm}, |
|
22382
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_Ps,_Pe,_Ps,_Pe,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_Sm,_So,_So,_So,_So,_So,_So,_So,_Ps,_Pe,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So}, |
|
22383
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No}, |
|
22384
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm}, |
|
22385
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So}, |
|
22386
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_Sm,_Sm,_Sm,_Sm,_Ps,_Pe,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm}, |
|
22387
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So}, |
|
22388
|
|
|
|
|
|
|
{_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Ps,_Pe,_Ps,_Pe,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Ps,_Pe,_Sm,_Sm}, |
|
22389
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_So,_So,_Sm,_Sm,_Sm,_Sm,_Sm,_Sm,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So}, |
|
22390
|
|
|
|
|
|
|
{_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Ll,_Lu,_Lu,_Lu,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Lu,_Lu,_Lu,_Ll,_Lu,_Ll,_Ll,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lm,_Lm,_Lu,_Lu,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_So,_So,_So,_So,_So,_So,_Lu,_Ll,_Lu,_Ll,_Mn,_Mn,_Mn,_Lu,_Ll,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Po,_Po,_Po,_No,_Po,_Po}, |
|
22391
|
|
|
|
|
|
|
{_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Ll,_Cn,_Cn,_Cn,_Cn,_Cn,_Ll,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lm,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn}, |
|
22392
|
|
|
|
|
|
|
{_Po,_Po,_Pi,_Pf,_Pi,_Pf,_Po,_Po,_Po,_Pi,_Pf,_Po,_Pi,_Pf,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Pd,_Po,_Po,_Pd,_Po,_Pi,_Pf,_Po,_Po,_Pi,_Pf,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Po,_Po,_Po,_Po,_Po,_Lm,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Pd,_Pd,_Po,_Po,_Po,_Po,_Pd,_Po,_Ps,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_So,_So,_Po,_Po,_Po,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Pd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22393
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn}, |
|
22394
|
|
|
|
|
|
|
{_Zs,_Po,_Po,_Po,_So,_Lm,_Lo,_Nl,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_So,_So,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Pd,_Ps,_Pe,_Pe,_So,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Pd,_Lm,_Lm,_Lm,_Lm,_Lm,_So,_So,_Nl,_Nl,_Nl,_Lm,_Lo,_Po,_So,_So,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Mn,_Sk,_Sk,_Lm,_Lm,_Lo,_Pd,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Po,_Lm,_Lm,_Lm,_Lo}, |
|
22395
|
|
|
|
|
|
|
{_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_So,_So,_No,_No,_No,_No,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo}, |
|
22396
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_No,_No,_No,_No,_No,_No,_No,_No,_So,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So}, |
|
22397
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So}, |
|
22398
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo}, |
|
22399
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Po,_Po}, |
|
22400
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Po,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lo,_Mn,_Me,_Me,_Me,_Po,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Lm,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lm,_Lm,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Mn,_Mn,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22401
|
|
|
|
|
|
|
{_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Sk,_Sk,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lm,_Sk,_Sk,_Lu,_Ll,_Lu,_Ll,_Lo,_Lu,_Ll,_Lu,_Ll,_Ll,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Ll,_Lu,_Lu,_Lu,_Lu,_Ll,_Lu,_Ll,_Cn,_Cn,_Cn,_Cn,_Cn,_Lu,_Ll,_Cn,_Ll,_Cn,_Ll,_Lu,_Ll,_Lu,_Ll,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lm,_Lm,_Lm,_Lu,_Ll,_Lo,_Lm,_Lm,_Ll,_Lo,_Lo,_Lo,_Lo,_Lo}, |
|
22402
|
|
|
|
|
|
|
{_Lo,_Lo,_Mn,_Lo,_Lo,_Lo,_Mn,_Lo,_Lo,_Lo,_Lo,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mn,_Mn,_Mc,_So,_So,_So,_So,_Mn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_So,_So,_Sc,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mc,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Po,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Po,_Po,_Po,_Lo,_Po,_Lo,_Lo,_Mn}, |
|
22403
|
|
|
|
|
|
|
{_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mn,_Mn,_Mc,_Mc,_Mc,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Lm,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Lm,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn}, |
|
22404
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mn,_Mn,_Mc,_Mc,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mc,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Po,_Po,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_So,_So,_So,_Lo,_Mc,_Mn,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Lo,_Mn,_Mn,_Mn,_Lo,_Lo,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Lo,_Mn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lm,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mn,_Mn,_Mc,_Mc,_Po,_Po,_Lo,_Lm,_Lm,_Mc,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22405
|
|
|
|
|
|
|
{_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Sk,_Lm,_Lm,_Lm,_Lm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lm,_Sk,_Sk,_Cn,_Cn,_Cn,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mn,_Mc,_Mc,_Mn,_Mc,_Mc,_Po,_Mc,_Mn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22406
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn}, |
|
22407
|
|
|
|
|
|
|
{_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs,_Cs}, |
|
22408
|
|
|
|
|
|
|
{_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co}, |
|
22409
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22410
|
|
|
|
|
|
|
{_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Sm,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Sk,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo}, |
|
22411
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Pe,_Ps,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Sc,_So,_So,_So}, |
|
22412
|
|
|
|
|
|
|
{_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Ps,_Pe,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Pd,_Pd,_Pc,_Pc,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Po,_Po,_Ps,_Pe,_Po,_Po,_Po,_Po,_Pc,_Pc,_Pc,_Po,_Po,_Po,_Cn,_Po,_Po,_Po,_Po,_Pd,_Ps,_Pe,_Ps,_Pe,_Ps,_Pe,_Po,_Po,_Po,_Sm,_Pd,_Sm,_Sm,_Sm,_Cn,_Po,_Sc,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cf}, |
|
22413
|
|
|
|
|
|
|
{_Cn,_Po,_Po,_Po,_Sc,_Po,_Po,_Po,_Ps,_Pe,_Po,_Sm,_Po,_Pd,_Po,_Po,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Po,_Po,_Sm,_Sm,_Sm,_Po,_Po,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ps,_Po,_Pe,_Sk,_Pc,_Sk,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ps,_Sm,_Pe,_Sm,_Ps,_Pe,_Po,_Ps,_Pe,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Lm,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Sc,_Sc,_Sm,_Sk,_So,_Sc,_Sc,_Cn,_So,_Sm,_Sm,_Sm,_Sm,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cf,_Cf,_Cf,_So,_So,_Cn,_Cn}, |
|
22414
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22415
|
|
|
|
|
|
|
{_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_No,_No,_No,_No,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_No,_No,_So,_So,_So,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Mn,_Cn,_Cn}, |
|
22416
|
|
|
|
|
|
|
{_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn}, |
|
22417
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Nl,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Nl,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Po,_Nl,_Nl,_Nl,_Nl,_Nl,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22418
|
|
|
|
|
|
|
{_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Cn,_Cn,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Cn,_Cn}, |
|
22419
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Lu,_Lu,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Ll,_Ll,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22420
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Cn,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Cn,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22421
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Po,_No,_No,_No,_No,_No,_No,_No,_No,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_So,_So,_No,_No,_No,_No,_No,_No,_No,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No}, |
|
22422
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_No,_No,_Lo,_Lo,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No}, |
|
22423
|
|
|
|
|
|
|
{_Lo,_Mn,_Mn,_Mn,_Cn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Mn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_No,_No,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_So,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22424
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22425
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No}, |
|
22426
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22427
|
|
|
|
|
|
|
{_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Mn,_Mn,_Pd,_Cn,_Cn,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn}, |
|
22428
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_No,_No,_No,_No,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22429
|
|
|
|
|
|
|
{_Mc,_Mn,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Mn,_Lo,_Lo,_Mn,_Mn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mn,_Mn,_Po,_Po,_Cf,_Po,_Po,_Po,_Po,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cf,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22430
|
|
|
|
|
|
|
{_Mn,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Po,_Po,_Po,_Po,_Lo,_Mc,_Mc,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Po,_Po,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Lo,_Lo,_Lo,_Lo,_Po,_Po,_Po,_Po,_Mn,_Mn,_Mn,_Mn,_Po,_Mc,_Mn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Lo,_Po,_Lo,_Po,_Po,_Po,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22431
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mc,_Mc,_Mn,_Mc,_Mn,_Mn,_Po,_Po,_Po,_Po,_Po,_Po,_Mn,_Lo,_Lo,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22432
|
|
|
|
|
|
|
{_Mn,_Mn,_Mc,_Mc,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Mn,_Mn,_Lo,_Mc,_Mc,_Mn,_Mc,_Mc,_Mc,_Mc,_Cn,_Cn,_Mc,_Mc,_Cn,_Cn,_Mc,_Mc,_Mc,_Cn,_Cn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22433
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mn,_Mn,_Mn,_Mc,_Mn,_Lo,_Lo,_Lo,_Lo,_Po,_Po,_Po,_Po,_Po,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Po,_Po,_Cn,_Po,_Mn,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mc,_Mc,_Mc,_Mc,_Mn,_Mn,_Mc,_Mn,_Mn,_Lo,_Lo,_Po,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22434
|
|
|
|
|
|
|
{_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Mc,_Mc,_Mc,_Mc,_Mn,_Mn,_Mc,_Mn,_Mn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22435
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mn,_Mc,_Mn,_Mn,_Po,_Po,_Po,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mc,_Mn,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Lo,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22436
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_No,_No,_Po,_Po,_Po,_So,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22437
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mn,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo}, |
|
22438
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Cn,_Mc,_Mc,_Cn,_Cn,_Mn,_Mn,_Mc,_Mn,_Lo,_Mc,_Lo,_Mc,_Mn,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Mn,_Mn,_Mc,_Mc,_Mc,_Mc,_Mn,_Lo,_Po,_Lo,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22439
|
|
|
|
|
|
|
{_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Lo,_Mn,_Mn,_Mn,_Mn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mc,_Mn,_Mn,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mn,_Po,_Po,_Po,_Lo,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22440
|
|
|
|
|
|
|
{_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22441
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Lo,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mc,_Mn,_Mn,_Mc,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22442
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Mn,_Cn,_Mn,_Mn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lo,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mc,_Mc,_Mc,_Cn,_Mn,_Mn,_Cn,_Mc,_Mc,_Mn,_Mc,_Mn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22443
|
|
|
|
|
|
|
{_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mc,_Mc,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22444
|
|
|
|
|
|
|
{_Mn,_Mn,_Lo,_Mc,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mc,_Mc,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Mc,_Mc,_Mn,_Mc,_Mn,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Po,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_So,_So,_So,_So,_So,_So,_So,_Sc,_Sc,_Sc,_Sc,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Po}, |
|
22445
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22446
|
|
|
|
|
|
|
{_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Nl,_Cn,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo}, |
|
22447
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22448
|
|
|
|
|
|
|
{_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22449
|
|
|
|
|
|
|
{_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22450
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Mn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22451
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22452
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Po,_Po,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22453
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Po,_Po,_Po,_Po,_Po,_So,_So,_So,_So,_Lm,_Lm,_Lm,_Lm,_Po,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_No,_No,_No,_No,_No,_No,_No,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22454
|
|
|
|
|
|
|
{_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22455
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Mn,_Lo,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lm,_Lm,_Po,_Lm,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mc,_Mc,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22456
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22457
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22458
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22459
|
|
|
|
|
|
|
{_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lm,_Lm,_Lm,_Lm,_Cn,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Cn,_Lm,_Lm,_Cn}, |
|
22460
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo}, |
|
22461
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn}, |
|
22462
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_So,_Mn,_Mn,_Po,_Cf,_Cf,_Cf,_Cf,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22463
|
|
|
|
|
|
|
{_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22464
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22465
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Mc,_Mc,_Mn,_Mn,_Mn,_So,_So,_So,_Mc,_Mc,_Mc,_Mc,_Mc,_Mc,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_So,_So,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Mn,_Mn,_Mn,_Mn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22466
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Mn,_Mn,_Mn,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22467
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22468
|
|
|
|
|
|
|
{_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Cn,_Lu,_Lu,_Cn,_Cn,_Lu,_Cn,_Cn,_Lu,_Lu,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Cn,_Ll,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll}, |
|
22469
|
|
|
|
|
|
|
{_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Cn,_Lu,_Lu,_Lu,_Lu,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Cn,_Lu,_Lu,_Lu,_Lu,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Lu,_Cn,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll}, |
|
22470
|
|
|
|
|
|
|
{_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Sm,_Ll,_Ll,_Ll,_Ll}, |
|
22471
|
|
|
|
|
|
|
{_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Sm,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lu,_Ll,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd}, |
|
22472
|
|
|
|
|
|
|
{_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_So,_So,_So,_So,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_So,_So,_So,_So,_So,_So,_So,_So,_Mn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Mn,_So,_So,_Po,_Po,_Po,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22473
|
|
|
|
|
|
|
{_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Lo,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22474
|
|
|
|
|
|
|
{_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Mn,_Mn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22475
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Lm,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Lo,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22476
|
|
|
|
|
|
|
{_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Mn,_Mn,_Mn,_Mn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Sc}, |
|
22477
|
|
|
|
|
|
|
{_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lm,_Mn,_Mn,_Mn,_Mn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22478
|
|
|
|
|
|
|
{_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn}, |
|
22479
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22480
|
|
|
|
|
|
|
{_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Lu,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Ll,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Lm,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Po,_Po,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22481
|
|
|
|
|
|
|
{_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_No,_No,_No,_Sc,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22482
|
|
|
|
|
|
|
{_Cn,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22483
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Cn,_Cn,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Cn,_Cn,_Cn,_Cn,_Lo,_Cn,_Lo,_Cn,_Lo,_Cn,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Cn,_Cn,_Lo,_Cn,_Lo,_Cn,_Lo,_Cn,_Lo,_Cn,_Lo,_Cn,_Lo,_Lo,_Cn,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Sm,_Sm,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22484
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22485
|
|
|
|
|
|
|
{_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_No,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So}, |
|
22486
|
|
|
|
|
|
|
{_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22487
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Sk,_Sk,_Sk,_Sk,_Sk}, |
|
22488
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn}, |
|
22489
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22490
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22491
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22492
|
|
|
|
|
|
|
{_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_So,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Nd,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22493
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22494
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo}, |
|
22495
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo}, |
|
22496
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo}, |
|
22497
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22498
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22499
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo}, |
|
22500
|
|
|
|
|
|
|
{_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Lo,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22501
|
|
|
|
|
|
|
{_Cn,_Cf,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cf,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22502
|
|
|
|
|
|
|
{_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Mn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn,_Cn}, |
|
22503
|
|
|
|
|
|
|
{_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Co,_Cn,_Cn} |
|
22504
|
|
|
|
|
|
|
}; |
|
22505
|
|
|
|
|
|
|
|
|
22506
|
|
|
|
|
|
|
const uint8_t unicode::othercase_index[unicode::CHARS >> 8] = { |
|
22507
|
|
|
|
|
|
|
0,1,2,3,4,5,6,6,6,6,6,6,6,6,6,6,7,6,6,8,6,6,6,6,6,6,6,6,9,10,11,12,6,13,6,6,14,6,6,6,6,6,6,6,15,16,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,17,18,6,6,6,19,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,20,6,6,6,6,21,22,6,6,6,6,6,6,23,6,6,6,6,6,6,6,6,6,6,6,24,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,25,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,26,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, |
|
22508
|
|
|
|
|
|
|
6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, |
|
22509
|
|
|
|
|
|
|
6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, |
|
22510
|
|
|
|
|
|
|
6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, |
|
22511
|
|
|
|
|
|
|
6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6 |
|
22512
|
|
|
|
|
|
|
}; |
|
22513
|
|
|
|
|
|
|
|
|
22514
|
|
|
|
|
|
|
const char32_t unicode::othercase_block[][256] = { |
|
22515
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,24833,25089,25345,25601,25857,26113,26369,26625,26881,27137,27393,27649,27905,28161,28417,28673,28929,29185,29441,29697,29953,30209,30465,30721,30977,31233,0,0,0,0,0,0,16642,16898,17154,17410,17666,17922,18178,18434,18690,18946,19202,19458,19714,19970,20226,20482,20738,20994,21250,21506,21762,22018,22274,22530,22786,23042,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,236546,0,0,0,0,0,0,0,0,0,0,57345,57601,57857,58113,58369,58625,58881,59137,59393,59649,59905,60161,60417,60673,60929,61185,61441,61697,61953,62209,62465,62721,62977,0,63489,63745,64001,64257,64513,64769,65025,0,49154,49410,49666,49922,50178,50434,50690,50946,51202,51458,51714,51970,52226,52482,52738,52994,53250,53506,53762,54018,54274,54530,54786,0,55298,55554,55810,56066,56322,56578,56834,96258}, |
|
22516
|
|
|
|
|
|
|
{65793,65538,66305,66050,66817,66562,67329,67074,67841,67586,68353,68098,68865,68610,69377,69122,69889,69634,70401,70146,70913,70658,71425,71170,71937,71682,72449,72194,72961,72706,73473,73218,73985,73730,74497,74242,75009,74754,75521,75266,76033,75778,76545,76290,77057,76802,77569,77314,26881,18690,78593,78338,79105,78850,79617,79362,0,80385,80130,80897,80642,81409,81154,81921,81666,82433,82178,82945,82690,83457,83202,83969,83714,0,84737,84482,85249,84994,85761,85506,86273,86018,86785,86530,87297,87042,87809,87554,88321,88066,88833,88578,89345,89090,89857,89602,90369,90114,90881,90626,91393,91138,91905,91650,92417,92162,92929,92674,93441,93186,93953,93698,94465,94210,94977,94722,95489,95234,96001,95746,65281,96769,96514,97281,97026,97793,97538,21250,148226,152321,99073,98818,99585,99330,152577,100353,100098,153089,153345,101377,101122,0,122113,153857,154369,102913,102658,155649,156417,128514,157953,157697,104705,104450,146690,0,159489,160257,139266,161025,106753,106498,107265,107010,107777,107522,163841,108545,108290,164609,0,0,109825,109570,165889,110593,110338,166401,166657,111617,111362,112129,111874,168449,112897,112642,0,0,113921,113666,0,128770,0,0,0,0,115974,116228,115717,116742,116996,116485,117510,117764,117253,118273,118018,118785,118530,119297,119042,119809,119554,120321,120066,120833,120578,121345,121090,121857,121602,101890,122625,122370,123137,122882,123649,123394,124161,123906,124673,124418,125185,124930,125697,125442,126209,125954,126721,126466,0,127494,127748,127237,128257,128002,103681,114433,129281,129026,129793,129538,130305,130050,130817,130562}, |
|
22517
|
|
|
|
|
|
|
{131329,131074,131841,131586,132353,132098,132865,132610,133377,133122,133889,133634,134401,134146,134913,134658,135425,135170,135937,135682,136449,136194,136961,136706,137473,137218,137985,137730,138497,138242,139009,138754,105985,0,140033,139778,140545,140290,141057,140802,141569,141314,142081,141826,142593,142338,143105,142850,143617,143362,144129,143874,0,0,0,0,0,0,2909441,146433,146178,104961,2909697,2915842,2916098,147969,147714,98305,166145,166913,149249,148994,149761,149506,150273,150018,150785,150530,151297,151042,2912002,2911490,2912258,98562,99842,0,100610,100866,0,102146,0,102402,10988290,0,0,0,103170,10988546,0,103426,0,10980610,10988034,0,104194,103938,10989058,2908674,10988802,0,0,105474,0,2911746,105730,0,0,106242,0,0,0,0,0,0,0,2909186,0,0,108034,0,10994946,108802,0,0,0,10989826,110082,148482,110850,111106,148738,0,0,0,0,0,112386,0,0,0,0,0,0,0,0,0,0,10990082,10989570,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22518
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,235778,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,225537,225282,226049,225794,0,0,227073,226818,0,0,0,261378,261634,261890,0,258817,0,0,0,0,0,0,240641,0,240897,241153,241409,0,248833,0,249089,249345,0,241921,242177,242433,242689,242945,243201,243457,243713,243969,244225,244481,244737,244993,245249,245505,245761,246017,0,246529,246785,247041,247297,247553,247809,248065,248321,248577,230914,231426,231682,231938,0,233730,233986,234242,234498,234754,235010,235266,235522,235778,236034,236290,236546,236802,237058,237314,237570,237826,238338,238338,238594,238850,239106,239362,239618,239874,240130,240386,232450,232962,233218,251649,233986,235522,0,0,0,239106,237570,249602,252161,251906,252673,252418,253185,252930,253697,253442,254209,253954,254721,254466,255233,254978,255745,255490,256257,256002,256769,256514,257281,257026,257793,257538,236034,237826,260354,229122,243713,234754,0,260097,259842,258561,260865,260610,0,228097,228353,228609}, |
|
22519
|
|
|
|
|
|
|
{282625,282881,283137,283393,283649,283905,284161,284417,284673,284929,285185,285441,285697,285953,286209,286465,274433,274689,274945,275201,275457,275713,275969,276225,276481,276737,276993,277249,277505,277761,278017,278273,278529,278785,279041,279297,279553,279809,280065,280321,280577,280833,281089,281345,281601,281857,282113,282369,266242,266498,266754,267010,267266,267522,267778,268034,268290,268546,268802,269058,269314,269570,269826,270082,270338,270594,270850,271106,271362,271618,271874,272130,272386,272642,272898,273154,273410,273666,273922,274178,262146,262402,262658,262914,263170,263426,263682,263938,264194,264450,264706,264962,265218,265474,265730,265986,286977,286722,287489,287234,288001,287746,288513,288258,289025,288770,289537,289282,290049,289794,290561,290306,291073,290818,291585,291330,292097,291842,292609,292354,293121,292866,293633,293378,294145,293890,294657,294402,295169,294914,0,0,0,0,0,0,0,0,297729,297474,298241,297986,298753,298498,299265,299010,299777,299522,300289,300034,300801,300546,301313,301058,301825,301570,302337,302082,302849,302594,303361,303106,303873,303618,304385,304130,304897,304642,305409,305154,305921,305666,306433,306178,306945,306690,307457,307202,307969,307714,308481,308226,308993,308738,309505,309250,310017,309762,310529,310274,311041,310786,315137,311809,311554,312321,312066,312833,312578,313345,313090,313857,313602,314369,314114,314881,314626,311298,315649,315394,316161,315906,316673,316418,317185,316930,317697,317442,318209,317954,318721,318466,319233,318978,319745,319490,320257,320002,320769,320514,321281,321026,321793,321538,322305,322050,322817,322562,323329,323074,323841,323586,324353,324098,324865,324610,325377,325122,325889,325634,326401,326146,326913,326658,327425,327170}, |
|
22520
|
|
|
|
|
|
|
{327937,327682,328449,328194,328961,328706,329473,329218,329985,329730,330497,330242,331009,330754,331521,331266,332033,331778,332545,332290,333057,332802,333569,333314,334081,333826,334593,334338,335105,334850,335617,335362,336129,335874,336641,336386,337153,336898,337665,337410,338177,337922,338689,338434,339201,338946,339713,339458,0,352513,352769,353025,353281,353537,353793,354049,354305,354561,354817,355073,355329,355585,355841,356097,356353,356609,356865,357121,357377,357633,357889,358145,358401,358657,358913,359169,359425,359681,359937,360193,360449,360705,360961,361217,361473,361729,361985,0,0,0,0,0,0,0,0,0,0,340226,340482,340738,340994,341250,341506,341762,342018,342274,342530,342786,343042,343298,343554,343810,344066,344322,344578,344834,345090,345346,345602,345858,346114,346370,346626,346882,347138,347394,347650,347906,348162,348418,348674,348930,349186,349442,349698,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22521
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22522
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2949121,2949377,2949633,2949889,2950145,2950401,2950657,2950913,2951169,2951425,2951681,2951937,2952193,2952449,2952705,2952961,2953217,2953473,2953729,2953985,2954241,2954497,2954753,2955009,2955265,2955521,2955777,2956033,2956289,2956545,2956801,2957057,2957313,2957569,2957825,2958081,2958337,2958593,0,2959105,0,0,0,0,0,2960641,0,0,1871875,1872131,1872387,1872643,1872899,1873155,1873411,1873667,1873923,1874179,1874435,1874691,1874947,1875203,1875459,1875715,1875971,1876227,1876483,1876739,1876995,1877251,1877507,1877763,1878019,1878275,1878531,1878787,1879043,1879299,1879555,1879811,1880067,1880323,1880579,1880835,1881091,1881347,1881603,1881859,1882115,1882371,1882627,0,0,1883395,1883651,1883907}, |
|
22523
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11235329,11235585,11235841,11236097,11236353,11236609,11236865,11237121,11237377,11237633,11237889,11238145,11238401,11238657,11238913,11239169,11239425,11239681,11239937,11240193,11240449,11240705,11240961,11241217,11241473,11241729,11241985,11242241,11242497,11242753,11243009,11243265,11243521,11243777,11244033,11244289,11244545,11244801,11245057,11245313,11245569,11245825,11246081,11246337,11246593,11246849,11247105,11247361,11247617,11247873,11248129,11248385,11248641,11248897,11249153,11249409,11249665,11249921,11250177,11250433,11250689,11250945,11251201,11251457,11251713,11251969,11252225,11252481,11252737,11252993,11253249,11253505,11253761,11254017,11254273,11254529,11254785,11255041,11255297,11255553,1308673,1308929,1309185,1309441,1309697,1309953,0,0,1306626,1306882,1307138,1307394,1307650,1307906,0,0}, |
|
22524
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,266754,267266,269826,270594,270850,270850,272898,287234,10897922,0,0,0,0,0,0,0,1101825,1102081,1102337,1102593,1102849,1103105,1103361,1103617,1103873,1104129,1104385,1104641,1104897,1105153,1105409,1105665,1105921,1106177,1106433,1106689,1106945,1107201,1107457,1107713,1107969,1108225,1108481,1108737,1108993,1109249,1109505,1109761,1110017,1110273,1110529,1110785,1111041,1111297,1111553,1111809,1112065,1112321,1112577,0,0,1113345,1113601,1113857,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22525
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10976514,0,0,0,2908930,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10995202,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22526
|
|
|
|
|
|
|
{1966337,1966082,1966849,1966594,1967361,1967106,1967873,1967618,1968385,1968130,1968897,1968642,1969409,1969154,1969921,1969666,1970433,1970178,1970945,1970690,1971457,1971202,1971969,1971714,1972481,1972226,1972993,1972738,1973505,1973250,1974017,1973762,1974529,1974274,1975041,1974786,1975553,1975298,1976065,1975810,1976577,1976322,1977089,1976834,1977601,1977346,1978113,1977858,1978625,1978370,1979137,1978882,1979649,1979394,1980161,1979906,1980673,1980418,1981185,1980930,1981697,1981442,1982209,1981954,1982721,1982466,1983233,1982978,1983745,1983490,1984257,1984002,1984769,1984514,1985281,1985026,1985793,1985538,1986305,1986050,1986817,1986562,1987329,1987074,1987841,1987586,1988353,1988098,1988865,1988610,1989377,1989122,1989889,1989634,1990401,1990146,1990913,1990658,1991425,1991170,1991937,1991682,1992449,1992194,1992961,1992706,1993473,1993218,1993985,1993730,1994497,1994242,1995009,1994754,1995521,1995266,1996033,1995778,1996545,1996290,1997057,1996802,1997569,1997314,1998081,1997826,1998593,1998338,1999105,1998850,1999617,1999362,2000129,1999874,2000641,2000386,2001153,2000898,2001665,2001410,2002177,2001922,2002689,2002434,2003201,2002946,2003713,2003458,2004225,2003970,0,0,0,0,0,1990658,0,0,57089,0,2007297,2007042,2007809,2007554,2008321,2008066,2008833,2008578,2009345,2009090,2009857,2009602,2010369,2010114,2010881,2010626,2011393,2011138,2011905,2011650,2012417,2012162,2012929,2012674,2013441,2013186,2013953,2013698,2014465,2014210,2014977,2014722,2015489,2015234,2016001,2015746,2016513,2016258,2017025,2016770,2017537,2017282,2018049,2017794,2018561,2018306,2019073,2018818,2019585,2019330,2020097,2019842,2020609,2020354,2021121,2020866,2021633,2021378,2022145,2021890,2022657,2022402,2023169,2022914,2023681,2023426,2024193,2023938,2024705,2024450,2025217,2024962,2025729,2025474,2026241,2025986,2026753,2026498,2027265,2027010,2027777,2027522,2028289,2028034,2028801,2028546,2029313,2029058,2029825,2029570,2030337,2030082,2030849,2030594,2031361, |
|
22527
|
|
|
|
|
|
|
2031106}, |
|
22528
|
|
|
|
|
|
|
{2033666,2033922,2034178,2034434,2034690,2034946,2035202,2035458,2031617,2031873,2032129,2032385,2032641,2032897,2033153,2033409,2037762,2038018,2038274,2038530,2038786,2039042,0,0,2035713,2035969,2036225,2036481,2036737,2036993,0,0,2041858,2042114,2042370,2042626,2042882,2043138,2043394,2043650,2039809,2040065,2040321,2040577,2040833,2041089,2041345,2041601,2045954,2046210,2046466,2046722,2046978,2047234,2047490,2047746,2043905,2044161,2044417,2044673,2044929,2045185,2045441,2045697,2050050,2050306,2050562,2050818,2051074,2051330,0,0,2048001,2048257,2048513,2048769,2049025,2049281,0,0,0,2054402,0,2054914,0,2055426,0,2055938,0,2052353,0,2052865,0,2053377,0,2053889,2058242,2058498,2058754,2059010,2059266,2059522,2059778,2060034,2056193,2056449,2056705,2056961,2057217,2057473,2057729,2057985,2079234,2079490,2082818,2083074,2083330,2083586,2087426,2087682,2095106,2095362,2091522,2091778,2095618,2095874,0,0,2066434,2066690,2066946,2067202,2067458,2067714,2067970,2068226,2064385,2064641,2064897,2065153,2065409,2065665,2065921,2066177,2070530,2070786,2071042,2071298,2071554,2071810,2072066,2072322,2068481,2068737,2068993,2069249,2069505,2069761,2070017,2070273,2074626,2074882,2075138,2075394,2075650,2075906,2076162,2076418,2072577,2072833,2073089,2073345,2073601,2073857,2074113,2074369,2078722,2078978,0,2079746,0,0,0,0,2076673,2076929,2060289,2060545,2077441,0,235778,0,0,0,0,2083842,0,0,0,0,2060801,2061057,2061313,2061569,2081537,0,0,0,2086914,2087170,0,0,0,0,0,0,2084865,2085121,2061825,2062081,0,0,0,0,2091010,2091266,0,0,0,2092034,0,0,2088961,2089217,2062849,2063105,2090241,0,0,0,0,0,0,2096130,0,0,0,0,2062337,2062593,2063361,2063617,2093825,0,0,0}, |
|
22529
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,248065,0,0,0,27393,58625,0,0,0,0,0,0,2182657,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2175490,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2191361,2191617,2191873,2192129,2192385,2192641,2192897,2193153,2193409,2193665,2193921,2194177,2194433,2194689,2194945,2195201,2187266,2187522,2187778,2188034,2188290,2188546,2188802,2189058,2189314,2189570,2189826,2190082,2190338,2190594,2190850,2191106,0,0,0,2196481,2196226,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22530
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2412545,2412801,2413057,2413313,2413569,2413825,2414081,2414337,2414593,2414849,2415105,2415361,2415617,2415873,2416129,2416385,2416641,2416897,2417153,2417409,2417665,2417921,2418177,2418433,2418689,2418945,2405890,2406146,2406402,2406658,2406914,2407170,2407426,2407682,2407938,2408194,2408450,2408706,2408962,2409218,2409474,2409730,2409986,2410242,2410498,2410754,2411010,2411266,2411522,2411778,2412034,2412290,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22531
|
|
|
|
|
|
|
{2895873,2896129,2896385,2896641,2896897,2897153,2897409,2897665,2897921,2898177,2898433,2898689,2898945,2899201,2899457,2899713,2899969,2900225,2900481,2900737,2900993,2901249,2901505,2901761,2902017,2902273,2902529,2902785,2903041,2903297,2903553,2903809,2904065,2904321,2904577,2904833,2905089,2905345,2905601,2905857,2906113,2906369,2906625,2906881,2907137,2907393,2907649,2907905,2883586,2883842,2884098,2884354,2884610,2884866,2885122,2885378,2885634,2885890,2886146,2886402,2886658,2886914,2887170,2887426,2887682,2887938,2888194,2888450,2888706,2888962,2889218,2889474,2889730,2889986,2890242,2890498,2890754,2891010,2891266,2891522,2891778,2892034,2892290,2892546,2892802,2893058,2893314,2893570,2893826,2894082,2894338,2894594,2894850,2895106,2895362,2895618,2908417,2908162,158465,1932545,163073,145922,146946,2910209,2909954,2910721,2910466,2911233,2910978,151809,160001,151553,152065,0,2913025,2912770,0,2913793,2913538,0,0,0,0,0,0,0,147201,147457,2916609,2916354,2917121,2916866,2917633,2917378,2918145,2917890,2918657,2918402,2919169,2918914,2919681,2919426,2920193,2919938,2920705,2920450,2921217,2920962,2921729,2921474,2922241,2921986,2922753,2922498,2923265,2923010,2923777,2923522,2924289,2924034,2924801,2924546,2925313,2925058,2925825,2925570,2926337,2926082,2926849,2926594,2927361,2927106,2927873,2927618,2928385,2928130,2928897,2928642,2929409,2929154,2929921,2929666,2930433,2930178,2930945,2930690,2931457,2931202,2931969,2931714,2932481,2932226,2932993,2932738,2933505,2933250,2934017,2933762,2934529,2934274,2935041,2934786,2935553,2935298,2936065,2935810,2936577,2936322,2937089,2936834,2937601,2937346,2938113,2937858,2938625,2938370,2939137,2938882,2939649,2939394,2940161,2939906,2940673,2940418,2941185,2940930,2941697,2941442,0,0,0,0,0,0,0,2944001,2943746,2944513,2944258,0,0,0,2945793,2945538,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22532
|
|
|
|
|
|
|
{1089538,1089794,1090050,1090306,1090562,1090818,1091074,1091330,1091586,1091842,1092098,1092354,1092610,1092866,1093122,1093378,1093634,1093890,1094146,1094402,1094658,1094914,1095170,1095426,1095682,1095938,1096194,1096450,1096706,1096962,1097218,1097474,1097730,1097986,1098242,1098498,1098754,1099010,0,1099522,0,0,0,0,0,1101058,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22533
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10895617,10895362,10896129,10895874,10896641,10896386,10897153,10896898,10897665,10897410,10898177,10897922,10898689,10898434,10899201,10898946,10899713,10899458,10900225,10899970,10900737,10900482,10901249,10900994,10901761,10901506,10902273,10902018,10902785,10902530,10903297,10903042,10903809,10903554,10904321,10904066,10904833,10904578,10905345,10905090,10905857,10905602,10906369,10906114,10906881,10906626,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10912001,10911746,10912513,10912258,10913025,10912770,10913537,10913282,10914049,10913794,10914561,10914306,10915073,10914818,10915585,10915330,10916097,10915842,10916609,10916354,10917121,10916866,10917633,10917378,10918145,10917890,10918657,10918402,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22534
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10953473,10953218,10953985,10953730,10954497,10954242,10955009,10954754,10955521,10955266,10956033,10955778,10956545,10956290,0,0,10957569,10957314,10958081,10957826,10958593,10958338,10959105,10958850,10959617,10959362,10960129,10959874,10960641,10960386,10961153,10960898,10961665,10961410,10962177,10961922,10962689,10962434,10963201,10962946,10963713,10963458,10964225,10963970,10964737,10964482,10965249,10964994,10965761,10965506,10966273,10966018,10966785,10966530,10967297,10967042,10967809,10967554,10968321,10968066,10968833,10968578,10969345,10969090,10969857,10969602,10970369,10970114,10970881,10970626,10971393,10971138,10971905,10971650,10972417,10972162,10972929,10972674,0,0,0,0,0,0,0,0,0,10975745,10975490,10976257,10976002,1931521,10977025,10976770,10977537,10977282,10978049,10977794,10978561,10978306,10979073,10978818,0,0,0,10980353,10980098,156929,0,0,10981633,10981378,10982145,10981890,10994690,0,10983169,10982914,10983681,10983426,10984193,10983938,10984705,10984450,10985217,10984962,10985729,10985474,10986241,10985986,10986753,10986498,10987265,10987010,10987777,10987522,157185,154625,155905,158721,158209,0,171521,165633,171265,11227905,10990849,10990594,10991361,10991106,10991873,10991618,10992385,10992130,10992897,10992642,10993409,10993154,10993921,10993666,10994433,10994178,10982401,164353,1936897,10995713,10995458,10996225,10995970,0,0,0,0,0,10998017,10997762,0,0,0,0,10999553,10999298,11000065,10999810,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11007489,11007234,0,0,0,0,0,0,0,0,0}, |
|
22535
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,10990338,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1286146,1286402,1286658,1286914,1287170,1287426,1287682,1287938,1288194,1288450,1288706,1288962,1289218,1289474,1289730,1289986,1290242,1290498,1290754,1291010,1291266,1291522,1291778,1292034,1292290,1292546,1292802,1293058,1293314,1293570,1293826,1294082,1294338,1294594,1294850,1295106,1295362,1295618,1295874,1296130,1296386,1296642,1296898,1297154,1297410,1297666,1297922,1298178,1298434,1298690,1298946,1299202,1299458,1299714,1299970,1300226,1300482,1300738,1300994,1301250,1301506,1301762,1302018,1302274,1302530,1302786,1303042,1303298,1303554,1303810,1304066,1304322,1304578,1304834,1305090,1305346,1305602,1305858,1306114,1306370,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22536
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,16728321,16728577,16728833,16729089,16729345,16729601,16729857,16730113,16730369,16730625,16730881,16731137,16731393,16731649,16731905,16732161,16732417,16732673,16732929,16733185,16733441,16733697,16733953,16734209,16734465,16734721,0,0,0,0,0,0,16720130,16720386,16720642,16720898,16721154,16721410,16721666,16721922,16722178,16722434,16722690,16722946,16723202,16723458,16723714,16723970,16724226,16724482,16724738,16724994,16725250,16725506,16725762,16726018,16726274,16726530,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22537
|
|
|
|
|
|
|
{17049601,17049857,17050113,17050369,17050625,17050881,17051137,17051393,17051649,17051905,17052161,17052417,17052673,17052929,17053185,17053441,17053697,17053953,17054209,17054465,17054721,17054977,17055233,17055489,17055745,17056001,17056257,17056513,17056769,17057025,17057281,17057537,17057793,17058049,17058305,17058561,17058817,17059073,17059329,17059585,17039362,17039618,17039874,17040130,17040386,17040642,17040898,17041154,17041410,17041666,17041922,17042178,17042434,17042690,17042946,17043202,17043458,17043714,17043970,17044226,17044482,17044738,17044994,17045250,17045506,17045762,17046018,17046274,17046530,17046786,17047042,17047298,17047554,17047810,17048066,17048322,17048578,17048834,17049090,17049346,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,17094657,17094913,17095169,17095425,17095681,17095937,17096193,17096449,17096705,17096961,17097217,17097473,17097729,17097985,17098241,17098497,17098753,17099009,17099265,17099521,17099777,17100033,17100289,17100545,17100801,17101057,17101313,17101569,17101825,17102081,17102337,17102593,17102849,17103105,17103361,17103617,0,0,0,0,17084418,17084674,17084930,17085186,17085442,17085698,17085954,17086210,17086466,17086722,17086978,17087234,17087490,17087746,17088002,17088258,17088514,17088770,17089026,17089282,17089538,17089794,17090050,17090306,17090562,17090818,17091074,17091330,17091586,17091842,17092098,17092354,17092610,17092866,17093122,17093378,0,0,0,0}, |
|
22538
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,17143553,17143809,17144065,17144321,17144577,17144833,17145089,17145345,17145601,17145857,17146113,0,17146625,17146881,17147137,17147393,17147649,17147905,17148161,17148417,17148673,17148929,17149185,17149441,17149697,17149953,17150209,0,17150721,17150977,17151233,17151489,17151745,17152001,17152257,0,17152769,17153025,0,17133570,17133826,17134082,17134338,17134594,17134850,17135106,17135362,17135618,17135874,17136130,0,17136642,17136898,17137154,17137410,17137666,17137922,17138178,17138434,17138690,17138946,17139202,17139458,17139714,17139970,17140226,0,17140738,17140994,17141250,17141506,17141762,17142018,17142274,0,17142786,17143042,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22539
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,17612801,17613057,17613313,17613569,17613825,17614081,17614337,17614593,17614849,17615105,17615361,17615617,17615873,17616129,17616385,17616641,17616897,17617153,17617409,17617665,17617921,17618177,17618433,17618689,17618945,17619201,17619457,17619713,17619969,17620225,17620481,17620737,17620993,17621249,17621505,17621761,17622017,17622273,17622529,17622785,17623041,17623297,17623553,17623809,17624065,17624321,17624577,17624833,17625089,17625345,17625601,0,0,0,0,0,0,0,0,0,0,0,0,0,17596418,17596674,17596930,17597186,17597442,17597698,17597954,17598210,17598466,17598722,17598978,17599234,17599490,17599746,17600002,17600258,17600514,17600770,17601026,17601282,17601538,17601794,17602050,17602306,17602562,17602818,17603074,17603330,17603586,17603842,17604098,17604354,17604610,17604866,17605122,17605378,17605634,17605890,17606146,17606402,17606658,17606914,17607170,17607426,17607682,17607938,17608194,17608450,17608706,17608962,17609218,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22540
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,18399233,18399489,18399745,18400001,18400257,18400513,18400769,18401025,18401281,18401537,18401793,18402049,18402305,18402561,18402817,18403073,18403329,18403585,18403841,18404097,18404353,18404609,18404865,18405121,18405377,18405633,18405889,18406145,18406401,18406657,18406913,18407169,18391042,18391298,18391554,18391810,18392066,18392322,18392578,18392834,18393090,18393346,18393602,18393858,18394114,18394370,18394626,18394882,18395138,18395394,18395650,18395906,18396162,18396418,18396674,18396930,18397186,18397442,18397698,18397954,18398210,18398466,18398722,18398978,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22541
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,24010753,24011009,24011265,24011521,24011777,24012033,24012289,24012545,24012801,24013057,24013313,24013569,24013825,24014081,24014337,24014593,24014849,24015105,24015361,24015617,24015873,24016129,24016385,24016641,24016897,24017153,24017409,24017665,24017921,24018177,24018433,24018689,24002562,24002818,24003074,24003330,24003586,24003842,24004098,24004354,24004610,24004866,24005122,24005378,24005634,24005890,24006146,24006402,24006658,24006914,24007170,24007426,24007682,24007938,24008194,24008450,24008706,24008962,24009218,24009474,24009730,24009986,24010242,24010498,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22542
|
|
|
|
|
|
|
{32055809,32056065,32056321,32056577,32056833,32057089,32057345,32057601,32057857,32058113,32058369,32058625,32058881,32059137,32059393,32059649,32059905,32060161,32060417,32060673,32060929,32061185,32061441,32061697,32061953,32062209,32062465,32062721,32062977,32063233,32063489,32063745,32064001,32064257,32047106,32047362,32047618,32047874,32048130,32048386,32048642,32048898,32049154,32049410,32049666,32049922,32050178,32050434,32050690,32050946,32051202,32051458,32051714,32051970,32052226,32052482,32052738,32052994,32053250,32053506,32053762,32054018,32054274,32054530,32054786,32055042,32055298,32055554,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} |
|
22543
|
|
|
|
|
|
|
}; |
|
22544
|
|
|
|
|
|
|
|
|
22545
|
|
|
|
|
|
|
} // namespace unilib |
|
22546
|
|
|
|
|
|
|
|
|
22547
|
|
|
|
|
|
|
///////// |
|
22548
|
|
|
|
|
|
|
// File: unilib/uninorms.cpp |
|
22549
|
|
|
|
|
|
|
///////// |
|
22550
|
|
|
|
|
|
|
|
|
22551
|
|
|
|
|
|
|
// This file is part of UniLib . |
|
22552
|
|
|
|
|
|
|
// |
|
22553
|
|
|
|
|
|
|
// Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of |
|
22554
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
22555
|
|
|
|
|
|
|
// |
|
22556
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
22557
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
22558
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
22559
|
|
|
|
|
|
|
// |
|
22560
|
|
|
|
|
|
|
// UniLib version: 3.3.0 |
|
22561
|
|
|
|
|
|
|
// Unicode version: 15.0.0 |
|
22562
|
|
|
|
|
|
|
|
|
22563
|
|
|
|
|
|
|
namespace unilib { |
|
22564
|
|
|
|
|
|
|
|
|
22565
|
0
|
|
|
|
|
|
void uninorms::nfc(std::u32string& str) { |
|
22566
|
0
|
|
|
|
|
|
decompose(str, false); |
|
22567
|
0
|
|
|
|
|
|
compose(str); |
|
22568
|
0
|
|
|
|
|
|
} |
|
22569
|
|
|
|
|
|
|
|
|
22570
|
0
|
|
|
|
|
|
void uninorms::nfd(std::u32string& str) { |
|
22571
|
0
|
|
|
|
|
|
decompose(str, false); |
|
22572
|
0
|
|
|
|
|
|
} |
|
22573
|
|
|
|
|
|
|
|
|
22574
|
0
|
|
|
|
|
|
void uninorms::nfkc(std::u32string& str) { |
|
22575
|
0
|
|
|
|
|
|
decompose(str, true); |
|
22576
|
0
|
|
|
|
|
|
compose(str); |
|
22577
|
0
|
|
|
|
|
|
} |
|
22578
|
|
|
|
|
|
|
|
|
22579
|
0
|
|
|
|
|
|
void uninorms::nfkd(std::u32string& str) { |
|
22580
|
0
|
0
|
|
|
|
|
decompose(str, true); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
22581
|
0
|
|
|
|
|
|
} |
|
22582
|
|
|
|
|
|
|
|
|
22583
|
0
|
|
|
|
|
|
void uninorms::compose(std::u32string& str) { |
|
22584
|
|
|
|
|
|
|
size_t old, com; |
|
22585
|
0
|
0
|
|
|
|
|
for (old = 0, com = 0; old < str.size(); old++, com++) { |
|
22586
|
0
|
|
|
|
|
|
str[com] = str[old]; |
|
22587
|
0
|
0
|
|
|
|
|
if (str[old] >= Hangul::LBase && str[old] < Hangul::LBase + Hangul::LCount) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
22588
|
|
|
|
|
|
|
// Check Hangul composition L + V [+ T]. |
|
22589
|
0
|
0
|
|
|
|
|
if (old + 1 < str.size() && str[old + 1] >= Hangul::VBase && str[old + 1] < Hangul::VBase + Hangul::VCount) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
22590
|
0
|
|
|
|
|
|
str[com] = Hangul::SBase + ((str[old] - Hangul::LBase) * Hangul::VCount + str[old + 1] - Hangul::VBase) * Hangul::TCount; |
|
22591
|
|
|
|
|
|
|
old++; |
|
22592
|
0
|
0
|
|
|
|
|
if (old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
22593
|
0
|
|
|
|
|
|
str[com] += str[++old] - Hangul::TBase; |
|
22594
|
|
|
|
|
|
|
} |
|
22595
|
0
|
0
|
|
|
|
|
} else if (str[old] >= Hangul::SBase && str[old] < Hangul::SBase + Hangul::SCount) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
22596
|
|
|
|
|
|
|
// Check Hangul composition LV + T |
|
22597
|
0
|
0
|
|
|
|
|
if ((str[old] - Hangul::SBase) % Hangul::TCount && old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
22598
|
0
|
|
|
|
|
|
str[com] += str[++old] - Hangul::TBase; |
|
22599
|
0
|
0
|
|
|
|
|
} else if (str[old] < CHARS) { |
|
22600
|
|
|
|
|
|
|
// Check composition_data |
|
22601
|
0
|
|
|
|
|
|
auto composition = &composition_block[composition_index[str[old] >> 8]][str[old] & 0xFF]; |
|
22602
|
|
|
|
|
|
|
auto starter = com; |
|
22603
|
0
|
0
|
|
|
|
|
for (int last_ccc = -1; old + 1 < str.size(); old++) { |
|
22604
|
0
|
0
|
|
|
|
|
int ccc = str[old + 1] < CHARS ? ccc_block[ccc_index[str[old + 1] >> 8]][str[old + 1] & 0xFF] : 0; |
|
22605
|
0
|
0
|
|
|
|
|
if (composition[1] - composition[0] && last_ccc < ccc) { |
|
|
|
0
|
|
|
|
|
|
|
22606
|
|
|
|
|
|
|
// Try finding a composition. |
|
22607
|
|
|
|
|
|
|
auto l = composition[0], r = composition[1]; |
|
22608
|
0
|
0
|
|
|
|
|
while (l + 2 < r) { |
|
22609
|
0
|
|
|
|
|
|
auto m = l + (((r - l) >> 1) & ~1); |
|
22610
|
0
|
0
|
|
|
|
|
if (composition_data[m] <= str[old + 1]) l = m; |
|
22611
|
0
|
0
|
|
|
|
|
if (composition_data[m] >= str[old + 1]) r = m; |
|
22612
|
|
|
|
|
|
|
} |
|
22613
|
0
|
0
|
|
|
|
|
if (composition_data[l] == str[old + 1]) { |
|
22614
|
|
|
|
|
|
|
// Found a composition. |
|
22615
|
0
|
|
|
|
|
|
str[starter] = composition_data[l + 1]; |
|
22616
|
0
|
|
|
|
|
|
composition = &composition_block[composition_index[composition_data[l + 1] >> 8]][composition_data[l + 1] & 0xFF]; |
|
22617
|
0
|
|
|
|
|
|
continue; |
|
22618
|
|
|
|
|
|
|
} |
|
22619
|
|
|
|
|
|
|
} |
|
22620
|
|
|
|
|
|
|
|
|
22621
|
0
|
0
|
|
|
|
|
if (!ccc) break; |
|
22622
|
|
|
|
|
|
|
last_ccc = ccc; |
|
22623
|
0
|
|
|
|
|
|
str[++com] = str[old + 1]; |
|
22624
|
|
|
|
|
|
|
} |
|
22625
|
|
|
|
|
|
|
} |
|
22626
|
|
|
|
|
|
|
} |
|
22627
|
|
|
|
|
|
|
|
|
22628
|
0
|
0
|
|
|
|
|
if (com < old) str.resize(com); |
|
22629
|
0
|
|
|
|
|
|
} |
|
22630
|
|
|
|
|
|
|
|
|
22631
|
0
|
|
|
|
|
|
void uninorms::decompose(std::u32string& str, bool kompatibility) { |
|
22632
|
|
|
|
|
|
|
// Count how much additional space do we need. |
|
22633
|
|
|
|
|
|
|
bool any_decomposition = false; |
|
22634
|
|
|
|
|
|
|
size_t additional = 0; |
|
22635
|
0
|
0
|
|
|
|
|
for (auto&& chr : str) { |
|
22636
|
|
|
|
|
|
|
int decomposition_len = 0; |
|
22637
|
|
|
|
|
|
|
|
|
22638
|
0
|
0
|
|
|
|
|
if (chr >= Hangul::SBase && chr < Hangul::SBase + Hangul::SCount) { |
|
22639
|
|
|
|
|
|
|
// Hangul decomposition. |
|
22640
|
0
|
0
|
|
|
|
|
decomposition_len = 2 + ((chr - Hangul::SBase) % Hangul::TCount ? 1 : 0); |
|
22641
|
0
|
0
|
|
|
|
|
} else if (chr < CHARS) { |
|
22642
|
|
|
|
|
|
|
// Check decomposition_data. |
|
22643
|
0
|
|
|
|
|
|
auto decomposition = &decomposition_block[decomposition_index[chr >> 8]][chr & 0xFF]; |
|
22644
|
0
|
|
|
|
|
|
decomposition_len = (decomposition[1] >> 2) - (decomposition[0] >> 2); |
|
22645
|
0
|
0
|
|
|
|
|
if (decomposition_len && !kompatibility && (decomposition[0] & 1)) decomposition_len = 0; |
|
|
|
0
|
|
|
|
|
|
|
22646
|
0
|
0
|
|
|
|
|
if (decomposition_len && kompatibility && (decomposition[0] & 2)) |
|
|
|
0
|
|
|
|
|
|
|
22647
|
|
|
|
|
|
|
// Further kompatibility decomposition. |
|
22648
|
0
|
0
|
|
|
|
|
for (auto i = decomposition[0] >> 2; i < decomposition[1] >> 2; i++) { |
|
22649
|
0
|
|
|
|
|
|
auto further_decomposition = &decomposition_block[decomposition_index[decomposition_data[i] >> 8]][decomposition_data[i] & 0xFF]; |
|
22650
|
0
|
0
|
|
|
|
|
if (further_decomposition[0] & 1) decomposition_len += (further_decomposition[1] >> 2) - (further_decomposition[0] >> 2) - 1; |
|
22651
|
|
|
|
|
|
|
} |
|
22652
|
|
|
|
|
|
|
} |
|
22653
|
|
|
|
|
|
|
// Do we decompose current character? |
|
22654
|
0
|
0
|
|
|
|
|
if (!decomposition_len) continue; |
|
22655
|
|
|
|
|
|
|
any_decomposition = true; |
|
22656
|
0
|
|
|
|
|
|
additional += decomposition_len - 1; |
|
22657
|
|
|
|
|
|
|
} |
|
22658
|
|
|
|
|
|
|
|
|
22659
|
|
|
|
|
|
|
// If needed, allocate enough space and perform the decomposition. |
|
22660
|
0
|
0
|
|
|
|
|
if (any_decomposition) { |
|
22661
|
0
|
|
|
|
|
|
str.resize(str.size() + additional); |
|
22662
|
0
|
0
|
|
|
|
|
for (size_t dec = str.size(), old = dec - additional; old--; ) |
|
22663
|
0
|
0
|
|
|
|
|
if (str[old] >= Hangul::SBase && str[old] < Hangul::SBase + Hangul::SCount) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
22664
|
|
|
|
|
|
|
// Hangul decomposition. |
|
22665
|
0
|
|
|
|
|
|
char32_t s_index = str[old] - Hangul::SBase; |
|
22666
|
0
|
0
|
|
|
|
|
if (s_index % Hangul::TCount) str[--dec] = Hangul::TBase + s_index % Hangul::TCount; |
|
22667
|
0
|
|
|
|
|
|
str[--dec] = Hangul::VBase + (s_index % Hangul::NCount) / Hangul::TCount; |
|
22668
|
0
|
|
|
|
|
|
str[--dec] = Hangul::LBase + s_index / Hangul::NCount; |
|
22669
|
0
|
0
|
|
|
|
|
} else if (str[old] < CHARS) { |
|
22670
|
|
|
|
|
|
|
// Check decomposition_data. |
|
22671
|
0
|
|
|
|
|
|
auto decomposition = &decomposition_block[decomposition_index[str[old] >> 8]][str[old] & 0xFF]; |
|
22672
|
0
|
|
|
|
|
|
int decomposition_len = (decomposition[1] >> 2) - (decomposition[0] >> 2); |
|
22673
|
0
|
0
|
|
|
|
|
if (decomposition_len && !kompatibility && (decomposition[0] & 1)) decomposition_len = 0; |
|
|
|
0
|
|
|
|
|
|
|
22674
|
0
|
0
|
|
|
|
|
if (decomposition_len && kompatibility && (decomposition[0] & 2)) { |
|
|
|
0
|
|
|
|
|
|
|
22675
|
|
|
|
|
|
|
// Further kompatibility decomposition. |
|
22676
|
0
|
0
|
|
|
|
|
while (decomposition_len--) { |
|
22677
|
0
|
|
|
|
|
|
auto chr = decomposition_data[(decomposition[0] >> 2) + decomposition_len]; |
|
22678
|
0
|
|
|
|
|
|
auto further_decomposition = &decomposition_block[decomposition_index[chr >> 8]][chr & 0xFF]; |
|
22679
|
0
|
0
|
|
|
|
|
if (further_decomposition[0] & 1) { |
|
22680
|
0
|
0
|
|
|
|
|
for (int further_decomposition_len = (further_decomposition[1] >> 2) - (further_decomposition[0] >> 2); further_decomposition_len--; ) |
|
22681
|
0
|
|
|
|
|
|
str[--dec] = decomposition_data[(further_decomposition[0] >> 2) + further_decomposition_len]; |
|
22682
|
|
|
|
|
|
|
} else { |
|
22683
|
0
|
|
|
|
|
|
str[--dec] = chr; |
|
22684
|
|
|
|
|
|
|
} |
|
22685
|
|
|
|
|
|
|
} |
|
22686
|
0
|
0
|
|
|
|
|
} else if (decomposition_len) { |
|
22687
|
|
|
|
|
|
|
// Non-recursive decomposition. |
|
22688
|
0
|
0
|
|
|
|
|
while (decomposition_len--) |
|
22689
|
0
|
|
|
|
|
|
str[--dec] = decomposition_data[(decomposition[0] >> 2) + decomposition_len]; |
|
22690
|
|
|
|
|
|
|
} else { |
|
22691
|
|
|
|
|
|
|
// No decomposition. |
|
22692
|
0
|
|
|
|
|
|
str[--dec] = str[old]; |
|
22693
|
|
|
|
|
|
|
} |
|
22694
|
|
|
|
|
|
|
} else { |
|
22695
|
|
|
|
|
|
|
// Non-Unicode character. |
|
22696
|
0
|
|
|
|
|
|
str[--dec] = str[old]; |
|
22697
|
|
|
|
|
|
|
} |
|
22698
|
|
|
|
|
|
|
} |
|
22699
|
|
|
|
|
|
|
|
|
22700
|
|
|
|
|
|
|
// Sort combining marks. |
|
22701
|
0
|
0
|
|
|
|
|
for (size_t i = 1; i < str.size(); i++) { |
|
22702
|
0
|
0
|
|
|
|
|
unsigned ccc = str[i] < CHARS ? ccc_block[ccc_index[str[i] >> 8]][str[i] & 0xFF] : 0; |
|
22703
|
0
|
0
|
|
|
|
|
if (!ccc) continue; |
|
22704
|
|
|
|
|
|
|
|
|
22705
|
|
|
|
|
|
|
auto chr = str[i]; |
|
22706
|
|
|
|
|
|
|
size_t j; |
|
22707
|
0
|
0
|
|
|
|
|
for (j = i; j && (str[j-1] < CHARS ? ccc_block[ccc_index[str[j-1] >> 8]][str[j-1] & 0xFF] : 0) > ccc; j--) str[j] = str[j-1]; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
22708
|
0
|
|
|
|
|
|
str[j] = chr; |
|
22709
|
|
|
|
|
|
|
} |
|
22710
|
0
|
|
|
|
|
|
} |
|
22711
|
|
|
|
|
|
|
|
|
22712
|
|
|
|
|
|
|
// Data fields |
|
22713
|
|
|
|
|
|
|
const char32_t uninorms::CHARS; |
|
22714
|
|
|
|
|
|
|
|
|
22715
|
|
|
|
|
|
|
const char32_t uninorms::Hangul::SBase; |
|
22716
|
|
|
|
|
|
|
const char32_t uninorms::Hangul::LBase; |
|
22717
|
|
|
|
|
|
|
const char32_t uninorms::Hangul::VBase; |
|
22718
|
|
|
|
|
|
|
const char32_t uninorms::Hangul::TBase; |
|
22719
|
|
|
|
|
|
|
const char32_t uninorms::Hangul::LCount; |
|
22720
|
|
|
|
|
|
|
const char32_t uninorms::Hangul::VCount; |
|
22721
|
|
|
|
|
|
|
const char32_t uninorms::Hangul::TCount; |
|
22722
|
|
|
|
|
|
|
const char32_t uninorms::Hangul::NCount; |
|
22723
|
|
|
|
|
|
|
const char32_t uninorms::Hangul::SCount; |
|
22724
|
|
|
|
|
|
|
|
|
22725
|
|
|
|
|
|
|
const uint8_t uninorms::ccc_index[uninorms::CHARS >> 8] = { |
|
22726
|
|
|
|
|
|
|
0,0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,0,0,15,0,0,0,16,17,18,19,20,21,22,0,0,23,0,0,0,0,0,0,0,0,0,0,0,24,25,0,0,26,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,27,0,28,29,30,31,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,32,0,0,33,0,0,34,35,36,0,0,0,0,0,0,37,0,0,38,39,40,41,42,43,44,45,46,47,48,49,50,51,0,52,53,0,54,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,55,56,0,0,0,57,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,58,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,59,60,0,0,0,0,0,0,0,0,0,0,0,0,0,61,56,62,0,63,0,0,0,64,65,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
|
22727
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
|
22728
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
|
22729
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
|
22730
|
|
|
|
|
|
|
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 |
|
22731
|
|
|
|
|
|
|
}; |
|
22732
|
|
|
|
|
|
|
const uint8_t uninorms::ccc_block[][256] = { |
|
22733
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22734
|
|
|
|
|
|
|
{230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,232,220,220,220,220,232,216,220,220,220,220,220,202,202,220,220,220,220,202,202,220,220,220,220,220,220,220,220,220,220,220,1,1,1,1,1,220,220,220,220,230,230,230,230,230,230,230,230,240,230,220,220,220,230,230,230,220,220,0,230,230,230,220,220,220,220,230,232,220,220,230,233,234,234,233,234,234,233,230,230,230,230,230,230,230,230,230,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22735
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22736
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,220,230,230,230,230,220,230,230,230,222,220,230,230,230,230,230,230,220,220,220,220,220,220,230,230,220,230,230,222,228,230,10,11,12,13,14,15,16,17,18,19,19,20,21,22,0,23,0,24,25,0,230,220,0,18,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22737
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,230,230,230,230,30,31,32,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,27,28,29,30,31,32,33,34,230,230,220,220,230,230,230,230,230,220,230,230,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,35,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,230,230,230,0,0,230,230,230,230,220,230,0,0,230,230,0,220,230,230,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22738
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,36,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,220,230,230,220,230,230,220,220,220,230,220,220,230,220,230,230,230,220,230,220,230,220,230,220,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,230,230,230,220,230,0,0,0,0,0,0,0,0,0,220,0,0}, |
|
22739
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,0,230,230,230,230,230,230,230,230,230,0,230,230,230,0,230,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,220,220,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,220,220,220,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,230,220,220,220,220,220,230,230,230,230,230,230,230,230,230,230,230,230,230,230,0,220,230,230,220,230,230,220,230,230,230,220,220,220,27,28,29,230,230,230,220,230,230,220,220,230,230,230,230,230}, |
|
22740
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,230,220,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,0}, |
|
22741
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22742
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22743
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,84,91,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22744
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22745
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,103,103,9,0,0,0,0,0,0,0,0,0,0,0,0,0,107,107,107,107,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,118,118,9,0,0,0,0,0,0,0,0,0,0,0,0,0,122,122,122,122,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22746
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,220,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,220,0,220,0,216,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,129,130,0,132,0,0,0,0,0,130,130,130,130,0,0,130,0,230,230,9,0,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22747
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,9,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22748
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22749
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22750
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,228,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22751
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,222,230,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22752
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,230,230,230,230,0,0,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,230,220,220,220,220,220,220,230,230,220,0,220,220,230,230,220,220,230,230,230,230,230,220,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22753
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,220,230,230,230,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,9,9,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22754
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,0,1,220,220,220,220,220,230,230,220,220,220,220,230,0,1,1,1,1,1,1,1,0,0,0,0,220,0,0,0,0,0,0,230,0,0,0,230,230,0,0,0,0,0,0}, |
|
22755
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,220,230,230,230,230,230,230,230,220,230,230,234,214,220,202,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,232,228,228,220,218,230,233,220,230,220}, |
|
22756
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,1,1,230,230,230,230,1,1,1,230,230,0,0,0,0,230,0,0,0,1,1,230,220,230,1,1,220,220,220,220,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22757
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22758
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230}, |
|
22759
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,218,228,232,222,224,224,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,8,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22760
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,0,0,0,0,230,230,230,230,230,230,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22761
|
|
|
|
|
|
|
{0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22762
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,220,220,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22763
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,0,230,230,220,0,0,230,230,0,0,0,0,0,230,230,0,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0}, |
|
22764
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22765
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,26,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22766
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,230,230,230,220,220,220,220,220,220,220,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22767
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,220,0,0}, |
|
22768
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22769
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22770
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,220,0,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,1,220,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22771
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22772
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,220,220,220}, |
|
22773
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,220,220,230,230,230,220,230,220,220,220,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,220,230,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22774
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22775
|
|
|
|
|
|
|
{230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22776
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22777
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,230,230,230,0,0,0,230,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22778
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22779
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22780
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22781
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22782
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22783
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,9,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22784
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22785
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22786
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,9,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22787
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,9,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22788
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0}, |
|
22789
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22790
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22791
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22792
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,216,216,1,1,1,0,0,0,226,216,216,216,216,216,0,0,0,0,0,0,0,0,220,220,220,220,220,220,220,220,0,0,230,230,230,230,230,220,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22793
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22794
|
|
|
|
|
|
|
{230,230,230,230,230,230,230,0,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,230,0,0,230,230,230,230,230,230,230,0,230,230,0,230,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22795
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22796
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,232,232,220,230,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22797
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,220,220,220,220,220,220,220,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22798
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,230,230,230,230,230,230,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0} |
|
22799
|
|
|
|
|
|
|
}; |
|
22800
|
|
|
|
|
|
|
|
|
22801
|
|
|
|
|
|
|
const uint8_t uninorms::composition_index[uninorms::CHARS >> 8] = { |
|
22802
|
|
|
|
|
|
|
0,1,2,3,4,5,6,5,5,7,5,8,9,10,5,5,11,5,5,5,5,5,5,5,5,5,5,12,5,5,13,14,5,15,16,5,5,5,5,5,5,5,5,5,5,5,5,5,17,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,18,19,5,20,21,22,5,5,5,23,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, |
|
22803
|
|
|
|
|
|
|
5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, |
|
22804
|
|
|
|
|
|
|
5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, |
|
22805
|
|
|
|
|
|
|
5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, |
|
22806
|
|
|
|
|
|
|
5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5 |
|
22807
|
|
|
|
|
|
|
}; |
|
22808
|
|
|
|
|
|
|
const uint16_t uninorms::composition_block[][257] = { |
|
22809
|
|
|
|
|
|
|
{1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,3,5,7,7,7,39,45,55,67,101,103,117,131,161,163,173,185,191,209,241,245,245,261,275,289,327,331,343,347,365,377,377,377,377,377,377,377,409,415,425,437,471,473,487,503,531,535,545,557,563,581,613,617,617,633,647,663,701,705,719,723,743,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,755,761,761,761,761,761,761,761,761,761,761,761,761,761,761,761,761,761,761,761,761,761,761,761,761,761,761,769,769,771,773,777,779,779,779,787,787,787,787,787,789,789,789,789,789,797,803,805,805,807,807,807,807,815,815,815,815,815,815,823,823,825,827,831,833,833,833,841,841,841,841,841,843,843,843,843,843,851,857,859,859,861,861,861,861,869,869,869,869}, |
|
22810
|
|
|
|
|
|
|
{869,869,869,877,885,885,885,885,885,885,885,885,885,885,885,885,885,885,885,889,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,893,897,901,901,901,901,901,901,901,901,901,901,901,901,901,903,905,905,905,905,905,907,909,909,909,909,909,909,909,911,913,915,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,917,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,919,929,939,939,939,939,939,939,939,939,939,939,939,939,939,939,949,959,959,959,959,959,959,959,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,961,963,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965}, |
|
22811
|
|
|
|
|
|
|
{965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,965,967,969,971,973,973,973,973,973,975,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,977,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979}, |
|
22812
|
|
|
|
|
|
|
{979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,979,993,993,993,993,1001,1001,1011,1011,1025,1025,1025,1025,1025,1025,1033,1033,1035,1035,1035,1035,1047,1047,1047,1047,1057,1057,1057,1059,1059,1061,1061,1061,1077,1077,1077,1077,1085,1085,1097,1097,1113,1113,1113,1113,1113,1113,1121,1121,1125,1125,1125,1125,1141,1141,1141,1141,1153,1159,1165,1165,1165,1167,1167,1167,1167,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171,1171}, |
|
22813
|
|
|
|
|
|
|
{1171,1171,1171,1171,1171,1171,1171,1173,1173,1173,1173,1173,1173,1173,1173,1173,1173,1177,1177,1177,1179,1179,1185,1189,1191,1199,1199,1201,1201,1201,1201,1203,1203,1203,1203,1203,1211,1211,1211,1211,1213,1213,1213,1213,1215,1215,1217,1217,1217,1221,1221,1221,1223,1223,1229,1233,1235,1243,1243,1245,1245,1245,1245,1247,1247,1247,1247,1247,1255,1255,1255,1255,1257,1257,1257,1257,1259,1259,1261,1261,1261,1261,1261,1261,1261,1261,1261,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1263,1265,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1267,1269,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1271,1273,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275}, |
|
22814
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22815
|
|
|
|
|
|
|
{1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1275,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1281,1283,1283,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1285,1287,1287,1287,1287,1287,1287,1287,1287,1287,1287,1287,1287,1287,1287,1287,1287,1287,1289,1289,1289,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291}, |
|
22816
|
|
|
|
|
|
|
{1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1291,1293,1293,1293,1293,1293,1293,1293,1293,1295,1295,1295,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1297,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301}, |
|
22817
|
|
|
|
|
|
|
{1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1301,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1309,1313,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315}, |
|
22818
|
|
|
|
|
|
|
{1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1315,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1317,1319,1319,1319,1319,1319,1319,1319,1325,1325,1325,1325,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327}, |
|
22819
|
|
|
|
|
|
|
{1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1327,1331,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1333,1339,1339,1339,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341}, |
|
22820
|
|
|
|
|
|
|
{1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1341,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343,1343}, |
|
22821
|
|
|
|
|
|
|
{1343,1343,1343,1343,1343,1343,1345,1345,1347,1347,1349,1349,1351,1351,1353,1353,1353,1353,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1355,1357,1357,1359,1359,1361,1363,1363,1363,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365}, |
|
22822
|
|
|
|
|
|
|
{1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1365,1367,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1369,1371,1373,1373,1373,1373,1373,1373,1373,1375,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1377,1381,1385,1385,1385,1385,1385,1385,1385,1385,1385,1385,1385,1385,1385,1385,1385,1385,1385,1385,1385,1385,1385,1385,1385,1387,1389,1389,1389,1389,1389,1389,1389,1389,1389,1389,1389,1389,1389,1389,1389,1389,1389,1389,1389,1391,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393,1393}, |
|
22823
|
|
|
|
|
|
|
{1393,1401,1409,1411,1413,1415,1417,1419,1421,1429,1437,1439,1441,1443,1445,1447,1449,1453,1457,1457,1457,1457,1457,1457,1457,1461,1465,1465,1465,1465,1465,1465,1465,1473,1481,1483,1485,1487,1489,1491,1493,1501,1509,1511,1513,1515,1517,1519,1521,1527,1533,1533,1533,1533,1533,1533,1533,1539,1545,1545,1545,1545,1545,1545,1545,1549,1553,1553,1553,1553,1553,1553,1553,1557,1561,1561,1561,1561,1561,1561,1561,1567,1573,1573,1573,1573,1573,1573,1573,1573,1579,1579,1579,1579,1579,1579,1579,1587,1595,1597,1599,1601,1603,1605,1607,1615,1623,1625,1627,1629,1631,1633,1635,1637,1637,1637,1637,1639,1639,1639,1639,1639,1639,1639,1639,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1641,1643,1643,1643,1643,1643,1643,1643,1643,1643,1649,1649,1649,1649,1649,1649,1649,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1651,1653,1653,1653,1653,1653,1653,1653,1653,1659,1659}, |
|
22824
|
|
|
|
|
|
|
{1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1659,1661,1661,1663,1663,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1665,1667,1667,1669,1669,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671,1671}, |
|
22825
|
|
|
|
|
|
|
{1671,1671,1671,1671,1673,1673,1673,1673,1673,1675,1675,1675,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1677,1679,1679,1681,1681,1681,1681,1681,1681,1681,1681,1681,1681,1681,1681,1681,1681,1681,1681,1681,1681,1681,1681,1681,1681,1681,1683,1683,1683,1683,1683,1683,1683,1685,1685,1687,1687,1687,1689,1689,1689,1689,1689,1691,1691,1691,1691,1691,1691,1691,1691,1691,1691,1691,1691,1691,1691,1691,1691,1691,1691,1691,1691,1693,1693,1693,1695,1697,1697,1697,1697,1697,1697,1697,1697,1697,1697,1697,1697,1697,1699,1701,1701,1701,1703,1705,1705,1705,1707,1709,1711,1713,1713,1713,1713,1713,1715,1717,1717,1717,1719,1721,1721,1721,1721,1721,1721,1721,1721,1721,1721,1723,1725,1725,1725,1725,1725,1725,1725,1725,1725,1725,1725,1725,1725,1725,1725,1725,1727,1727,1727,1727,1727,1727,1729,1731,1731,1733,1733,1733,1733,1733,1733,1733,1735,1737,1739,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741}, |
|
22826
|
|
|
|
|
|
|
{1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1741,1743,1743,1743,1743,1743,1745,1745,1747,1747,1749,1749,1751,1751,1753,1753,1755,1755,1757,1757,1759,1759,1761,1761,1763,1763,1765,1765,1767,1767,1767,1769,1769,1771,1771,1773,1773,1773,1773,1773,1773,1773,1777,1777,1777,1781,1781,1781,1785,1785,1785,1789,1789,1789,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1793,1795,1795,1795,1795,1795,1795,1795,1795,1795,1797,1797,1797,1797,1797,1799,1799,1801,1801,1803,1803,1805,1805,1807,1807,1809,1809,1811,1811,1813,1813,1815,1815,1817,1817,1819,1819,1821,1821,1821,1823,1823,1825,1825,1827,1827,1827,1827,1827,1827,1827,1831,1831,1831,1835,1835,1835,1839,1839,1839,1843,1843,1843,1847,1847,1847,1847,1847,1847,1847,1847,1847,1847,1847,1847,1847,1847,1847,1847,1847,1847,1847,1847,1849,1851,1853,1855,1855,1855,1855,1855,1855,1855,1855,1855,1855,1855,1857,1857,1857}, |
|
22827
|
|
|
|
|
|
|
{1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1857,1859,1859,1861,1861,1861,1861,1861,1861,1861,1861,1861,1861,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863}, |
|
22828
|
|
|
|
|
|
|
{1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1863,1865,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867}, |
|
22829
|
|
|
|
|
|
|
{1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1867,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871}, |
|
22830
|
|
|
|
|
|
|
{1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1871,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877}, |
|
22831
|
|
|
|
|
|
|
{1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1877,1879,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881}, |
|
22832
|
|
|
|
|
|
|
{1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1881,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883,1883} |
|
22833
|
|
|
|
|
|
|
}; |
|
22834
|
|
|
|
|
|
|
const char32_t uninorms::composition_data[] = { |
|
22835
|
|
|
|
|
|
|
0,824,8814,824,8800,824,8815,768,192,769,193,770,194,771,195,772,256,774,258,775,550,776,196,777,7842,778,197,780,461,783,512,785,514,803,7840,805,7680,808,260,775,7682,803,7684,817,7686,769,262,770,264,775,266,780,268,807,199,775,7690,780,270,803,7692,807,7696,813,7698,817,7694,768,200,769,201,770,202,771,7868,772,274,774,276,775,278,776,203,777,7866,780,282,783,516,785,518,803,7864,807,552,808,280,813,7704,816,7706,775,7710,769,500,770,284,772,7712,774,286,775,288,780,486,807,290,770,292,775,7714,776,7718,780,542,803,7716,807,7720,814,7722,768,204,769,205,770,206,771,296,772,298,774,300,775,304,776,207,777,7880,780,463,783,520,785,522,803,7882,808,302,816,7724,770,308,769,7728,780,488,803,7730,807,310,817,7732,769,313,780,317,803,7734,807,315,813,7740,817,7738,769,7742,775,7744,803,7746,768,504,769,323,771,209,775,7748,780,327,803,7750,807,325,813,7754,817,7752,768,210,769,211,770,212,771,213,772,332,774,334,775,558,776,214,777,7886,779,336,780,465,783,524,785,526,795,416,803,7884,808,490,769,7764,775,7766,769,340,775,7768,780,344,783,528,785,530,803,7770,807,342,817,7774,769,346,770,348,775,7776,780,352,803,7778,806,536,807,350,775,7786,780,356,803,7788,806,538,807,354,813,7792,817,7790,768,217,769,218,770,219,771,360,772,362,774,364,776,220,777,7910,778,366,779,368,780,467,783,532,785,534,795,431,803,7908,804,7794,808,370,813,7798,816,7796,771,7804,803,7806,768,7808,769,7810,770,372,775,7814,776,7812,803,7816,775,7818,776,7820,768,7922,769,221,770,374,771,7928,772,562,775,7822,776,376,777,7926,803,7924,769,377,770,7824,775,379,780,381,803,7826,817,7828,768,224,769,225,770,226,771,227,772,257,774,259,775,551,776,228,777,7843,778,229,780,462,783,513,785,515,803,7841,805,7681,808,261,775,7683,803,7685,817,7687,769,263,770,265,775,267,780,269,807,231,775,7691,780,271,803,7693,807,7697,813,7699,817,7695,768,232,769,233,770,234,771,7869,772,275,774,277,775,279,776,235,777,7867,780,283,783,517,785,519,803,7865,807,553,808,281,813,7705,816,7707,775,7711,769,501,770, |
|
22836
|
|
|
|
|
|
|
285,772,7713,774,287,775,289,780,487,807,291,770,293,775,7715,776,7719,780,543,803,7717,807,7721,814,7723,817,7830,768,236,769,237,770,238,771,297,772,299,774,301,776,239,777,7881,780,464,783,521,785,523,803,7883,808,303,816,7725,770,309,780,496,769,7729,780,489,803,7731,807,311,817,7733,769,314,780,318,803,7735,807,316,813,7741,817,7739,769,7743,775,7745,803,7747,768,505,769,324,771,241,775,7749,780,328,803,7751,807,326,813,7755,817,7753,768,242,769,243,770,244,771,245,772,333,774,335,775,559,776,246,777,7887,779,337,780,466,783,525,785,527,795,417,803,7885,808,491,769,7765,775,7767,769,341,775,7769,780,345,783,529,785,531,803,7771,807,343,817,7775,769,347,770,349,775,7777,780,353,803,7779,806,537,807,351,775,7787,776,7831,780,357,803,7789,806,539,807,355,813,7793,817,7791,768,249,769,250,770,251,771,361,772,363,774,365,776,252,777,7911,778,367,779,369,780,468,783,533,785,535,795,432,803,7909,804,7795,808,371,813,7799,816,7797,771,7805,803,7807,768,7809,769,7811,770,373,775,7815,776,7813,778,7832,803,7817,775,7819,776,7821,768,7923,769,253,770,375,771,7929,772,563,775,7823,776,255,777,7927,778,7833,803,7925,769,378,770,7825,775,380,780,382,803,7827,817,7829,768,8173,769,901,834,8129,768,7846,769,7844,771,7850,777,7848,772,478,769,506,769,508,772,482,769,7688,768,7872,769,7870,771,7876,777,7874,769,7726,768,7890,769,7888,771,7894,777,7892,769,7756,772,556,776,7758,772,554,769,510,768,475,769,471,772,469,780,473,768,7847,769,7845,771,7851,777,7849,772,479,769,507,769,509,772,483,769,7689,768,7873,769,7871,771,7877,777,7875,769,7727,768,7891,769,7889,771,7895,777,7893,769,7757,772,557,776,7759,772,555,769,511,768,476,769,472,772,470,780,474,768,7856,769,7854,771,7860,777,7858,768,7857,769,7855,771,7861,777,7859,768,7700,769,7702,768,7701,769,7703,768,7760,769,7762,768,7761,769,7763,775,7780,775,7781,775,7782,775,7783,769,7800,769,7801,776,7802,776,7803,775,7835,768,7900,769,7898,771,7904,777,7902,803,7906,768,7901,769,7899,771,7905,777,7903,803,7907,768,7914,769,7912, |
|
22837
|
|
|
|
|
|
|
771,7918,777,7916,803,7920,768,7915,769,7913,771,7919,777,7917,803,7921,780,494,772,492,772,493,772,480,772,481,774,7708,774,7709,772,560,772,561,780,495,768,8122,769,902,772,8121,774,8120,787,7944,788,7945,837,8124,768,8136,769,904,787,7960,788,7961,768,8138,769,905,787,7976,788,7977,837,8140,768,8154,769,906,772,8153,774,8152,776,938,787,7992,788,7993,768,8184,769,908,787,8008,788,8009,788,8172,768,8170,769,910,772,8169,774,8168,776,939,788,8025,768,8186,769,911,787,8040,788,8041,837,8188,837,8116,837,8132,768,8048,769,940,772,8113,774,8112,787,7936,788,7937,834,8118,837,8115,768,8050,769,941,787,7952,788,7953,768,8052,769,942,787,7968,788,7969,834,8134,837,8131,768,8054,769,943,772,8145,774,8144,776,970,787,7984,788,7985,834,8150,768,8056,769,972,787,8000,788,8001,787,8164,788,8165,768,8058,769,973,772,8161,774,8160,776,971,787,8016,788,8017,834,8166,768,8060,769,974,787,8032,788,8033,834,8182,837,8179,768,8146,769,912,834,8151,768,8162,769,944,834,8167,837,8180,769,979,776,980,776,1031,774,1232,776,1234,769,1027,768,1024,774,1238,776,1025,774,1217,776,1244,776,1246,768,1037,772,1250,774,1049,776,1252,769,1036,776,1254,772,1262,774,1038,776,1264,779,1266,776,1268,776,1272,776,1260,774,1233,776,1235,769,1107,768,1104,774,1239,776,1105,774,1218,776,1245,776,1247,768,1117,772,1251,774,1081,776,1253,769,1116,776,1255,772,1263,774,1118,776,1265,779,1267,776,1269,776,1273,776,1261,776,1111,783,1142,783,1143,776,1242,776,1243,776,1258,776,1259,1619,1570,1620,1571,1621,1573,1620,1572,1620,1574,1620,1730,1620,1747,1620,1728,2364,2345,2364,2353,2364,2356,2494,2507,2519,2508,2878,2891,2902,2888,2903,2892,3031,2964,3006,3018,3031,3020,3006,3019,3158,3144,3285,3264,3266,3274,3285,3271,3286,3272,3285,3275,3390,3402,3415,3404,3390,3403,3530,3546,3535,3548,3551,3550,3530,3549,4142,4134,6965,6918,6965,6920,6965,6922,6965,6924,6965,6926,6965,6930,6965,6971,6965,6973,6965,6976,6965,6977,6965,6979,772,7736,772,7737,772,7772,772,7773,775,7784,775,7785,770,7852,774,7862,770,7853,774, |
|
22838
|
|
|
|
|
|
|
7863,770,7878,770,7879,770,7896,770,7897,768,7938,769,7940,834,7942,837,8064,768,7939,769,7941,834,7943,837,8065,837,8066,837,8067,837,8068,837,8069,837,8070,837,8071,768,7946,769,7948,834,7950,837,8072,768,7947,769,7949,834,7951,837,8073,837,8074,837,8075,837,8076,837,8077,837,8078,837,8079,768,7954,769,7956,768,7955,769,7957,768,7962,769,7964,768,7963,769,7965,768,7970,769,7972,834,7974,837,8080,768,7971,769,7973,834,7975,837,8081,837,8082,837,8083,837,8084,837,8085,837,8086,837,8087,768,7978,769,7980,834,7982,837,8088,768,7979,769,7981,834,7983,837,8089,837,8090,837,8091,837,8092,837,8093,837,8094,837,8095,768,7986,769,7988,834,7990,768,7987,769,7989,834,7991,768,7994,769,7996,834,7998,768,7995,769,7997,834,7999,768,8002,769,8004,768,8003,769,8005,768,8010,769,8012,768,8011,769,8013,768,8018,769,8020,834,8022,768,8019,769,8021,834,8023,768,8027,769,8029,834,8031,768,8034,769,8036,834,8038,837,8096,768,8035,769,8037,834,8039,837,8097,837,8098,837,8099,837,8100,837,8101,837,8102,837,8103,768,8042,769,8044,834,8046,837,8104,768,8043,769,8045,834,8047,837,8105,837,8106,837,8107,837,8108,837,8109,837,8110,837,8111,837,8114,837,8130,837,8178,837,8119,768,8141,769,8142,834,8143,837,8135,837,8183,768,8157,769,8158,834,8159,824,8602,824,8603,824,8622,824,8653,824,8655,824,8654,824,8708,824,8713,824,8716,824,8740,824,8742,824,8769,824,8772,824,8775,824,8777,824,8813,824,8802,824,8816,824,8817,824,8820,824,8821,824,8824,824,8825,824,8832,824,8833,824,8928,824,8929,824,8836,824,8837,824,8840,824,8841,824,8930,824,8931,824,8876,824,8877,824,8878,824,8879,824,8938,824,8939,824,8940,824,8941,12441,12436,12441,12364,12441,12366,12441,12368,12441,12370,12441,12372,12441,12374,12441,12376,12441,12378,12441,12380,12441,12382,12441,12384,12441,12386,12441,12389,12441,12391,12441,12393,12441,12400,12442,12401,12441,12403,12442,12404,12441,12406,12442,12407,12441,12409,12442,12410,12441,12412,12442,12413,12441,12446,12441,12532,12441,12460,12441,12462,12441,12464,12441,12466,12441, |
|
22839
|
|
|
|
|
|
|
12468,12441,12470,12441,12472,12441,12474,12441,12476,12441,12478,12441,12480,12441,12482,12441,12485,12441,12487,12441,12489,12441,12496,12442,12497,12441,12499,12442,12500,12441,12502,12442,12503,12441,12505,12442,12506,12441,12508,12442,12509,12441,12535,12441,12536,12441,12537,12441,12538,12441,12542,69818,69786,69818,69788,69818,69803,69927,69934,69927,69935,70462,70475,70487,70476,70832,70844,70842,70843,70845,70846,71087,71098,71087,71099,71984,71992 |
|
22840
|
|
|
|
|
|
|
}; |
|
22841
|
|
|
|
|
|
|
|
|
22842
|
|
|
|
|
|
|
const uint8_t uninorms::decomposition_index[uninorms::CHARS >> 8] = { |
|
22843
|
|
|
|
|
|
|
0,1,2,3,4,5,6,7,7,8,9,10,11,12,13,14,15,7,7,7,7,7,7,7,7,7,7,16,7,17,18,19,20,21,22,23,24,7,7,7,7,7,25,7,26,27,28,29,30,31,32,33,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,34,35,7,7,7,36,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,37,38,39,40,41,42,43,7,7,7,7,7,7,7,44,7,7,7,7,7,7,7,7,45,46,7,47,48,49,7,7,7,50,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,51,7,52,53,54,55,56,7,7,7,7,7,7,7,7,57,7,7,7,7,7,7,7,7,7,7,7,7,58,59,7,60,61,62,7,7,7,7,7,7,7,7,63,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,64,65,66,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, |
|
22844
|
|
|
|
|
|
|
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, |
|
22845
|
|
|
|
|
|
|
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, |
|
22846
|
|
|
|
|
|
|
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, |
|
22847
|
|
|
|
|
|
|
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7 |
|
22848
|
|
|
|
|
|
|
}; |
|
22849
|
|
|
|
|
|
|
const uint16_t uninorms::decomposition_block[][257] = { |
|
22850
|
|
|
|
|
|
|
{4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,5,8,8,8,8,8,8,8,9,16,17,20,20,20,20,21,28,28,29,33,37,45,48,48,49,57,61,64,65,77,89,100,100,108,116,124,132,140,148,148,156,164,172,180,188,196,204,212,220,220,228,236,244,252,260,268,268,268,276,284,292,300,308,308,308,316,324,332,340,348,356,356,364,372,380,388,396,404,412,420,428,428,436,444,452,460,468,476,476,476,484,492,500,508,516,516,524}, |
|
22851
|
|
|
|
|
|
|
{524,532,540,548,556,564,572,580,588,596,604,612,620,628,636,644,652,652,652,660,668,676,684,692,700,708,716,724,732,740,748,756,764,772,780,788,796,804,812,812,812,820,828,836,844,852,860,868,876,884,885,893,900,908,916,924,932,932,940,948,956,964,972,981,989,996,996,996,1004,1012,1020,1028,1036,1045,1052,1052,1052,1060,1068,1076,1084,1092,1100,1100,1100,1108,1116,1124,1132,1140,1148,1156,1164,1172,1180,1188,1196,1204,1212,1220,1228,1236,1244,1244,1244,1252,1260,1268,1276,1284,1292,1300,1308,1316,1324,1332,1340,1348,1356,1364,1372,1380,1388,1396,1404,1412,1420,1429,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1432,1440,1448,1448,1448,1448,1448,1448,1448,1448,1448,1448,1448,1448,1448,1448,1456,1464,1464,1464,1464,1464,1464,1464,1464,1464,1464,1464,1464,1464,1464,1464,1464,1464,1464,1464,1465,1477,1489,1501,1509,1517,1525,1533,1541,1548,1556,1564,1572,1580,1588,1596,1604,1612,1624,1636,1648,1660,1672,1684,1696,1708,1708,1720,1732,1744,1756,1764,1772,1772,1772,1780,1788,1796,1804,1812,1820,1832,1844,1852,1860,1869,1877,1885,1892,1900,1908,1908,1908,1916,1924,1936,1948,1956,1964,1972,1980}, |
|
22852
|
|
|
|
|
|
|
{1980,1988,1996,2004,2012,2020,2028,2036,2044,2052,2060,2068,2076,2084,2092,2100,2108,2116,2124,2132,2140,2148,2156,2164,2172,2180,2188,2196,2204,2204,2204,2212,2220,2220,2220,2220,2220,2220,2220,2228,2236,2244,2252,2264,2276,2288,2300,2308,2316,2328,2340,2348,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2356,2357,2361,2365,2369,2373,2377,2381,2385,2389,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2392,2393,2401,2409,2417,2425,2433,2440,2440,2441,2445,2449,2453,2457,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460}, |
|
22853
|
|
|
|
|
|
|
{2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2460,2464,2468,2468,2472,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2480,2484,2484,2484,2484,2484,2485,2492,2492,2492,2492,2496,2496,2496,2496,2496,2497,2506,2512,2520,2524,2532,2540,2548,2548,2556,2556,2564,2572,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2584,2592,2600,2608,2616,2624,2632,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2644,2652,2660,2668,2676,2684,2685,2689,2693,2698,2706,2713,2717,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2720,2721,2725,2729,2732,2733,2737,2740,2740,2740,2741,2744,2744,2744,2744,2744,2744,2744}, |
|
22854
|
|
|
|
|
|
|
{2744,2752,2760,2760,2768,2768,2768,2768,2776,2776,2776,2776,2776,2784,2792,2800,2800,2800,2800,2800,2800,2800,2800,2800,2800,2800,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2808,2816,2816,2816,2816,2816,2816,2816,2816,2816,2816,2816,2816,2816,2816,2816,2816,2816,2816,2816,2816,2816,2816,2816,2824,2832,2832,2840,2840,2840,2840,2848,2848,2848,2848,2848,2856,2864,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2872,2880,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2888,2896,2904,2904,2904,2904,2904,2904,2904,2904,2904,2904,2904,2904,2904,2904,2912,2920,2928,2936,2936,2936,2944,2952,2952,2952,2960,2968,2976,2984,2992,3000,3000,3000,3008,3016,3024,3032,3040,3048,3048,3048,3056,3064,3072,3080,3088,3096,3104,3112,3120,3128,3136,3144,3144,3144,3152,3160,3160,3160,3160,3160,3160,3160}, |
|
22855
|
|
|
|
|
|
|
{3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3160,3161,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168}, |
|
22856
|
|
|
|
|
|
|
{3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3168,3176,3184,3192,3200,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3208,3209,3217,3225,3233,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3240,3248,3248,3256,3256,3256,3256,3256,3256,3256,3256,3256,3256,3256,3256,3256,3256,3256,3256,3256,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264}, |
|
22857
|
|
|
|
|
|
|
{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}, |
|
22858
|
|
|
|
|
|
|
{3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3264,3272,3272,3272,3272,3272,3272,3272,3272,3280,3280,3280,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3288,3296,3304,3312,3320,3328,3336,3344,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3352,3360,3368,3368,3368,3368,3368,3368,3368,3368,3368,3368,3368,3368,3368,3368,3368,3368,3376,3384,3384,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392}, |
|
22859
|
|
|
|
|
|
|
{3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3392,3400,3400,3400,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3408,3416,3424,3432,3432,3432,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440}, |
|
22860
|
|
|
|
|
|
|
{3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3440,3448,3448,3448,3456,3464,3464,3464,3464,3464,3464,3464,3464,3464,3464,3464,3464,3464,3464,3464,3464,3472,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3480,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3488,3496,3504,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512}, |
|
22861
|
|
|
|
|
|
|
{3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3512,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3520,3528,3528,3528,3528,3528,3528,3528,3536,3544,3544,3552,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564}, |
|
22862
|
|
|
|
|
|
|
{3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3564,3572,3580,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3588,3596,3596,3604,3616,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624}, |
|
22863
|
|
|
|
|
|
|
{3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3624,3625,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3632,3633,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3640,3641,3649,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656}, |
|
22864
|
|
|
|
|
|
|
{3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3656,3657,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3660,3668,3668,3668,3668,3668,3668,3668,3668,3668,3668,3676,3676,3676,3676,3676,3684,3684,3684,3684,3684,3692,3692,3692,3692,3692,3700,3700,3700,3700,3700,3700,3700,3700,3700,3700,3700,3700,3700,3708,3708,3708,3708,3708,3708,3708,3708,3708,3708,3716,3716,3724,3733,3744,3753,3764,3764,3764,3764,3764,3764,3764,3764,3772,3772,3772,3772,3772,3772,3772,3772,3772,3772,3772,3772,3772,3772,3772,3772,3772,3772,3780,3780,3780,3780,3780,3780,3780,3780,3780,3780,3788,3788,3788,3788,3788,3796,3796,3796,3796,3796,3804,3804,3804,3804,3804,3812,3812,3812,3812,3812,3812,3812,3812,3812,3812,3812,3812,3812,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820}, |
|
22865
|
|
|
|
|
|
|
{3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3820,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3828,3829,3832,3832,3832,3832}, |
|
22866
|
|
|
|
|
|
|
{3832,3832,3832,3832,3832,3832,3832,3840,3840,3848,3848,3856,3856,3864,3864,3872,3872,3872,3872,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3880,3888,3888,3896,3896,3896,3904,3912,3912,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920}, |
|
22867
|
|
|
|
|
|
|
{3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3920,3921,3925,3929,3932,3933,3937,3941,3945,3949,3953,3957,3961,3965,3969,3973,3976,3977,3981,3985,3989,3993,3997,4001,4005,4009,4013,4017,4021,4025,4029,4033,4037,4041,4045,4048,4049,4053,4057,4061,4065,4069,4073,4077,4081,4085,4089,4093,4097,4101,4105,4109,4113,4117,4121,4125,4129,4133,4137,4141,4145,4149,4153,4157,4160,4160,4160,4160,4160,4160,4160,4160,4160,4160,4160,4160,4160,4161,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4164,4165,4169,4173,4177,4181,4185,4189,4193,4197,4201,4205,4209,4213,4217,4221,4225,4229,4233,4237,4241,4245,4249,4253,4257,4261,4265,4269,4273,4277,4281,4285,4289,4293,4297,4301,4305,4309,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312,4312}, |
|
22868
|
|
|
|
|
|
|
{4312,4320,4328,4336,4344,4352,4360,4368,4376,4388,4400,4408,4416,4424,4432,4440,4448,4456,4464,4472,4480,4492,4504,4516,4528,4536,4544,4552,4560,4572,4584,4592,4600,4608,4616,4624,4632,4640,4648,4656,4664,4672,4680,4688,4696,4704,4712,4724,4736,4744,4752,4760,4768,4776,4784,4792,4800,4812,4824,4832,4840,4848,4856,4864,4872,4880,4888,4896,4904,4912,4920,4928,4936,4944,4952,4960,4968,4980,4992,5004,5016,5028,5040,5052,5064,5072,5080,5088,5096,5104,5112,5120,5128,5140,5152,5160,5168,5176,5184,5192,5200,5212,5224,5236,5248,5260,5272,5280,5288,5296,5304,5312,5320,5328,5336,5344,5352,5360,5368,5376,5384,5396,5408,5420,5432,5440,5448,5456,5464,5472,5480,5488,5496,5504,5512,5520,5528,5536,5544,5552,5560,5568,5576,5584,5592,5600,5608,5616,5624,5632,5640,5648,5656,5664,5673,5682,5688,5688,5688,5688,5688,5696,5704,5712,5720,5732,5744,5756,5768,5780,5792,5804,5816,5828,5840,5852,5864,5876,5888,5900,5912,5924,5936,5948,5960,5968,5976,5984,5992,6000,6008,6020,6032,6044,6056,6068,6080,6092,6104,6116,6128,6136,6144,6152,6160,6168,6176,6184,6192,6204,6216,6228,6240,6252,6264,6276,6288,6300,6312,6324,6336,6348,6360,6372,6384,6396,6408,6420,6432,6440,6448,6456,6464,6476,6488,6500,6512,6524,6536,6548,6560,6572,6584,6592,6600,6608,6616,6624,6632,6640,6648,6648,6648,6648,6648,6648,6648}, |
|
22869
|
|
|
|
|
|
|
{6648,6656,6664,6676,6688,6700,6712,6724,6736,6744,6752,6764,6776,6788,6800,6812,6824,6832,6840,6852,6864,6876,6888,6888,6888,6896,6904,6916,6928,6940,6952,6952,6952,6960,6968,6980,6992,7004,7016,7028,7040,7048,7056,7068,7080,7092,7104,7116,7128,7136,7144,7156,7168,7180,7192,7204,7216,7224,7232,7244,7256,7268,7280,7292,7304,7312,7320,7332,7344,7356,7368,7368,7368,7376,7384,7396,7408,7420,7432,7432,7432,7440,7448,7460,7472,7484,7496,7508,7520,7520,7528,7528,7540,7540,7552,7552,7564,7572,7580,7592,7604,7616,7628,7640,7652,7660,7668,7680,7692,7704,7716,7728,7740,7748,7756,7764,7772,7780,7788,7796,7804,7812,7820,7828,7836,7844,7852,7852,7852,7864,7876,7892,7908,7924,7940,7956,7972,7984,7996,8012,8028,8044,8060,8076,8092,8104,8116,8132,8148,8164,8180,8196,8212,8224,8236,8252,8268,8284,8300,8316,8332,8344,8356,8372,8388,8404,8420,8436,8452,8464,8476,8492,8508,8524,8540,8556,8572,8580,8588,8600,8608,8620,8620,8628,8640,8648,8656,8664,8672,8681,8688,8693,8701,8710,8716,8728,8736,8748,8748,8756,8768,8776,8784,8792,8800,8810,8818,8826,8832,8840,8848,8860,8872,8872,8872,8880,8892,8900,8908,8916,8924,8926,8934,8942,8948,8956,8964,8976,8988,8996,9004,9012,9024,9032,9040,9048,9056,9066,9074,9080,9084,9084,9084,9096,9104,9116,9116,9124,9136,9144,9152,9160,9168,9178,9181,9188,9190}, |
|
22870
|
|
|
|
|
|
|
{9190,9194,9197,9201,9205,9209,9213,9217,9221,9225,9229,9232,9232,9232,9232,9232,9232,9233,9236,9236,9236,9236,9236,9237,9244,9244,9244,9244,9244,9244,9244,9244,9244,9244,9244,9244,9245,9249,9257,9268,9268,9268,9268,9268,9268,9268,9268,9269,9272,9272,9272,9273,9281,9292,9293,9301,9312,9312,9312,9312,9313,9320,9321,9328,9328,9328,9328,9328,9328,9328,9328,9329,9337,9345,9352,9352,9352,9352,9352,9352,9352,9352,9352,9352,9352,9352,9352,9353,9368,9368,9368,9368,9368,9368,9368,9369,9372,9372,9372,9372,9372,9372,9372,9372,9372,9372,9372,9372,9372,9372,9372,9372,9373,9377,9380,9380,9381,9385,9389,9393,9397,9401,9405,9409,9413,9417,9421,9425,9429,9433,9437,9441,9445,9449,9453,9457,9461,9465,9469,9473,9477,9481,9485,9488,9489,9493,9497,9501,9505,9509,9513,9517,9521,9525,9529,9533,9537,9540,9540,9540,9540,9540,9540,9540,9540,9540,9540,9540,9541,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9548,9549}, |
|
22871
|
|
|
|
|
|
|
{9549,9561,9573,9577,9584,9585,9597,9609,9612,9613,9621,9625,9629,9633,9637,9641,9645,9649,9653,9657,9660,9661,9665,9672,9672,9673,9677,9681,9685,9689,9692,9692,9693,9701,9713,9720,9721,9724,9724,9728,9729,9732,9732,9736,9745,9749,9752,9753,9757,9761,9764,9765,9769,9773,9777,9781,9785,9789,9792,9793,9805,9809,9813,9817,9821,9824,9824,9824,9824,9825,9829,9833,9837,9841,9844,9844,9844,9844,9844,9844,9845,9857,9869,9885,9897,9909,9921,9933,9945,9957,9969,9981,9993,10005,10017,10029,10037,10041,10049,10061,10069,10073,10081,10093,10109,10117,10121,10129,10141,10145,10149,10153,10157,10161,10169,10181,10189,10193,10201,10213,10229,10237,10241,10249,10261,10265,10269,10273,10276,10276,10276,10276,10276,10276,10276,10276,10276,10277,10288,10288,10288,10288,10288,10288,10288,10288,10288,10288,10288,10288,10288,10288,10288,10288,10288,10296,10304,10304,10304,10304,10304,10304,10304,10304,10304,10304,10304,10304,10304,10304,10304,10304,10304,10304,10304,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10312,10320,10328,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336,10336}, |
|
22872
|
|
|
|
|
|
|
{10336,10336,10336,10336,10336,10344,10344,10344,10344,10344,10352,10352,10352,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10360,10368,10368,10376,10376,10376,10376,10376,10377,10385,10396,10397,10405,10416,10416,10416,10416,10416,10416,10416,10416,10416,10416,10416,10416,10416,10416,10416,10416,10416,10424,10424,10424,10432,10432,10432,10440,10440,10448,10448,10448,10448,10448,10448,10448,10448,10448,10448,10448,10448,10448,10448,10448,10448,10448,10448,10448,10448,10448,10448,10448,10456,10456,10464,10464,10464,10464,10464,10464,10464,10464,10464,10464,10464,10472,10480,10488,10496,10504,10504,10504,10512,10520,10520,10520,10528,10536,10536,10536,10536,10536,10536,10536,10544,10552,10552,10552,10560,10568,10568,10568,10576,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10584,10592,10600,10608,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10616,10624,10632,10640,10648,10648,10648,10648,10648,10648,10648,10656,10664,10672,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680}, |
|
22873
|
|
|
|
|
|
|
{10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10680,10684,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688}, |
|
22874
|
|
|
|
|
|
|
{10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10688,10689,10693,10697,10701,10705,10709,10713,10717,10721,10725,10733,10741,10749,10757,10765,10773,10781,10789,10797,10805,10813,10825,10837,10849,10861,10873,10885,10897,10909,10921,10937,10953,10969,10985,11001,11017,11033,11049,11065,11081,11097,11105,11113,11121,11129,11137,11145,11153,11161,11169,11181,11193,11205,11217,11229,11241,11253,11265,11277,11289,11301,11313,11325,11337,11349,11361,11373,11385,11397,11409,11421,11433,11445,11457,11469,11481,11493,11505,11517,11529,11541,11553,11565,11577,11589,11601,11613,11617,11621,11625,11629,11633,11637,11641,11645,11649,11653,11657,11661,11665,11669,11673,11677,11681,11685,11689,11693,11697,11701,11705,11709,11713,11717,11721,11725,11729,11733,11737,11741,11745,11749,11753,11757,11761,11765,11769,11773,11777,11781,11785,11789,11793,11797,11801,11805,11809,11813,11817,11821,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824}, |
|
22875
|
|
|
|
|
|
|
{11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11824,11825,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11840,11841,11853,11861,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11872,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880}, |
|
22876
|
|
|
|
|
|
|
{11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11880,11881,11885,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888}, |
|
22877
|
|
|
|
|
|
|
{11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11888,11889,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892}, |
|
22878
|
|
|
|
|
|
|
{11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11892,11893,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11896,11897,11900,11900,11900,11900,11900,11900,11900,11900,11900,11900,11900,11900,11901}, |
|
22879
|
|
|
|
|
|
|
{11901,11905,11909,11913,11917,11921,11925,11929,11933,11937,11941,11945,11949,11953,11957,11961,11965,11969,11973,11977,11981,11985,11989,11993,11997,12001,12005,12009,12013,12017,12021,12025,12029,12033,12037,12041,12045,12049,12053,12057,12061,12065,12069,12073,12077,12081,12085,12089,12093,12097,12101,12105,12109,12113,12117,12121,12125,12129,12133,12137,12141,12145,12149,12153,12157,12161,12165,12169,12173,12177,12181,12185,12189,12193,12197,12201,12205,12209,12213,12217,12221,12225,12229,12233,12237,12241,12245,12249,12253,12257,12261,12265,12269,12273,12277,12281,12285,12289,12293,12297,12301,12305,12309,12313,12317,12321,12325,12329,12333,12337,12341,12345,12349,12353,12357,12361,12365,12369,12373,12377,12381,12385,12389,12393,12397,12401,12405,12409,12413,12417,12421,12425,12429,12433,12437,12441,12445,12449,12453,12457,12461,12465,12469,12473,12477,12481,12485,12489,12493,12497,12501,12505,12509,12513,12517,12521,12525,12529,12533,12537,12541,12545,12549,12553,12557,12561,12565,12569,12573,12577,12581,12585,12589,12593,12597,12601,12605,12609,12613,12617,12621,12625,12629,12633,12637,12641,12645,12649,12653,12657,12661,12665,12669,12673,12677,12681,12685,12689,12693,12697,12701,12705,12709,12713,12717,12721,12725,12729,12733,12737,12741,12745,12749,12753,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12756,12757}, |
|
22880
|
|
|
|
|
|
|
{12757,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12760,12761,12764,12765,12769,12773,12776,12776,12776,12776,12776,12776,12776,12776,12776,12776,12776,12776,12776,12776,12776,12776,12776,12776,12784,12784,12792,12792,12800,12800,12808,12808,12816,12816,12824,12824,12832,12832,12840,12840,12848,12848,12856,12856,12864,12864,12872,12872,12872,12880,12880,12888,12888,12896,12896,12896,12896,12896,12896,12896,12904,12912,12912,12920,12928,12928,12936,12944,12944,12952,12960,12960,12968,12976,12976,12976,12976,12976,12976,12976,12976,12976,12976,12976,12976,12976,12976,12976,12976,12976,12976,12976,12976,12976,12976,12976,12984,12984,12984,12984,12984,12984,12985,12993,13000,13000,13009,13016,13016,13016,13016,13016,13016,13016,13016,13016,13016,13016,13016,13016,13024,13024,13032,13032,13040,13040,13048,13048,13056,13056,13064,13064,13072,13072,13080,13080,13088,13088,13096,13096,13104,13104,13112,13112,13112,13120,13120,13128,13128,13136,13136,13136,13136,13136,13136,13136,13144,13152,13152,13160,13168,13168,13176,13184,13184,13192,13200,13200,13208,13216,13216,13216,13216,13216,13216,13216,13216,13216,13216,13216,13216,13216,13216,13216,13216,13216,13216,13216,13216,13216,13216,13216,13224,13224,13224,13232,13240,13248,13256,13256,13256,13256,13265,13272}, |
|
22881
|
|
|
|
|
|
|
{13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13272,13273,13277,13281,13285,13289,13293,13297,13301,13305,13309,13313,13317,13321,13325,13329,13333,13337,13341,13345,13349,13353,13357,13361,13365,13369,13373,13377,13381,13385,13389,13393,13397,13401,13405,13409,13413,13417,13421,13425,13429,13433,13437,13441,13445,13449,13453,13457,13461,13465,13469,13473,13477,13481,13485,13489,13493,13497,13501,13505,13509,13513,13517,13521,13525,13529,13533,13537,13541,13545,13549,13553,13557,13561,13565,13569,13573,13577,13581,13585,13589,13593,13597,13601,13605,13609,13613,13617,13621,13625,13629,13633,13637,13641,13645,13648,13648,13648,13649,13653,13657,13661,13665,13669,13673,13677,13681,13685,13689,13693,13697,13701,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13704,13705}, |
|
22882
|
|
|
|
|
|
|
{13705,13717,13729,13741,13753,13765,13777,13789,13801,13813,13825,13837,13849,13861,13873,13889,13905,13921,13937,13953,13969,13985,14001,14017,14033,14049,14065,14081,14097,14113,14141,14164,14165,14177,14189,14201,14213,14225,14237,14249,14261,14273,14285,14297,14309,14321,14333,14345,14357,14369,14381,14393,14405,14417,14429,14441,14453,14465,14477,14489,14501,14513,14525,14537,14549,14561,14573,14585,14597,14601,14605,14609,14612,14612,14612,14612,14612,14612,14612,14612,14613,14625,14633,14641,14649,14657,14665,14673,14681,14689,14697,14705,14713,14721,14729,14737,14745,14749,14753,14757,14761,14765,14769,14773,14777,14781,14785,14789,14793,14797,14801,14809,14817,14825,14833,14841,14849,14857,14865,14873,14881,14889,14897,14905,14913,14933,14949,14956,14957,14961,14965,14969,14973,14977,14981,14985,14989,14993,14997,15001,15005,15009,15013,15017,15021,15025,15029,15033,15037,15041,15045,15049,15053,15057,15061,15065,15069,15073,15077,15081,15085,15089,15093,15097,15101,15105,15109,15113,15117,15121,15125,15129,15133,15137,15141,15145,15149,15153,15161,15169,15177,15185,15193,15201,15209,15217,15225,15233,15241,15249,15257,15265,15273,15281,15289,15297,15305,15313,15321,15329,15337,15345,15357,15369,15381,15389,15401,15409,15421,15425,15429,15433,15437,15441,15445,15449,15453,15457,15461,15465,15469,15473,15477,15481,15485,15489,15493,15497,15501,15505,15509,15513,15517,15521,15525,15529,15533,15537,15541,15545,15549,15553,15557,15561,15565,15569,15573,15577,15581,15585,15589,15593,15597,15601,15605,15609,15617}, |
|
22883
|
|
|
|
|
|
|
{15617,15637,15653,15673,15685,15705,15717,15729,15753,15769,15781,15793,15805,15821,15837,15853,15869,15885,15901,15917,15941,15949,15973,15997,16017,16033,16057,16081,16097,16109,16121,16137,16153,16173,16193,16205,16217,16233,16245,16257,16265,16273,16285,16297,16321,16337,16357,16381,16397,16409,16421,16445,16461,16485,16497,16517,16529,16545,16557,16573,16593,16609,16629,16645,16653,16673,16685,16697,16713,16725,16737,16749,16769,16785,16793,16817,16829,16849,16865,16881,16893,16905,16921,16929,16945,16965,16973,16997,17009,17017,17025,17033,17041,17049,17057,17065,17073,17081,17089,17101,17113,17125,17137,17149,17161,17173,17185,17197,17209,17221,17233,17245,17257,17269,17281,17289,17297,17309,17317,17325,17333,17345,17357,17365,17373,17381,17389,17397,17413,17421,17429,17437,17445,17453,17461,17469,17477,17489,17505,17513,17521,17529,17537,17545,17553,17561,17573,17585,17597,17609,17617,17625,17633,17641,17649,17657,17665,17673,17681,17689,17701,17713,17721,17733,17745,17757,17765,17777,17789,17805,17813,17825,17837,17849,17861,17881,17905,17913,17921,17929,17937,17945,17953,17961,17969,17977,17985,17993,18001,18009,18017,18025,18033,18041,18049,18065,18073,18081,18089,18105,18117,18125,18133,18141,18149,18157,18165,18173,18181,18189,18197,18209,18217,18225,18237,18249,18257,18273,18285,18293,18301,18309,18317,18329,18341,18349,18357,18365,18373,18381,18389,18397,18405,18413,18425,18437,18449,18461,18473,18485,18497,18509,18521,18533,18545,18557,18569,18581,18593,18605,18617,18629,18641,18653,18665,18677,18688}, |
|
22884
|
|
|
|
|
|
|
{18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18688,18689,18693,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696}, |
|
22885
|
|
|
|
|
|
|
{18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18696,18697,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18700,18701,18705,18709,18712,18712,18712,18713,18717,18720,18720,18720,18720,18720,18720,18720}, |
|
22886
|
|
|
|
|
|
|
{18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18720,18721,18725,18729,18733,18736,18736,18736,18736,18736,18736,18736,18736,18736,18737,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740,18740}, |
|
22887
|
|
|
|
|
|
|
{18740,18744,18748,18752,18756,18760,18764,18768,18772,18776,18780,18784,18788,18792,18796,18800,18804,18808,18812,18816,18820,18824,18828,18832,18836,18840,18844,18848,18852,18856,18860,18864,18868,18872,18876,18880,18884,18888,18892,18896,18900,18904,18908,18912,18916,18920,18924,18928,18932,18936,18940,18944,18948,18952,18956,18960,18964,18968,18972,18976,18980,18984,18988,18992,18996,19000,19004,19008,19012,19016,19020,19024,19028,19032,19036,19040,19044,19048,19052,19056,19060,19064,19068,19072,19076,19080,19084,19088,19092,19096,19100,19104,19108,19112,19116,19120,19124,19128,19132,19136,19140,19144,19148,19152,19156,19160,19164,19168,19172,19176,19180,19184,19188,19192,19196,19200,19204,19208,19212,19216,19220,19224,19228,19232,19236,19240,19244,19248,19252,19256,19260,19264,19268,19272,19276,19280,19284,19288,19292,19296,19300,19304,19308,19312,19316,19320,19324,19328,19332,19336,19340,19344,19348,19352,19356,19360,19364,19368,19372,19376,19380,19384,19388,19392,19396,19400,19404,19408,19412,19416,19420,19424,19428,19432,19436,19440,19444,19448,19452,19456,19460,19464,19468,19472,19476,19480,19484,19488,19492,19496,19500,19504,19508,19512,19516,19520,19524,19528,19532,19536,19540,19544,19548,19552,19556,19560,19564,19568,19572,19576,19580,19584,19588,19592,19596,19600,19604,19608,19612,19616,19620,19624,19628,19632,19636,19640,19644,19648,19652,19656,19660,19664,19668,19672,19676,19680,19684,19688,19692,19696,19700,19704,19708,19712,19716,19720,19724,19728,19732,19736,19740,19744,19748,19752,19756,19760,19764}, |
|
22888
|
|
|
|
|
|
|
{19764,19768,19772,19776,19780,19784,19788,19792,19796,19800,19804,19808,19812,19816,19820,19820,19820,19824,19824,19828,19828,19828,19832,19836,19840,19844,19848,19852,19856,19860,19864,19868,19868,19872,19872,19876,19876,19876,19880,19884,19884,19884,19884,19888,19892,19896,19900,19904,19908,19912,19916,19920,19924,19928,19932,19936,19940,19944,19948,19952,19956,19960,19964,19968,19972,19976,19980,19984,19988,19992,19996,20000,20004,20008,20012,20016,20020,20024,20028,20032,20036,20040,20044,20048,20052,20056,20060,20064,20068,20072,20076,20080,20084,20088,20092,20096,20100,20104,20108,20112,20116,20120,20124,20128,20132,20136,20140,20144,20148,20152,20156,20156,20156,20160,20164,20168,20172,20176,20180,20184,20188,20192,20196,20200,20204,20208,20212,20216,20220,20224,20228,20232,20236,20240,20244,20248,20252,20256,20260,20264,20268,20272,20276,20280,20284,20288,20292,20296,20300,20304,20308,20312,20316,20320,20324,20328,20332,20336,20340,20344,20348,20352,20356,20360,20364,20368,20372,20376,20380,20384,20388,20392,20396,20400,20404,20408,20412,20416,20420,20424,20428,20432,20436,20440,20444,20448,20452,20456,20460,20464,20468,20472,20476,20480,20484,20488,20492,20496,20500,20504,20508,20512,20516,20520,20524,20528,20532,20536,20540,20544,20548,20552,20556,20560,20564,20568,20572,20576,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20580,20581}, |
|
22889
|
|
|
|
|
|
|
{20581,20589,20597,20605,20617,20629,20637,20644,20644,20644,20644,20644,20644,20644,20644,20644,20644,20644,20644,20645,20653,20661,20669,20677,20684,20684,20684,20684,20684,20684,20692,20692,20701,20705,20709,20713,20717,20721,20725,20729,20733,20737,20740,20748,20756,20768,20780,20788,20796,20804,20812,20820,20828,20836,20844,20852,20852,20860,20868,20876,20884,20892,20892,20900,20900,20908,20916,20916,20924,20932,20932,20940,20948,20956,20964,20972,20980,20988,20996,21005,21013,21017,21021,21025,21029,21033,21037,21041,21045,21049,21053,21057,21061,21065,21069,21073,21077,21081,21085,21089,21093,21097,21101,21105,21109,21113,21117,21121,21125,21129,21133,21137,21141,21145,21149,21153,21157,21161,21165,21169,21173,21177,21181,21185,21189,21193,21197,21201,21205,21209,21213,21217,21221,21225,21229,21233,21237,21241,21245,21249,21253,21257,21261,21265,21269,21273,21277,21281,21285,21289,21293,21297,21301,21305,21309,21313,21317,21321,21325,21329,21333,21337,21341,21345,21349,21357,21365,21369,21373,21377,21381,21385,21389,21393,21397,21401,21405,21413,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21420,21421,21425,21429,21433,21437,21441,21445,21449,21453,21457,21461,21469,21473,21477,21481,21485,21489,21493,21497,21501,21505,21509,21513,21517,21529,21541,21553,21565,21577,21589,21601,21613,21625,21637,21649,21661,21673,21685,21697,21709,21721,21733,21737,21741,21745,21749}, |
|
22890
|
|
|
|
|
|
|
{21749,21761,21773,21785,21797,21809,21817,21825,21833,21841,21849,21857,21865,21873,21881,21889,21897,21905,21913,21921,21929,21937,21945,21953,21961,21969,21977,21985,21993,22001,22009,22017,22025,22033,22041,22049,22057,22065,22073,22081,22089,22097,22105,22113,22121,22129,22137,22145,22153,22161,22169,22177,22185,22193,22201,22209,22217,22225,22233,22241,22249,22257,22265,22273,22281,22289,22297,22305,22313,22321,22329,22337,22345,22353,22361,22369,22377,22385,22393,22401,22409,22417,22425,22433,22441,22449,22457,22465,22473,22481,22489,22497,22505,22513,22521,22533,22545,22557,22569,22581,22593,22605,22617,22629,22641,22653,22665,22673,22681,22689,22697,22705,22713,22721,22729,22737,22745,22753,22761,22769,22777,22785,22793,22801,22809,22817,22825,22833,22841,22849,22857,22865,22873,22881,22889,22897,22905,22913,22921,22929,22937,22945,22953,22961,22969,22977,22985,22993,23001,23009,23017,23025,23037,23049,23061,23073,23085,23093,23101,23109,23117,23125,23133,23141,23149,23157,23165,23173,23181,23189,23197,23205,23213,23221,23229,23237,23245,23253,23261,23269,23277,23285,23293,23301,23309,23317,23325,23333,23341,23349,23357,23365,23373,23381,23389,23397,23405,23413,23421,23429,23437,23445,23453,23461,23469,23477,23485,23493,23501,23509,23517,23525,23533,23541,23549,23557,23565,23573,23581,23589,23597,23605,23613,23621,23633,23645,23653,23661,23669,23677,23685,23693,23701,23709,23717,23725,23733,23741,23749,23757,23765,23773,23781,23793,23805,23817,23825,23833,23841,23849,23857,23865,23873,23881,23889,23897,23905}, |
|
22891
|
|
|
|
|
|
|
{23905,23913,23921,23929,23937,23945,23953,23961,23969,23977,23985,23993,24001,24009,24017,24025,24033,24041,24049,24057,24065,24073,24081,24089,24097,24105,24113,24121,24129,24137,24145,24153,24161,24169,24177,24185,24193,24201,24209,24217,24225,24233,24241,24249,24257,24265,24273,24281,24289,24297,24305,24313,24321,24329,24337,24345,24353,24361,24369,24377,24385,24393,24400,24400,24400,24400,24400,24400,24400,24400,24400,24400,24400,24400,24400,24400,24400,24400,24400,24400,24401,24413,24425,24437,24449,24461,24473,24485,24497,24509,24521,24533,24545,24557,24569,24581,24593,24605,24617,24629,24641,24653,24665,24677,24689,24701,24713,24725,24737,24749,24761,24773,24785,24797,24809,24821,24833,24845,24857,24869,24881,24893,24905,24917,24929,24941,24953,24965,24977,24989,25001,25013,25025,25037,25049,25061,25073,25085,25097,25109,25121,25133,25145,25157,25168,25168,25169,25181,25193,25205,25217,25229,25241,25253,25265,25277,25289,25301,25313,25325,25337,25349,25361,25373,25385,25397,25409,25421,25433,25445,25457,25469,25481,25493,25505,25517,25529,25541,25553,25565,25577,25589,25601,25613,25625,25637,25649,25661,25673,25685,25697,25709,25721,25733,25745,25757,25769,25781,25793,25805,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25816,25817,25829,25841,25857,25873,25889,25905,25921,25937,25953,25965,26037,26069,26084,26084,26084,26084}, |
|
22892
|
|
|
|
|
|
|
{26084,26084,26084,26084,26084,26084,26084,26084,26084,26084,26084,26084,26084,26084,26084,26084,26085,26089,26093,26097,26101,26105,26109,26113,26117,26121,26132,26132,26132,26132,26132,26132,26132,26132,26132,26132,26132,26132,26132,26132,26132,26132,26132,26132,26132,26132,26132,26132,26133,26141,26145,26149,26153,26157,26161,26165,26169,26173,26177,26181,26185,26189,26193,26197,26201,26205,26209,26213,26217,26220,26220,26221,26225,26229,26237,26245,26253,26261,26265,26269,26273,26277,26281,26284,26285,26289,26293,26297,26301,26305,26309,26313,26317,26321,26325,26329,26333,26337,26341,26345,26349,26353,26357,26360,26361,26365,26369,26373,26376,26376,26376,26376,26377,26385,26393,26400,26401,26408,26409,26417,26425,26433,26441,26449,26457,26465,26473,26481,26489,26493,26501,26509,26517,26525,26533,26541,26549,26557,26565,26573,26581,26589,26593,26597,26601,26605,26609,26613,26617,26621,26625,26629,26633,26637,26641,26645,26649,26653,26657,26661,26665,26669,26673,26677,26681,26685,26689,26693,26697,26701,26705,26709,26713,26717,26721,26725,26729,26733,26737,26741,26745,26749,26753,26757,26761,26765,26769,26773,26777,26781,26785,26789,26793,26797,26801,26805,26809,26813,26817,26821,26825,26829,26833,26837,26841,26845,26849,26853,26857,26861,26865,26869,26873,26877,26881,26885,26889,26893,26897,26901,26905,26909,26913,26917,26921,26925,26929,26933,26937,26941,26945,26949,26953,26957,26961,26965,26969,26973,26977,26981,26985,26989,26993,26997,27001,27005,27017,27029,27041,27053,27065,27077,27085,27092,27092,27092,27092}, |
|
22893
|
|
|
|
|
|
|
{27092,27093,27097,27101,27105,27109,27113,27117,27121,27125,27129,27133,27137,27141,27145,27149,27153,27157,27161,27165,27169,27173,27177,27181,27185,27189,27193,27197,27201,27205,27209,27213,27217,27221,27225,27229,27233,27237,27241,27245,27249,27253,27257,27261,27265,27269,27273,27277,27281,27285,27289,27293,27297,27301,27305,27309,27313,27317,27321,27325,27329,27333,27337,27341,27345,27349,27353,27357,27361,27365,27369,27373,27377,27381,27385,27389,27393,27397,27401,27405,27409,27413,27417,27421,27425,27429,27433,27437,27441,27445,27449,27453,27457,27461,27465,27469,27473,27477,27481,27485,27489,27493,27497,27501,27505,27509,27513,27517,27521,27525,27529,27533,27537,27541,27545,27549,27553,27557,27561,27565,27569,27573,27577,27581,27585,27589,27593,27597,27601,27605,27609,27613,27617,27621,27625,27629,27633,27637,27641,27645,27649,27653,27657,27661,27665,27669,27673,27677,27681,27685,27689,27693,27697,27701,27705,27709,27713,27717,27721,27725,27729,27733,27737,27741,27745,27749,27753,27757,27761,27765,27769,27773,27777,27781,27785,27789,27793,27797,27801,27805,27809,27813,27817,27821,27825,27829,27833,27837,27841,27845,27849,27852,27852,27852,27853,27857,27861,27865,27869,27873,27876,27876,27877,27881,27885,27889,27893,27897,27900,27900,27901,27905,27909,27913,27917,27921,27924,27924,27925,27929,27933,27936,27936,27936,27937,27941,27945,27949,27957,27961,27965,27968,27969,27973,27977,27981,27985,27989,27993,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996}, |
|
22894
|
|
|
|
|
|
|
{27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27996,27997,28001,28005,28009,28013,28016,28017,28021,28025,28029,28033,28037,28041,28045,28049,28053,28057,28061,28065,28069,28073,28077,28081,28085,28089,28093,28097,28101,28105,28109,28113,28117,28121,28125,28129,28133,28137,28141,28145,28149,28153,28157,28161,28165,28169,28173,28177,28181,28184,28185,28189,28193,28197,28201,28205,28209,28213,28217,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220}, |
|
22895
|
|
|
|
|
|
|
{28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28220,28228,28228,28236,28236,28236,28236,28236,28236,28236,28236,28236,28236,28236,28236,28236,28236,28236,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244}, |
|
22896
|
|
|
|
|
|
|
{28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28244,28252,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260}, |
|
22897
|
|
|
|
|
|
|
{28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28260,28268,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276}, |
|
22898
|
|
|
|
|
|
|
{28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28276,28284,28292,28292,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300}, |
|
22899
|
|
|
|
|
|
|
{28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28300,28308,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316}, |
|
22900
|
|
|
|
|
|
|
{28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28316,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324}, |
|
22901
|
|
|
|
|
|
|
{28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28324,28332,28340,28352,28364,28376,28388,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28400,28408,28416,28428,28440,28452,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464}, |
|
22902
|
|
|
|
|
|
|
{28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28464,28465}, |
|
22903
|
|
|
|
|
|
|
{28465,28469,28473,28477,28481,28485,28489,28493,28497,28501,28505,28509,28513,28517,28521,28525,28529,28533,28537,28541,28545,28549,28553,28557,28561,28565,28569,28573,28577,28581,28585,28589,28593,28597,28601,28605,28609,28613,28617,28621,28625,28629,28633,28637,28641,28645,28649,28653,28657,28661,28665,28669,28673,28677,28681,28685,28689,28693,28697,28701,28705,28709,28713,28717,28721,28725,28729,28733,28737,28741,28745,28749,28753,28757,28761,28765,28769,28773,28777,28781,28785,28789,28793,28797,28801,28804,28805,28809,28813,28817,28821,28825,28829,28833,28837,28841,28845,28849,28853,28857,28861,28865,28869,28873,28877,28881,28885,28889,28893,28897,28901,28905,28909,28913,28917,28921,28925,28929,28933,28937,28941,28945,28949,28953,28957,28961,28965,28969,28973,28977,28981,28985,28989,28993,28997,29001,29005,29009,29013,29017,29021,29025,29029,29033,29037,29041,29045,29049,29053,29057,29061,29065,29069,29073,29077,29081,29085,29088,29089,29093,29096,29096,29097,29100,29100,29101,29105,29108,29108,29109,29113,29117,29121,29124,29125,29129,29133,29137,29141,29145,29149,29153,29157,29161,29165,29169,29172,29173,29176,29177,29181,29185,29189,29193,29197,29201,29204,29205,29209,29213,29217,29221,29225,29229,29233,29237,29241,29245,29249,29253,29257,29261,29265,29269,29273,29277,29281,29285,29289,29293,29297,29301,29305,29309,29313,29317,29321,29325,29329,29333,29337,29341,29345,29349,29353,29357,29361,29365,29369,29373,29377,29381,29385,29389,29393,29397,29401,29405,29409,29413,29417,29421,29425,29429,29433,29437,29441}, |
|
22904
|
|
|
|
|
|
|
{29441,29445,29449,29453,29457,29461,29464,29465,29469,29473,29477,29480,29480,29481,29485,29489,29493,29497,29501,29505,29509,29512,29513,29517,29521,29525,29529,29533,29537,29540,29541,29545,29549,29553,29557,29561,29565,29569,29573,29577,29581,29585,29589,29593,29597,29601,29605,29609,29613,29617,29621,29625,29629,29633,29637,29641,29645,29649,29652,29653,29657,29661,29665,29668,29669,29673,29677,29681,29685,29688,29689,29692,29692,29692,29693,29697,29701,29705,29709,29713,29717,29720,29721,29725,29729,29733,29737,29741,29745,29749,29753,29757,29761,29765,29769,29773,29777,29781,29785,29789,29793,29797,29801,29805,29809,29813,29817,29821,29825,29829,29833,29837,29841,29845,29849,29853,29857,29861,29865,29869,29873,29877,29881,29885,29889,29893,29897,29901,29905,29909,29913,29917,29921,29925,29929,29933,29937,29941,29945,29949,29953,29957,29961,29965,29969,29973,29977,29981,29985,29989,29993,29997,30001,30005,30009,30013,30017,30021,30025,30029,30033,30037,30041,30045,30049,30053,30057,30061,30065,30069,30073,30077,30081,30085,30089,30093,30097,30101,30105,30109,30113,30117,30121,30125,30129,30133,30137,30141,30145,30149,30153,30157,30161,30165,30169,30173,30177,30181,30185,30189,30193,30197,30201,30205,30209,30213,30217,30221,30225,30229,30233,30237,30241,30245,30249,30253,30257,30261,30265,30269,30273,30277,30281,30285,30289,30293,30297,30301,30305,30309,30313,30317,30321,30325,30329,30333,30337,30341,30345,30349,30353,30357,30361,30365,30369,30373,30377,30381,30385,30389,30393,30397,30401,30405,30409,30413,30417}, |
|
22905
|
|
|
|
|
|
|
{30417,30421,30425,30429,30433,30437,30441,30445,30449,30453,30457,30461,30465,30469,30473,30477,30481,30485,30489,30493,30497,30501,30505,30509,30513,30517,30521,30525,30529,30533,30537,30541,30545,30549,30553,30557,30561,30565,30569,30573,30577,30581,30585,30589,30593,30597,30601,30605,30609,30613,30617,30621,30625,30629,30633,30637,30641,30645,30649,30653,30657,30661,30665,30669,30673,30677,30681,30685,30689,30693,30697,30701,30705,30709,30713,30717,30721,30725,30729,30733,30737,30741,30745,30749,30753,30757,30761,30765,30769,30773,30777,30781,30785,30789,30793,30797,30801,30805,30809,30813,30817,30821,30825,30829,30833,30837,30841,30845,30849,30853,30857,30861,30865,30869,30873,30877,30881,30885,30889,30893,30897,30901,30905,30909,30913,30917,30921,30925,30929,30933,30937,30941,30945,30949,30953,30957,30961,30965,30969,30973,30977,30981,30985,30989,30993,30997,31001,31005,31009,31013,31017,31021,31025,31029,31033,31037,31041,31045,31049,31053,31057,31061,31065,31069,31073,31077,31080,31080,31081,31085,31089,31093,31097,31101,31105,31109,31113,31117,31121,31125,31129,31133,31137,31141,31145,31149,31153,31157,31161,31165,31169,31173,31177,31181,31185,31189,31193,31197,31201,31205,31209,31213,31217,31221,31225,31229,31233,31237,31241,31245,31249,31253,31257,31261,31265,31269,31273,31277,31281,31285,31289,31293,31297,31301,31305,31309,31313,31317,31321,31325,31329,31333,31337,31341,31345,31349,31353,31357,31361,31365,31369,31373,31377,31381,31385,31389,31393,31397,31401,31405,31409,31413,31417,31421,31425,31429,31433}, |
|
22906
|
|
|
|
|
|
|
{31433,31437,31441,31445,31449,31453,31457,31461,31465,31469,31473,31477,31481,31485,31489,31493,31497,31501,31505,31509,31513,31517,31521,31525,31529,31533,31537,31541,31545,31549,31553,31557,31561,31565,31569,31573,31577,31581,31585,31589,31593,31597,31601,31605,31609,31613,31617,31621,31625,31629,31633,31637,31641,31645,31649,31653,31657,31661,31665,31669,31673,31677,31681,31685,31689,31693,31697,31701,31705,31709,31713,31717,31721,31725,31729,31733,31737,31741,31745,31749,31753,31757,31761,31765,31769,31773,31777,31781,31785,31789,31793,31797,31801,31805,31809,31813,31817,31821,31825,31829,31833,31837,31841,31845,31849,31853,31857,31861,31865,31869,31873,31877,31881,31885,31889,31893,31897,31901,31905,31909,31913,31917,31921,31925,31929,31933,31937,31941,31945,31949,31953,31957,31961,31965,31969,31973,31977,31981,31985,31989,31993,31997,32001,32005,32009,32013,32017,32021,32025,32029,32033,32037,32041,32045,32049,32053,32057,32061,32065,32069,32073,32077,32081,32085,32089,32093,32097,32101,32105,32109,32113,32117,32121,32125,32129,32133,32137,32141,32145,32149,32153,32157,32161,32165,32169,32173,32177,32181,32185,32189,32193,32197,32201,32205,32209,32213,32217,32221,32225,32229,32233,32237,32241,32245,32248,32248,32249,32253,32257,32261,32265,32269,32273,32277,32281,32285,32289,32293,32297,32301,32305,32309,32313,32317,32321,32325,32329,32333,32337,32341,32345,32349,32353,32357,32361,32365,32369,32373,32377,32381,32385,32389,32393,32397,32401,32405,32409,32413,32417,32421,32425,32429,32433,32437,32441,32445,32448}, |
|
22907
|
|
|
|
|
|
|
{32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32448,32449,32453,32457,32461,32465,32469,32473,32477,32481,32485,32489,32493,32497,32501,32505,32509,32513,32517,32521,32525,32529,32533,32537,32541,32545,32549,32553,32557,32561,32565,32569,32573,32577,32581,32585,32589,32593,32597,32601,32605,32609,32613,32617,32621,32625,32629,32633,32637,32641,32645,32649,32653,32657,32661,32665,32669,32673,32677,32681,32685,32689,32693,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696}, |
|
22908
|
|
|
|
|
|
|
{32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32696,32697}, |
|
22909
|
|
|
|
|
|
|
{32697,32701,32705,32709,32712,32713,32717,32721,32725,32729,32733,32737,32741,32745,32749,32753,32757,32761,32765,32769,32773,32777,32781,32785,32789,32793,32797,32801,32805,32809,32813,32817,32820,32821,32825,32828,32829,32832,32832,32833,32836,32837,32841,32845,32849,32853,32857,32861,32865,32869,32873,32876,32877,32881,32885,32889,32892,32893,32896,32897,32900,32900,32900,32900,32900,32900,32901,32904,32904,32904,32904,32905,32908,32909,32912,32913,32916,32917,32921,32925,32928,32929,32933,32936,32937,32940,32940,32941,32944,32945,32948,32949,32952,32953,32956,32957,32960,32961,32965,32968,32969,32972,32972,32973,32977,32981,32985,32988,32989,32993,32997,33001,33005,33009,33013,33016,33017,33021,33025,33029,33032,33033,33037,33041,33045,33048,33049,33052,33053,33057,33061,33065,33069,33073,33077,33081,33085,33089,33092,33093,33097,33101,33105,33109,33113,33117,33121,33125,33129,33133,33137,33141,33145,33149,33153,33157,33160,33160,33160,33160,33160,33161,33165,33169,33172,33173,33177,33181,33185,33189,33192,33193,33197,33201,33205,33209,33213,33217,33221,33225,33229,33233,33237,33241,33245,33249,33253,33257,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260}, |
|
22910
|
|
|
|
|
|
|
{33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33260,33261}, |
|
22911
|
|
|
|
|
|
|
{33261,33269,33277,33285,33293,33301,33309,33317,33325,33333,33341,33348,33348,33348,33348,33348,33349,33361,33373,33385,33397,33409,33421,33433,33445,33457,33469,33481,33493,33505,33517,33529,33541,33553,33565,33577,33589,33601,33613,33625,33637,33649,33661,33673,33677,33681,33689,33696,33697,33701,33705,33709,33713,33717,33721,33725,33729,33733,33737,33741,33745,33749,33753,33757,33761,33765,33769,33773,33777,33781,33785,33789,33793,33797,33801,33809,33817,33825,33833,33845,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33852,33853,33861,33869,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33876,33877,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33884,33885}, |
|
22912
|
|
|
|
|
|
|
{33885,33893,33901,33904,33904,33904,33904,33904,33904,33904,33904,33904,33904,33904,33904,33904,33905,33909,33913,33917,33925,33929,33933,33937,33941,33945,33949,33953,33957,33961,33965,33969,33973,33977,33981,33985,33989,33993,33997,34001,34005,34009,34013,34017,34021,34025,34029,34033,34037,34041,34045,34049,34053,34057,34061,34065,34069,34073,34077,34081,34084,34084,34084,34084,34085,34097,34109,34121,34133,34145,34157,34169,34181,34192,34192,34192,34192,34192,34192,34192,34193,34197,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200}, |
|
22913
|
|
|
|
|
|
|
{34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34200,34201,34205,34209,34213,34217,34221,34225,34229,34233,34237,34240,34240,34240,34240,34240,34240,34240}, |
|
22914
|
|
|
|
|
|
|
{34240,34244,34248,34252,34256,34260,34264,34268,34272,34276,34280,34284,34288,34292,34296,34300,34304,34308,34312,34316,34320,34324,34328,34332,34336,34340,34344,34348,34352,34356,34360,34364,34368,34372,34376,34380,34384,34388,34392,34396,34400,34404,34408,34412,34416,34420,34424,34428,34432,34436,34440,34444,34448,34452,34456,34460,34464,34468,34472,34476,34480,34484,34488,34492,34496,34500,34504,34508,34512,34516,34520,34524,34528,34532,34536,34540,34544,34548,34552,34556,34560,34564,34568,34572,34576,34580,34584,34588,34592,34596,34600,34604,34608,34612,34616,34620,34624,34628,34632,34636,34640,34644,34648,34652,34656,34660,34664,34668,34672,34676,34680,34684,34688,34692,34696,34700,34704,34708,34712,34716,34720,34724,34728,34732,34736,34740,34744,34748,34752,34756,34760,34764,34768,34772,34776,34780,34784,34788,34792,34796,34800,34804,34808,34812,34816,34820,34824,34828,34832,34836,34840,34844,34848,34852,34856,34860,34864,34868,34872,34876,34880,34884,34888,34892,34896,34900,34904,34908,34912,34916,34920,34924,34928,34932,34936,34940,34944,34948,34952,34956,34960,34964,34968,34972,34976,34980,34984,34988,34992,34996,35000,35004,35008,35012,35016,35020,35024,35028,35032,35036,35040,35044,35048,35052,35056,35060,35064,35068,35072,35076,35080,35084,35088,35092,35096,35100,35104,35108,35112,35116,35120,35124,35128,35132,35136,35140,35144,35148,35152,35156,35160,35164,35168,35172,35176,35180,35184,35188,35192,35196,35200,35204,35208,35212,35216,35220,35224,35228,35232,35236,35240,35244,35248,35252,35256,35260,35264}, |
|
22915
|
|
|
|
|
|
|
{35264,35268,35272,35276,35280,35284,35288,35292,35296,35300,35304,35308,35312,35316,35320,35324,35328,35332,35336,35340,35344,35348,35352,35356,35360,35364,35368,35372,35376,35380,35384,35388,35392,35396,35400,35404,35408,35412,35416,35420,35424,35428,35432,35436,35440,35444,35448,35452,35456,35460,35464,35468,35472,35476,35480,35484,35488,35492,35496,35500,35504,35508,35512,35516,35520,35524,35528,35532,35536,35540,35544,35548,35552,35556,35560,35564,35568,35572,35576,35580,35584,35588,35592,35596,35600,35604,35608,35612,35616,35620,35624,35628,35632,35636,35640,35644,35648,35652,35656,35660,35664,35668,35672,35676,35680,35684,35688,35692,35696,35700,35704,35708,35712,35716,35720,35724,35728,35732,35736,35740,35744,35748,35752,35756,35760,35764,35768,35772,35776,35780,35784,35788,35792,35796,35800,35804,35808,35812,35816,35820,35824,35828,35832,35836,35840,35844,35848,35852,35856,35860,35864,35868,35872,35876,35880,35884,35888,35892,35896,35900,35904,35908,35912,35916,35920,35924,35928,35932,35936,35940,35944,35948,35952,35956,35960,35964,35968,35972,35976,35980,35984,35988,35992,35996,36000,36004,36008,36012,36016,36020,36024,36028,36032,36036,36040,36044,36048,36052,36056,36060,36064,36068,36072,36076,36080,36084,36088,36092,36096,36100,36104,36108,36112,36116,36120,36124,36128,36132,36136,36140,36144,36148,36152,36156,36160,36164,36168,36172,36176,36180,36184,36188,36192,36196,36200,36204,36208,36212,36216,36220,36224,36228,36232,36236,36240,36244,36248,36252,36256,36260,36264,36268,36272,36276,36280,36284,36288}, |
|
22916
|
|
|
|
|
|
|
{36288,36292,36296,36300,36304,36308,36312,36316,36320,36324,36328,36332,36336,36340,36344,36348,36352,36356,36360,36364,36368,36372,36376,36380,36384,36388,36392,36396,36400,36404,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408,36408} |
|
22917
|
|
|
|
|
|
|
}; |
|
22918
|
|
|
|
|
|
|
const char32_t uninorms::decomposition_data[] = { |
|
22919
|
|
|
|
|
|
|
0,32,32,776,97,32,772,50,51,32,769,956,32,807,49,111,49,8260,52,49,8260,50,51,8260,52,65,768,65,769,65,770,65,771,65,776,65,778,67,807,69,768,69,769,69,770,69,776,73,768,73,769,73,770,73,776,78,771,79,768,79,769,79,770,79,771,79,776,85,768,85,769,85,770,85,776,89,769,97,768,97,769,97,770,97,771,97,776,97,778,99,807,101,768,101,769,101,770,101,776,105,768,105,769,105,770,105,776,110,771,111,768,111,769,111,770,111,771,111,776,117,768,117,769,117,770,117,776,121,769,121,776,65,772,97,772,65,774,97,774,65,808,97,808,67,769,99,769,67,770,99,770,67,775,99,775,67,780,99,780,68,780,100,780,69,772,101,772,69,774,101,774,69,775,101,775,69,808,101,808,69,780,101,780,71,770,103,770,71,774,103,774,71,775,103,775,71,807,103,807,72,770,104,770,73,771,105,771,73,772,105,772,73,774,105,774,73,808,105,808,73,775,73,74,105,106,74,770,106,770,75,807,107,807,76,769,108,769,76,807,108,807,76,780,108,780,76,183,108,183,78,769,110,769,78,807,110,807,78,780,110,780,700,110,79,772,111,772,79,774,111,774,79,779,111,779,82,769,114,769,82,807,114,807,82,780,114,780,83,769,115,769,83,770,115,770,83,807,115,807,83,780,115,780,84,807,116,807,84,780,116,780,85,771,117,771,85,772,117,772,85,774,117,774,85,778,117,778,85,779,117,779,85,808,117,808,87,770,119,770,89,770,121,770,89,776,90,769,122,769,90,775,122,775,90,780,122,780,115,79,795,111,795,85,795,117,795,68,90,780,68,122,780,100,122,780,76,74,76,106,108,106,78,74,78,106,110,106,65,780,97,780,73,780,105,780,79,780,111,780,85,780,117,780,85,776,772,117,776,772,85,776,769,117,776,769,85,776,780,117,776,780,85,776,768,117,776,768,65,776,772,97,776,772,65,775,772,97,775,772,198,772,230,772,71,780,103,780,75,780,107,780,79,808,111,808,79,808,772,111,808,772,439,780,658,780,106,780,68,90,68,122,100,122,71,769,103,769,78,768,110,768,65,778,769,97,778,769,198,769,230,769,216,769,248,769,65,783,97,783,65,785,97,785,69,783,101,783,69,785,101,785,73,783,105,783,73,785,105,785,79,783,111,783,79,785,111,785,82,783,114,783,82,785,114,785,85,783,117,783, |
|
22920
|
|
|
|
|
|
|
85,785,117,785,83,806,115,806,84,806,116,806,72,780,104,780,65,775,97,775,69,807,101,807,79,776,772,111,776,772,79,771,772,111,771,772,79,775,111,775,79,775,772,111,775,772,89,772,121,772,104,614,106,114,633,635,641,119,121,32,774,32,775,32,778,32,808,32,771,32,779,611,108,115,120,661,768,769,787,776,769,697,32,837,59,32,769,168,769,913,769,183,917,769,919,769,921,769,927,769,933,769,937,769,953,776,769,921,776,933,776,945,769,949,769,951,769,953,769,965,776,769,953,776,965,776,959,769,965,769,969,769,946,952,933,978,769,978,776,966,960,954,961,962,920,949,931,1045,768,1045,776,1043,769,1030,776,1050,769,1048,768,1059,774,1048,774,1080,774,1077,768,1077,776,1075,769,1110,776,1082,769,1080,768,1091,774,1140,783,1141,783,1046,774,1078,774,1040,774,1072,774,1040,776,1072,776,1045,774,1077,774,1240,776,1241,776,1046,776,1078,776,1047,776,1079,776,1048,772,1080,772,1048,776,1080,776,1054,776,1086,776,1256,776,1257,776,1069,776,1101,776,1059,772,1091,772,1059,776,1091,776,1059,779,1091,779,1063,776,1095,776,1067,776,1099,776,1381,1410,1575,1619,1575,1620,1608,1620,1575,1621,1610,1620,1575,1652,1608,1652,1735,1652,1610,1652,1749,1620,1729,1620,1746,1620,2344,2364,2352,2364,2355,2364,2325,2364,2326,2364,2327,2364,2332,2364,2337,2364,2338,2364,2347,2364,2351,2364,2503,2494,2503,2519,2465,2492,2466,2492,2479,2492,2610,2620,2616,2620,2582,2620,2583,2620,2588,2620,2603,2620,2887,2902,2887,2878,2887,2903,2849,2876,2850,2876,2962,3031,3014,3006,3015,3006,3014,3031,3142,3158,3263,3285,3270,3285,3270,3286,3270,3266,3270,3266,3285,3398,3390,3399,3390,3398,3415,3545,3530,3545,3535,3545,3535,3530,3545,3551,3661,3634,3789,3762,3755,3737,3755,3745,3851,3906,4023,3916,4023,3921,4023,3926,4023,3931,4023,3904,4021,3953,3954,3953,3956,4018,3968,4018,3953,3968,4019,3968,4019,3953,3968,3953,3968,3986,4023,3996,4023,4001,4023,4006,4023,4011,4023,3984,4021,4133,4142,4316,6917,6965,6919,6965,6921,6965,6923,6965,6925,6965,6929,6965,6970,6965,6972,6965,6974,6965,6975,6965,6978,6965,65,198,66,68, |
|
22921
|
|
|
|
|
|
|
69,398,71,72,73,74,75,76,77,78,79,546,80,82,84,85,87,97,592,593,7426,98,100,101,601,603,604,103,107,109,331,111,596,7446,7447,112,116,117,7453,623,118,7461,946,947,948,966,967,105,114,117,118,946,947,961,966,967,1085,594,99,597,240,604,102,607,609,613,616,617,618,7547,669,621,7557,671,625,624,626,627,628,629,632,642,643,427,649,650,7452,651,652,122,656,657,658,952,65,805,97,805,66,775,98,775,66,803,98,803,66,817,98,817,67,807,769,99,807,769,68,775,100,775,68,803,100,803,68,817,100,817,68,807,100,807,68,813,100,813,69,772,768,101,772,768,69,772,769,101,772,769,69,813,101,813,69,816,101,816,69,807,774,101,807,774,70,775,102,775,71,772,103,772,72,775,104,775,72,803,104,803,72,776,104,776,72,807,104,807,72,814,104,814,73,816,105,816,73,776,769,105,776,769,75,769,107,769,75,803,107,803,75,817,107,817,76,803,108,803,76,803,772,108,803,772,76,817,108,817,76,813,108,813,77,769,109,769,77,775,109,775,77,803,109,803,78,775,110,775,78,803,110,803,78,817,110,817,78,813,110,813,79,771,769,111,771,769,79,771,776,111,771,776,79,772,768,111,772,768,79,772,769,111,772,769,80,769,112,769,80,775,112,775,82,775,114,775,82,803,114,803,82,803,772,114,803,772,82,817,114,817,83,775,115,775,83,803,115,803,83,769,775,115,769,775,83,780,775,115,780,775,83,803,775,115,803,775,84,775,116,775,84,803,116,803,84,817,116,817,84,813,116,813,85,804,117,804,85,816,117,816,85,813,117,813,85,771,769,117,771,769,85,772,776,117,772,776,86,771,118,771,86,803,118,803,87,768,119,768,87,769,119,769,87,776,119,776,87,775,119,775,87,803,119,803,88,775,120,775,88,776,120,776,89,775,121,775,90,770,122,770,90,803,122,803,90,817,122,817,104,817,116,776,119,778,121,778,97,702,383,775,65,803,97,803,65,777,97,777,65,770,769,97,770,769,65,770,768,97,770,768,65,770,777,97,770,777,65,770,771,97,770,771,65,803,770,97,803,770,65,774,769,97,774,769,65,774,768,97,774,768,65,774,777,97,774,777,65,774,771,97,774,771,65,803,774,97,803,774,69,803,101,803,69,777,101,777,69,771,101,771,69,770,769,101,770,769,69,770,768,101,770, |
|
22922
|
|
|
|
|
|
|
768,69,770,777,101,770,777,69,770,771,101,770,771,69,803,770,101,803,770,73,777,105,777,73,803,105,803,79,803,111,803,79,777,111,777,79,770,769,111,770,769,79,770,768,111,770,768,79,770,777,111,770,777,79,770,771,111,770,771,79,803,770,111,803,770,79,795,769,111,795,769,79,795,768,111,795,768,79,795,777,111,795,777,79,795,771,111,795,771,79,795,803,111,795,803,85,803,117,803,85,777,117,777,85,795,769,117,795,769,85,795,768,117,795,768,85,795,777,117,795,777,85,795,771,117,795,771,85,795,803,117,795,803,89,768,121,768,89,803,121,803,89,777,121,777,89,771,121,771,945,787,945,788,945,787,768,945,788,768,945,787,769,945,788,769,945,787,834,945,788,834,913,787,913,788,913,787,768,913,788,768,913,787,769,913,788,769,913,787,834,913,788,834,949,787,949,788,949,787,768,949,788,768,949,787,769,949,788,769,917,787,917,788,917,787,768,917,788,768,917,787,769,917,788,769,951,787,951,788,951,787,768,951,788,768,951,787,769,951,788,769,951,787,834,951,788,834,919,787,919,788,919,787,768,919,788,768,919,787,769,919,788,769,919,787,834,919,788,834,953,787,953,788,953,787,768,953,788,768,953,787,769,953,788,769,953,787,834,953,788,834,921,787,921,788,921,787,768,921,788,768,921,787,769,921,788,769,921,787,834,921,788,834,959,787,959,788,959,787,768,959,788,768,959,787,769,959,788,769,927,787,927,788,927,787,768,927,788,768,927,787,769,927,788,769,965,787,965,788,965,787,768,965,788,768,965,787,769,965,788,769,965,787,834,965,788,834,933,788,933,788,768,933,788,769,933,788,834,969,787,969,788,969,787,768,969,788,768,969,787,769,969,788,769,969,787,834,969,788,834,937,787,937,788,937,787,768,937,788,768,937,787,769,937,788,769,937,787,834,937,788,834,945,768,945,769,949,768,949,769,951,768,951,769,953,768,953,769,959,768,959,769,965,768,965,769,969,768,969,769,945,787,837,945,788,837,945,787,768,837,945,788,768,837,945,787,769,837,945,788,769,837,945,787,834,837,945,788,834,837,913,787,837,913,788,837,913,787,768,837,913,788,768,837,913,787,769,837,913,788,769,837,913,787,834,837,913, |
|
22923
|
|
|
|
|
|
|
788,834,837,951,787,837,951,788,837,951,787,768,837,951,788,768,837,951,787,769,837,951,788,769,837,951,787,834,837,951,788,834,837,919,787,837,919,788,837,919,787,768,837,919,788,768,837,919,787,769,837,919,788,769,837,919,787,834,837,919,788,834,837,969,787,837,969,788,837,969,787,768,837,969,788,768,837,969,787,769,837,969,788,769,837,969,787,834,837,969,788,834,837,937,787,837,937,788,837,937,787,768,837,937,788,768,837,937,787,769,837,937,788,769,837,937,787,834,837,937,788,834,837,945,774,945,772,945,768,837,945,837,945,769,837,945,834,945,834,837,913,774,913,772,913,768,913,769,913,837,32,787,953,32,787,32,834,168,834,951,768,837,951,837,951,769,837,951,834,951,834,837,917,768,917,769,919,768,919,769,919,837,8127,768,8127,769,8127,834,953,774,953,772,953,776,768,953,776,769,953,834,953,776,834,921,774,921,772,921,768,921,769,8190,768,8190,769,8190,834,965,774,965,772,965,776,768,965,776,769,961,787,961,788,965,834,965,776,834,933,774,933,772,933,768,933,769,929,788,168,768,168,769,96,969,768,837,969,837,969,769,837,969,834,969,834,837,927,768,927,769,937,768,937,769,937,837,180,32,788,8194,8195,32,32,32,32,32,32,32,32,32,8208,32,819,46,46,46,46,46,46,32,8242,8242,8242,8242,8242,8245,8245,8245,8245,8245,33,33,32,773,63,63,63,33,33,63,8242,8242,8242,8242,32,48,105,52,53,54,55,56,57,43,8722,61,40,41,110,48,49,50,51,52,53,54,55,56,57,43,8722,61,40,41,97,101,111,120,601,104,107,108,109,110,112,115,116,82,115,97,47,99,97,47,115,67,176,67,99,47,111,99,47,117,400,176,70,103,72,72,72,104,295,73,73,76,108,78,78,111,80,81,82,82,82,83,77,84,69,76,84,77,90,937,90,75,65,778,66,67,101,69,70,77,111,1488,1489,1490,1491,105,70,65,88,960,947,915,928,8721,68,100,101,105,106,49,8260,55,49,8260,57,49,8260,49,48,49,8260,51,50,8260,51,49,8260,53,50,8260,53,51,8260,53,52,8260,53,49,8260,54,53,8260,54,49,8260,56,51,8260,56,53,8260,56,55,8260,56,49,8260,73,73,73,73,73,73,73,86,86,86,73,86,73,73,86,73,73,73,73,88,88,88,73,88,73,73,76,67,68,77,105,105,105,105,105,105,105,118,118,118,105, |
|
22924
|
|
|
|
|
|
|
118,105,105,118,105,105,105,105,120,120,120,105,120,105,105,108,99,100,109,48,8260,51,8592,824,8594,824,8596,824,8656,824,8660,824,8658,824,8707,824,8712,824,8715,824,8739,824,8741,824,8747,8747,8747,8747,8747,8750,8750,8750,8750,8750,8764,824,8771,824,8773,824,8776,824,61,824,8801,824,8781,824,60,824,62,824,8804,824,8805,824,8818,824,8819,824,8822,824,8823,824,8826,824,8827,824,8834,824,8835,824,8838,824,8839,824,8866,824,8872,824,8873,824,8875,824,8828,824,8829,824,8849,824,8850,824,8882,824,8883,824,8884,824,8885,824,12296,12297,49,50,51,52,53,54,55,56,57,49,48,49,49,49,50,49,51,49,52,49,53,49,54,49,55,49,56,49,57,50,48,40,49,41,40,50,41,40,51,41,40,52,41,40,53,41,40,54,41,40,55,41,40,56,41,40,57,41,40,49,48,41,40,49,49,41,40,49,50,41,40,49,51,41,40,49,52,41,40,49,53,41,40,49,54,41,40,49,55,41,40,49,56,41,40,49,57,41,40,50,48,41,49,46,50,46,51,46,52,46,53,46,54,46,55,46,56,46,57,46,49,48,46,49,49,46,49,50,46,49,51,46,49,52,46,49,53,46,49,54,46,49,55,46,49,56,46,49,57,46,50,48,46,40,97,41,40,98,41,40,99,41,40,100,41,40,101,41,40,102,41,40,103,41,40,104,41,40,105,41,40,106,41,40,107,41,40,108,41,40,109,41,40,110,41,40,111,41,40,112,41,40,113,41,40,114,41,40,115,41,40,116,41,40,117,41,40,118,41,40,119,41,40,120,41,40,121,41,40,122,41,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,48,8747,8747,8747,8747,58,58,61,61,61,61,61,61,10973,824,106,86,11617,27597,40863,19968,20008,20022,20031,20057,20101,20108,20128,20154,20799,20837,20843,20866,20886,20907,20960,20981,20992,21147,21241,21269,21274,21304,21313,21340,21353,21378,21430,21448,21475,22231,22303,22763,22786,22794,22805,22823,22899,23376,23424,23544,23567,23586,23608,23662,23665,24027,24037,24049,24062,24178,24186,24191,24308,24318,24331,24339,24400,24417,24435,24515,25096,25142,25163,25903,25908,25991,26007,26020,26041,26080,26085,26352,26376,26408,27424,27490,27513,27571,27595, |
|
22925
|
|
|
|
|
|
|
27604,27611,27663,27668,27700,28779,29226,29238,29243,29247,29255,29273,29275,29356,29572,29577,29916,29926,29976,29983,29992,30000,30091,30098,30326,30333,30382,30399,30446,30683,30690,30707,31034,31160,31166,31348,31435,31481,31859,31992,32566,32593,32650,32701,32769,32780,32786,32819,32895,32905,33251,33258,33267,33276,33292,33307,33311,33390,33394,33400,34381,34411,34880,34892,34915,35198,35211,35282,35328,35895,35910,35925,35960,35997,36196,36208,36275,36523,36554,36763,36784,36789,37009,37193,37318,37324,37329,38263,38272,38428,38582,38585,38632,38737,38750,38754,38761,38859,38893,38899,38913,39080,39131,39135,39318,39321,39340,39592,39640,39647,39717,39727,39730,39740,39770,40165,40565,40575,40613,40635,40643,40653,40657,40697,40701,40718,40723,40736,40763,40778,40786,40845,40860,40864,32,12306,21313,21316,21317,12363,12441,12365,12441,12367,12441,12369,12441,12371,12441,12373,12441,12375,12441,12377,12441,12379,12441,12381,12441,12383,12441,12385,12441,12388,12441,12390,12441,12392,12441,12399,12441,12399,12442,12402,12441,12402,12442,12405,12441,12405,12442,12408,12441,12408,12442,12411,12441,12411,12442,12358,12441,32,12441,32,12442,12445,12441,12424,12426,12459,12441,12461,12441,12463,12441,12465,12441,12467,12441,12469,12441,12471,12441,12473,12441,12475,12441,12477,12441,12479,12441,12481,12441,12484,12441,12486,12441,12488,12441,12495,12441,12495,12442,12498,12441,12498,12442,12501,12441,12501,12442,12504,12441,12504,12442,12507,12441,12507,12442,12454,12441,12527,12441,12528,12441,12529,12441,12530,12441,12541,12441,12467,12488,4352,4353,4522,4354,4524,4525,4355,4356,4357,4528,4529,4530,4531,4532,4533,4378,4358,4359,4360,4385,4361,4362,4363,4364,4365,4366,4367,4368,4369,4370,4449,4450,4451,4452,4453,4454,4455,4456,4457,4458,4459,4460,4461,4462,4463,4464,4465,4466,4467,4468,4469,4448,4372,4373,4551,4552,4556,4558,4563,4567,4569,4380,4573,4575,4381,4382,4384,4386,4387,4391,4393,4395,4396,4397,4398,4399,4402,4406,4416,4423,4428,4593,4594,4439,4440,4441, |
|
22926
|
|
|
|
|
|
|
4484,4485,4488,4497,4498,4500,4510,4513,19968,20108,19977,22235,19978,20013,19979,30002,20057,19993,19969,22825,22320,20154,40,4352,41,40,4354,41,40,4355,41,40,4357,41,40,4358,41,40,4359,41,40,4361,41,40,4363,41,40,4364,41,40,4366,41,40,4367,41,40,4368,41,40,4369,41,40,4370,41,40,4352,4449,41,40,4354,4449,41,40,4355,4449,41,40,4357,4449,41,40,4358,4449,41,40,4359,4449,41,40,4361,4449,41,40,4363,4449,41,40,4364,4449,41,40,4366,4449,41,40,4367,4449,41,40,4368,4449,41,40,4369,4449,41,40,4370,4449,41,40,4364,4462,41,40,4363,4457,4364,4453,4523,41,40,4363,4457,4370,4462,41,40,19968,41,40,20108,41,40,19977,41,40,22235,41,40,20116,41,40,20845,41,40,19971,41,40,20843,41,40,20061,41,40,21313,41,40,26376,41,40,28779,41,40,27700,41,40,26408,41,40,37329,41,40,22303,41,40,26085,41,40,26666,41,40,26377,41,40,31038,41,40,21517,41,40,29305,41,40,36001,41,40,31069,41,40,21172,41,40,20195,41,40,21628,41,40,23398,41,40,30435,41,40,20225,41,40,36039,41,40,21332,41,40,31085,41,40,20241,41,40,33258,41,40,33267,41,21839,24188,25991,31631,80,84,69,50,49,50,50,50,51,50,52,50,53,50,54,50,55,50,56,50,57,51,48,51,49,51,50,51,51,51,52,51,53,4352,4354,4355,4357,4358,4359,4361,4363,4364,4366,4367,4368,4369,4370,4352,4449,4354,4449,4355,4449,4357,4449,4358,4449,4359,4449,4361,4449,4363,4449,4364,4449,4366,4449,4367,4449,4368,4449,4369,4449,4370,4449,4366,4449,4535,4352,4457,4364,4462,4363,4468,4363,4462,19968,20108,19977,22235,20116,20845,19971,20843,20061,21313,26376,28779,27700,26408,37329,22303,26085,26666,26377,31038,21517,29305,36001,31069,21172,31192,30007,22899,36969,20778,21360,27880,38917,20241,20889,27491,19978,20013,19979,24038,21491,21307,23447,23398,30435,20225,36039,21332,22812,51,54,51,55,51,56,51,57,52,48,52,49,52,50,52,51,52,52,52,53,52,54,52,55,52,56,52,57,53,48,49,26376,50,26376,51,26376,52,26376,53,26376,54,26376,55,26376,56,26376,57,26376,49,48,26376,49,49,26376,49,50,26376,72,103,101,114,103,101,86,76,84,68,12450,12452,12454,12456,12458,12459,12461,12463,12465,12467,12469, |
|
22927
|
|
|
|
|
|
|
12471,12473,12475,12477,12479,12481,12484,12486,12488,12490,12491,12492,12493,12494,12495,12498,12501,12504,12507,12510,12511,12512,12513,12514,12516,12518,12520,12521,12522,12523,12524,12525,12527,12528,12529,12530,20196,21644,12450,12495,12442,12540,12488,12450,12523,12501,12449,12450,12531,12504,12442,12450,12450,12540,12523,12452,12491,12531,12463,12441,12452,12531,12481,12454,12457,12531,12456,12473,12463,12540,12488,12441,12456,12540,12459,12540,12458,12531,12473,12458,12540,12512,12459,12452,12522,12459,12521,12483,12488,12459,12525,12522,12540,12459,12441,12525,12531,12459,12441,12531,12510,12461,12441,12459,12441,12461,12441,12491,12540,12461,12517,12522,12540,12461,12441,12523,12479,12441,12540,12461,12525,12461,12525,12463,12441,12521,12512,12461,12525,12513,12540,12488,12523,12461,12525,12527,12483,12488,12463,12441,12521,12512,12463,12441,12521,12512,12488,12531,12463,12523,12475,12441,12452,12525,12463,12525,12540,12493,12465,12540,12473,12467,12523,12490,12467,12540,12507,12442,12469,12452,12463,12523,12469,12531,12481,12540,12512,12471,12522,12531,12463,12441,12475,12531,12481,12475,12531,12488,12479,12441,12540,12473,12486,12441,12471,12488,12441,12523,12488,12531,12490,12494,12494,12483,12488,12495,12452,12484,12495,12442,12540,12475,12531,12488,12495,12442,12540,12484,12495,12441,12540,12524,12523,12498,12442,12450,12473,12488,12523,12498,12442,12463,12523,12498,12442,12467,12498,12441,12523,12501,12449,12521,12483,12488,12441,12501,12451,12540,12488,12501,12441,12483,12471,12455,12523,12501,12521,12531,12504,12463,12479,12540,12523,12504,12442,12477,12504,12442,12491,12498,12504,12523,12484,12504,12442,12531,12473,12504,12442,12540,12471,12441,12504,12441,12540,12479,12507,12442,12452,12531,12488,12507,12441,12523,12488,12507,12531,12507,12442,12531,12488,12441,12507,12540,12523,12507,12540,12531,12510,12452,12463,12525,12510,12452,12523,12510,12483,12495,12510,12523,12463,12510,12531,12471,12519,12531,12511,12463,12525,12531,12511,12522,12511, |
|
22928
|
|
|
|
|
|
|
12522,12495,12441,12540,12523,12513,12459,12441,12513,12459,12441,12488,12531,12513,12540,12488,12523,12516,12540,12488,12441,12516,12540,12523,12518,12450,12531,12522,12483,12488,12523,12522,12521,12523,12498,12442,12540,12523,12540,12501,12441,12523,12524,12512,12524,12531,12488,12465,12441,12531,12527,12483,12488,48,28857,49,28857,50,28857,51,28857,52,28857,53,28857,54,28857,55,28857,56,28857,57,28857,49,48,28857,49,49,28857,49,50,28857,49,51,28857,49,52,28857,49,53,28857,49,54,28857,49,55,28857,49,56,28857,49,57,28857,50,48,28857,50,49,28857,50,50,28857,50,51,28857,50,52,28857,104,80,97,100,97,65,85,98,97,114,111,86,112,99,100,109,100,109,50,100,109,51,73,85,24179,25104,26157,21644,22823,27491,26126,27835,26666,24335,20250,31038,112,65,110,65,956,65,109,65,107,65,75,66,77,66,71,66,99,97,108,107,99,97,108,112,70,110,70,956,70,956,103,109,103,107,103,72,122,107,72,122,77,72,122,71,72,122,84,72,122,956,108,109,108,100,108,107,108,102,109,110,109,956,109,109,109,99,109,107,109,109,109,50,99,109,50,109,50,107,109,50,109,109,51,99,109,51,109,51,107,109,51,109,8725,115,109,8725,115,50,80,97,107,80,97,77,80,97,71,80,97,114,97,100,114,97,100,8725,115,114,97,100,8725,115,50,112,115,110,115,956,115,109,115,112,86,110,86,956,86,109,86,107,86,77,86,112,87,110,87,956,87,109,87,107,87,77,87,107,937,77,937,97,46,109,46,66,113,99,99,99,100,67,8725,107,103,67,111,46,100,66,71,121,104,97,72,80,105,110,75,75,75,77,107,116,108,109,108,110,108,111,103,108,120,109,98,109,105,108,109,111,108,80,72,112,46,109,46,80,80,77,80,82,115,114,83,118,87,98,86,8725,109,65,8725,109,49,26085,50,26085,51,26085,52,26085,53,26085,54,26085,55,26085,56,26085,57,26085,49,48,26085,49,49,26085,49,50,26085,49,51,26085,49,52,26085,49,53,26085,49,54,26085,49,55,26085,49,56,26085,49,57,26085,50,48,26085,50,49,26085,50,50,26085,50,51,26085,50,52,26085,50,53,26085,50,54,26085,50,55,26085,50,56,26085,50,57,26085,51,48,26085,51,49,26085,103,97,108,1098,1100,42863,67,70,81,294,339,42791,43831,619,43858,653,35912, |
|
22929
|
|
|
|
|
|
|
26356,36554,36040,28369,20018,21477,40860,40860,22865,37329,21895,22856,25078,30313,32645,34367,34746,35064,37007,27138,27931,28889,29662,33853,37226,39409,20098,21365,27396,29211,34349,40478,23888,28651,34253,35172,25289,33240,34847,24266,26391,28010,29436,37070,20358,20919,21214,25796,27347,29200,30439,32769,34310,34396,36335,38706,39791,40442,30860,31103,32160,33737,37636,40575,35542,22751,24324,31840,32894,29282,30922,36034,38647,22744,23650,27155,28122,28431,32047,32311,38475,21202,32907,20956,20940,31260,32190,33777,38517,35712,25295,27138,35582,20025,23527,24594,29575,30064,21271,30971,20415,24489,19981,27852,25976,32034,21443,22622,30465,33865,35498,27578,36784,27784,25342,33509,25504,30053,20142,20841,20937,26753,31975,33391,35538,37327,21237,21570,22899,24300,26053,28670,31018,38317,39530,40599,40654,21147,26310,27511,36706,24180,24976,25088,25754,28451,29001,29833,31178,32244,32879,36646,34030,36899,37706,21015,21155,21693,28872,35010,35498,24265,24565,25467,27566,31806,29557,20196,22265,23527,23994,24604,29618,29801,32666,32838,37428,38646,38728,38936,20363,31150,37300,38584,24801,20102,20698,23534,23615,26009,27138,29134,30274,34044,36988,40845,26248,38446,21129,26491,26611,27969,28316,29705,30041,30827,32016,39006,20845,25134,38520,20523,23833,28138,36650,24459,24900,26647,29575,38534,21033,21519,23653,26131,26446,26792,27877,29702,30178,32633,35023,35041,37324,38626,21311,28346,21533,29136,29848,34298,38563,40023,40607,26519,28107,33256,31435,31520,31890,29376,28825,35672,20160,33590,21050,20999,24230,25299,31958,23429,27934,26292,36667,34892,38477,35211,24275,20800,21952,22618,26228,20958,29482,30410,31036,31070,31077,31119,38742,31934,32701,34322,35576,36920,37117,39151,39164,39208,40372,37086,38583,20398,20711,20813,21193,21220,21329,21917,22022,22120,22592,22696,23652,23662,24724,24936,24974,25074,25935,26082,26257,26757,28023,28186,28450,29038,29227,29730,30865,31038,31049,31048,31056,31062,31069,31117,31118,31296,31361,31680,32244,32265,32321, |
|
22930
|
|
|
|
|
|
|
32626,32773,33261,33401,33401,33879,35088,35222,35585,35641,36051,36104,36790,36920,38627,38911,38971,24693,148206,33304,20006,20917,20840,20352,20805,20864,21191,21242,21917,21845,21913,21986,22618,22707,22852,22868,23138,23336,24274,24281,24425,24493,24792,24910,24840,24974,24928,25074,25140,25540,25628,25682,25942,26228,26391,26395,26454,27513,27578,27969,28379,28363,28450,28702,29038,30631,29237,29359,29482,29809,29958,30011,30237,30239,30410,30427,30452,30538,30528,30924,31409,31680,31867,32091,32244,32574,32773,33618,33775,34681,35137,35206,35222,35519,35576,35531,35585,35582,35565,35641,35722,36104,36664,36978,37273,37494,38524,38627,38742,38875,38911,38923,38971,39698,40860,141386,141380,144341,15261,16408,16441,152137,154832,163539,40771,40846,102,102,102,105,102,108,102,102,105,102,102,108,115,116,115,116,1396,1398,1396,1381,1396,1387,1406,1398,1396,1389,1497,1460,1522,1463,1506,1488,1491,1492,1499,1500,1501,1512,1514,43,1513,1473,1513,1474,1513,1468,1473,1513,1468,1474,1488,1463,1488,1464,1488,1468,1489,1468,1490,1468,1491,1468,1492,1468,1493,1468,1494,1468,1496,1468,1497,1468,1498,1468,1499,1468,1500,1468,1502,1468,1504,1468,1505,1468,1507,1468,1508,1468,1510,1468,1511,1468,1512,1468,1513,1468,1514,1468,1493,1465,1489,1471,1499,1471,1508,1471,1488,1500,1649,1649,1659,1659,1659,1659,1662,1662,1662,1662,1664,1664,1664,1664,1658,1658,1658,1658,1663,1663,1663,1663,1657,1657,1657,1657,1700,1700,1700,1700,1702,1702,1702,1702,1668,1668,1668,1668,1667,1667,1667,1667,1670,1670,1670,1670,1671,1671,1671,1671,1677,1677,1676,1676,1678,1678,1672,1672,1688,1688,1681,1681,1705,1705,1705,1705,1711,1711,1711,1711,1715,1715,1715,1715,1713,1713,1713,1713,1722,1722,1723,1723,1723,1723,1749,1620,1749,1620,1729,1729,1729,1729,1726,1726,1726,1726,1746,1746,1746,1620,1746,1620,1709,1709,1709,1709,1735,1735,1734,1734,1736,1736,1735,1652,1739,1739,1733,1733,1737,1737,1744,1744,1744,1744,1609,1609,1610,1620,1575,1610,1620,1575,1610,1620,1749,1610,1620,1749,1610,1620,1608,1610,1620, |
|
22931
|
|
|
|
|
|
|
1608,1610,1620,1735,1610,1620,1735,1610,1620,1734,1610,1620,1734,1610,1620,1736,1610,1620,1736,1610,1620,1744,1610,1620,1744,1610,1620,1744,1610,1620,1609,1610,1620,1609,1610,1620,1609,1740,1740,1740,1740,1610,1620,1580,1610,1620,1581,1610,1620,1605,1610,1620,1609,1610,1620,1610,1576,1580,1576,1581,1576,1582,1576,1605,1576,1609,1576,1610,1578,1580,1578,1581,1578,1582,1578,1605,1578,1609,1578,1610,1579,1580,1579,1605,1579,1609,1579,1610,1580,1581,1580,1605,1581,1580,1581,1605,1582,1580,1582,1581,1582,1605,1587,1580,1587,1581,1587,1582,1587,1605,1589,1581,1589,1605,1590,1580,1590,1581,1590,1582,1590,1605,1591,1581,1591,1605,1592,1605,1593,1580,1593,1605,1594,1580,1594,1605,1601,1580,1601,1581,1601,1582,1601,1605,1601,1609,1601,1610,1602,1581,1602,1605,1602,1609,1602,1610,1603,1575,1603,1580,1603,1581,1603,1582,1603,1604,1603,1605,1603,1609,1603,1610,1604,1580,1604,1581,1604,1582,1604,1605,1604,1609,1604,1610,1605,1580,1605,1581,1605,1582,1605,1605,1605,1609,1605,1610,1606,1580,1606,1581,1606,1582,1606,1605,1606,1609,1606,1610,1607,1580,1607,1605,1607,1609,1607,1610,1610,1580,1610,1581,1610,1582,1610,1605,1610,1609,1610,1610,1584,1648,1585,1648,1609,1648,32,1612,1617,32,1613,1617,32,1614,1617,32,1615,1617,32,1616,1617,32,1617,1648,1610,1620,1585,1610,1620,1586,1610,1620,1605,1610,1620,1606,1610,1620,1609,1610,1620,1610,1576,1585,1576,1586,1576,1605,1576,1606,1576,1609,1576,1610,1578,1585,1578,1586,1578,1605,1578,1606,1578,1609,1578,1610,1579,1585,1579,1586,1579,1605,1579,1606,1579,1609,1579,1610,1601,1609,1601,1610,1602,1609,1602,1610,1603,1575,1603,1604,1603,1605,1603,1609,1603,1610,1604,1605,1604,1609,1604,1610,1605,1575,1605,1605,1606,1585,1606,1586,1606,1605,1606,1606,1606,1609,1606,1610,1609,1648,1610,1585,1610,1586,1610,1605,1610,1606,1610,1609,1610,1610,1610,1620,1580,1610,1620,1581,1610,1620,1582,1610,1620,1605,1610,1620,1607,1576,1580,1576,1581,1576,1582,1576,1605,1576,1607,1578,1580,1578,1581,1578,1582,1578,1605,1578,1607,1579,1605,1580,1581,1580,1605,1581, |
|
22932
|
|
|
|
|
|
|
1580,1581,1605,1582,1580,1582,1605,1587,1580,1587,1581,1587,1582,1587,1605,1589,1581,1589,1582,1589,1605,1590,1580,1590,1581,1590,1582,1590,1605,1591,1581,1592,1605,1593,1580,1593,1605,1594,1580,1594,1605,1601,1580,1601,1581,1601,1582,1601,1605,1602,1581,1602,1605,1603,1580,1603,1581,1603,1582,1603,1604,1603,1605,1604,1580,1604,1581,1604,1582,1604,1605,1604,1607,1605,1580,1605,1581,1605,1582,1605,1605,1606,1580,1606,1581,1606,1582,1606,1605,1606,1607,1607,1580,1607,1605,1607,1648,1610,1580,1610,1581,1610,1582,1610,1605,1610,1607,1610,1620,1605,1610,1620,1607,1576,1605,1576,1607,1578,1605,1578,1607,1579,1605,1579,1607,1587,1605,1587,1607,1588,1605,1588,1607,1603,1604,1603,1605,1604,1605,1606,1605,1606,1607,1610,1605,1610,1607,1600,1614,1617,1600,1615,1617,1600,1616,1617,1591,1609,1591,1610,1593,1609,1593,1610,1594,1609,1594,1610,1587,1609,1587,1610,1588,1609,1588,1610,1581,1609,1581,1610,1580,1609,1580,1610,1582,1609,1582,1610,1589,1609,1589,1610,1590,1609,1590,1610,1588,1580,1588,1581,1588,1582,1588,1605,1588,1585,1587,1585,1589,1585,1590,1585,1591,1609,1591,1610,1593,1609,1593,1610,1594,1609,1594,1610,1587,1609,1587,1610,1588,1609,1588,1610,1581,1609,1581,1610,1580,1609,1580,1610,1582,1609,1582,1610,1589,1609,1589,1610,1590,1609,1590,1610,1588,1580,1588,1581,1588,1582,1588,1605,1588,1585,1587,1585,1589,1585,1590,1585,1588,1580,1588,1581,1588,1582,1588,1605,1587,1607,1588,1607,1591,1605,1587,1580,1587,1581,1587,1582,1588,1580,1588,1581,1588,1582,1591,1605,1592,1605,1575,1611,1575,1611,1578,1580,1605,1578,1581,1580,1578,1581,1580,1578,1581,1605,1578,1582,1605,1578,1605,1580,1578,1605,1581,1578,1605,1582,1580,1605,1581,1580,1605,1581,1581,1605,1610,1581,1605,1609,1587,1581,1580,1587,1580,1581,1587,1580,1609,1587,1605,1581,1587,1605,1581,1587,1605,1580,1587,1605,1605,1587,1605,1605,1589,1581,1581,1589,1581,1581,1589,1605,1605,1588,1581,1605,1588,1581,1605,1588,1580,1610,1588,1605,1582,1588,1605,1582,1588,1605,1605,1588,1605,1605,1590,1581,1609,1590,1582,1605,1590,1582, |
|
22933
|
|
|
|
|
|
|
1605,1591,1605,1581,1591,1605,1581,1591,1605,1605,1591,1605,1610,1593,1580,1605,1593,1605,1605,1593,1605,1605,1593,1605,1609,1594,1605,1605,1594,1605,1610,1594,1605,1609,1601,1582,1605,1601,1582,1605,1602,1605,1581,1602,1605,1605,1604,1581,1605,1604,1581,1610,1604,1581,1609,1604,1580,1580,1604,1580,1580,1604,1582,1605,1604,1582,1605,1604,1605,1581,1604,1605,1581,1605,1581,1580,1605,1581,1605,1605,1581,1610,1605,1580,1581,1605,1580,1605,1605,1582,1580,1605,1582,1605,1605,1580,1582,1607,1605,1580,1607,1605,1605,1606,1581,1605,1606,1581,1609,1606,1580,1605,1606,1580,1605,1606,1580,1609,1606,1605,1610,1606,1605,1609,1610,1605,1605,1610,1605,1605,1576,1582,1610,1578,1580,1610,1578,1580,1609,1578,1582,1610,1578,1582,1609,1578,1605,1610,1578,1605,1609,1580,1605,1610,1580,1581,1609,1580,1605,1609,1587,1582,1609,1589,1581,1610,1588,1581,1610,1590,1581,1610,1604,1580,1610,1604,1605,1610,1610,1581,1610,1610,1580,1610,1610,1605,1610,1605,1605,1610,1602,1605,1610,1606,1581,1610,1602,1605,1581,1604,1581,1605,1593,1605,1610,1603,1605,1610,1606,1580,1581,1605,1582,1610,1604,1580,1605,1603,1605,1605,1604,1580,1605,1606,1580,1581,1580,1581,1610,1581,1580,1610,1605,1580,1610,1601,1605,1610,1576,1581,1610,1603,1605,1605,1593,1580,1605,1589,1605,1605,1587,1582,1610,1606,1580,1610,1589,1604,1746,1602,1604,1746,1575,1604,1604,1607,1575,1603,1576,1585,1605,1581,1605,1583,1589,1604,1593,1605,1585,1587,1608,1604,1593,1604,1610,1607,1608,1587,1604,1605,1589,1604,1609,1589,1604,1609,32,1575,1604,1604,1607,32,1593,1604,1610,1607,32,1608,1587,1604,1605,1580,1604,32,1580,1604,1575,1604,1607,1585,1740,1575,1604,44,12289,12290,58,59,33,63,12310,12311,46,46,46,46,46,8212,8211,95,95,40,41,123,125,12308,12309,12304,12305,12298,12299,12296,12297,12300,12301,12302,12303,91,93,32,773,32,773,32,773,32,773,95,95,95,44,12289,46,59,58,63,33,8212,40,41,123,125,12308,12309,35,38,42,43,45,60,62,61,92,36,37,64,32,1611,1600,1611,32,1612,32,1613,32,1614,1600,1614,32,1615,1600,1615,32,1616,1600,1616,32,1617,1600, |
|
22934
|
|
|
|
|
|
|
1617,32,1618,1600,1618,1569,1575,1619,1575,1619,1575,1620,1575,1620,1608,1620,1608,1620,1575,1621,1575,1621,1610,1620,1610,1620,1610,1620,1610,1620,1575,1575,1576,1576,1576,1576,1577,1577,1578,1578,1578,1578,1579,1579,1579,1579,1580,1580,1580,1580,1581,1581,1581,1581,1582,1582,1582,1582,1583,1583,1584,1584,1585,1585,1586,1586,1587,1587,1587,1587,1588,1588,1588,1588,1589,1589,1589,1589,1590,1590,1590,1590,1591,1591,1591,1591,1592,1592,1592,1592,1593,1593,1593,1593,1594,1594,1594,1594,1601,1601,1601,1601,1602,1602,1602,1602,1603,1603,1603,1603,1604,1604,1604,1604,1605,1605,1605,1605,1606,1606,1606,1606,1607,1607,1607,1607,1608,1608,1609,1609,1610,1610,1610,1610,1604,1575,1619,1604,1575,1619,1604,1575,1620,1604,1575,1620,1604,1575,1621,1604,1575,1621,1604,1575,1604,1575,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,123,124,125,126,10629,10630,12290,12300,12301,12289,12539,12530,12449,12451,12453,12455,12457,12515,12517,12519,12483,12540,12450,12452,12454,12456,12458,12459,12461,12463,12465,12467,12469,12471,12473,12475,12477,12479,12481,12484,12486,12488,12490,12491,12492,12493,12494,12495,12498,12501,12504,12507,12510,12511,12512,12513,12514,12516,12518,12520,12521,12522,12523,12524,12525,12527,12531,12441,12442,4448,4352,4353,4522,4354,4524,4525,4355,4356,4357,4528,4529,4530,4531,4532,4533,4378,4358,4359,4360,4385,4361,4362,4363,4364,4365,4366,4367,4368,4369,4370,4449,4450,4451,4452,4453,4454,4455,4456,4457,4458,4459,4460,4461,4462,4463,4464,4465,4466,4467,4468,4469,162,163,172,32,772,166,165,8361,9474,8592,8593,8594,8595,9632,9675,720,721,230,665,595,675,43878,677,676,598,599,7569,600,606,681,612,610,608,667,295,668,615,644,682,683,620,122628,42894,622,122629,654,122630,248,630,631,113,634,122632,637,638,640,680,678,43879, |
|
22935
|
|
|
|
|
|
|
679,648,11377,655,673,674,664,448,449,450,122634,122654,69785,69818,69787,69818,69797,69818,69937,69927,69938,69927,70471,70462,70471,70487,70841,70842,70841,70832,70841,70845,71096,71087,71097,71087,71989,71984,119127,119141,119128,119141,119128,119141,119150,119128,119141,119151,119128,119141,119152,119128,119141,119153,119128,119141,119154,119225,119141,119226,119141,119225,119141,119150,119226,119141,119150,119225,119141,119151,119226,119141,119151,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,97,98,99,100,101,102,103,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,65,67,68,71,74,75,78,79,80,81,83,84,85,86,87,88,89,90,97,98,99,100,102,104,105,106,107,108,109,110,112,113,114,115,116,117,118,119,120,121,122,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,65,66,68,69,70,71,74,75,76,77,78,79,80,81,83,84,85,86,87,88,89,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,65,66,68,69,70,71,73,74,75,76,77,79,83,84,85,86,87,88,89,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,65,66, |
|
22936
|
|
|
|
|
|
|
67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,97,98,99,100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115,116,117,118,119,120,121,122,305,567,913,914,915,916,917,918,919,920,921,922,923,924,925,926,927,928,929,920,931,932,933,934,935,936,937,8711,945,946,947,948,949,950,951,952,953,954,955,956,957,958,959,960,961,962,963,964,965,966,967,968,969,8706,949,952,954,966,961,960,913,914,915,916,917,918,919,920,921,922,923,924,925,926,927,928,929,920,931,932,933,934,935,936,937,8711,945,946,947,948,949,950,951,952,953,954,955,956,957,958,959,960,961,962,963,964,965,966,967,968,969,8706,949,952,954,966,961,960,913,914,915,916,917,918,919,920,921,922,923,924,925,926,927,928,929,920,931,932,933,934,935,936,937,8711,945,946,947,948,949,950,951,952,953,954,955,956,957,958,959,960,961,962,963,964,965,966,967,968,969,8706,949,952,954,966,961,960,913,914,915,916,917,918,919,920,921,922,923,924,925,926,927,928,929,920,931,932,933,934,935,936,937,8711,945,946,947,948,949,950,951,952,953,954,955,956,957,958,959,960,961,962,963,964,965,966,967,968,969,8706,949,952,954,966,961,960,913,914,915,916,917,918,919,920,921,922,923,924,925,926,927,928,929,920,931,932,933,934,935,936,937,8711,945,946,947,948,949,950,951,952,953,954,955,956,957,958,959,960,961,962,963,964,965,966,967,968,969,8706,949,952,954,966,961,960,988,989,48,49,50,51,52,53,54,55,56,57,48,49,50,51,52,53,54,55,56,57,48,49,50,51,52,53,54,55,56,57,48,49,50,51, |
|
22937
|
|
|
|
|
|
|
52,53,54,55,56,57,48,49,50,51,52,53,54,55,56,57,1072,1073,1074,1075,1076,1077,1078,1079,1080,1082,1083,1084,1086,1087,1088,1089,1090,1091,1092,1093,1094,1095,1096,1099,1101,1102,42633,1241,1110,1112,1257,1199,1231,1072,1073,1074,1075,1076,1077,1078,1079,1080,1082,1083,1086,1087,1089,1091,1092,1093,1094,1095,1096,1098,1099,1169,1110,1109,1119,1195,42577,1201,1575,1576,1580,1583,1608,1586,1581,1591,1610,1603,1604,1605,1606,1587,1593,1601,1589,1602,1585,1588,1578,1579,1582,1584,1590,1592,1594,1646,1722,1697,1647,1576,1580,1607,1581,1610,1603,1604,1605,1606,1587,1593,1601,1589,1602,1588,1578,1579,1582,1590,1594,1580,1581,1610,1604,1606,1587,1593,1589,1602,1588,1582,1590,1594,1722,1647,1576,1580,1607,1581,1591,1610,1603,1605,1606,1587,1593,1601,1589,1602,1588,1578,1579,1582,1590,1592,1594,1646,1697,1575,1576,1580,1583,1607,1608,1586,1581,1591,1610,1604,1605,1606,1587,1593,1601,1589,1602,1585,1588,1578,1579,1582,1584,1590,1592,1594,1576,1580,1583,1608,1586,1581,1591,1610,1604,1605,1606,1587,1593,1601,1589,1602,1585,1588,1578,1579,1582,1584,1590,1592,1594,48,46,48,44,49,44,50,44,51,44,52,44,53,44,54,44,55,44,56,44,57,44,40,65,41,40,66,41,40,67,41,40,68,41,40,69,41,40,70,41,40,71,41,40,72,41,40,73,41,40,74,41,40,75,41,40,76,41,40,77,41,40,78,41,40,79,41,40,80,41,40,81,41,40,82,41,40,83,41,40,84,41,40,85,41,40,86,41,40,87,41,40,88,41,40,89,41,40,90,41,12308,83,12309,67,82,67,68,87,90,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,72,86,77,86,83,68,83,83,80,80,86,87,67,77,67,77,68,77,82,68,74,12411,12363,12467,12467,12469,25163,23383,21452,12486,12441,20108,22810,35299,22825,20132,26144,28961,26009,21069,24460,20877,26032,21021,32066,29983,36009,22768,21561,28436,25237,25429,19968,19977,36938,24038,20013,21491,25351,36208,25171,31105,31354,21512,28288,26377,26376,30003,21106,21942,37197,12308,26412,12309,12308,19977,12309,12308,20108,12309,12308,23433,12309,12308,28857,12309,12308,25171,12309,12308,30423,12309,12308,21213,12309,12308,25943, |
|
22938
|
|
|
|
|
|
|
12309,24471,21487,48,49,50,51,52,53,54,55,56,57,20029,20024,20033,131362,20320,20398,20411,20482,20602,20633,20711,20687,13470,132666,20813,20820,20836,20855,132380,13497,20839,20877,132427,20887,20900,20172,20908,20917,168415,20981,20995,13535,21051,21062,21106,21111,13589,21191,21193,21220,21242,21253,21254,21271,21321,21329,21338,21363,21373,21375,21375,21375,133676,28784,21450,21471,133987,21483,21489,21510,21662,21560,21576,21608,21666,21750,21776,21843,21859,21892,21892,21913,21931,21939,21954,22294,22022,22295,22097,22132,20999,22766,22478,22516,22541,22411,22578,22577,22700,136420,22770,22775,22790,22810,22818,22882,136872,136938,23020,23067,23079,23000,23142,14062,14076,23304,23358,23358,137672,23491,23512,23527,23539,138008,23551,23558,24403,23586,14209,23648,23662,23744,23693,138724,23875,138726,23918,23915,23932,24033,24034,14383,24061,24104,24125,24169,14434,139651,14460,24240,24243,24246,24266,172946,24318,140081,140081,33281,24354,24354,14535,144056,156122,24418,24427,14563,24474,24525,24535,24569,24705,14650,14620,24724,141012,24775,24904,24908,24910,24908,24954,24974,25010,24996,25007,25054,25074,25078,25104,25115,25181,25265,25300,25424,142092,25405,25340,25448,25475,25572,142321,25634,25541,25513,14894,25705,25726,25757,25719,14956,25935,25964,143370,26083,26360,26185,15129,26257,15112,15076,20882,20885,26368,26268,32941,17369,26391,26395,26401,26462,26451,144323,15177,26618,26501,26706,26757,144493,26766,26655,26900,15261,26946,27043,27114,27304,145059,27355,15384,27425,145575,27476,15438,27506,27551,27578,27579,146061,138507,146170,27726,146620,27839,27853,27751,27926,27966,28023,27969,28009,28024,28037,146718,27956,28207,28270,15667,28363,28359,147153,28153,28526,147294,147342,28614,28729,28702,28699,15766,28746,28797,28791,28845,132389,28997,148067,29084,148395,29224,29237,29264,149000,29312,29333,149301,149524,29562,29579,16044,29605,16056,16056,29767,29788,29809,29829,29898,16155,29988,150582,30014,150674,30064,139679,30224,151457,151480, |
|
22939
|
|
|
|
|
|
|
151620,16380,16392,30452,151795,151794,151833,151859,30494,30495,30495,30538,16441,30603,16454,16534,152605,30798,30860,30924,16611,153126,31062,153242,153285,31119,31211,16687,31296,31306,31311,153980,154279,154279,31470,16898,154539,31686,31689,16935,154752,31954,17056,31976,31971,32000,155526,32099,17153,32199,32258,32325,17204,156200,156231,17241,156377,32634,156478,32661,32762,32773,156890,156963,32864,157096,32880,144223,17365,32946,33027,17419,33086,23221,157607,157621,144275,144284,33281,33284,36766,17515,33425,33419,33437,21171,33457,33459,33469,33510,158524,33509,33565,33635,33709,33571,33725,33767,33879,33619,33738,33740,33756,158774,159083,158933,17707,34033,34035,34070,160714,34148,159532,17757,17761,159665,159954,17771,34384,34396,34407,34409,34473,34440,34574,34530,34681,34600,34667,34694,17879,34785,34817,17913,34912,34915,161383,35031,35038,17973,35066,13499,161966,162150,18110,18119,35488,35565,35722,35925,162984,36011,36033,36123,36215,163631,133124,36299,36284,36336,133342,36564,36664,165330,165357,37012,37105,37137,165678,37147,37432,37591,37592,37500,37881,37909,166906,38283,18837,38327,167287,18918,38595,23986,38691,168261,168474,19054,19062,38880,168970,19122,169110,38923,38923,38953,169398,39138,19251,39209,39335,39362,39422,19406,170800,39698,40000,40189,19662,19693,40295,172238,19704,172293,172558,172689,40635,19798,40697,40702,40709,40719,40726,40763,173568 |
|
22940
|
|
|
|
|
|
|
}; |
|
22941
|
|
|
|
|
|
|
|
|
22942
|
|
|
|
|
|
|
} // namespace unilib |
|
22943
|
|
|
|
|
|
|
|
|
22944
|
|
|
|
|
|
|
///////// |
|
22945
|
|
|
|
|
|
|
// File: unilib/utf8.cpp |
|
22946
|
|
|
|
|
|
|
///////// |
|
22947
|
|
|
|
|
|
|
|
|
22948
|
|
|
|
|
|
|
// This file is part of UniLib . |
|
22949
|
|
|
|
|
|
|
// |
|
22950
|
|
|
|
|
|
|
// Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of |
|
22951
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
22952
|
|
|
|
|
|
|
// |
|
22953
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
22954
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
22955
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
22956
|
|
|
|
|
|
|
// |
|
22957
|
|
|
|
|
|
|
// UniLib version: 3.3.0 |
|
22958
|
|
|
|
|
|
|
// Unicode version: 15.0.0 |
|
22959
|
|
|
|
|
|
|
|
|
22960
|
|
|
|
|
|
|
namespace unilib { |
|
22961
|
|
|
|
|
|
|
|
|
22962
|
0
|
|
|
|
|
|
bool utf8::valid(const char* str) { |
|
22963
|
0
|
0
|
|
|
|
|
for (; *str; str++) |
|
22964
|
0
|
0
|
|
|
|
|
if (((unsigned char)*str) >= 0x80) { |
|
22965
|
0
|
0
|
|
|
|
|
if (((unsigned char)*str) < 0xC0) return false; |
|
22966
|
0
|
0
|
|
|
|
|
else if (((unsigned char)*str) < 0xE0) { |
|
22967
|
0
|
0
|
|
|
|
|
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
|
0
|
|
|
|
|
|
|
22968
|
0
|
0
|
|
|
|
|
} else if (((unsigned char)*str) < 0xF0) { |
|
22969
|
0
|
0
|
|
|
|
|
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
|
0
|
|
|
|
|
|
|
22970
|
0
|
0
|
|
|
|
|
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
|
0
|
|
|
|
|
|
|
22971
|
0
|
0
|
|
|
|
|
} else if (((unsigned char)*str) < 0xF8) { |
|
22972
|
0
|
0
|
|
|
|
|
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
|
0
|
|
|
|
|
|
|
22973
|
0
|
0
|
|
|
|
|
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
|
0
|
|
|
|
|
|
|
22974
|
0
|
0
|
|
|
|
|
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
|
0
|
|
|
|
|
|
|
22975
|
|
|
|
|
|
|
} else return false; |
|
22976
|
|
|
|
|
|
|
} |
|
22977
|
|
|
|
|
|
|
return true; |
|
22978
|
|
|
|
|
|
|
} |
|
22979
|
|
|
|
|
|
|
|
|
22980
|
0
|
|
|
|
|
|
bool utf8::valid(const char* str, size_t len) { |
|
22981
|
0
|
0
|
|
|
|
|
for (; len > 0; str++, len--) |
|
22982
|
0
|
0
|
|
|
|
|
if (((unsigned char)*str) >= 0x80) { |
|
22983
|
0
|
0
|
|
|
|
|
if (((unsigned char)*str) < 0xC0) return false; |
|
22984
|
0
|
0
|
|
|
|
|
else if (((unsigned char)*str) < 0xE0) { |
|
22985
|
0
|
0
|
|
|
|
|
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
22986
|
0
|
0
|
|
|
|
|
} else if (((unsigned char)*str) < 0xF0) { |
|
22987
|
0
|
0
|
|
|
|
|
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
22988
|
0
|
0
|
|
|
|
|
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
22989
|
0
|
0
|
|
|
|
|
} else if (((unsigned char)*str) < 0xF8) { |
|
22990
|
0
|
0
|
|
|
|
|
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
22991
|
0
|
0
|
|
|
|
|
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
22992
|
0
|
0
|
|
|
|
|
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
22993
|
|
|
|
|
|
|
} else return false; |
|
22994
|
|
|
|
|
|
|
} |
|
22995
|
|
|
|
|
|
|
return true; |
|
22996
|
|
|
|
|
|
|
} |
|
22997
|
|
|
|
|
|
|
|
|
22998
|
0
|
|
|
|
|
|
void utf8::decode(const char* str, std::u32string& decoded) { |
|
22999
|
|
|
|
|
|
|
decoded.clear(); |
|
23000
|
|
|
|
|
|
|
|
|
23001
|
0
|
0
|
|
|
|
|
for (char32_t chr; (chr = decode(str)); ) |
|
23002
|
0
|
|
|
|
|
|
decoded.push_back(chr); |
|
23003
|
0
|
|
|
|
|
|
} |
|
23004
|
|
|
|
|
|
|
|
|
23005
|
0
|
|
|
|
|
|
void utf8::decode(const char* str, size_t len, std::u32string& decoded) { |
|
23006
|
|
|
|
|
|
|
decoded.clear(); |
|
23007
|
|
|
|
|
|
|
|
|
23008
|
0
|
0
|
|
|
|
|
while (len) |
|
23009
|
0
|
|
|
|
|
|
decoded.push_back(decode(str, len)); |
|
23010
|
0
|
|
|
|
|
|
} |
|
23011
|
|
|
|
|
|
|
|
|
23012
|
0
|
|
|
|
|
|
void utf8::encode(const std::u32string& str, std::string& encoded) { |
|
23013
|
|
|
|
|
|
|
encoded.clear(); |
|
23014
|
|
|
|
|
|
|
|
|
23015
|
0
|
0
|
|
|
|
|
for (auto&& chr : str) |
|
23016
|
0
|
|
|
|
|
|
append(encoded, chr); |
|
23017
|
0
|
|
|
|
|
|
} |
|
23018
|
|
|
|
|
|
|
|
|
23019
|
|
|
|
|
|
|
const char utf8::REPLACEMENT_CHAR; |
|
23020
|
|
|
|
|
|
|
|
|
23021
|
|
|
|
|
|
|
} // namespace unilib |
|
23022
|
|
|
|
|
|
|
|
|
23023
|
|
|
|
|
|
|
///////// |
|
23024
|
|
|
|
|
|
|
// File: unilib/version.cpp |
|
23025
|
|
|
|
|
|
|
///////// |
|
23026
|
|
|
|
|
|
|
|
|
23027
|
|
|
|
|
|
|
// This file is part of UniLib . |
|
23028
|
|
|
|
|
|
|
// |
|
23029
|
|
|
|
|
|
|
// Copyright 2014 Institute of Formal and Applied Linguistics, Faculty of |
|
23030
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
23031
|
|
|
|
|
|
|
// |
|
23032
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
23033
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
23034
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
23035
|
|
|
|
|
|
|
// |
|
23036
|
|
|
|
|
|
|
// UniLib version: 3.3.0 |
|
23037
|
|
|
|
|
|
|
// Unicode version: 15.0.0 |
|
23038
|
|
|
|
|
|
|
|
|
23039
|
|
|
|
|
|
|
namespace unilib { |
|
23040
|
|
|
|
|
|
|
|
|
23041
|
|
|
|
|
|
|
// Returns current version. |
|
23042
|
0
|
|
|
|
|
|
version version::current() { |
|
23043
|
0
|
0
|
|
|
|
|
return {3, 3, 0, ""}; |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
23044
|
|
|
|
|
|
|
} |
|
23045
|
|
|
|
|
|
|
|
|
23046
|
|
|
|
|
|
|
} // namespace unilib |
|
23047
|
|
|
|
|
|
|
|
|
23048
|
|
|
|
|
|
|
///////// |
|
23049
|
|
|
|
|
|
|
// File: utils/compressor_load.cpp |
|
23050
|
|
|
|
|
|
|
///////// |
|
23051
|
|
|
|
|
|
|
|
|
23052
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
|
23053
|
|
|
|
|
|
|
// |
|
23054
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
23055
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
23056
|
|
|
|
|
|
|
// |
|
23057
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
23058
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
23059
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
23060
|
|
|
|
|
|
|
|
|
23061
|
|
|
|
|
|
|
namespace utils { |
|
23062
|
|
|
|
|
|
|
|
|
23063
|
|
|
|
|
|
|
// Start of LZMA compression library by Igor Pavlov |
|
23064
|
|
|
|
|
|
|
namespace lzma { |
|
23065
|
|
|
|
|
|
|
|
|
23066
|
|
|
|
|
|
|
// Types.h -- Basic types |
|
23067
|
|
|
|
|
|
|
// 2010-10-09 : Igor Pavlov : Public domain |
|
23068
|
|
|
|
|
|
|
#ifndef UFAL_CPPUTILS_COMPRESSOR_LZMA_TYPES_H |
|
23069
|
|
|
|
|
|
|
#define UFAL_CPPUTILS_COMPRESSOR_LZMA_TYPES_H |
|
23070
|
|
|
|
|
|
|
|
|
23071
|
|
|
|
|
|
|
#define SZ_OK 0 |
|
23072
|
|
|
|
|
|
|
|
|
23073
|
|
|
|
|
|
|
#define SZ_ERROR_DATA 1 |
|
23074
|
|
|
|
|
|
|
#define SZ_ERROR_MEM 2 |
|
23075
|
|
|
|
|
|
|
#define SZ_ERROR_CRC 3 |
|
23076
|
|
|
|
|
|
|
#define SZ_ERROR_UNSUPPORTED 4 |
|
23077
|
|
|
|
|
|
|
#define SZ_ERROR_PARAM 5 |
|
23078
|
|
|
|
|
|
|
#define SZ_ERROR_INPUT_EOF 6 |
|
23079
|
|
|
|
|
|
|
#define SZ_ERROR_OUTPUT_EOF 7 |
|
23080
|
|
|
|
|
|
|
#define SZ_ERROR_READ 8 |
|
23081
|
|
|
|
|
|
|
#define SZ_ERROR_WRITE 9 |
|
23082
|
|
|
|
|
|
|
#define SZ_ERROR_PROGRESS 10 |
|
23083
|
|
|
|
|
|
|
#define SZ_ERROR_FAIL 11 |
|
23084
|
|
|
|
|
|
|
#define SZ_ERROR_THREAD 12 |
|
23085
|
|
|
|
|
|
|
|
|
23086
|
|
|
|
|
|
|
#define SZ_ERROR_ARCHIVE 16 |
|
23087
|
|
|
|
|
|
|
#define SZ_ERROR_NO_ARCHIVE 17 |
|
23088
|
|
|
|
|
|
|
|
|
23089
|
|
|
|
|
|
|
typedef int SRes; |
|
23090
|
|
|
|
|
|
|
|
|
23091
|
|
|
|
|
|
|
#ifndef RINOK |
|
23092
|
|
|
|
|
|
|
#define RINOK(x) { int __result__ = (x); if (__result__ != 0) return __result__; } |
|
23093
|
|
|
|
|
|
|
#endif |
|
23094
|
|
|
|
|
|
|
|
|
23095
|
|
|
|
|
|
|
/* The following interfaces use first parameter as pointer to structure */ |
|
23096
|
|
|
|
|
|
|
|
|
23097
|
|
|
|
|
|
|
struct IByteIn |
|
23098
|
|
|
|
|
|
|
{ |
|
23099
|
|
|
|
|
|
|
uint8_t (*Read)(void *p); /* reads one byte, returns 0 in case of EOF or error */ |
|
23100
|
|
|
|
|
|
|
}; |
|
23101
|
|
|
|
|
|
|
|
|
23102
|
|
|
|
|
|
|
struct IByteOut |
|
23103
|
|
|
|
|
|
|
{ |
|
23104
|
|
|
|
|
|
|
void (*Write)(void *p, uint8_t b); |
|
23105
|
|
|
|
|
|
|
}; |
|
23106
|
|
|
|
|
|
|
|
|
23107
|
|
|
|
|
|
|
struct ISeqInStream |
|
23108
|
|
|
|
|
|
|
{ |
|
23109
|
|
|
|
|
|
|
SRes (*Read)(void *p, void *buf, size_t *size); |
|
23110
|
|
|
|
|
|
|
/* if (input(*size) != 0 && output(*size) == 0) means end_of_stream. |
|
23111
|
|
|
|
|
|
|
(output(*size) < input(*size)) is allowed */ |
|
23112
|
|
|
|
|
|
|
}; |
|
23113
|
|
|
|
|
|
|
|
|
23114
|
|
|
|
|
|
|
/* it can return SZ_ERROR_INPUT_EOF */ |
|
23115
|
|
|
|
|
|
|
SRes SeqInStream_Read(ISeqInStream *stream, void *buf, size_t size); |
|
23116
|
|
|
|
|
|
|
SRes SeqInStream_Read2(ISeqInStream *stream, void *buf, size_t size, SRes errorType); |
|
23117
|
|
|
|
|
|
|
SRes SeqInStream_ReadByte(ISeqInStream *stream, uint8_t *buf); |
|
23118
|
|
|
|
|
|
|
|
|
23119
|
|
|
|
|
|
|
struct ISeqOutStream |
|
23120
|
|
|
|
|
|
|
{ |
|
23121
|
|
|
|
|
|
|
size_t (*Write)(void *p, const void *buf, size_t size); |
|
23122
|
|
|
|
|
|
|
/* Returns: result - the number of actually written bytes. |
|
23123
|
|
|
|
|
|
|
(result < size) means error */ |
|
23124
|
|
|
|
|
|
|
}; |
|
23125
|
|
|
|
|
|
|
|
|
23126
|
|
|
|
|
|
|
enum ESzSeek |
|
23127
|
|
|
|
|
|
|
{ |
|
23128
|
|
|
|
|
|
|
SZ_SEEK_SET = 0, |
|
23129
|
|
|
|
|
|
|
SZ_SEEK_CUR = 1, |
|
23130
|
|
|
|
|
|
|
SZ_SEEK_END = 2 |
|
23131
|
|
|
|
|
|
|
}; |
|
23132
|
|
|
|
|
|
|
|
|
23133
|
|
|
|
|
|
|
struct ISeekInStream |
|
23134
|
|
|
|
|
|
|
{ |
|
23135
|
|
|
|
|
|
|
SRes (*Read)(void *p, void *buf, size_t *size); /* same as ISeqInStream::Read */ |
|
23136
|
|
|
|
|
|
|
SRes (*Seek)(void *p, int64_t *pos, ESzSeek origin); |
|
23137
|
|
|
|
|
|
|
}; |
|
23138
|
|
|
|
|
|
|
|
|
23139
|
|
|
|
|
|
|
struct ILookInStream |
|
23140
|
|
|
|
|
|
|
{ |
|
23141
|
|
|
|
|
|
|
SRes (*Look)(void *p, const void **buf, size_t *size); |
|
23142
|
|
|
|
|
|
|
/* if (input(*size) != 0 && output(*size) == 0) means end_of_stream. |
|
23143
|
|
|
|
|
|
|
(output(*size) > input(*size)) is not allowed |
|
23144
|
|
|
|
|
|
|
(output(*size) < input(*size)) is allowed */ |
|
23145
|
|
|
|
|
|
|
SRes (*Skip)(void *p, size_t offset); |
|
23146
|
|
|
|
|
|
|
/* offset must be <= output(*size) of Look */ |
|
23147
|
|
|
|
|
|
|
|
|
23148
|
|
|
|
|
|
|
SRes (*Read)(void *p, void *buf, size_t *size); |
|
23149
|
|
|
|
|
|
|
/* reads directly (without buffer). It's same as ISeqInStream::Read */ |
|
23150
|
|
|
|
|
|
|
SRes (*Seek)(void *p, int64_t *pos, ESzSeek origin); |
|
23151
|
|
|
|
|
|
|
}; |
|
23152
|
|
|
|
|
|
|
|
|
23153
|
|
|
|
|
|
|
SRes LookInStream_LookRead(ILookInStream *stream, void *buf, size_t *size); |
|
23154
|
|
|
|
|
|
|
SRes LookInStream_SeekTo(ILookInStream *stream, uint64_t offset); |
|
23155
|
|
|
|
|
|
|
|
|
23156
|
|
|
|
|
|
|
/* reads via ILookInStream::Read */ |
|
23157
|
|
|
|
|
|
|
SRes LookInStream_Read2(ILookInStream *stream, void *buf, size_t size, SRes errorType); |
|
23158
|
|
|
|
|
|
|
SRes LookInStream_Read(ILookInStream *stream, void *buf, size_t size); |
|
23159
|
|
|
|
|
|
|
|
|
23160
|
|
|
|
|
|
|
#define LookToRead_BUF_SIZE (1 << 14) |
|
23161
|
|
|
|
|
|
|
|
|
23162
|
|
|
|
|
|
|
struct CLookToRead |
|
23163
|
|
|
|
|
|
|
{ |
|
23164
|
|
|
|
|
|
|
ILookInStream s; |
|
23165
|
|
|
|
|
|
|
ISeekInStream *realStream; |
|
23166
|
|
|
|
|
|
|
size_t pos; |
|
23167
|
|
|
|
|
|
|
size_t size; |
|
23168
|
|
|
|
|
|
|
uint8_t buf[LookToRead_BUF_SIZE]; |
|
23169
|
|
|
|
|
|
|
}; |
|
23170
|
|
|
|
|
|
|
|
|
23171
|
|
|
|
|
|
|
void LookToRead_CreateVTable(CLookToRead *p, int lookahead); |
|
23172
|
|
|
|
|
|
|
void LookToRead_Init(CLookToRead *p); |
|
23173
|
|
|
|
|
|
|
|
|
23174
|
|
|
|
|
|
|
struct CSecToLook |
|
23175
|
|
|
|
|
|
|
{ |
|
23176
|
|
|
|
|
|
|
ISeqInStream s; |
|
23177
|
|
|
|
|
|
|
ILookInStream *realStream; |
|
23178
|
|
|
|
|
|
|
}; |
|
23179
|
|
|
|
|
|
|
|
|
23180
|
|
|
|
|
|
|
void SecToLook_CreateVTable(CSecToLook *p); |
|
23181
|
|
|
|
|
|
|
|
|
23182
|
|
|
|
|
|
|
struct CSecToRead |
|
23183
|
|
|
|
|
|
|
{ |
|
23184
|
|
|
|
|
|
|
ISeqInStream s; |
|
23185
|
|
|
|
|
|
|
ILookInStream *realStream; |
|
23186
|
|
|
|
|
|
|
}; |
|
23187
|
|
|
|
|
|
|
|
|
23188
|
|
|
|
|
|
|
void SecToRead_CreateVTable(CSecToRead *p); |
|
23189
|
|
|
|
|
|
|
|
|
23190
|
|
|
|
|
|
|
struct ICompressProgress |
|
23191
|
|
|
|
|
|
|
{ |
|
23192
|
|
|
|
|
|
|
SRes (*Progress)(void *p, uint64_t inSize, uint64_t outSize); |
|
23193
|
|
|
|
|
|
|
/* Returns: result. (result != SZ_OK) means break. |
|
23194
|
|
|
|
|
|
|
Value (uint64_t)(int64_t)-1 for size means unknown value. */ |
|
23195
|
|
|
|
|
|
|
}; |
|
23196
|
|
|
|
|
|
|
|
|
23197
|
|
|
|
|
|
|
struct ISzAlloc |
|
23198
|
|
|
|
|
|
|
{ |
|
23199
|
|
|
|
|
|
|
void *(*Alloc)(void *p, size_t size); |
|
23200
|
|
|
|
|
|
|
void (*Free)(void *p, void *address); /* address can be 0 */ |
|
23201
|
|
|
|
|
|
|
}; |
|
23202
|
|
|
|
|
|
|
|
|
23203
|
|
|
|
|
|
|
#define IAlloc_Alloc(p, size) (p)->Alloc((p), size) |
|
23204
|
|
|
|
|
|
|
#define IAlloc_Free(p, a) (p)->Free((p), a) |
|
23205
|
|
|
|
|
|
|
|
|
23206
|
|
|
|
|
|
|
#endif // UFAL_CPPUTILS_COMPRESSOR_LZMA_TYPES_H |
|
23207
|
|
|
|
|
|
|
|
|
23208
|
|
|
|
|
|
|
// LzmaDec.h -- LZMA Decoder |
|
23209
|
|
|
|
|
|
|
// 2009-02-07 : Igor Pavlov : Public domain |
|
23210
|
|
|
|
|
|
|
|
|
23211
|
|
|
|
|
|
|
/* #define _LZMA_PROB32 */ |
|
23212
|
|
|
|
|
|
|
/* _LZMA_PROB32 can increase the speed on some CPUs, |
|
23213
|
|
|
|
|
|
|
but memory usage for CLzmaDec::probs will be doubled in that case */ |
|
23214
|
|
|
|
|
|
|
|
|
23215
|
|
|
|
|
|
|
#ifdef _LZMA_PROB32 |
|
23216
|
|
|
|
|
|
|
#define CLzmaProb uint32_t |
|
23217
|
|
|
|
|
|
|
#else |
|
23218
|
|
|
|
|
|
|
#define CLzmaProb uint16_t |
|
23219
|
|
|
|
|
|
|
#endif |
|
23220
|
|
|
|
|
|
|
|
|
23221
|
|
|
|
|
|
|
/* ---------- LZMA Properties ---------- */ |
|
23222
|
|
|
|
|
|
|
|
|
23223
|
|
|
|
|
|
|
#define LZMA_PROPS_SIZE 5 |
|
23224
|
|
|
|
|
|
|
|
|
23225
|
|
|
|
|
|
|
struct CLzmaProps |
|
23226
|
|
|
|
|
|
|
{ |
|
23227
|
|
|
|
|
|
|
unsigned lc, lp, pb; |
|
23228
|
|
|
|
|
|
|
uint32_t dicSize; |
|
23229
|
|
|
|
|
|
|
}; |
|
23230
|
|
|
|
|
|
|
|
|
23231
|
|
|
|
|
|
|
/* LzmaProps_Decode - decodes properties |
|
23232
|
|
|
|
|
|
|
Returns: |
|
23233
|
|
|
|
|
|
|
SZ_OK |
|
23234
|
|
|
|
|
|
|
SZ_ERROR_UNSUPPORTED - Unsupported properties |
|
23235
|
|
|
|
|
|
|
*/ |
|
23236
|
|
|
|
|
|
|
|
|
23237
|
|
|
|
|
|
|
SRes LzmaProps_Decode(CLzmaProps *p, const uint8_t *data, unsigned size); |
|
23238
|
|
|
|
|
|
|
|
|
23239
|
|
|
|
|
|
|
/* ---------- LZMA Decoder state ---------- */ |
|
23240
|
|
|
|
|
|
|
|
|
23241
|
|
|
|
|
|
|
/* LZMA_REQUIRED_INPUT_MAX = number of required input bytes for worst case. |
|
23242
|
|
|
|
|
|
|
Num bits = log2((2^11 / 31) ^ 22) + 26 < 134 + 26 = 160; */ |
|
23243
|
|
|
|
|
|
|
|
|
23244
|
|
|
|
|
|
|
#define LZMA_REQUIRED_INPUT_MAX 20 |
|
23245
|
|
|
|
|
|
|
|
|
23246
|
|
|
|
|
|
|
struct CLzmaDec |
|
23247
|
|
|
|
|
|
|
{ |
|
23248
|
|
|
|
|
|
|
CLzmaProps prop; |
|
23249
|
|
|
|
|
|
|
CLzmaProb *probs; |
|
23250
|
|
|
|
|
|
|
uint8_t *dic; |
|
23251
|
|
|
|
|
|
|
const uint8_t *buf; |
|
23252
|
|
|
|
|
|
|
uint32_t range, code; |
|
23253
|
|
|
|
|
|
|
size_t dicPos; |
|
23254
|
|
|
|
|
|
|
size_t dicBufSize; |
|
23255
|
|
|
|
|
|
|
uint32_t processedPos; |
|
23256
|
|
|
|
|
|
|
uint32_t checkDicSize; |
|
23257
|
|
|
|
|
|
|
unsigned state; |
|
23258
|
|
|
|
|
|
|
uint32_t reps[4]; |
|
23259
|
|
|
|
|
|
|
unsigned remainLen; |
|
23260
|
|
|
|
|
|
|
int needFlush; |
|
23261
|
|
|
|
|
|
|
int needInitState; |
|
23262
|
|
|
|
|
|
|
uint32_t numProbs; |
|
23263
|
|
|
|
|
|
|
unsigned tempBufSize; |
|
23264
|
|
|
|
|
|
|
uint8_t tempBuf[LZMA_REQUIRED_INPUT_MAX]; |
|
23265
|
|
|
|
|
|
|
}; |
|
23266
|
|
|
|
|
|
|
|
|
23267
|
|
|
|
|
|
|
#define LzmaDec_Construct(p) { (p)->dic = 0; (p)->probs = 0; } |
|
23268
|
|
|
|
|
|
|
|
|
23269
|
|
|
|
|
|
|
void LzmaDec_Init(CLzmaDec *p); |
|
23270
|
|
|
|
|
|
|
|
|
23271
|
|
|
|
|
|
|
/* There are two types of LZMA streams: |
|
23272
|
|
|
|
|
|
|
0) Stream with end mark. That end mark adds about 6 bytes to compressed size. |
|
23273
|
|
|
|
|
|
|
1) Stream without end mark. You must know exact uncompressed size to decompress such stream. */ |
|
23274
|
|
|
|
|
|
|
|
|
23275
|
|
|
|
|
|
|
enum ELzmaFinishMode |
|
23276
|
|
|
|
|
|
|
{ |
|
23277
|
|
|
|
|
|
|
LZMA_FINISH_ANY, /* finish at any point */ |
|
23278
|
|
|
|
|
|
|
LZMA_FINISH_END /* block must be finished at the end */ |
|
23279
|
|
|
|
|
|
|
}; |
|
23280
|
|
|
|
|
|
|
|
|
23281
|
|
|
|
|
|
|
/* ELzmaFinishMode has meaning only if the decoding reaches output limit !!! |
|
23282
|
|
|
|
|
|
|
|
|
23283
|
|
|
|
|
|
|
You must use LZMA_FINISH_END, when you know that current output buffer |
|
23284
|
|
|
|
|
|
|
covers last bytes of block. In other cases you must use LZMA_FINISH_ANY. |
|
23285
|
|
|
|
|
|
|
|
|
23286
|
|
|
|
|
|
|
If LZMA decoder sees end marker before reaching output limit, it returns SZ_OK, |
|
23287
|
|
|
|
|
|
|
and output value of destLen will be less than output buffer size limit. |
|
23288
|
|
|
|
|
|
|
You can check status result also. |
|
23289
|
|
|
|
|
|
|
|
|
23290
|
|
|
|
|
|
|
You can use multiple checks to test data integrity after full decompression: |
|
23291
|
|
|
|
|
|
|
1) Check Result and "status" variable. |
|
23292
|
|
|
|
|
|
|
2) Check that output(destLen) = uncompressedSize, if you know real uncompressedSize. |
|
23293
|
|
|
|
|
|
|
3) Check that output(srcLen) = compressedSize, if you know real compressedSize. |
|
23294
|
|
|
|
|
|
|
You must use correct finish mode in that case. */ |
|
23295
|
|
|
|
|
|
|
|
|
23296
|
|
|
|
|
|
|
enum ELzmaStatus |
|
23297
|
|
|
|
|
|
|
{ |
|
23298
|
|
|
|
|
|
|
LZMA_STATUS_NOT_SPECIFIED, /* use main error code instead */ |
|
23299
|
|
|
|
|
|
|
LZMA_STATUS_FINISHED_WITH_MARK, /* stream was finished with end mark. */ |
|
23300
|
|
|
|
|
|
|
LZMA_STATUS_NOT_FINISHED, /* stream was not finished */ |
|
23301
|
|
|
|
|
|
|
LZMA_STATUS_NEEDS_MORE_INPUT, /* you must provide more input bytes */ |
|
23302
|
|
|
|
|
|
|
LZMA_STATUS_MAYBE_FINISHED_WITHOUT_MARK /* there is probability that stream was finished without end mark */ |
|
23303
|
|
|
|
|
|
|
}; |
|
23304
|
|
|
|
|
|
|
|
|
23305
|
|
|
|
|
|
|
/* ELzmaStatus is used only as output value for function call */ |
|
23306
|
|
|
|
|
|
|
|
|
23307
|
|
|
|
|
|
|
/* ---------- Interfaces ---------- */ |
|
23308
|
|
|
|
|
|
|
|
|
23309
|
|
|
|
|
|
|
/* There are 3 levels of interfaces: |
|
23310
|
|
|
|
|
|
|
1) Dictionary Interface |
|
23311
|
|
|
|
|
|
|
2) Buffer Interface |
|
23312
|
|
|
|
|
|
|
3) One Call Interface |
|
23313
|
|
|
|
|
|
|
You can select any of these interfaces, but don't mix functions from different |
|
23314
|
|
|
|
|
|
|
groups for same object. */ |
|
23315
|
|
|
|
|
|
|
|
|
23316
|
|
|
|
|
|
|
/* There are two variants to allocate state for Dictionary Interface: |
|
23317
|
|
|
|
|
|
|
1) LzmaDec_Allocate / LzmaDec_Free |
|
23318
|
|
|
|
|
|
|
2) LzmaDec_AllocateProbs / LzmaDec_FreeProbs |
|
23319
|
|
|
|
|
|
|
You can use variant 2, if you set dictionary buffer manually. |
|
23320
|
|
|
|
|
|
|
For Buffer Interface you must always use variant 1. |
|
23321
|
|
|
|
|
|
|
|
|
23322
|
|
|
|
|
|
|
LzmaDec_Allocate* can return: |
|
23323
|
|
|
|
|
|
|
SZ_OK |
|
23324
|
|
|
|
|
|
|
SZ_ERROR_MEM - Memory allocation error |
|
23325
|
|
|
|
|
|
|
SZ_ERROR_UNSUPPORTED - Unsupported properties |
|
23326
|
|
|
|
|
|
|
*/ |
|
23327
|
|
|
|
|
|
|
|
|
23328
|
|
|
|
|
|
|
SRes LzmaDec_AllocateProbs(CLzmaDec *p, const uint8_t *props, unsigned propsSize, ISzAlloc *alloc); |
|
23329
|
|
|
|
|
|
|
void LzmaDec_FreeProbs(CLzmaDec *p, ISzAlloc *alloc); |
|
23330
|
|
|
|
|
|
|
|
|
23331
|
|
|
|
|
|
|
SRes LzmaDec_Allocate(CLzmaDec *state, const uint8_t *prop, unsigned propsSize, ISzAlloc *alloc); |
|
23332
|
|
|
|
|
|
|
void LzmaDec_Free(CLzmaDec *state, ISzAlloc *alloc); |
|
23333
|
|
|
|
|
|
|
|
|
23334
|
|
|
|
|
|
|
/* ---------- Dictionary Interface ---------- */ |
|
23335
|
|
|
|
|
|
|
|
|
23336
|
|
|
|
|
|
|
/* You can use it, if you want to eliminate the overhead for data copying from |
|
23337
|
|
|
|
|
|
|
dictionary to some other external buffer. |
|
23338
|
|
|
|
|
|
|
You must work with CLzmaDec variables directly in this interface. |
|
23339
|
|
|
|
|
|
|
|
|
23340
|
|
|
|
|
|
|
STEPS: |
|
23341
|
|
|
|
|
|
|
LzmaDec_Constr() |
|
23342
|
|
|
|
|
|
|
LzmaDec_Allocate() |
|
23343
|
|
|
|
|
|
|
for (each new stream) |
|
23344
|
|
|
|
|
|
|
{ |
|
23345
|
|
|
|
|
|
|
LzmaDec_Init() |
|
23346
|
|
|
|
|
|
|
while (it needs more decompression) |
|
23347
|
|
|
|
|
|
|
{ |
|
23348
|
|
|
|
|
|
|
LzmaDec_DecodeToDic() |
|
23349
|
|
|
|
|
|
|
use data from CLzmaDec::dic and update CLzmaDec::dicPos |
|
23350
|
|
|
|
|
|
|
} |
|
23351
|
|
|
|
|
|
|
} |
|
23352
|
|
|
|
|
|
|
LzmaDec_Free() |
|
23353
|
|
|
|
|
|
|
*/ |
|
23354
|
|
|
|
|
|
|
|
|
23355
|
|
|
|
|
|
|
/* LzmaDec_DecodeToDic |
|
23356
|
|
|
|
|
|
|
|
|
23357
|
|
|
|
|
|
|
The decoding to internal dictionary buffer (CLzmaDec::dic). |
|
23358
|
|
|
|
|
|
|
You must manually update CLzmaDec::dicPos, if it reaches CLzmaDec::dicBufSize !!! |
|
23359
|
|
|
|
|
|
|
|
|
23360
|
|
|
|
|
|
|
finishMode: |
|
23361
|
|
|
|
|
|
|
It has meaning only if the decoding reaches output limit (dicLimit). |
|
23362
|
|
|
|
|
|
|
LZMA_FINISH_ANY - Decode just dicLimit bytes. |
|
23363
|
|
|
|
|
|
|
LZMA_FINISH_END - Stream must be finished after dicLimit. |
|
23364
|
|
|
|
|
|
|
|
|
23365
|
|
|
|
|
|
|
Returns: |
|
23366
|
|
|
|
|
|
|
SZ_OK |
|
23367
|
|
|
|
|
|
|
status: |
|
23368
|
|
|
|
|
|
|
LZMA_STATUS_FINISHED_WITH_MARK |
|
23369
|
|
|
|
|
|
|
LZMA_STATUS_NOT_FINISHED |
|
23370
|
|
|
|
|
|
|
LZMA_STATUS_NEEDS_MORE_INPUT |
|
23371
|
|
|
|
|
|
|
LZMA_STATUS_MAYBE_FINISHED_WITHOUT_MARK |
|
23372
|
|
|
|
|
|
|
SZ_ERROR_DATA - Data error |
|
23373
|
|
|
|
|
|
|
*/ |
|
23374
|
|
|
|
|
|
|
|
|
23375
|
|
|
|
|
|
|
SRes LzmaDec_DecodeToDic(CLzmaDec *p, size_t dicLimit, |
|
23376
|
|
|
|
|
|
|
const uint8_t *src, size_t *srcLen, ELzmaFinishMode finishMode, ELzmaStatus *status); |
|
23377
|
|
|
|
|
|
|
|
|
23378
|
|
|
|
|
|
|
/* ---------- Buffer Interface ---------- */ |
|
23379
|
|
|
|
|
|
|
|
|
23380
|
|
|
|
|
|
|
/* It's zlib-like interface. |
|
23381
|
|
|
|
|
|
|
See LzmaDec_DecodeToDic description for information about STEPS and return results, |
|
23382
|
|
|
|
|
|
|
but you must use LzmaDec_DecodeToBuf instead of LzmaDec_DecodeToDic and you don't need |
|
23383
|
|
|
|
|
|
|
to work with CLzmaDec variables manually. |
|
23384
|
|
|
|
|
|
|
|
|
23385
|
|
|
|
|
|
|
finishMode: |
|
23386
|
|
|
|
|
|
|
It has meaning only if the decoding reaches output limit (*destLen). |
|
23387
|
|
|
|
|
|
|
LZMA_FINISH_ANY - Decode just destLen bytes. |
|
23388
|
|
|
|
|
|
|
LZMA_FINISH_END - Stream must be finished after (*destLen). |
|
23389
|
|
|
|
|
|
|
*/ |
|
23390
|
|
|
|
|
|
|
|
|
23391
|
|
|
|
|
|
|
SRes LzmaDec_DecodeToBuf(CLzmaDec *p, uint8_t *dest, size_t *destLen, |
|
23392
|
|
|
|
|
|
|
const uint8_t *src, size_t *srcLen, ELzmaFinishMode finishMode, ELzmaStatus *status); |
|
23393
|
|
|
|
|
|
|
|
|
23394
|
|
|
|
|
|
|
/* ---------- One Call Interface ---------- */ |
|
23395
|
|
|
|
|
|
|
|
|
23396
|
|
|
|
|
|
|
/* LzmaDecode |
|
23397
|
|
|
|
|
|
|
|
|
23398
|
|
|
|
|
|
|
finishMode: |
|
23399
|
|
|
|
|
|
|
It has meaning only if the decoding reaches output limit (*destLen). |
|
23400
|
|
|
|
|
|
|
LZMA_FINISH_ANY - Decode just destLen bytes. |
|
23401
|
|
|
|
|
|
|
LZMA_FINISH_END - Stream must be finished after (*destLen). |
|
23402
|
|
|
|
|
|
|
|
|
23403
|
|
|
|
|
|
|
Returns: |
|
23404
|
|
|
|
|
|
|
SZ_OK |
|
23405
|
|
|
|
|
|
|
status: |
|
23406
|
|
|
|
|
|
|
LZMA_STATUS_FINISHED_WITH_MARK |
|
23407
|
|
|
|
|
|
|
LZMA_STATUS_NOT_FINISHED |
|
23408
|
|
|
|
|
|
|
LZMA_STATUS_MAYBE_FINISHED_WITHOUT_MARK |
|
23409
|
|
|
|
|
|
|
SZ_ERROR_DATA - Data error |
|
23410
|
|
|
|
|
|
|
SZ_ERROR_MEM - Memory allocation error |
|
23411
|
|
|
|
|
|
|
SZ_ERROR_UNSUPPORTED - Unsupported properties |
|
23412
|
|
|
|
|
|
|
SZ_ERROR_INPUT_EOF - It needs more bytes in input buffer (src). |
|
23413
|
|
|
|
|
|
|
*/ |
|
23414
|
|
|
|
|
|
|
|
|
23415
|
|
|
|
|
|
|
SRes LzmaDecode(uint8_t *dest, size_t *destLen, const uint8_t *src, size_t *srcLen, |
|
23416
|
|
|
|
|
|
|
const uint8_t *propData, unsigned propSize, ELzmaFinishMode finishMode, |
|
23417
|
|
|
|
|
|
|
ELzmaStatus *status, ISzAlloc *alloc); |
|
23418
|
|
|
|
|
|
|
|
|
23419
|
|
|
|
|
|
|
// LzmaDec.c -- LZMA Decoder |
|
23420
|
|
|
|
|
|
|
// 2009-09-20 : Igor Pavlov : Public domain |
|
23421
|
|
|
|
|
|
|
|
|
23422
|
|
|
|
|
|
|
#define kNumTopBits 24 |
|
23423
|
|
|
|
|
|
|
#define kTopValue ((uint32_t)1 << kNumTopBits) |
|
23424
|
|
|
|
|
|
|
|
|
23425
|
|
|
|
|
|
|
#define kNumBitModelTotalBits 11 |
|
23426
|
|
|
|
|
|
|
#define kBitModelTotal (1 << kNumBitModelTotalBits) |
|
23427
|
|
|
|
|
|
|
#define kNumMoveBits 5 |
|
23428
|
|
|
|
|
|
|
|
|
23429
|
|
|
|
|
|
|
#define RC_INIT_SIZE 5 |
|
23430
|
|
|
|
|
|
|
|
|
23431
|
|
|
|
|
|
|
#define NORMALIZE if (range < kTopValue) { range <<= 8; code = (code << 8) | (*buf++); } |
|
23432
|
|
|
|
|
|
|
|
|
23433
|
|
|
|
|
|
|
#define IF_BIT_0(p) ttt = *(p); NORMALIZE; bound = (range >> kNumBitModelTotalBits) * ttt; if (code < bound) |
|
23434
|
|
|
|
|
|
|
#define UPDATE_0(p) range = bound; *(p) = (CLzmaProb)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits)); |
|
23435
|
|
|
|
|
|
|
#define UPDATE_1(p) range -= bound; code -= bound; *(p) = (CLzmaProb)(ttt - (ttt >> kNumMoveBits)); |
|
23436
|
|
|
|
|
|
|
#define GET_BIT2(p, i, A0, A1) IF_BIT_0(p) \ |
|
23437
|
|
|
|
|
|
|
{ UPDATE_0(p); i = (i + i); A0; } else \ |
|
23438
|
|
|
|
|
|
|
{ UPDATE_1(p); i = (i + i) + 1; A1; } |
|
23439
|
|
|
|
|
|
|
#define GET_BIT(p, i) GET_BIT2(p, i, ; , ;) |
|
23440
|
|
|
|
|
|
|
|
|
23441
|
|
|
|
|
|
|
#define TREE_GET_BIT(probs, i) { GET_BIT((probs + i), i); } |
|
23442
|
|
|
|
|
|
|
#define TREE_DECODE(probs, limit, i) \ |
|
23443
|
|
|
|
|
|
|
{ i = 1; do { TREE_GET_BIT(probs, i); } while (i < limit); i -= limit; } |
|
23444
|
|
|
|
|
|
|
|
|
23445
|
|
|
|
|
|
|
/* #define _LZMA_SIZE_OPT */ |
|
23446
|
|
|
|
|
|
|
|
|
23447
|
|
|
|
|
|
|
#ifdef _LZMA_SIZE_OPT |
|
23448
|
|
|
|
|
|
|
#define TREE_6_DECODE(probs, i) TREE_DECODE(probs, (1 << 6), i) |
|
23449
|
|
|
|
|
|
|
#else |
|
23450
|
|
|
|
|
|
|
#define TREE_6_DECODE(probs, i) \ |
|
23451
|
|
|
|
|
|
|
{ i = 1; \ |
|
23452
|
|
|
|
|
|
|
TREE_GET_BIT(probs, i); \ |
|
23453
|
|
|
|
|
|
|
TREE_GET_BIT(probs, i); \ |
|
23454
|
|
|
|
|
|
|
TREE_GET_BIT(probs, i); \ |
|
23455
|
|
|
|
|
|
|
TREE_GET_BIT(probs, i); \ |
|
23456
|
|
|
|
|
|
|
TREE_GET_BIT(probs, i); \ |
|
23457
|
|
|
|
|
|
|
TREE_GET_BIT(probs, i); \ |
|
23458
|
|
|
|
|
|
|
i -= 0x40; } |
|
23459
|
|
|
|
|
|
|
#endif |
|
23460
|
|
|
|
|
|
|
|
|
23461
|
|
|
|
|
|
|
#define NORMALIZE_CHECK if (range < kTopValue) { if (buf >= bufLimit) return DUMMY_ERROR; range <<= 8; code = (code << 8) | (*buf++); } |
|
23462
|
|
|
|
|
|
|
|
|
23463
|
|
|
|
|
|
|
#define IF_BIT_0_CHECK(p) ttt = *(p); NORMALIZE_CHECK; bound = (range >> kNumBitModelTotalBits) * ttt; if (code < bound) |
|
23464
|
|
|
|
|
|
|
#define UPDATE_0_CHECK range = bound; |
|
23465
|
|
|
|
|
|
|
#define UPDATE_1_CHECK range -= bound; code -= bound; |
|
23466
|
|
|
|
|
|
|
#define GET_BIT2_CHECK(p, i, A0, A1) IF_BIT_0_CHECK(p) \ |
|
23467
|
|
|
|
|
|
|
{ UPDATE_0_CHECK; i = (i + i); A0; } else \ |
|
23468
|
|
|
|
|
|
|
{ UPDATE_1_CHECK; i = (i + i) + 1; A1; } |
|
23469
|
|
|
|
|
|
|
#define GET_BIT_CHECK(p, i) GET_BIT2_CHECK(p, i, ; , ;) |
|
23470
|
|
|
|
|
|
|
#define TREE_DECODE_CHECK(probs, limit, i) \ |
|
23471
|
|
|
|
|
|
|
{ i = 1; do { GET_BIT_CHECK(probs + i, i) } while (i < limit); i -= limit; } |
|
23472
|
|
|
|
|
|
|
|
|
23473
|
|
|
|
|
|
|
#define kNumPosBitsMax 4 |
|
23474
|
|
|
|
|
|
|
#define kNumPosStatesMax (1 << kNumPosBitsMax) |
|
23475
|
|
|
|
|
|
|
|
|
23476
|
|
|
|
|
|
|
#define kLenNumLowBits 3 |
|
23477
|
|
|
|
|
|
|
#define kLenNumLowSymbols (1 << kLenNumLowBits) |
|
23478
|
|
|
|
|
|
|
#define kLenNumMidBits 3 |
|
23479
|
|
|
|
|
|
|
#define kLenNumMidSymbols (1 << kLenNumMidBits) |
|
23480
|
|
|
|
|
|
|
#define kLenNumHighBits 8 |
|
23481
|
|
|
|
|
|
|
#define kLenNumHighSymbols (1 << kLenNumHighBits) |
|
23482
|
|
|
|
|
|
|
|
|
23483
|
|
|
|
|
|
|
#define LenChoice 0 |
|
23484
|
|
|
|
|
|
|
#define LenChoice2 (LenChoice + 1) |
|
23485
|
|
|
|
|
|
|
#define LenLow (LenChoice2 + 1) |
|
23486
|
|
|
|
|
|
|
#define LenMid (LenLow + (kNumPosStatesMax << kLenNumLowBits)) |
|
23487
|
|
|
|
|
|
|
#define LenHigh (LenMid + (kNumPosStatesMax << kLenNumMidBits)) |
|
23488
|
|
|
|
|
|
|
#define kNumLenProbs (LenHigh + kLenNumHighSymbols) |
|
23489
|
|
|
|
|
|
|
|
|
23490
|
|
|
|
|
|
|
#define kNumStates 12 |
|
23491
|
|
|
|
|
|
|
#define kNumLitStates 7 |
|
23492
|
|
|
|
|
|
|
|
|
23493
|
|
|
|
|
|
|
#define kStartPosModelIndex 4 |
|
23494
|
|
|
|
|
|
|
#define kEndPosModelIndex 14 |
|
23495
|
|
|
|
|
|
|
#define kNumFullDistances (1 << (kEndPosModelIndex >> 1)) |
|
23496
|
|
|
|
|
|
|
|
|
23497
|
|
|
|
|
|
|
#define kNumPosSlotBits 6 |
|
23498
|
|
|
|
|
|
|
#define kNumLenToPosStates 4 |
|
23499
|
|
|
|
|
|
|
|
|
23500
|
|
|
|
|
|
|
#define kNumAlignBits 4 |
|
23501
|
|
|
|
|
|
|
#define kAlignTableSize (1 << kNumAlignBits) |
|
23502
|
|
|
|
|
|
|
|
|
23503
|
|
|
|
|
|
|
#define kMatchMinLen 2 |
|
23504
|
|
|
|
|
|
|
#define kMatchSpecLenStart (kMatchMinLen + kLenNumLowSymbols + kLenNumMidSymbols + kLenNumHighSymbols) |
|
23505
|
|
|
|
|
|
|
|
|
23506
|
|
|
|
|
|
|
#define IsMatch 0 |
|
23507
|
|
|
|
|
|
|
#define IsRep (IsMatch + (kNumStates << kNumPosBitsMax)) |
|
23508
|
|
|
|
|
|
|
#define IsRepG0 (IsRep + kNumStates) |
|
23509
|
|
|
|
|
|
|
#define IsRepG1 (IsRepG0 + kNumStates) |
|
23510
|
|
|
|
|
|
|
#define IsRepG2 (IsRepG1 + kNumStates) |
|
23511
|
|
|
|
|
|
|
#define IsRep0Long (IsRepG2 + kNumStates) |
|
23512
|
|
|
|
|
|
|
#define PosSlot (IsRep0Long + (kNumStates << kNumPosBitsMax)) |
|
23513
|
|
|
|
|
|
|
#define SpecPos (PosSlot + (kNumLenToPosStates << kNumPosSlotBits)) |
|
23514
|
|
|
|
|
|
|
#define Align (SpecPos + kNumFullDistances - kEndPosModelIndex) |
|
23515
|
|
|
|
|
|
|
#define LenCoder (Align + kAlignTableSize) |
|
23516
|
|
|
|
|
|
|
#define RepLenCoder (LenCoder + kNumLenProbs) |
|
23517
|
|
|
|
|
|
|
#define Literal (RepLenCoder + kNumLenProbs) |
|
23518
|
|
|
|
|
|
|
|
|
23519
|
|
|
|
|
|
|
#define LZMA_BASE_SIZE 1846 |
|
23520
|
|
|
|
|
|
|
#define LZMA_LIT_SIZE 768 |
|
23521
|
|
|
|
|
|
|
|
|
23522
|
|
|
|
|
|
|
#define LzmaProps_GetNumProbs(p) ((uint32_t)LZMA_BASE_SIZE + (LZMA_LIT_SIZE << ((p)->lc + (p)->lp))) |
|
23523
|
|
|
|
|
|
|
|
|
23524
|
|
|
|
|
|
|
#if Literal != LZMA_BASE_SIZE |
|
23525
|
|
|
|
|
|
|
StopCompilingDueBUG |
|
23526
|
|
|
|
|
|
|
#endif |
|
23527
|
|
|
|
|
|
|
|
|
23528
|
|
|
|
|
|
|
#define LZMA_DIC_MIN (1 << 12) |
|
23529
|
|
|
|
|
|
|
|
|
23530
|
|
|
|
|
|
|
/* First LZMA-symbol is always decoded. |
|
23531
|
|
|
|
|
|
|
And it decodes new LZMA-symbols while (buf < bufLimit), but "buf" is without last normalization |
|
23532
|
|
|
|
|
|
|
Out: |
|
23533
|
|
|
|
|
|
|
Result: |
|
23534
|
|
|
|
|
|
|
SZ_OK - OK |
|
23535
|
|
|
|
|
|
|
SZ_ERROR_DATA - Error |
|
23536
|
|
|
|
|
|
|
p->remainLen: |
|
23537
|
|
|
|
|
|
|
< kMatchSpecLenStart : normal remain |
|
23538
|
|
|
|
|
|
|
= kMatchSpecLenStart : finished |
|
23539
|
|
|
|
|
|
|
= kMatchSpecLenStart + 1 : Flush marker |
|
23540
|
|
|
|
|
|
|
= kMatchSpecLenStart + 2 : State Init Marker |
|
23541
|
|
|
|
|
|
|
*/ |
|
23542
|
|
|
|
|
|
|
|
|
23543
|
504
|
|
|
|
|
|
static int LzmaDec_DecodeReal(CLzmaDec *p, size_t limit, const uint8_t *bufLimit) |
|
23544
|
|
|
|
|
|
|
{ |
|
23545
|
504
|
|
|
|
|
|
CLzmaProb *probs = p->probs; |
|
23546
|
|
|
|
|
|
|
|
|
23547
|
504
|
|
|
|
|
|
unsigned state = p->state; |
|
23548
|
504
|
|
|
|
|
|
uint32_t rep0 = p->reps[0], rep1 = p->reps[1], rep2 = p->reps[2], rep3 = p->reps[3]; |
|
23549
|
504
|
|
|
|
|
|
unsigned pbMask = ((unsigned)1 << (p->prop.pb)) - 1; |
|
23550
|
504
|
|
|
|
|
|
unsigned lpMask = ((unsigned)1 << (p->prop.lp)) - 1; |
|
23551
|
504
|
|
|
|
|
|
unsigned lc = p->prop.lc; |
|
23552
|
|
|
|
|
|
|
|
|
23553
|
504
|
|
|
|
|
|
uint8_t *dic = p->dic; |
|
23554
|
504
|
|
|
|
|
|
size_t dicBufSize = p->dicBufSize; |
|
23555
|
504
|
|
|
|
|
|
size_t dicPos = p->dicPos; |
|
23556
|
|
|
|
|
|
|
|
|
23557
|
504
|
|
|
|
|
|
uint32_t processedPos = p->processedPos; |
|
23558
|
504
|
|
|
|
|
|
uint32_t checkDicSize = p->checkDicSize; |
|
23559
|
|
|
|
|
|
|
unsigned len = 0; |
|
23560
|
|
|
|
|
|
|
|
|
23561
|
504
|
|
|
|
|
|
const uint8_t *buf = p->buf; |
|
23562
|
504
|
|
|
|
|
|
uint32_t range = p->range; |
|
23563
|
504
|
|
|
|
|
|
uint32_t code = p->code; |
|
23564
|
|
|
|
|
|
|
|
|
23565
|
|
|
|
|
|
|
do |
|
23566
|
|
|
|
|
|
|
{ |
|
23567
|
|
|
|
|
|
|
CLzmaProb *prob; |
|
23568
|
|
|
|
|
|
|
uint32_t bound; |
|
23569
|
|
|
|
|
|
|
unsigned ttt; |
|
23570
|
107442
|
|
|
|
|
|
unsigned posState = processedPos & pbMask; |
|
23571
|
|
|
|
|
|
|
|
|
23572
|
107442
|
|
|
|
|
|
prob = probs + IsMatch + (state << kNumPosBitsMax) + posState; |
|
23573
|
107442
|
100
|
|
|
|
|
IF_BIT_0(prob) |
|
|
|
100
|
|
|
|
|
|
|
23574
|
|
|
|
|
|
|
{ |
|
23575
|
|
|
|
|
|
|
unsigned symbol; |
|
23576
|
23097
|
|
|
|
|
|
UPDATE_0(prob); |
|
23577
|
23097
|
|
|
|
|
|
prob = probs + Literal; |
|
23578
|
23097
|
100
|
|
|
|
|
if (checkDicSize != 0 || processedPos != 0) |
|
23579
|
46182
|
|
|
|
|
|
prob += (LZMA_LIT_SIZE * (((processedPos & lpMask) << lc) + |
|
23580
|
23091
|
50
|
|
|
|
|
(dic[(dicPos == 0 ? dicBufSize : dicPos) - 1] >> (8 - lc)))); |
|
23581
|
|
|
|
|
|
|
|
|
23582
|
23097
|
100
|
|
|
|
|
if (state < kNumLitStates) |
|
23583
|
|
|
|
|
|
|
{ |
|
23584
|
21934
|
|
|
|
|
|
state -= (state < 4) ? state : 3; |
|
23585
|
|
|
|
|
|
|
symbol = 1; |
|
23586
|
175472
|
100
|
|
|
|
|
do { GET_BIT(prob + symbol, symbol) } while (symbol < 0x100); |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
23587
|
|
|
|
|
|
|
} |
|
23588
|
|
|
|
|
|
|
else |
|
23589
|
|
|
|
|
|
|
{ |
|
23590
|
1163
|
50
|
|
|
|
|
unsigned matchByte = p->dic[(dicPos - rep0) + ((dicPos < rep0) ? dicBufSize : 0)]; |
|
23591
|
|
|
|
|
|
|
unsigned offs = 0x100; |
|
23592
|
1163
|
100
|
|
|
|
|
state -= (state < 10) ? 3 : 6; |
|
23593
|
|
|
|
|
|
|
symbol = 1; |
|
23594
|
|
|
|
|
|
|
do |
|
23595
|
|
|
|
|
|
|
{ |
|
23596
|
|
|
|
|
|
|
unsigned bit; |
|
23597
|
|
|
|
|
|
|
CLzmaProb *probLit; |
|
23598
|
9304
|
|
|
|
|
|
matchByte <<= 1; |
|
23599
|
9304
|
|
|
|
|
|
bit = (matchByte & offs); |
|
23600
|
9304
|
|
|
|
|
|
probLit = prob + offs + bit + symbol; |
|
23601
|
9304
|
100
|
|
|
|
|
GET_BIT2(probLit, symbol, offs &= ~bit, offs &= bit) |
|
|
|
100
|
|
|
|
|
|
|
23602
|
|
|
|
|
|
|
} |
|
23603
|
9304
|
100
|
|
|
|
|
while (symbol < 0x100); |
|
23604
|
|
|
|
|
|
|
} |
|
23605
|
23097
|
|
|
|
|
|
dic[dicPos++] = (uint8_t)symbol; |
|
23606
|
23097
|
|
|
|
|
|
processedPos++; |
|
23607
|
23097
|
|
|
|
|
|
continue; |
|
23608
|
|
|
|
|
|
|
} |
|
23609
|
|
|
|
|
|
|
else |
|
23610
|
|
|
|
|
|
|
{ |
|
23611
|
84345
|
|
|
|
|
|
UPDATE_1(prob); |
|
23612
|
84345
|
|
|
|
|
|
prob = probs + IsRep + state; |
|
23613
|
84345
|
100
|
|
|
|
|
IF_BIT_0(prob) |
|
|
|
100
|
|
|
|
|
|
|
23614
|
|
|
|
|
|
|
{ |
|
23615
|
487
|
|
|
|
|
|
UPDATE_0(prob); |
|
23616
|
487
|
|
|
|
|
|
state += kNumStates; |
|
23617
|
487
|
|
|
|
|
|
prob = probs + LenCoder; |
|
23618
|
|
|
|
|
|
|
} |
|
23619
|
|
|
|
|
|
|
else |
|
23620
|
|
|
|
|
|
|
{ |
|
23621
|
83858
|
|
|
|
|
|
UPDATE_1(prob); |
|
23622
|
83858
|
50
|
|
|
|
|
if (checkDicSize == 0 && processedPos == 0) |
|
23623
|
|
|
|
|
|
|
return SZ_ERROR_DATA; |
|
23624
|
83858
|
|
|
|
|
|
prob = probs + IsRepG0 + state; |
|
23625
|
83858
|
100
|
|
|
|
|
IF_BIT_0(prob) |
|
|
|
100
|
|
|
|
|
|
|
23626
|
|
|
|
|
|
|
{ |
|
23627
|
83695
|
|
|
|
|
|
UPDATE_0(prob); |
|
23628
|
83695
|
|
|
|
|
|
prob = probs + IsRep0Long + (state << kNumPosBitsMax) + posState; |
|
23629
|
83695
|
100
|
|
|
|
|
IF_BIT_0(prob) |
|
|
|
100
|
|
|
|
|
|
|
23630
|
|
|
|
|
|
|
{ |
|
23631
|
645
|
|
|
|
|
|
UPDATE_0(prob); |
|
23632
|
645
|
50
|
|
|
|
|
dic[dicPos] = dic[(dicPos - rep0) + ((dicPos < rep0) ? dicBufSize : 0)]; |
|
23633
|
645
|
|
|
|
|
|
dicPos++; |
|
23634
|
645
|
|
|
|
|
|
processedPos++; |
|
23635
|
645
|
100
|
|
|
|
|
state = state < kNumLitStates ? 9 : 11; |
|
23636
|
|
|
|
|
|
|
continue; |
|
23637
|
|
|
|
|
|
|
} |
|
23638
|
83050
|
|
|
|
|
|
UPDATE_1(prob); |
|
23639
|
|
|
|
|
|
|
} |
|
23640
|
|
|
|
|
|
|
else |
|
23641
|
|
|
|
|
|
|
{ |
|
23642
|
|
|
|
|
|
|
uint32_t distance; |
|
23643
|
163
|
|
|
|
|
|
UPDATE_1(prob); |
|
23644
|
163
|
|
|
|
|
|
prob = probs + IsRepG1 + state; |
|
23645
|
163
|
100
|
|
|
|
|
IF_BIT_0(prob) |
|
|
|
100
|
|
|
|
|
|
|
23646
|
|
|
|
|
|
|
{ |
|
23647
|
97
|
|
|
|
|
|
UPDATE_0(prob); |
|
23648
|
|
|
|
|
|
|
distance = rep1; |
|
23649
|
|
|
|
|
|
|
} |
|
23650
|
|
|
|
|
|
|
else |
|
23651
|
|
|
|
|
|
|
{ |
|
23652
|
66
|
|
|
|
|
|
UPDATE_1(prob); |
|
23653
|
66
|
|
|
|
|
|
prob = probs + IsRepG2 + state; |
|
23654
|
66
|
100
|
|
|
|
|
IF_BIT_0(prob) |
|
|
|
100
|
|
|
|
|
|
|
23655
|
|
|
|
|
|
|
{ |
|
23656
|
37
|
|
|
|
|
|
UPDATE_0(prob); |
|
23657
|
|
|
|
|
|
|
distance = rep2; |
|
23658
|
|
|
|
|
|
|
} |
|
23659
|
|
|
|
|
|
|
else |
|
23660
|
|
|
|
|
|
|
{ |
|
23661
|
29
|
|
|
|
|
|
UPDATE_1(prob); |
|
23662
|
|
|
|
|
|
|
distance = rep3; |
|
23663
|
|
|
|
|
|
|
rep3 = rep2; |
|
23664
|
|
|
|
|
|
|
} |
|
23665
|
|
|
|
|
|
|
rep2 = rep1; |
|
23666
|
|
|
|
|
|
|
} |
|
23667
|
|
|
|
|
|
|
rep1 = rep0; |
|
23668
|
|
|
|
|
|
|
rep0 = distance; |
|
23669
|
|
|
|
|
|
|
} |
|
23670
|
83213
|
100
|
|
|
|
|
state = state < kNumLitStates ? 8 : 11; |
|
23671
|
83213
|
|
|
|
|
|
prob = probs + RepLenCoder; |
|
23672
|
|
|
|
|
|
|
} |
|
23673
|
|
|
|
|
|
|
{ |
|
23674
|
|
|
|
|
|
|
unsigned limit, offset; |
|
23675
|
|
|
|
|
|
|
CLzmaProb *probLen = prob + LenChoice; |
|
23676
|
83700
|
100
|
|
|
|
|
IF_BIT_0(probLen) |
|
|
|
100
|
|
|
|
|
|
|
23677
|
|
|
|
|
|
|
{ |
|
23678
|
445
|
|
|
|
|
|
UPDATE_0(probLen); |
|
23679
|
445
|
|
|
|
|
|
probLen = prob + LenLow + (posState << kLenNumLowBits); |
|
23680
|
|
|
|
|
|
|
offset = 0; |
|
23681
|
|
|
|
|
|
|
limit = (1 << kLenNumLowBits); |
|
23682
|
|
|
|
|
|
|
} |
|
23683
|
|
|
|
|
|
|
else |
|
23684
|
|
|
|
|
|
|
{ |
|
23685
|
83255
|
|
|
|
|
|
UPDATE_1(probLen); |
|
23686
|
|
|
|
|
|
|
probLen = prob + LenChoice2; |
|
23687
|
83255
|
100
|
|
|
|
|
IF_BIT_0(probLen) |
|
|
|
100
|
|
|
|
|
|
|
23688
|
|
|
|
|
|
|
{ |
|
23689
|
113
|
|
|
|
|
|
UPDATE_0(probLen); |
|
23690
|
113
|
|
|
|
|
|
probLen = prob + LenMid + (posState << kLenNumMidBits); |
|
23691
|
|
|
|
|
|
|
offset = kLenNumLowSymbols; |
|
23692
|
|
|
|
|
|
|
limit = (1 << kLenNumMidBits); |
|
23693
|
|
|
|
|
|
|
} |
|
23694
|
|
|
|
|
|
|
else |
|
23695
|
|
|
|
|
|
|
{ |
|
23696
|
83142
|
|
|
|
|
|
UPDATE_1(probLen); |
|
23697
|
83700
|
|
|
|
|
|
probLen = prob + LenHigh; |
|
23698
|
|
|
|
|
|
|
offset = kLenNumLowSymbols + kLenNumMidSymbols; |
|
23699
|
|
|
|
|
|
|
limit = (1 << kLenNumHighBits); |
|
23700
|
|
|
|
|
|
|
} |
|
23701
|
|
|
|
|
|
|
} |
|
23702
|
666810
|
100
|
|
|
|
|
TREE_DECODE(probLen, limit, len); |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
23703
|
83700
|
|
|
|
|
|
len += offset; |
|
23704
|
|
|
|
|
|
|
} |
|
23705
|
|
|
|
|
|
|
|
|
23706
|
83700
|
100
|
|
|
|
|
if (state >= kNumStates) |
|
23707
|
|
|
|
|
|
|
{ |
|
23708
|
|
|
|
|
|
|
uint32_t distance; |
|
23709
|
487
|
|
|
|
|
|
prob = probs + PosSlot + |
|
23710
|
974
|
|
|
|
|
|
((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits); |
|
23711
|
487
|
100
|
|
|
|
|
TREE_6_DECODE(prob, distance); |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
23712
|
487
|
100
|
|
|
|
|
if (distance >= kStartPosModelIndex) |
|
23713
|
|
|
|
|
|
|
{ |
|
23714
|
|
|
|
|
|
|
unsigned posSlot = (unsigned)distance; |
|
23715
|
405
|
|
|
|
|
|
int numDirectBits = (int)(((distance >> 1) - 1)); |
|
23716
|
405
|
|
|
|
|
|
distance = (2 | (distance & 1)); |
|
23717
|
405
|
100
|
|
|
|
|
if (posSlot < kEndPosModelIndex) |
|
23718
|
|
|
|
|
|
|
{ |
|
23719
|
166
|
|
|
|
|
|
distance <<= numDirectBits; |
|
23720
|
166
|
|
|
|
|
|
prob = probs + SpecPos + distance - posSlot - 1; |
|
23721
|
|
|
|
|
|
|
{ |
|
23722
|
|
|
|
|
|
|
uint32_t mask = 1; |
|
23723
|
|
|
|
|
|
|
unsigned i = 1; |
|
23724
|
491
|
100
|
|
|
|
|
do |
|
23725
|
|
|
|
|
|
|
{ |
|
23726
|
491
|
100
|
|
|
|
|
GET_BIT2(prob + i, i, ; , distance |= mask); |
|
|
|
100
|
|
|
|
|
|
|
23727
|
491
|
|
|
|
|
|
mask <<= 1; |
|
23728
|
|
|
|
|
|
|
} |
|
23729
|
|
|
|
|
|
|
while (--numDirectBits != 0); |
|
23730
|
|
|
|
|
|
|
} |
|
23731
|
|
|
|
|
|
|
} |
|
23732
|
|
|
|
|
|
|
else |
|
23733
|
|
|
|
|
|
|
{ |
|
23734
|
239
|
|
|
|
|
|
numDirectBits -= kNumAlignBits; |
|
23735
|
2572
|
100
|
|
|
|
|
do |
|
23736
|
|
|
|
|
|
|
{ |
|
23737
|
2572
|
100
|
|
|
|
|
NORMALIZE |
|
23738
|
2572
|
|
|
|
|
|
range >>= 1; |
|
23739
|
|
|
|
|
|
|
|
|
23740
|
|
|
|
|
|
|
{ |
|
23741
|
|
|
|
|
|
|
uint32_t t; |
|
23742
|
2572
|
|
|
|
|
|
code -= range; |
|
23743
|
2572
|
|
|
|
|
|
t = (0 - ((uint32_t)code >> 31)); /* (uint32_t)((int32_t)code >> 31) */ |
|
23744
|
2572
|
|
|
|
|
|
distance = (distance << 1) + (t + 1); |
|
23745
|
2572
|
|
|
|
|
|
code += range & t; |
|
23746
|
|
|
|
|
|
|
} |
|
23747
|
|
|
|
|
|
|
/* |
|
23748
|
|
|
|
|
|
|
distance <<= 1; |
|
23749
|
|
|
|
|
|
|
if (code >= range) |
|
23750
|
|
|
|
|
|
|
{ |
|
23751
|
|
|
|
|
|
|
code -= range; |
|
23752
|
|
|
|
|
|
|
distance |= 1; |
|
23753
|
|
|
|
|
|
|
} |
|
23754
|
|
|
|
|
|
|
*/ |
|
23755
|
|
|
|
|
|
|
} |
|
23756
|
|
|
|
|
|
|
while (--numDirectBits != 0); |
|
23757
|
239
|
|
|
|
|
|
prob = probs + Align; |
|
23758
|
239
|
|
|
|
|
|
distance <<= kNumAlignBits; |
|
23759
|
|
|
|
|
|
|
{ |
|
23760
|
|
|
|
|
|
|
unsigned i = 1; |
|
23761
|
239
|
100
|
|
|
|
|
GET_BIT2(prob + i, i, ; , distance |= 1); |
|
|
|
100
|
|
|
|
|
|
|
23762
|
239
|
100
|
|
|
|
|
GET_BIT2(prob + i, i, ; , distance |= 2); |
|
|
|
100
|
|
|
|
|
|
|
23763
|
239
|
100
|
|
|
|
|
GET_BIT2(prob + i, i, ; , distance |= 4); |
|
|
|
100
|
|
|
|
|
|
|
23764
|
239
|
100
|
|
|
|
|
GET_BIT2(prob + i, i, ; , distance |= 8); |
|
|
|
100
|
|
|
|
|
|
|
23765
|
|
|
|
|
|
|
} |
|
23766
|
239
|
50
|
|
|
|
|
if (distance == (uint32_t)0xFFFFFFFF) |
|
23767
|
|
|
|
|
|
|
{ |
|
23768
|
0
|
|
|
|
|
|
len += kMatchSpecLenStart; |
|
23769
|
0
|
|
|
|
|
|
state -= kNumStates; |
|
23770
|
0
|
|
|
|
|
|
break; |
|
23771
|
|
|
|
|
|
|
} |
|
23772
|
|
|
|
|
|
|
} |
|
23773
|
|
|
|
|
|
|
} |
|
23774
|
|
|
|
|
|
|
rep3 = rep2; |
|
23775
|
|
|
|
|
|
|
rep2 = rep1; |
|
23776
|
|
|
|
|
|
|
rep1 = rep0; |
|
23777
|
487
|
|
|
|
|
|
rep0 = distance + 1; |
|
23778
|
487
|
50
|
|
|
|
|
if (checkDicSize == 0) |
|
23779
|
|
|
|
|
|
|
{ |
|
23780
|
487
|
50
|
|
|
|
|
if (distance >= processedPos) |
|
23781
|
|
|
|
|
|
|
return SZ_ERROR_DATA; |
|
23782
|
|
|
|
|
|
|
} |
|
23783
|
0
|
0
|
|
|
|
|
else if (distance >= checkDicSize) |
|
23784
|
|
|
|
|
|
|
return SZ_ERROR_DATA; |
|
23785
|
487
|
100
|
|
|
|
|
state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3; |
|
23786
|
|
|
|
|
|
|
} |
|
23787
|
|
|
|
|
|
|
|
|
23788
|
83700
|
|
|
|
|
|
len += kMatchMinLen; |
|
23789
|
|
|
|
|
|
|
|
|
23790
|
83700
|
50
|
|
|
|
|
if (limit == dicPos) |
|
23791
|
|
|
|
|
|
|
return SZ_ERROR_DATA; |
|
23792
|
|
|
|
|
|
|
{ |
|
23793
|
83700
|
|
|
|
|
|
size_t rem = limit - dicPos; |
|
23794
|
83700
|
50
|
|
|
|
|
unsigned curLen = ((rem < len) ? (unsigned)rem : len); |
|
23795
|
83700
|
50
|
|
|
|
|
size_t pos = (dicPos - rep0) + ((dicPos < rep0) ? dicBufSize : 0); |
|
23796
|
|
|
|
|
|
|
|
|
23797
|
83700
|
|
|
|
|
|
processedPos += curLen; |
|
23798
|
|
|
|
|
|
|
|
|
23799
|
83700
|
|
|
|
|
|
len -= curLen; |
|
23800
|
83700
|
50
|
|
|
|
|
if (pos + curLen <= dicBufSize) |
|
23801
|
|
|
|
|
|
|
{ |
|
23802
|
83700
|
|
|
|
|
|
uint8_t *dest = dic + dicPos; |
|
23803
|
83700
|
|
|
|
|
|
ptrdiff_t src = (ptrdiff_t)pos - (ptrdiff_t)dicPos; |
|
23804
|
83700
|
|
|
|
|
|
const uint8_t *lim = dest + curLen; |
|
23805
|
83700
|
|
|
|
|
|
dicPos += curLen; |
|
23806
|
22650228
|
100
|
|
|
|
|
do |
|
23807
|
22650228
|
|
|
|
|
|
*(dest) = (uint8_t)*(dest + src); |
|
23808
|
|
|
|
|
|
|
while (++dest != lim); |
|
23809
|
|
|
|
|
|
|
} |
|
23810
|
|
|
|
|
|
|
else |
|
23811
|
|
|
|
|
|
|
{ |
|
23812
|
0
|
0
|
|
|
|
|
do |
|
23813
|
|
|
|
|
|
|
{ |
|
23814
|
0
|
|
|
|
|
|
dic[dicPos++] = dic[pos]; |
|
23815
|
0
|
0
|
|
|
|
|
if (++pos == dicBufSize) |
|
23816
|
|
|
|
|
|
|
pos = 0; |
|
23817
|
|
|
|
|
|
|
} |
|
23818
|
|
|
|
|
|
|
while (--curLen != 0); |
|
23819
|
|
|
|
|
|
|
} |
|
23820
|
|
|
|
|
|
|
} |
|
23821
|
|
|
|
|
|
|
} |
|
23822
|
|
|
|
|
|
|
} |
|
23823
|
107442
|
100
|
|
|
|
|
while (dicPos < limit && buf < bufLimit); |
|
23824
|
504
|
100
|
|
|
|
|
NORMALIZE; |
|
23825
|
504
|
|
|
|
|
|
p->buf = buf; |
|
23826
|
504
|
|
|
|
|
|
p->range = range; |
|
23827
|
504
|
|
|
|
|
|
p->code = code; |
|
23828
|
504
|
|
|
|
|
|
p->remainLen = len; |
|
23829
|
504
|
|
|
|
|
|
p->dicPos = dicPos; |
|
23830
|
504
|
|
|
|
|
|
p->processedPos = processedPos; |
|
23831
|
504
|
|
|
|
|
|
p->reps[0] = rep0; |
|
23832
|
504
|
|
|
|
|
|
p->reps[1] = rep1; |
|
23833
|
504
|
|
|
|
|
|
p->reps[2] = rep2; |
|
23834
|
504
|
|
|
|
|
|
p->reps[3] = rep3; |
|
23835
|
504
|
|
|
|
|
|
p->state = state; |
|
23836
|
|
|
|
|
|
|
|
|
23837
|
504
|
|
|
|
|
|
return SZ_OK; |
|
23838
|
|
|
|
|
|
|
} |
|
23839
|
|
|
|
|
|
|
|
|
23840
|
510
|
|
|
|
|
|
static void LzmaDec_WriteRem(CLzmaDec *p, size_t limit) |
|
23841
|
|
|
|
|
|
|
{ |
|
23842
|
510
|
50
|
|
|
|
|
if (p->remainLen != 0 && p->remainLen < kMatchSpecLenStart) |
|
23843
|
|
|
|
|
|
|
{ |
|
23844
|
0
|
|
|
|
|
|
uint8_t *dic = p->dic; |
|
23845
|
0
|
|
|
|
|
|
size_t dicPos = p->dicPos; |
|
23846
|
0
|
|
|
|
|
|
size_t dicBufSize = p->dicBufSize; |
|
23847
|
|
|
|
|
|
|
unsigned len = p->remainLen; |
|
23848
|
0
|
|
|
|
|
|
uint32_t rep0 = p->reps[0]; |
|
23849
|
0
|
0
|
|
|
|
|
if (limit - dicPos < len) |
|
23850
|
0
|
|
|
|
|
|
len = (unsigned)(limit - dicPos); |
|
23851
|
|
|
|
|
|
|
|
|
23852
|
0
|
0
|
|
|
|
|
if (p->checkDicSize == 0 && p->prop.dicSize - p->processedPos <= len) |
|
|
|
0
|
|
|
|
|
|
|
23853
|
0
|
|
|
|
|
|
p->checkDicSize = p->prop.dicSize; |
|
23854
|
|
|
|
|
|
|
|
|
23855
|
0
|
|
|
|
|
|
p->processedPos += len; |
|
23856
|
0
|
|
|
|
|
|
p->remainLen -= len; |
|
23857
|
0
|
0
|
|
|
|
|
while (len-- != 0) |
|
23858
|
|
|
|
|
|
|
{ |
|
23859
|
0
|
0
|
|
|
|
|
dic[dicPos] = dic[(dicPos - rep0) + ((dicPos < rep0) ? dicBufSize : 0)]; |
|
23860
|
0
|
|
|
|
|
|
dicPos++; |
|
23861
|
|
|
|
|
|
|
} |
|
23862
|
0
|
|
|
|
|
|
p->dicPos = dicPos; |
|
23863
|
|
|
|
|
|
|
} |
|
23864
|
510
|
|
|
|
|
|
} |
|
23865
|
|
|
|
|
|
|
|
|
23866
|
1008
|
|
|
|
|
|
static int LzmaDec_DecodeReal2(CLzmaDec *p, size_t limit, const uint8_t *bufLimit) |
|
23867
|
|
|
|
|
|
|
{ |
|
23868
|
|
|
|
|
|
|
do |
|
23869
|
|
|
|
|
|
|
{ |
|
23870
|
|
|
|
|
|
|
size_t limit2 = limit; |
|
23871
|
504
|
50
|
|
|
|
|
if (p->checkDicSize == 0) |
|
23872
|
|
|
|
|
|
|
{ |
|
23873
|
504
|
|
|
|
|
|
uint32_t rem = p->prop.dicSize - p->processedPos; |
|
23874
|
504
|
50
|
|
|
|
|
if (limit - p->dicPos > rem) |
|
23875
|
0
|
|
|
|
|
|
limit2 = p->dicPos + rem; |
|
23876
|
|
|
|
|
|
|
} |
|
23877
|
504
|
50
|
|
|
|
|
RINOK(LzmaDec_DecodeReal(p, limit2, bufLimit)); |
|
23878
|
504
|
50
|
|
|
|
|
if (p->processedPos >= p->prop.dicSize) |
|
23879
|
0
|
|
|
|
|
|
p->checkDicSize = p->prop.dicSize; |
|
23880
|
504
|
|
|
|
|
|
LzmaDec_WriteRem(p, limit); |
|
23881
|
|
|
|
|
|
|
} |
|
23882
|
504
|
100
|
|
|
|
|
while (p->dicPos < limit && p->buf < bufLimit && p->remainLen < kMatchSpecLenStart); |
|
|
|
50
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
23883
|
|
|
|
|
|
|
|
|
23884
|
504
|
50
|
|
|
|
|
if (p->remainLen > kMatchSpecLenStart) |
|
23885
|
|
|
|
|
|
|
{ |
|
23886
|
0
|
|
|
|
|
|
p->remainLen = kMatchSpecLenStart; |
|
23887
|
|
|
|
|
|
|
} |
|
23888
|
|
|
|
|
|
|
return 0; |
|
23889
|
|
|
|
|
|
|
} |
|
23890
|
|
|
|
|
|
|
|
|
23891
|
|
|
|
|
|
|
enum ELzmaDummy |
|
23892
|
|
|
|
|
|
|
{ |
|
23893
|
|
|
|
|
|
|
DUMMY_ERROR, /* unexpected end of input stream */ |
|
23894
|
|
|
|
|
|
|
DUMMY_LIT, |
|
23895
|
|
|
|
|
|
|
DUMMY_MATCH, |
|
23896
|
|
|
|
|
|
|
DUMMY_REP |
|
23897
|
|
|
|
|
|
|
}; |
|
23898
|
|
|
|
|
|
|
|
|
23899
|
470
|
|
|
|
|
|
static ELzmaDummy LzmaDec_TryDummy(const CLzmaDec *p, const uint8_t *buf, size_t inSize) |
|
23900
|
|
|
|
|
|
|
{ |
|
23901
|
470
|
|
|
|
|
|
uint32_t range = p->range; |
|
23902
|
470
|
|
|
|
|
|
uint32_t code = p->code; |
|
23903
|
470
|
|
|
|
|
|
const uint8_t *bufLimit = buf + inSize; |
|
23904
|
470
|
|
|
|
|
|
CLzmaProb *probs = p->probs; |
|
23905
|
470
|
|
|
|
|
|
unsigned state = p->state; |
|
23906
|
|
|
|
|
|
|
ELzmaDummy res; |
|
23907
|
|
|
|
|
|
|
|
|
23908
|
|
|
|
|
|
|
{ |
|
23909
|
|
|
|
|
|
|
CLzmaProb *prob; |
|
23910
|
|
|
|
|
|
|
uint32_t bound; |
|
23911
|
|
|
|
|
|
|
unsigned ttt; |
|
23912
|
470
|
|
|
|
|
|
unsigned posState = (p->processedPos) & ((1 << p->prop.pb) - 1); |
|
23913
|
|
|
|
|
|
|
|
|
23914
|
470
|
|
|
|
|
|
prob = probs + IsMatch + (state << kNumPosBitsMax) + posState; |
|
23915
|
470
|
50
|
|
|
|
|
IF_BIT_0_CHECK(prob) |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
23916
|
|
|
|
|
|
|
{ |
|
23917
|
|
|
|
|
|
|
UPDATE_0_CHECK |
|
23918
|
|
|
|
|
|
|
|
|
23919
|
|
|
|
|
|
|
/* if (bufLimit - buf >= 7) return DUMMY_LIT; */ |
|
23920
|
|
|
|
|
|
|
|
|
23921
|
37
|
|
|
|
|
|
prob = probs + Literal; |
|
23922
|
37
|
100
|
|
|
|
|
if (p->checkDicSize != 0 || p->processedPos != 0) |
|
23923
|
36
|
|
|
|
|
|
prob += (LZMA_LIT_SIZE * |
|
23924
|
72
|
|
|
|
|
|
((((p->processedPos) & ((1 << (p->prop.lp)) - 1)) << p->prop.lc) + |
|
23925
|
36
|
50
|
|
|
|
|
(p->dic[(p->dicPos == 0 ? p->dicBufSize : p->dicPos) - 1] >> (8 - p->prop.lc)))); |
|
23926
|
|
|
|
|
|
|
|
|
23927
|
37
|
100
|
|
|
|
|
if (state < kNumLitStates) |
|
23928
|
|
|
|
|
|
|
{ |
|
23929
|
|
|
|
|
|
|
unsigned symbol = 1; |
|
23930
|
216
|
100
|
|
|
|
|
do { GET_BIT_CHECK(prob + symbol, symbol) } while (symbol < 0x100); |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
23931
|
|
|
|
|
|
|
} |
|
23932
|
|
|
|
|
|
|
else |
|
23933
|
|
|
|
|
|
|
{ |
|
23934
|
20
|
|
|
|
|
|
unsigned matchByte = p->dic[p->dicPos - p->reps[0] + |
|
23935
|
10
|
50
|
|
|
|
|
((p->dicPos < p->reps[0]) ? p->dicBufSize : 0)]; |
|
23936
|
|
|
|
|
|
|
unsigned offs = 0x100; |
|
23937
|
|
|
|
|
|
|
unsigned symbol = 1; |
|
23938
|
|
|
|
|
|
|
do |
|
23939
|
|
|
|
|
|
|
{ |
|
23940
|
|
|
|
|
|
|
unsigned bit; |
|
23941
|
|
|
|
|
|
|
CLzmaProb *probLit; |
|
23942
|
80
|
|
|
|
|
|
matchByte <<= 1; |
|
23943
|
80
|
|
|
|
|
|
bit = (matchByte & offs); |
|
23944
|
80
|
|
|
|
|
|
probLit = prob + offs + bit + symbol; |
|
23945
|
80
|
100
|
|
|
|
|
GET_BIT2_CHECK(probLit, symbol, offs &= ~bit, offs &= bit) |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
23946
|
|
|
|
|
|
|
} |
|
23947
|
80
|
100
|
|
|
|
|
while (symbol < 0x100); |
|
23948
|
|
|
|
|
|
|
} |
|
23949
|
|
|
|
|
|
|
res = DUMMY_LIT; |
|
23950
|
|
|
|
|
|
|
} |
|
23951
|
|
|
|
|
|
|
else |
|
23952
|
|
|
|
|
|
|
{ |
|
23953
|
|
|
|
|
|
|
unsigned len; |
|
23954
|
433
|
|
|
|
|
|
UPDATE_1_CHECK; |
|
23955
|
|
|
|
|
|
|
|
|
23956
|
433
|
|
|
|
|
|
prob = probs + IsRep + state; |
|
23957
|
433
|
100
|
|
|
|
|
IF_BIT_0_CHECK(prob) |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
23958
|
|
|
|
|
|
|
{ |
|
23959
|
|
|
|
|
|
|
UPDATE_0_CHECK; |
|
23960
|
|
|
|
|
|
|
state = 0; |
|
23961
|
17
|
|
|
|
|
|
prob = probs + LenCoder; |
|
23962
|
|
|
|
|
|
|
res = DUMMY_MATCH; |
|
23963
|
|
|
|
|
|
|
} |
|
23964
|
|
|
|
|
|
|
else |
|
23965
|
|
|
|
|
|
|
{ |
|
23966
|
416
|
|
|
|
|
|
UPDATE_1_CHECK; |
|
23967
|
|
|
|
|
|
|
res = DUMMY_REP; |
|
23968
|
416
|
|
|
|
|
|
prob = probs + IsRepG0 + state; |
|
23969
|
416
|
50
|
|
|
|
|
IF_BIT_0_CHECK(prob) |
|
|
|
0
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
23970
|
|
|
|
|
|
|
{ |
|
23971
|
|
|
|
|
|
|
UPDATE_0_CHECK; |
|
23972
|
415
|
|
|
|
|
|
prob = probs + IsRep0Long + (state << kNumPosBitsMax) + posState; |
|
23973
|
415
|
100
|
|
|
|
|
IF_BIT_0_CHECK(prob) |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
23974
|
|
|
|
|
|
|
{ |
|
23975
|
|
|
|
|
|
|
UPDATE_0_CHECK; |
|
23976
|
0
|
0
|
|
|
|
|
NORMALIZE_CHECK; |
|
|
|
0
|
|
|
|
|
|
|
23977
|
|
|
|
|
|
|
return DUMMY_REP; |
|
23978
|
|
|
|
|
|
|
} |
|
23979
|
|
|
|
|
|
|
else |
|
23980
|
|
|
|
|
|
|
{ |
|
23981
|
415
|
|
|
|
|
|
UPDATE_1_CHECK; |
|
23982
|
|
|
|
|
|
|
} |
|
23983
|
|
|
|
|
|
|
} |
|
23984
|
|
|
|
|
|
|
else |
|
23985
|
|
|
|
|
|
|
{ |
|
23986
|
1
|
|
|
|
|
|
UPDATE_1_CHECK; |
|
23987
|
1
|
|
|
|
|
|
prob = probs + IsRepG1 + state; |
|
23988
|
1
|
50
|
|
|
|
|
IF_BIT_0_CHECK(prob) |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
23989
|
|
|
|
|
|
|
{ |
|
23990
|
|
|
|
|
|
|
UPDATE_0_CHECK; |
|
23991
|
|
|
|
|
|
|
} |
|
23992
|
|
|
|
|
|
|
else |
|
23993
|
|
|
|
|
|
|
{ |
|
23994
|
1
|
|
|
|
|
|
UPDATE_1_CHECK; |
|
23995
|
1
|
|
|
|
|
|
prob = probs + IsRepG2 + state; |
|
23996
|
1
|
50
|
|
|
|
|
IF_BIT_0_CHECK(prob) |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
23997
|
|
|
|
|
|
|
{ |
|
23998
|
|
|
|
|
|
|
UPDATE_0_CHECK; |
|
23999
|
|
|
|
|
|
|
} |
|
24000
|
|
|
|
|
|
|
else |
|
24001
|
|
|
|
|
|
|
{ |
|
24002
|
0
|
|
|
|
|
|
UPDATE_1_CHECK; |
|
24003
|
|
|
|
|
|
|
} |
|
24004
|
|
|
|
|
|
|
} |
|
24005
|
|
|
|
|
|
|
} |
|
24006
|
|
|
|
|
|
|
state = kNumStates; |
|
24007
|
416
|
|
|
|
|
|
prob = probs + RepLenCoder; |
|
24008
|
|
|
|
|
|
|
} |
|
24009
|
|
|
|
|
|
|
{ |
|
24010
|
|
|
|
|
|
|
unsigned limit, offset; |
|
24011
|
|
|
|
|
|
|
CLzmaProb *probLen = prob + LenChoice; |
|
24012
|
433
|
100
|
|
|
|
|
IF_BIT_0_CHECK(probLen) |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
24013
|
|
|
|
|
|
|
{ |
|
24014
|
|
|
|
|
|
|
UPDATE_0_CHECK; |
|
24015
|
16
|
|
|
|
|
|
probLen = prob + LenLow + (posState << kLenNumLowBits); |
|
24016
|
|
|
|
|
|
|
offset = 0; |
|
24017
|
|
|
|
|
|
|
limit = 1 << kLenNumLowBits; |
|
24018
|
|
|
|
|
|
|
} |
|
24019
|
|
|
|
|
|
|
else |
|
24020
|
|
|
|
|
|
|
{ |
|
24021
|
417
|
|
|
|
|
|
UPDATE_1_CHECK; |
|
24022
|
|
|
|
|
|
|
probLen = prob + LenChoice2; |
|
24023
|
417
|
100
|
|
|
|
|
IF_BIT_0_CHECK(probLen) |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
24024
|
|
|
|
|
|
|
{ |
|
24025
|
|
|
|
|
|
|
UPDATE_0_CHECK; |
|
24026
|
2
|
|
|
|
|
|
probLen = prob + LenMid + (posState << kLenNumMidBits); |
|
24027
|
|
|
|
|
|
|
offset = kLenNumLowSymbols; |
|
24028
|
|
|
|
|
|
|
limit = 1 << kLenNumMidBits; |
|
24029
|
|
|
|
|
|
|
} |
|
24030
|
|
|
|
|
|
|
else |
|
24031
|
|
|
|
|
|
|
{ |
|
24032
|
415
|
|
|
|
|
|
UPDATE_1_CHECK; |
|
24033
|
433
|
|
|
|
|
|
probLen = prob + LenHigh; |
|
24034
|
|
|
|
|
|
|
offset = kLenNumLowSymbols + kLenNumMidSymbols; |
|
24035
|
|
|
|
|
|
|
limit = 1 << kLenNumHighBits; |
|
24036
|
|
|
|
|
|
|
} |
|
24037
|
|
|
|
|
|
|
} |
|
24038
|
3374
|
100
|
|
|
|
|
TREE_DECODE_CHECK(probLen, limit, len); |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
24039
|
433
|
|
|
|
|
|
len += offset; |
|
24040
|
|
|
|
|
|
|
} |
|
24041
|
|
|
|
|
|
|
|
|
24042
|
433
|
100
|
|
|
|
|
if (state < 4) |
|
24043
|
|
|
|
|
|
|
{ |
|
24044
|
|
|
|
|
|
|
unsigned posSlot; |
|
24045
|
17
|
|
|
|
|
|
prob = probs + PosSlot + |
|
24046
|
17
|
|
|
|
|
|
((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << |
|
24047
|
17
|
|
|
|
|
|
kNumPosSlotBits); |
|
24048
|
102
|
100
|
|
|
|
|
TREE_DECODE_CHECK(prob, 1 << kNumPosSlotBits, posSlot); |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
24049
|
17
|
100
|
|
|
|
|
if (posSlot >= kStartPosModelIndex) |
|
24050
|
|
|
|
|
|
|
{ |
|
24051
|
14
|
|
|
|
|
|
int numDirectBits = ((posSlot >> 1) - 1); |
|
24052
|
|
|
|
|
|
|
|
|
24053
|
|
|
|
|
|
|
/* if (bufLimit - buf >= 8) return DUMMY_MATCH; */ |
|
24054
|
|
|
|
|
|
|
|
|
24055
|
14
|
100
|
|
|
|
|
if (posSlot < kEndPosModelIndex) |
|
24056
|
|
|
|
|
|
|
{ |
|
24057
|
9
|
|
|
|
|
|
prob = probs + SpecPos + ((2 | (posSlot & 1)) << numDirectBits) - posSlot - 1; |
|
24058
|
|
|
|
|
|
|
} |
|
24059
|
|
|
|
|
|
|
else |
|
24060
|
|
|
|
|
|
|
{ |
|
24061
|
5
|
|
|
|
|
|
numDirectBits -= kNumAlignBits; |
|
24062
|
42
|
100
|
|
|
|
|
do |
|
24063
|
|
|
|
|
|
|
{ |
|
24064
|
42
|
100
|
|
|
|
|
NORMALIZE_CHECK |
|
|
|
50
|
|
|
|
|
|
|
24065
|
42
|
|
|
|
|
|
range >>= 1; |
|
24066
|
42
|
|
|
|
|
|
code -= range & (((code - range) >> 31) - 1); |
|
24067
|
|
|
|
|
|
|
/* if (code >= range) code -= range; */ |
|
24068
|
|
|
|
|
|
|
} |
|
24069
|
|
|
|
|
|
|
while (--numDirectBits != 0); |
|
24070
|
14
|
|
|
|
|
|
prob = probs + Align; |
|
24071
|
|
|
|
|
|
|
numDirectBits = kNumAlignBits; |
|
24072
|
|
|
|
|
|
|
} |
|
24073
|
|
|
|
|
|
|
{ |
|
24074
|
|
|
|
|
|
|
unsigned i = 1; |
|
24075
|
47
|
100
|
|
|
|
|
do |
|
24076
|
|
|
|
|
|
|
{ |
|
24077
|
47
|
100
|
|
|
|
|
GET_BIT_CHECK(prob + i, i); |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
24078
|
|
|
|
|
|
|
} |
|
24079
|
|
|
|
|
|
|
while (--numDirectBits != 0); |
|
24080
|
|
|
|
|
|
|
} |
|
24081
|
|
|
|
|
|
|
} |
|
24082
|
|
|
|
|
|
|
} |
|
24083
|
|
|
|
|
|
|
} |
|
24084
|
|
|
|
|
|
|
} |
|
24085
|
470
|
100
|
|
|
|
|
NORMALIZE_CHECK; |
|
|
|
50
|
|
|
|
|
|
|
24086
|
|
|
|
|
|
|
return res; |
|
24087
|
|
|
|
|
|
|
} |
|
24088
|
|
|
|
|
|
|
|
|
24089
|
|
|
|
|
|
|
static void LzmaDec_InitRc(CLzmaDec *p, const uint8_t *data) |
|
24090
|
|
|
|
|
|
|
{ |
|
24091
|
6
|
|
|
|
|
|
p->code = ((uint32_t)data[1] << 24) | ((uint32_t)data[2] << 16) | ((uint32_t)data[3] << 8) | ((uint32_t)data[4]); |
|
24092
|
6
|
|
|
|
|
|
p->range = 0xFFFFFFFF; |
|
24093
|
6
|
|
|
|
|
|
p->needFlush = 0; |
|
24094
|
|
|
|
|
|
|
} |
|
24095
|
|
|
|
|
|
|
|
|
24096
|
0
|
|
|
|
|
|
void LzmaDec_InitDicAndState(CLzmaDec *p, bool initDic, bool initState) |
|
24097
|
|
|
|
|
|
|
{ |
|
24098
|
6
|
|
|
|
|
|
p->needFlush = 1; |
|
24099
|
6
|
|
|
|
|
|
p->remainLen = 0; |
|
24100
|
6
|
|
|
|
|
|
p->tempBufSize = 0; |
|
24101
|
|
|
|
|
|
|
|
|
24102
|
0
|
0
|
|
|
|
|
if (initDic) |
|
24103
|
|
|
|
|
|
|
{ |
|
24104
|
6
|
|
|
|
|
|
p->processedPos = 0; |
|
24105
|
6
|
|
|
|
|
|
p->checkDicSize = 0; |
|
24106
|
0
|
|
|
|
|
|
p->needInitState = 1; |
|
24107
|
|
|
|
|
|
|
} |
|
24108
|
0
|
0
|
|
|
|
|
if (initState) |
|
24109
|
0
|
|
|
|
|
|
p->needInitState = 1; |
|
24110
|
0
|
|
|
|
|
|
} |
|
24111
|
|
|
|
|
|
|
|
|
24112
|
0
|
|
|
|
|
|
void LzmaDec_Init(CLzmaDec *p) |
|
24113
|
|
|
|
|
|
|
{ |
|
24114
|
6
|
|
|
|
|
|
p->dicPos = 0; |
|
24115
|
|
|
|
|
|
|
LzmaDec_InitDicAndState(p, true, true); |
|
24116
|
0
|
|
|
|
|
|
} |
|
24117
|
|
|
|
|
|
|
|
|
24118
|
|
|
|
|
|
|
static void LzmaDec_InitStateReal(CLzmaDec *p) |
|
24119
|
|
|
|
|
|
|
{ |
|
24120
|
6
|
|
|
|
|
|
uint32_t numProbs = Literal + ((uint32_t)LZMA_LIT_SIZE << (p->prop.lc + p->prop.lp)); |
|
24121
|
|
|
|
|
|
|
uint32_t i; |
|
24122
|
6
|
|
|
|
|
|
CLzmaProb *probs = p->probs; |
|
24123
|
47946
|
100
|
|
|
|
|
for (i = 0; i < numProbs; i++) |
|
24124
|
47940
|
|
|
|
|
|
probs[i] = kBitModelTotal >> 1; |
|
24125
|
6
|
|
|
|
|
|
p->reps[0] = p->reps[1] = p->reps[2] = p->reps[3] = 1; |
|
24126
|
6
|
|
|
|
|
|
p->state = 0; |
|
24127
|
6
|
|
|
|
|
|
p->needInitState = 0; |
|
24128
|
|
|
|
|
|
|
} |
|
24129
|
|
|
|
|
|
|
|
|
24130
|
6
|
|
|
|
|
|
SRes LzmaDec_DecodeToDic(CLzmaDec *p, size_t dicLimit, const uint8_t *src, size_t *srcLen, |
|
24131
|
|
|
|
|
|
|
ELzmaFinishMode finishMode, ELzmaStatus *status) |
|
24132
|
|
|
|
|
|
|
{ |
|
24133
|
6
|
|
|
|
|
|
size_t inSize = *srcLen; |
|
24134
|
6
|
|
|
|
|
|
(*srcLen) = 0; |
|
24135
|
6
|
|
|
|
|
|
LzmaDec_WriteRem(p, dicLimit); |
|
24136
|
|
|
|
|
|
|
|
|
24137
|
510
|
|
|
|
|
|
*status = LZMA_STATUS_NOT_SPECIFIED; |
|
24138
|
|
|
|
|
|
|
|
|
24139
|
510
|
50
|
|
|
|
|
while (p->remainLen != kMatchSpecLenStart) |
|
24140
|
|
|
|
|
|
|
{ |
|
24141
|
|
|
|
|
|
|
int checkEndMarkNow; |
|
24142
|
|
|
|
|
|
|
|
|
24143
|
510
|
100
|
|
|
|
|
if (p->needFlush != 0) |
|
24144
|
|
|
|
|
|
|
{ |
|
24145
|
36
|
50
|
|
|
|
|
for (; inSize > 0 && p->tempBufSize < RC_INIT_SIZE; (*srcLen)++, inSize--) |
|
|
|
100
|
|
|
|
|
|
|
24146
|
30
|
|
|
|
|
|
p->tempBuf[p->tempBufSize++] = *src++; |
|
24147
|
6
|
50
|
|
|
|
|
if (p->tempBufSize < RC_INIT_SIZE) |
|
24148
|
|
|
|
|
|
|
{ |
|
24149
|
0
|
|
|
|
|
|
*status = LZMA_STATUS_NEEDS_MORE_INPUT; |
|
24150
|
0
|
|
|
|
|
|
return SZ_OK; |
|
24151
|
|
|
|
|
|
|
} |
|
24152
|
6
|
50
|
|
|
|
|
if (p->tempBuf[0] != 0) |
|
24153
|
|
|
|
|
|
|
return SZ_ERROR_DATA; |
|
24154
|
|
|
|
|
|
|
|
|
24155
|
|
|
|
|
|
|
LzmaDec_InitRc(p, p->tempBuf); |
|
24156
|
6
|
|
|
|
|
|
p->tempBufSize = 0; |
|
24157
|
|
|
|
|
|
|
} |
|
24158
|
|
|
|
|
|
|
|
|
24159
|
|
|
|
|
|
|
checkEndMarkNow = 0; |
|
24160
|
510
|
100
|
|
|
|
|
if (p->dicPos >= dicLimit) |
|
24161
|
|
|
|
|
|
|
{ |
|
24162
|
6
|
50
|
|
|
|
|
if (p->remainLen == 0 && p->code == 0) |
|
|
|
50
|
|
|
|
|
|
|
24163
|
|
|
|
|
|
|
{ |
|
24164
|
6
|
|
|
|
|
|
*status = LZMA_STATUS_MAYBE_FINISHED_WITHOUT_MARK; |
|
24165
|
6
|
|
|
|
|
|
return SZ_OK; |
|
24166
|
|
|
|
|
|
|
} |
|
24167
|
0
|
0
|
|
|
|
|
if (finishMode == LZMA_FINISH_ANY) |
|
24168
|
|
|
|
|
|
|
{ |
|
24169
|
0
|
|
|
|
|
|
*status = LZMA_STATUS_NOT_FINISHED; |
|
24170
|
0
|
|
|
|
|
|
return SZ_OK; |
|
24171
|
|
|
|
|
|
|
} |
|
24172
|
0
|
0
|
|
|
|
|
if (p->remainLen != 0) |
|
24173
|
|
|
|
|
|
|
{ |
|
24174
|
0
|
|
|
|
|
|
*status = LZMA_STATUS_NOT_FINISHED; |
|
24175
|
0
|
|
|
|
|
|
return SZ_ERROR_DATA; |
|
24176
|
|
|
|
|
|
|
} |
|
24177
|
|
|
|
|
|
|
checkEndMarkNow = 1; |
|
24178
|
|
|
|
|
|
|
} |
|
24179
|
|
|
|
|
|
|
|
|
24180
|
504
|
100
|
|
|
|
|
if (p->needInitState) |
|
24181
|
|
|
|
|
|
|
LzmaDec_InitStateReal(p); |
|
24182
|
|
|
|
|
|
|
|
|
24183
|
504
|
50
|
|
|
|
|
if (p->tempBufSize == 0) |
|
24184
|
|
|
|
|
|
|
{ |
|
24185
|
|
|
|
|
|
|
size_t processed; |
|
24186
|
|
|
|
|
|
|
const uint8_t *bufLimit; |
|
24187
|
504
|
100
|
|
|
|
|
if (inSize < LZMA_REQUIRED_INPUT_MAX || checkEndMarkNow) |
|
24188
|
|
|
|
|
|
|
{ |
|
24189
|
470
|
|
|
|
|
|
int dummyRes = LzmaDec_TryDummy(p, src, inSize); |
|
24190
|
470
|
50
|
|
|
|
|
if (dummyRes == DUMMY_ERROR) |
|
24191
|
|
|
|
|
|
|
{ |
|
24192
|
0
|
|
|
|
|
|
memcpy(p->tempBuf, src, inSize); |
|
24193
|
0
|
|
|
|
|
|
p->tempBufSize = (unsigned)inSize; |
|
24194
|
0
|
|
|
|
|
|
(*srcLen) += inSize; |
|
24195
|
0
|
|
|
|
|
|
*status = LZMA_STATUS_NEEDS_MORE_INPUT; |
|
24196
|
0
|
|
|
|
|
|
return SZ_OK; |
|
24197
|
|
|
|
|
|
|
} |
|
24198
|
470
|
50
|
|
|
|
|
if (checkEndMarkNow && dummyRes != DUMMY_MATCH) |
|
24199
|
|
|
|
|
|
|
{ |
|
24200
|
0
|
|
|
|
|
|
*status = LZMA_STATUS_NOT_FINISHED; |
|
24201
|
0
|
|
|
|
|
|
return SZ_ERROR_DATA; |
|
24202
|
|
|
|
|
|
|
} |
|
24203
|
|
|
|
|
|
|
bufLimit = src; |
|
24204
|
|
|
|
|
|
|
} |
|
24205
|
|
|
|
|
|
|
else |
|
24206
|
34
|
|
|
|
|
|
bufLimit = src + inSize - LZMA_REQUIRED_INPUT_MAX; |
|
24207
|
504
|
|
|
|
|
|
p->buf = src; |
|
24208
|
504
|
50
|
|
|
|
|
if (LzmaDec_DecodeReal2(p, dicLimit, bufLimit) != 0) |
|
24209
|
|
|
|
|
|
|
return SZ_ERROR_DATA; |
|
24210
|
504
|
|
|
|
|
|
processed = (size_t)(p->buf - src); |
|
24211
|
504
|
|
|
|
|
|
(*srcLen) += processed; |
|
24212
|
|
|
|
|
|
|
src += processed; |
|
24213
|
504
|
|
|
|
|
|
inSize -= processed; |
|
24214
|
|
|
|
|
|
|
} |
|
24215
|
|
|
|
|
|
|
else |
|
24216
|
|
|
|
|
|
|
{ |
|
24217
|
|
|
|
|
|
|
unsigned rem = p->tempBufSize, lookAhead = 0; |
|
24218
|
0
|
0
|
|
|
|
|
while (rem < LZMA_REQUIRED_INPUT_MAX && lookAhead < inSize) |
|
|
|
0
|
|
|
|
|
|
|
24219
|
0
|
|
|
|
|
|
p->tempBuf[rem++] = src[lookAhead++]; |
|
24220
|
0
|
|
|
|
|
|
p->tempBufSize = rem; |
|
24221
|
0
|
0
|
|
|
|
|
if (rem < LZMA_REQUIRED_INPUT_MAX || checkEndMarkNow) |
|
24222
|
|
|
|
|
|
|
{ |
|
24223
|
0
|
|
|
|
|
|
int dummyRes = LzmaDec_TryDummy(p, p->tempBuf, rem); |
|
24224
|
0
|
0
|
|
|
|
|
if (dummyRes == DUMMY_ERROR) |
|
24225
|
|
|
|
|
|
|
{ |
|
24226
|
0
|
|
|
|
|
|
(*srcLen) += lookAhead; |
|
24227
|
0
|
|
|
|
|
|
*status = LZMA_STATUS_NEEDS_MORE_INPUT; |
|
24228
|
0
|
|
|
|
|
|
return SZ_OK; |
|
24229
|
|
|
|
|
|
|
} |
|
24230
|
0
|
0
|
|
|
|
|
if (checkEndMarkNow && dummyRes != DUMMY_MATCH) |
|
24231
|
|
|
|
|
|
|
{ |
|
24232
|
0
|
|
|
|
|
|
*status = LZMA_STATUS_NOT_FINISHED; |
|
24233
|
0
|
|
|
|
|
|
return SZ_ERROR_DATA; |
|
24234
|
|
|
|
|
|
|
} |
|
24235
|
|
|
|
|
|
|
} |
|
24236
|
0
|
|
|
|
|
|
p->buf = p->tempBuf; |
|
24237
|
0
|
0
|
|
|
|
|
if (LzmaDec_DecodeReal2(p, dicLimit, p->buf) != 0) |
|
24238
|
|
|
|
|
|
|
return SZ_ERROR_DATA; |
|
24239
|
0
|
|
|
|
|
|
lookAhead -= (rem - (unsigned)(p->buf - p->tempBuf)); |
|
24240
|
0
|
|
|
|
|
|
(*srcLen) += lookAhead; |
|
24241
|
0
|
|
|
|
|
|
src += lookAhead; |
|
24242
|
0
|
|
|
|
|
|
inSize -= lookAhead; |
|
24243
|
0
|
|
|
|
|
|
p->tempBufSize = 0; |
|
24244
|
|
|
|
|
|
|
} |
|
24245
|
|
|
|
|
|
|
} |
|
24246
|
0
|
0
|
|
|
|
|
if (p->code == 0) |
|
24247
|
0
|
|
|
|
|
|
*status = LZMA_STATUS_FINISHED_WITH_MARK; |
|
24248
|
0
|
|
|
|
|
|
return (p->code == 0) ? SZ_OK : SZ_ERROR_DATA; |
|
24249
|
|
|
|
|
|
|
} |
|
24250
|
|
|
|
|
|
|
|
|
24251
|
0
|
|
|
|
|
|
SRes LzmaDec_DecodeToBuf(CLzmaDec *p, uint8_t *dest, size_t *destLen, const uint8_t *src, size_t *srcLen, ELzmaFinishMode finishMode, ELzmaStatus *status) |
|
24252
|
|
|
|
|
|
|
{ |
|
24253
|
0
|
|
|
|
|
|
size_t outSize = *destLen; |
|
24254
|
0
|
|
|
|
|
|
size_t inSize = *srcLen; |
|
24255
|
0
|
|
|
|
|
|
*srcLen = *destLen = 0; |
|
24256
|
0
|
|
|
|
|
|
for (;;) |
|
24257
|
|
|
|
|
|
|
{ |
|
24258
|
0
|
|
|
|
|
|
size_t inSizeCur = inSize, outSizeCur, dicPos; |
|
24259
|
|
|
|
|
|
|
ELzmaFinishMode curFinishMode; |
|
24260
|
|
|
|
|
|
|
SRes res; |
|
24261
|
0
|
0
|
|
|
|
|
if (p->dicPos == p->dicBufSize) |
|
24262
|
0
|
|
|
|
|
|
p->dicPos = 0; |
|
24263
|
0
|
|
|
|
|
|
dicPos = p->dicPos; |
|
24264
|
0
|
0
|
|
|
|
|
if (outSize > p->dicBufSize - dicPos) |
|
24265
|
|
|
|
|
|
|
{ |
|
24266
|
|
|
|
|
|
|
outSizeCur = p->dicBufSize; |
|
24267
|
|
|
|
|
|
|
curFinishMode = LZMA_FINISH_ANY; |
|
24268
|
|
|
|
|
|
|
} |
|
24269
|
|
|
|
|
|
|
else |
|
24270
|
|
|
|
|
|
|
{ |
|
24271
|
0
|
|
|
|
|
|
outSizeCur = dicPos + outSize; |
|
24272
|
|
|
|
|
|
|
curFinishMode = finishMode; |
|
24273
|
|
|
|
|
|
|
} |
|
24274
|
|
|
|
|
|
|
|
|
24275
|
0
|
|
|
|
|
|
res = LzmaDec_DecodeToDic(p, outSizeCur, src, &inSizeCur, curFinishMode, status); |
|
24276
|
0
|
|
|
|
|
|
src += inSizeCur; |
|
24277
|
0
|
|
|
|
|
|
inSize -= inSizeCur; |
|
24278
|
0
|
|
|
|
|
|
*srcLen += inSizeCur; |
|
24279
|
0
|
|
|
|
|
|
outSizeCur = p->dicPos - dicPos; |
|
24280
|
0
|
|
|
|
|
|
memcpy(dest, p->dic + dicPos, outSizeCur); |
|
24281
|
0
|
|
|
|
|
|
dest += outSizeCur; |
|
24282
|
0
|
|
|
|
|
|
outSize -= outSizeCur; |
|
24283
|
0
|
|
|
|
|
|
*destLen += outSizeCur; |
|
24284
|
0
|
0
|
|
|
|
|
if (res != 0) |
|
24285
|
0
|
|
|
|
|
|
return res; |
|
24286
|
0
|
0
|
|
|
|
|
if (outSizeCur == 0 || outSize == 0) |
|
24287
|
|
|
|
|
|
|
return SZ_OK; |
|
24288
|
|
|
|
|
|
|
} |
|
24289
|
|
|
|
|
|
|
} |
|
24290
|
|
|
|
|
|
|
|
|
24291
|
0
|
|
|
|
|
|
void LzmaDec_FreeProbs(CLzmaDec *p, ISzAlloc *alloc) |
|
24292
|
|
|
|
|
|
|
{ |
|
24293
|
12
|
|
|
|
|
|
alloc->Free(alloc, p->probs); |
|
24294
|
6
|
|
|
|
|
|
p->probs = 0; |
|
24295
|
0
|
|
|
|
|
|
} |
|
24296
|
|
|
|
|
|
|
|
|
24297
|
|
|
|
|
|
|
static void LzmaDec_FreeDict(CLzmaDec *p, ISzAlloc *alloc) |
|
24298
|
|
|
|
|
|
|
{ |
|
24299
|
0
|
|
|
|
|
|
alloc->Free(alloc, p->dic); |
|
24300
|
0
|
|
|
|
|
|
p->dic = 0; |
|
24301
|
|
|
|
|
|
|
} |
|
24302
|
|
|
|
|
|
|
|
|
24303
|
0
|
|
|
|
|
|
void LzmaDec_Free(CLzmaDec *p, ISzAlloc *alloc) |
|
24304
|
|
|
|
|
|
|
{ |
|
24305
|
|
|
|
|
|
|
LzmaDec_FreeProbs(p, alloc); |
|
24306
|
|
|
|
|
|
|
LzmaDec_FreeDict(p, alloc); |
|
24307
|
0
|
|
|
|
|
|
} |
|
24308
|
|
|
|
|
|
|
|
|
24309
|
6
|
|
|
|
|
|
SRes LzmaProps_Decode(CLzmaProps *p, const uint8_t *data, unsigned size) |
|
24310
|
|
|
|
|
|
|
{ |
|
24311
|
|
|
|
|
|
|
uint32_t dicSize; |
|
24312
|
|
|
|
|
|
|
uint8_t d; |
|
24313
|
|
|
|
|
|
|
|
|
24314
|
6
|
50
|
|
|
|
|
if (size < LZMA_PROPS_SIZE) |
|
24315
|
|
|
|
|
|
|
return SZ_ERROR_UNSUPPORTED; |
|
24316
|
|
|
|
|
|
|
else |
|
24317
|
6
|
|
|
|
|
|
dicSize = data[1] | ((uint32_t)data[2] << 8) | ((uint32_t)data[3] << 16) | ((uint32_t)data[4] << 24); |
|
24318
|
|
|
|
|
|
|
|
|
24319
|
6
|
50
|
|
|
|
|
if (dicSize < LZMA_DIC_MIN) |
|
24320
|
|
|
|
|
|
|
dicSize = LZMA_DIC_MIN; |
|
24321
|
6
|
|
|
|
|
|
p->dicSize = dicSize; |
|
24322
|
|
|
|
|
|
|
|
|
24323
|
6
|
|
|
|
|
|
d = data[0]; |
|
24324
|
6
|
50
|
|
|
|
|
if (d >= (9 * 5 * 5)) |
|
24325
|
|
|
|
|
|
|
return SZ_ERROR_UNSUPPORTED; |
|
24326
|
|
|
|
|
|
|
|
|
24327
|
6
|
|
|
|
|
|
p->lc = d % 9; |
|
24328
|
6
|
|
|
|
|
|
d /= 9; |
|
24329
|
6
|
|
|
|
|
|
p->pb = d / 5; |
|
24330
|
6
|
|
|
|
|
|
p->lp = d % 5; |
|
24331
|
|
|
|
|
|
|
|
|
24332
|
6
|
|
|
|
|
|
return SZ_OK; |
|
24333
|
|
|
|
|
|
|
} |
|
24334
|
|
|
|
|
|
|
|
|
24335
|
12
|
|
|
|
|
|
static SRes LzmaDec_AllocateProbs2(CLzmaDec *p, const CLzmaProps *propNew, ISzAlloc *alloc) |
|
24336
|
|
|
|
|
|
|
{ |
|
24337
|
6
|
|
|
|
|
|
uint32_t numProbs = LzmaProps_GetNumProbs(propNew); |
|
24338
|
6
|
50
|
|
|
|
|
if (p->probs == 0 || numProbs != p->numProbs) |
|
|
|
0
|
|
|
|
|
|
|
24339
|
|
|
|
|
|
|
{ |
|
24340
|
|
|
|
|
|
|
LzmaDec_FreeProbs(p, alloc); |
|
24341
|
6
|
|
|
|
|
|
p->probs = (CLzmaProb *)alloc->Alloc(alloc, numProbs * sizeof(CLzmaProb)); |
|
24342
|
6
|
|
|
|
|
|
p->numProbs = numProbs; |
|
24343
|
6
|
50
|
|
|
|
|
if (p->probs == 0) |
|
24344
|
|
|
|
|
|
|
return SZ_ERROR_MEM; |
|
24345
|
|
|
|
|
|
|
} |
|
24346
|
|
|
|
|
|
|
return SZ_OK; |
|
24347
|
|
|
|
|
|
|
} |
|
24348
|
|
|
|
|
|
|
|
|
24349
|
6
|
|
|
|
|
|
SRes LzmaDec_AllocateProbs(CLzmaDec *p, const uint8_t *props, unsigned propsSize, ISzAlloc *alloc) |
|
24350
|
|
|
|
|
|
|
{ |
|
24351
|
|
|
|
|
|
|
CLzmaProps propNew; |
|
24352
|
6
|
50
|
|
|
|
|
RINOK(LzmaProps_Decode(&propNew, props, propsSize)); |
|
24353
|
6
|
50
|
|
|
|
|
RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc)); |
|
24354
|
6
|
|
|
|
|
|
p->prop = propNew; |
|
24355
|
6
|
|
|
|
|
|
return SZ_OK; |
|
24356
|
|
|
|
|
|
|
} |
|
24357
|
|
|
|
|
|
|
|
|
24358
|
0
|
|
|
|
|
|
SRes LzmaDec_Allocate(CLzmaDec *p, const uint8_t *props, unsigned propsSize, ISzAlloc *alloc) |
|
24359
|
|
|
|
|
|
|
{ |
|
24360
|
|
|
|
|
|
|
CLzmaProps propNew; |
|
24361
|
|
|
|
|
|
|
size_t dicBufSize; |
|
24362
|
0
|
0
|
|
|
|
|
RINOK(LzmaProps_Decode(&propNew, props, propsSize)); |
|
24363
|
0
|
0
|
|
|
|
|
RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc)); |
|
24364
|
0
|
|
|
|
|
|
dicBufSize = propNew.dicSize; |
|
24365
|
0
|
0
|
|
|
|
|
if (p->dic == 0 || dicBufSize != p->dicBufSize) |
|
|
|
0
|
|
|
|
|
|
|
24366
|
|
|
|
|
|
|
{ |
|
24367
|
|
|
|
|
|
|
LzmaDec_FreeDict(p, alloc); |
|
24368
|
0
|
|
|
|
|
|
p->dic = (uint8_t *)alloc->Alloc(alloc, dicBufSize); |
|
24369
|
0
|
0
|
|
|
|
|
if (p->dic == 0) |
|
24370
|
|
|
|
|
|
|
{ |
|
24371
|
|
|
|
|
|
|
LzmaDec_FreeProbs(p, alloc); |
|
24372
|
0
|
|
|
|
|
|
return SZ_ERROR_MEM; |
|
24373
|
|
|
|
|
|
|
} |
|
24374
|
|
|
|
|
|
|
} |
|
24375
|
0
|
|
|
|
|
|
p->dicBufSize = dicBufSize; |
|
24376
|
0
|
|
|
|
|
|
p->prop = propNew; |
|
24377
|
0
|
|
|
|
|
|
return SZ_OK; |
|
24378
|
|
|
|
|
|
|
} |
|
24379
|
|
|
|
|
|
|
|
|
24380
|
6
|
|
|
|
|
|
SRes LzmaDecode(uint8_t *dest, size_t *destLen, const uint8_t *src, size_t *srcLen, |
|
24381
|
|
|
|
|
|
|
const uint8_t *propData, unsigned propSize, ELzmaFinishMode finishMode, |
|
24382
|
|
|
|
|
|
|
ELzmaStatus *status, ISzAlloc *alloc) |
|
24383
|
|
|
|
|
|
|
{ |
|
24384
|
|
|
|
|
|
|
CLzmaDec p; |
|
24385
|
|
|
|
|
|
|
SRes res; |
|
24386
|
6
|
|
|
|
|
|
size_t inSize = *srcLen; |
|
24387
|
6
|
|
|
|
|
|
size_t outSize = *destLen; |
|
24388
|
6
|
|
|
|
|
|
*srcLen = *destLen = 0; |
|
24389
|
6
|
50
|
|
|
|
|
if (inSize < RC_INIT_SIZE) |
|
24390
|
|
|
|
|
|
|
return SZ_ERROR_INPUT_EOF; |
|
24391
|
|
|
|
|
|
|
|
|
24392
|
6
|
|
|
|
|
|
LzmaDec_Construct(&p); |
|
24393
|
6
|
|
|
|
|
|
res = LzmaDec_AllocateProbs(&p, propData, propSize, alloc); |
|
24394
|
6
|
50
|
|
|
|
|
if (res != 0) |
|
24395
|
|
|
|
|
|
|
return res; |
|
24396
|
6
|
|
|
|
|
|
p.dic = dest; |
|
24397
|
6
|
|
|
|
|
|
p.dicBufSize = outSize; |
|
24398
|
|
|
|
|
|
|
|
|
24399
|
|
|
|
|
|
|
LzmaDec_Init(&p); |
|
24400
|
|
|
|
|
|
|
|
|
24401
|
6
|
|
|
|
|
|
*srcLen = inSize; |
|
24402
|
6
|
|
|
|
|
|
res = LzmaDec_DecodeToDic(&p, outSize, src, srcLen, finishMode, status); |
|
24403
|
|
|
|
|
|
|
|
|
24404
|
6
|
50
|
|
|
|
|
if (res == SZ_OK && *status == LZMA_STATUS_NEEDS_MORE_INPUT) |
|
|
|
50
|
|
|
|
|
|
|
24405
|
|
|
|
|
|
|
res = SZ_ERROR_INPUT_EOF; |
|
24406
|
|
|
|
|
|
|
|
|
24407
|
6
|
|
|
|
|
|
(*destLen) = p.dicPos; |
|
24408
|
|
|
|
|
|
|
LzmaDec_FreeProbs(&p, alloc); |
|
24409
|
|
|
|
|
|
|
return res; |
|
24410
|
|
|
|
|
|
|
} |
|
24411
|
|
|
|
|
|
|
|
|
24412
|
|
|
|
|
|
|
} // namespace lzma |
|
24413
|
|
|
|
|
|
|
// End of LZMA compression library by Igor Pavlov |
|
24414
|
|
|
|
|
|
|
|
|
24415
|
|
|
|
|
|
|
#ifndef UFAL_CPPUTILS_COMPRESSOR_LZMA_ALLOCATOR_H |
|
24416
|
|
|
|
|
|
|
#define UFAL_CPPUTILS_COMPRESSOR_LZMA_ALLOCATOR_H |
|
24417
|
12
|
|
|
|
|
|
static void *LzmaAlloc(void* /*p*/, size_t size) { return new char[size]; } |
|
24418
|
24
|
100
|
|
|
|
|
static void LzmaFree(void* /*p*/, void *address) { delete[] (char*) address; } |
|
24419
|
|
|
|
|
|
|
static lzma::ISzAlloc lzmaAllocator = { LzmaAlloc, LzmaFree }; |
|
24420
|
|
|
|
|
|
|
#endif // UFAL_CPPUTILS_COMPRESSOR_LZMA_ALLOCATOR_H |
|
24421
|
|
|
|
|
|
|
|
|
24422
|
6
|
|
|
|
|
|
bool compressor::load(istream& is, binary_decoder& data) { |
|
24423
|
|
|
|
|
|
|
uint32_t uncompressed_len, compressed_len, poor_crc; |
|
24424
|
|
|
|
|
|
|
unsigned char props_encoded[LZMA_PROPS_SIZE]; |
|
24425
|
|
|
|
|
|
|
|
|
24426
|
6
|
50
|
|
|
|
|
if (!is.read((char *) &uncompressed_len, sizeof(uncompressed_len))) return false; |
|
24427
|
6
|
50
|
|
|
|
|
if (!is.read((char *) &compressed_len, sizeof(compressed_len))) return false; |
|
24428
|
6
|
50
|
|
|
|
|
if (!is.read((char *) &poor_crc, sizeof(poor_crc))) return false; |
|
24429
|
6
|
50
|
|
|
|
|
if (poor_crc != uncompressed_len * 19991 + compressed_len * 199999991 + 1234567890) return false; |
|
24430
|
6
|
50
|
|
|
|
|
if (!is.read((char *) props_encoded, sizeof(props_encoded))) return false; |
|
24431
|
|
|
|
|
|
|
|
|
24432
|
6
|
|
|
|
|
|
vector compressed(compressed_len); |
|
24433
|
6
|
50
|
|
|
|
|
if (!is.read((char *) compressed.data(), compressed_len)) return false; |
|
|
|
50
|
|
|
|
|
|
|
24434
|
|
|
|
|
|
|
|
|
24435
|
|
|
|
|
|
|
lzma::ELzmaStatus status; |
|
24436
|
6
|
|
|
|
|
|
size_t uncompressed_size = uncompressed_len, compressed_size = compressed_len; |
|
24437
|
6
|
50
|
|
|
|
|
auto res = lzma::LzmaDecode(data.fill(uncompressed_len), &uncompressed_size, compressed.data(), &compressed_size, props_encoded, LZMA_PROPS_SIZE, lzma::LZMA_FINISH_ANY, &status, &lzmaAllocator); |
|
24438
|
6
|
50
|
|
|
|
|
if (res != SZ_OK || uncompressed_size != uncompressed_len || compressed_size != compressed_len) return false; |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
24439
|
|
|
|
|
|
|
|
|
24440
|
6
|
|
|
|
|
|
return true; |
|
24441
|
|
|
|
|
|
|
} |
|
24442
|
|
|
|
|
|
|
|
|
24443
|
|
|
|
|
|
|
} // namespace utils |
|
24444
|
|
|
|
|
|
|
|
|
24445
|
|
|
|
|
|
|
///////// |
|
24446
|
|
|
|
|
|
|
// File: utils/compressor_save.cpp |
|
24447
|
|
|
|
|
|
|
///////// |
|
24448
|
|
|
|
|
|
|
|
|
24449
|
|
|
|
|
|
|
// This file is part of UFAL C++ Utils . |
|
24450
|
|
|
|
|
|
|
// |
|
24451
|
|
|
|
|
|
|
// Copyright 2015 Institute of Formal and Applied Linguistics, Faculty of |
|
24452
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
24453
|
|
|
|
|
|
|
// |
|
24454
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
24455
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
24456
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
24457
|
|
|
|
|
|
|
|
|
24458
|
|
|
|
|
|
|
namespace utils { |
|
24459
|
|
|
|
|
|
|
|
|
24460
|
|
|
|
|
|
|
// Start of LZMA compression library by Igor Pavlov |
|
24461
|
|
|
|
|
|
|
namespace lzma { |
|
24462
|
|
|
|
|
|
|
|
|
24463
|
|
|
|
|
|
|
// Types.h -- Basic types |
|
24464
|
|
|
|
|
|
|
// 2010-10-09 : Igor Pavlov : Public domain |
|
24465
|
|
|
|
|
|
|
#ifndef UFAL_CPPUTILS_COMPRESSOR_LZMA_TYPES_H |
|
24466
|
|
|
|
|
|
|
#define UFAL_CPPUTILS_COMPRESSOR_LZMA_TYPES_H |
|
24467
|
|
|
|
|
|
|
|
|
24468
|
|
|
|
|
|
|
#define SZ_OK 0 |
|
24469
|
|
|
|
|
|
|
|
|
24470
|
|
|
|
|
|
|
#define SZ_ERROR_DATA 1 |
|
24471
|
|
|
|
|
|
|
#define SZ_ERROR_MEM 2 |
|
24472
|
|
|
|
|
|
|
#define SZ_ERROR_CRC 3 |
|
24473
|
|
|
|
|
|
|
#define SZ_ERROR_UNSUPPORTED 4 |
|
24474
|
|
|
|
|
|
|
#define SZ_ERROR_PARAM 5 |
|
24475
|
|
|
|
|
|
|
#define SZ_ERROR_INPUT_EOF 6 |
|
24476
|
|
|
|
|
|
|
#define SZ_ERROR_OUTPUT_EOF 7 |
|
24477
|
|
|
|
|
|
|
#define SZ_ERROR_READ 8 |
|
24478
|
|
|
|
|
|
|
#define SZ_ERROR_WRITE 9 |
|
24479
|
|
|
|
|
|
|
#define SZ_ERROR_PROGRESS 10 |
|
24480
|
|
|
|
|
|
|
#define SZ_ERROR_FAIL 11 |
|
24481
|
|
|
|
|
|
|
#define SZ_ERROR_THREAD 12 |
|
24482
|
|
|
|
|
|
|
|
|
24483
|
|
|
|
|
|
|
#define SZ_ERROR_ARCHIVE 16 |
|
24484
|
|
|
|
|
|
|
#define SZ_ERROR_NO_ARCHIVE 17 |
|
24485
|
|
|
|
|
|
|
|
|
24486
|
|
|
|
|
|
|
typedef int SRes; |
|
24487
|
|
|
|
|
|
|
|
|
24488
|
|
|
|
|
|
|
#ifndef RINOK |
|
24489
|
|
|
|
|
|
|
#define RINOK(x) { int __result__ = (x); if (__result__ != 0) return __result__; } |
|
24490
|
|
|
|
|
|
|
#endif |
|
24491
|
|
|
|
|
|
|
|
|
24492
|
|
|
|
|
|
|
/* The following interfaces use first parameter as pointer to structure */ |
|
24493
|
|
|
|
|
|
|
|
|
24494
|
|
|
|
|
|
|
struct IByteIn |
|
24495
|
|
|
|
|
|
|
{ |
|
24496
|
|
|
|
|
|
|
uint8_t (*Read)(void *p); /* reads one byte, returns 0 in case of EOF or error */ |
|
24497
|
|
|
|
|
|
|
}; |
|
24498
|
|
|
|
|
|
|
|
|
24499
|
|
|
|
|
|
|
struct IByteOut |
|
24500
|
|
|
|
|
|
|
{ |
|
24501
|
|
|
|
|
|
|
void (*Write)(void *p, uint8_t b); |
|
24502
|
|
|
|
|
|
|
}; |
|
24503
|
|
|
|
|
|
|
|
|
24504
|
|
|
|
|
|
|
struct ISeqInStream |
|
24505
|
|
|
|
|
|
|
{ |
|
24506
|
|
|
|
|
|
|
SRes (*Read)(void *p, void *buf, size_t *size); |
|
24507
|
|
|
|
|
|
|
/* if (input(*size) != 0 && output(*size) == 0) means end_of_stream. |
|
24508
|
|
|
|
|
|
|
(output(*size) < input(*size)) is allowed */ |
|
24509
|
|
|
|
|
|
|
}; |
|
24510
|
|
|
|
|
|
|
|
|
24511
|
|
|
|
|
|
|
/* it can return SZ_ERROR_INPUT_EOF */ |
|
24512
|
|
|
|
|
|
|
SRes SeqInStream_Read(ISeqInStream *stream, void *buf, size_t size); |
|
24513
|
|
|
|
|
|
|
SRes SeqInStream_Read2(ISeqInStream *stream, void *buf, size_t size, SRes errorType); |
|
24514
|
|
|
|
|
|
|
SRes SeqInStream_ReadByte(ISeqInStream *stream, uint8_t *buf); |
|
24515
|
|
|
|
|
|
|
|
|
24516
|
|
|
|
|
|
|
struct ISeqOutStream |
|
24517
|
|
|
|
|
|
|
{ |
|
24518
|
|
|
|
|
|
|
size_t (*Write)(void *p, const void *buf, size_t size); |
|
24519
|
|
|
|
|
|
|
/* Returns: result - the number of actually written bytes. |
|
24520
|
|
|
|
|
|
|
(result < size) means error */ |
|
24521
|
|
|
|
|
|
|
}; |
|
24522
|
|
|
|
|
|
|
|
|
24523
|
|
|
|
|
|
|
enum ESzSeek |
|
24524
|
|
|
|
|
|
|
{ |
|
24525
|
|
|
|
|
|
|
SZ_SEEK_SET = 0, |
|
24526
|
|
|
|
|
|
|
SZ_SEEK_CUR = 1, |
|
24527
|
|
|
|
|
|
|
SZ_SEEK_END = 2 |
|
24528
|
|
|
|
|
|
|
}; |
|
24529
|
|
|
|
|
|
|
|
|
24530
|
|
|
|
|
|
|
struct ISeekInStream |
|
24531
|
|
|
|
|
|
|
{ |
|
24532
|
|
|
|
|
|
|
SRes (*Read)(void *p, void *buf, size_t *size); /* same as ISeqInStream::Read */ |
|
24533
|
|
|
|
|
|
|
SRes (*Seek)(void *p, int64_t *pos, ESzSeek origin); |
|
24534
|
|
|
|
|
|
|
}; |
|
24535
|
|
|
|
|
|
|
|
|
24536
|
|
|
|
|
|
|
struct ILookInStream |
|
24537
|
|
|
|
|
|
|
{ |
|
24538
|
|
|
|
|
|
|
SRes (*Look)(void *p, const void **buf, size_t *size); |
|
24539
|
|
|
|
|
|
|
/* if (input(*size) != 0 && output(*size) == 0) means end_of_stream. |
|
24540
|
|
|
|
|
|
|
(output(*size) > input(*size)) is not allowed |
|
24541
|
|
|
|
|
|
|
(output(*size) < input(*size)) is allowed */ |
|
24542
|
|
|
|
|
|
|
SRes (*Skip)(void *p, size_t offset); |
|
24543
|
|
|
|
|
|
|
/* offset must be <= output(*size) of Look */ |
|
24544
|
|
|
|
|
|
|
|
|
24545
|
|
|
|
|
|
|
SRes (*Read)(void *p, void *buf, size_t *size); |
|
24546
|
|
|
|
|
|
|
/* reads directly (without buffer). It's same as ISeqInStream::Read */ |
|
24547
|
|
|
|
|
|
|
SRes (*Seek)(void *p, int64_t *pos, ESzSeek origin); |
|
24548
|
|
|
|
|
|
|
}; |
|
24549
|
|
|
|
|
|
|
|
|
24550
|
|
|
|
|
|
|
SRes LookInStream_LookRead(ILookInStream *stream, void *buf, size_t *size); |
|
24551
|
|
|
|
|
|
|
SRes LookInStream_SeekTo(ILookInStream *stream, uint64_t offset); |
|
24552
|
|
|
|
|
|
|
|
|
24553
|
|
|
|
|
|
|
/* reads via ILookInStream::Read */ |
|
24554
|
|
|
|
|
|
|
SRes LookInStream_Read2(ILookInStream *stream, void *buf, size_t size, SRes errorType); |
|
24555
|
|
|
|
|
|
|
SRes LookInStream_Read(ILookInStream *stream, void *buf, size_t size); |
|
24556
|
|
|
|
|
|
|
|
|
24557
|
|
|
|
|
|
|
#define LookToRead_BUF_SIZE (1 << 14) |
|
24558
|
|
|
|
|
|
|
|
|
24559
|
|
|
|
|
|
|
struct CLookToRead |
|
24560
|
|
|
|
|
|
|
{ |
|
24561
|
|
|
|
|
|
|
ILookInStream s; |
|
24562
|
|
|
|
|
|
|
ISeekInStream *realStream; |
|
24563
|
|
|
|
|
|
|
size_t pos; |
|
24564
|
|
|
|
|
|
|
size_t size; |
|
24565
|
|
|
|
|
|
|
uint8_t buf[LookToRead_BUF_SIZE]; |
|
24566
|
|
|
|
|
|
|
}; |
|
24567
|
|
|
|
|
|
|
|
|
24568
|
|
|
|
|
|
|
void LookToRead_CreateVTable(CLookToRead *p, int lookahead); |
|
24569
|
|
|
|
|
|
|
void LookToRead_Init(CLookToRead *p); |
|
24570
|
|
|
|
|
|
|
|
|
24571
|
|
|
|
|
|
|
struct CSecToLook |
|
24572
|
|
|
|
|
|
|
{ |
|
24573
|
|
|
|
|
|
|
ISeqInStream s; |
|
24574
|
|
|
|
|
|
|
ILookInStream *realStream; |
|
24575
|
|
|
|
|
|
|
}; |
|
24576
|
|
|
|
|
|
|
|
|
24577
|
|
|
|
|
|
|
void SecToLook_CreateVTable(CSecToLook *p); |
|
24578
|
|
|
|
|
|
|
|
|
24579
|
|
|
|
|
|
|
struct CSecToRead |
|
24580
|
|
|
|
|
|
|
{ |
|
24581
|
|
|
|
|
|
|
ISeqInStream s; |
|
24582
|
|
|
|
|
|
|
ILookInStream *realStream; |
|
24583
|
|
|
|
|
|
|
}; |
|
24584
|
|
|
|
|
|
|
|
|
24585
|
|
|
|
|
|
|
void SecToRead_CreateVTable(CSecToRead *p); |
|
24586
|
|
|
|
|
|
|
|
|
24587
|
|
|
|
|
|
|
struct ICompressProgress |
|
24588
|
|
|
|
|
|
|
{ |
|
24589
|
|
|
|
|
|
|
SRes (*Progress)(void *p, uint64_t inSize, uint64_t outSize); |
|
24590
|
|
|
|
|
|
|
/* Returns: result. (result != SZ_OK) means break. |
|
24591
|
|
|
|
|
|
|
Value (uint64_t)(int64_t)-1 for size means unknown value. */ |
|
24592
|
|
|
|
|
|
|
}; |
|
24593
|
|
|
|
|
|
|
|
|
24594
|
|
|
|
|
|
|
struct ISzAlloc |
|
24595
|
|
|
|
|
|
|
{ |
|
24596
|
|
|
|
|
|
|
void *(*Alloc)(void *p, size_t size); |
|
24597
|
|
|
|
|
|
|
void (*Free)(void *p, void *address); /* address can be 0 */ |
|
24598
|
|
|
|
|
|
|
}; |
|
24599
|
|
|
|
|
|
|
|
|
24600
|
|
|
|
|
|
|
#define IAlloc_Alloc(p, size) (p)->Alloc((p), size) |
|
24601
|
|
|
|
|
|
|
#define IAlloc_Free(p, a) (p)->Free((p), a) |
|
24602
|
|
|
|
|
|
|
|
|
24603
|
|
|
|
|
|
|
#endif // UFAL_CPPUTILS_COMPRESSOR_LZMA_TYPES_H |
|
24604
|
|
|
|
|
|
|
|
|
24605
|
|
|
|
|
|
|
// LzHash.h -- HASH functions for LZ algorithms |
|
24606
|
|
|
|
|
|
|
// 2009-02-07 : Igor Pavlov : Public domain |
|
24607
|
|
|
|
|
|
|
|
|
24608
|
|
|
|
|
|
|
#define kHash2Size (1 << 10) |
|
24609
|
|
|
|
|
|
|
#define kHash3Size (1 << 16) |
|
24610
|
|
|
|
|
|
|
#define kHash4Size (1 << 20) |
|
24611
|
|
|
|
|
|
|
|
|
24612
|
|
|
|
|
|
|
#define kFix3HashSize (kHash2Size) |
|
24613
|
|
|
|
|
|
|
#define kFix4HashSize (kHash2Size + kHash3Size) |
|
24614
|
|
|
|
|
|
|
#define kFix5HashSize (kHash2Size + kHash3Size + kHash4Size) |
|
24615
|
|
|
|
|
|
|
|
|
24616
|
|
|
|
|
|
|
#define HASH2_CALC hashValue = cur[0] | ((uint32_t)cur[1] << 8); |
|
24617
|
|
|
|
|
|
|
|
|
24618
|
|
|
|
|
|
|
#define HASH3_CALC { \ |
|
24619
|
|
|
|
|
|
|
uint32_t temp = p->crc[cur[0]] ^ cur[1]; \ |
|
24620
|
|
|
|
|
|
|
hash2Value = temp & (kHash2Size - 1); \ |
|
24621
|
|
|
|
|
|
|
hashValue = (temp ^ ((uint32_t)cur[2] << 8)) & p->hashMask; } |
|
24622
|
|
|
|
|
|
|
|
|
24623
|
|
|
|
|
|
|
#define HASH4_CALC { \ |
|
24624
|
|
|
|
|
|
|
uint32_t temp = p->crc[cur[0]] ^ cur[1]; \ |
|
24625
|
|
|
|
|
|
|
hash2Value = temp & (kHash2Size - 1); \ |
|
24626
|
|
|
|
|
|
|
hash3Value = (temp ^ ((uint32_t)cur[2] << 8)) & (kHash3Size - 1); \ |
|
24627
|
|
|
|
|
|
|
hashValue = (temp ^ ((uint32_t)cur[2] << 8) ^ (p->crc[cur[3]] << 5)) & p->hashMask; } |
|
24628
|
|
|
|
|
|
|
|
|
24629
|
|
|
|
|
|
|
#define HASH5_CALC { \ |
|
24630
|
|
|
|
|
|
|
uint32_t temp = p->crc[cur[0]] ^ cur[1]; \ |
|
24631
|
|
|
|
|
|
|
hash2Value = temp & (kHash2Size - 1); \ |
|
24632
|
|
|
|
|
|
|
hash3Value = (temp ^ ((uint32_t)cur[2] << 8)) & (kHash3Size - 1); \ |
|
24633
|
|
|
|
|
|
|
hash4Value = (temp ^ ((uint32_t)cur[2] << 8) ^ (p->crc[cur[3]] << 5)); \ |
|
24634
|
|
|
|
|
|
|
hashValue = (hash4Value ^ (p->crc[cur[4]] << 3)) & p->hashMask; \ |
|
24635
|
|
|
|
|
|
|
hash4Value &= (kHash4Size - 1); } |
|
24636
|
|
|
|
|
|
|
|
|
24637
|
|
|
|
|
|
|
/* #define HASH_ZIP_CALC hashValue = ((cur[0] | ((uint32_t)cur[1] << 8)) ^ p->crc[cur[2]]) & 0xFFFF; */ |
|
24638
|
|
|
|
|
|
|
#define HASH_ZIP_CALC hashValue = ((cur[2] | ((uint32_t)cur[0] << 8)) ^ p->crc[cur[1]]) & 0xFFFF; |
|
24639
|
|
|
|
|
|
|
|
|
24640
|
|
|
|
|
|
|
#define MT_HASH2_CALC \ |
|
24641
|
|
|
|
|
|
|
hash2Value = (p->crc[cur[0]] ^ cur[1]) & (kHash2Size - 1); |
|
24642
|
|
|
|
|
|
|
|
|
24643
|
|
|
|
|
|
|
#define MT_HASH3_CALC { \ |
|
24644
|
|
|
|
|
|
|
uint32_t temp = p->crc[cur[0]] ^ cur[1]; \ |
|
24645
|
|
|
|
|
|
|
hash2Value = temp & (kHash2Size - 1); \ |
|
24646
|
|
|
|
|
|
|
hash3Value = (temp ^ ((uint32_t)cur[2] << 8)) & (kHash3Size - 1); } |
|
24647
|
|
|
|
|
|
|
|
|
24648
|
|
|
|
|
|
|
#define MT_HASH4_CALC { \ |
|
24649
|
|
|
|
|
|
|
uint32_t temp = p->crc[cur[0]] ^ cur[1]; \ |
|
24650
|
|
|
|
|
|
|
hash2Value = temp & (kHash2Size - 1); \ |
|
24651
|
|
|
|
|
|
|
hash3Value = (temp ^ ((uint32_t)cur[2] << 8)) & (kHash3Size - 1); \ |
|
24652
|
|
|
|
|
|
|
hash4Value = (temp ^ ((uint32_t)cur[2] << 8) ^ (p->crc[cur[3]] << 5)) & (kHash4Size - 1); } |
|
24653
|
|
|
|
|
|
|
|
|
24654
|
|
|
|
|
|
|
// LzFind.h -- Match finder for LZ algorithms |
|
24655
|
|
|
|
|
|
|
// 2009-04-22 : Igor Pavlov : Public domain |
|
24656
|
|
|
|
|
|
|
|
|
24657
|
|
|
|
|
|
|
typedef uint32_t CLzRef; |
|
24658
|
|
|
|
|
|
|
|
|
24659
|
|
|
|
|
|
|
struct CMatchFinder |
|
24660
|
|
|
|
|
|
|
{ |
|
24661
|
|
|
|
|
|
|
uint8_t *buffer; |
|
24662
|
|
|
|
|
|
|
uint32_t pos; |
|
24663
|
|
|
|
|
|
|
uint32_t posLimit; |
|
24664
|
|
|
|
|
|
|
uint32_t streamPos; |
|
24665
|
|
|
|
|
|
|
uint32_t lenLimit; |
|
24666
|
|
|
|
|
|
|
|
|
24667
|
|
|
|
|
|
|
uint32_t cyclicBufferPos; |
|
24668
|
|
|
|
|
|
|
uint32_t cyclicBufferSize; /* it must be = (historySize + 1) */ |
|
24669
|
|
|
|
|
|
|
|
|
24670
|
|
|
|
|
|
|
uint32_t matchMaxLen; |
|
24671
|
|
|
|
|
|
|
CLzRef *hash; |
|
24672
|
|
|
|
|
|
|
CLzRef *son; |
|
24673
|
|
|
|
|
|
|
uint32_t hashMask; |
|
24674
|
|
|
|
|
|
|
uint32_t cutValue; |
|
24675
|
|
|
|
|
|
|
|
|
24676
|
|
|
|
|
|
|
uint8_t *bufferBase; |
|
24677
|
|
|
|
|
|
|
ISeqInStream *stream; |
|
24678
|
|
|
|
|
|
|
int streamEndWasReached; |
|
24679
|
|
|
|
|
|
|
|
|
24680
|
|
|
|
|
|
|
uint32_t blockSize; |
|
24681
|
|
|
|
|
|
|
uint32_t keepSizeBefore; |
|
24682
|
|
|
|
|
|
|
uint32_t keepSizeAfter; |
|
24683
|
|
|
|
|
|
|
|
|
24684
|
|
|
|
|
|
|
uint32_t numHashBytes; |
|
24685
|
|
|
|
|
|
|
int directInput; |
|
24686
|
|
|
|
|
|
|
size_t directInputRem; |
|
24687
|
|
|
|
|
|
|
int btMode; |
|
24688
|
|
|
|
|
|
|
int bigHash; |
|
24689
|
|
|
|
|
|
|
uint32_t historySize; |
|
24690
|
|
|
|
|
|
|
uint32_t fixedHashSize; |
|
24691
|
|
|
|
|
|
|
uint32_t hashSizeSum; |
|
24692
|
|
|
|
|
|
|
uint32_t numSons; |
|
24693
|
|
|
|
|
|
|
SRes result; |
|
24694
|
|
|
|
|
|
|
uint32_t crc[256]; |
|
24695
|
|
|
|
|
|
|
}; |
|
24696
|
|
|
|
|
|
|
|
|
24697
|
|
|
|
|
|
|
#define Inline_MatchFinder_GetPointerToCurrentPos(p) ((p)->buffer) |
|
24698
|
|
|
|
|
|
|
#define Inline_MatchFinder_GetIndexByte(p, index) ((p)->buffer[(int32_t)(index)]) |
|
24699
|
|
|
|
|
|
|
|
|
24700
|
|
|
|
|
|
|
#define Inline_MatchFinder_GetNumAvailableBytes(p) ((p)->streamPos - (p)->pos) |
|
24701
|
|
|
|
|
|
|
|
|
24702
|
|
|
|
|
|
|
int MatchFinder_NeedMove(CMatchFinder *p); |
|
24703
|
|
|
|
|
|
|
uint8_t *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p); |
|
24704
|
|
|
|
|
|
|
void MatchFinder_MoveBlock(CMatchFinder *p); |
|
24705
|
|
|
|
|
|
|
void MatchFinder_ReadIfRequired(CMatchFinder *p); |
|
24706
|
|
|
|
|
|
|
|
|
24707
|
|
|
|
|
|
|
void MatchFinder_Construct(CMatchFinder *p); |
|
24708
|
|
|
|
|
|
|
|
|
24709
|
|
|
|
|
|
|
/* Conditions: |
|
24710
|
|
|
|
|
|
|
historySize <= 3 GB |
|
24711
|
|
|
|
|
|
|
keepAddBufferBefore + matchMaxLen + keepAddBufferAfter < 511MB |
|
24712
|
|
|
|
|
|
|
*/ |
|
24713
|
|
|
|
|
|
|
int MatchFinder_Create(CMatchFinder *p, uint32_t historySize, |
|
24714
|
|
|
|
|
|
|
uint32_t keepAddBufferBefore, uint32_t matchMaxLen, uint32_t keepAddBufferAfter, |
|
24715
|
|
|
|
|
|
|
ISzAlloc *alloc); |
|
24716
|
|
|
|
|
|
|
void MatchFinder_Free(CMatchFinder *p, ISzAlloc *alloc); |
|
24717
|
|
|
|
|
|
|
void MatchFinder_Normalize3(uint32_t subValue, CLzRef *items, uint32_t numItems); |
|
24718
|
|
|
|
|
|
|
void MatchFinder_ReduceOffsets(CMatchFinder *p, uint32_t subValue); |
|
24719
|
|
|
|
|
|
|
|
|
24720
|
|
|
|
|
|
|
uint32_t * GetMatchesSpec1(uint32_t lenLimit, uint32_t curMatch, uint32_t pos, const uint8_t *buffer, CLzRef *son, |
|
24721
|
|
|
|
|
|
|
uint32_t _cyclicBufferPos, uint32_t _cyclicBufferSize, uint32_t _cutValue, |
|
24722
|
|
|
|
|
|
|
uint32_t *distances, uint32_t maxLen); |
|
24723
|
|
|
|
|
|
|
|
|
24724
|
|
|
|
|
|
|
/* |
|
24725
|
|
|
|
|
|
|
Conditions: |
|
24726
|
|
|
|
|
|
|
Mf_GetNumAvailableBytes_Func must be called before each Mf_GetMatchLen_Func. |
|
24727
|
|
|
|
|
|
|
Mf_GetPointerToCurrentPos_Func's result must be used only before any other function |
|
24728
|
|
|
|
|
|
|
*/ |
|
24729
|
|
|
|
|
|
|
|
|
24730
|
|
|
|
|
|
|
typedef void (*Mf_Init_Func)(CMatchFinder *object); |
|
24731
|
|
|
|
|
|
|
typedef uint8_t (*Mf_GetIndexByte_Func)(CMatchFinder *object, int32_t index); |
|
24732
|
|
|
|
|
|
|
typedef uint32_t (*Mf_GetNumAvailableBytes_Func)(CMatchFinder *object); |
|
24733
|
|
|
|
|
|
|
typedef uint8_t * (*Mf_GetPointerToCurrentPos_Func)(CMatchFinder *object); |
|
24734
|
|
|
|
|
|
|
typedef uint32_t (*Mf_GetMatches_Func)(CMatchFinder *object, uint32_t *distances); |
|
24735
|
|
|
|
|
|
|
typedef void (*Mf_Skip_Func)(CMatchFinder *object, uint32_t); |
|
24736
|
|
|
|
|
|
|
|
|
24737
|
|
|
|
|
|
|
struct IMatchFinder |
|
24738
|
|
|
|
|
|
|
{ |
|
24739
|
|
|
|
|
|
|
Mf_Init_Func Init; |
|
24740
|
|
|
|
|
|
|
Mf_GetIndexByte_Func GetIndexByte; |
|
24741
|
|
|
|
|
|
|
Mf_GetNumAvailableBytes_Func GetNumAvailableBytes; |
|
24742
|
|
|
|
|
|
|
Mf_GetPointerToCurrentPos_Func GetPointerToCurrentPos; |
|
24743
|
|
|
|
|
|
|
Mf_GetMatches_Func GetMatches; |
|
24744
|
|
|
|
|
|
|
Mf_Skip_Func Skip; |
|
24745
|
|
|
|
|
|
|
}; |
|
24746
|
|
|
|
|
|
|
|
|
24747
|
|
|
|
|
|
|
void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder *vTable); |
|
24748
|
|
|
|
|
|
|
|
|
24749
|
|
|
|
|
|
|
void MatchFinder_Init(CMatchFinder *p); |
|
24750
|
|
|
|
|
|
|
uint32_t Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, uint32_t *distances); |
|
24751
|
|
|
|
|
|
|
uint32_t Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, uint32_t *distances); |
|
24752
|
|
|
|
|
|
|
void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, uint32_t num); |
|
24753
|
|
|
|
|
|
|
void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, uint32_t num); |
|
24754
|
|
|
|
|
|
|
|
|
24755
|
|
|
|
|
|
|
// LzFind.c -- Match finder for LZ algorithms |
|
24756
|
|
|
|
|
|
|
// 2009-04-22 : Igor Pavlov : Public domain |
|
24757
|
|
|
|
|
|
|
|
|
24758
|
|
|
|
|
|
|
#define kEmptyHashValue 0 |
|
24759
|
|
|
|
|
|
|
#define kMaxValForNormalize ((uint32_t)0xFFFFFFFF) |
|
24760
|
|
|
|
|
|
|
#define kNormalizeStepMin (1 << 10) /* it must be power of 2 */ |
|
24761
|
|
|
|
|
|
|
#define kNormalizeMask (~(kNormalizeStepMin - 1)) |
|
24762
|
|
|
|
|
|
|
#define kMaxHistorySize ((uint32_t)3 << 30) |
|
24763
|
|
|
|
|
|
|
|
|
24764
|
|
|
|
|
|
|
#define kStartMaxLen 3 |
|
24765
|
|
|
|
|
|
|
|
|
24766
|
|
|
|
|
|
|
static void LzInWindow_Free(CMatchFinder *p, ISzAlloc *alloc) |
|
24767
|
|
|
|
|
|
|
{ |
|
24768
|
0
|
0
|
|
|
|
|
if (!p->directInput) |
|
24769
|
|
|
|
|
|
|
{ |
|
24770
|
0
|
|
|
|
|
|
alloc->Free(alloc, p->bufferBase); |
|
24771
|
0
|
|
|
|
|
|
p->bufferBase = 0; |
|
24772
|
|
|
|
|
|
|
} |
|
24773
|
|
|
|
|
|
|
} |
|
24774
|
|
|
|
|
|
|
|
|
24775
|
|
|
|
|
|
|
/* keepSizeBefore + keepSizeAfter + keepSizeReserv must be < 4G) */ |
|
24776
|
|
|
|
|
|
|
|
|
24777
|
0
|
|
|
|
|
|
static int LzInWindow_Create(CMatchFinder *p, uint32_t keepSizeReserv, ISzAlloc *alloc) |
|
24778
|
|
|
|
|
|
|
{ |
|
24779
|
0
|
|
|
|
|
|
uint32_t blockSize = p->keepSizeBefore + p->keepSizeAfter + keepSizeReserv; |
|
24780
|
0
|
0
|
|
|
|
|
if (p->directInput) |
|
24781
|
|
|
|
|
|
|
{ |
|
24782
|
0
|
|
|
|
|
|
p->blockSize = blockSize; |
|
24783
|
0
|
|
|
|
|
|
return 1; |
|
24784
|
|
|
|
|
|
|
} |
|
24785
|
0
|
0
|
|
|
|
|
if (p->bufferBase == 0 || p->blockSize != blockSize) |
|
|
|
0
|
|
|
|
|
|
|
24786
|
|
|
|
|
|
|
{ |
|
24787
|
|
|
|
|
|
|
LzInWindow_Free(p, alloc); |
|
24788
|
0
|
|
|
|
|
|
p->blockSize = blockSize; |
|
24789
|
0
|
|
|
|
|
|
p->bufferBase = (uint8_t *)alloc->Alloc(alloc, (size_t)blockSize); |
|
24790
|
|
|
|
|
|
|
} |
|
24791
|
0
|
|
|
|
|
|
return (p->bufferBase != 0); |
|
24792
|
|
|
|
|
|
|
} |
|
24793
|
|
|
|
|
|
|
|
|
24794
|
0
|
|
|
|
|
|
uint8_t *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p) { return p->buffer; } |
|
24795
|
0
|
|
|
|
|
|
uint8_t MatchFinder_GetIndexByte(CMatchFinder *p, int32_t index) { return p->buffer[index]; } |
|
24796
|
|
|
|
|
|
|
|
|
24797
|
0
|
|
|
|
|
|
uint32_t MatchFinder_GetNumAvailableBytes(CMatchFinder *p) { return p->streamPos - p->pos; } |
|
24798
|
|
|
|
|
|
|
|
|
24799
|
0
|
|
|
|
|
|
void MatchFinder_ReduceOffsets(CMatchFinder *p, uint32_t subValue) |
|
24800
|
|
|
|
|
|
|
{ |
|
24801
|
0
|
|
|
|
|
|
p->posLimit -= subValue; |
|
24802
|
0
|
|
|
|
|
|
p->pos -= subValue; |
|
24803
|
0
|
|
|
|
|
|
p->streamPos -= subValue; |
|
24804
|
0
|
|
|
|
|
|
} |
|
24805
|
|
|
|
|
|
|
|
|
24806
|
0
|
|
|
|
|
|
static void MatchFinder_ReadBlock(CMatchFinder *p) |
|
24807
|
|
|
|
|
|
|
{ |
|
24808
|
0
|
0
|
|
|
|
|
if (p->streamEndWasReached || p->result != SZ_OK) |
|
|
|
0
|
|
|
|
|
|
|
24809
|
|
|
|
|
|
|
return; |
|
24810
|
0
|
0
|
|
|
|
|
if (p->directInput) |
|
24811
|
|
|
|
|
|
|
{ |
|
24812
|
0
|
|
|
|
|
|
uint32_t curSize = 0xFFFFFFFF - p->streamPos; |
|
24813
|
0
|
0
|
|
|
|
|
if (curSize > p->directInputRem) |
|
24814
|
0
|
|
|
|
|
|
curSize = (uint32_t)p->directInputRem; |
|
24815
|
0
|
|
|
|
|
|
p->directInputRem -= curSize; |
|
24816
|
0
|
|
|
|
|
|
p->streamPos += curSize; |
|
24817
|
0
|
0
|
|
|
|
|
if (p->directInputRem == 0) |
|
24818
|
0
|
|
|
|
|
|
p->streamEndWasReached = 1; |
|
24819
|
|
|
|
|
|
|
return; |
|
24820
|
|
|
|
|
|
|
} |
|
24821
|
0
|
|
|
|
|
|
for (;;) |
|
24822
|
|
|
|
|
|
|
{ |
|
24823
|
0
|
|
|
|
|
|
uint8_t *dest = p->buffer + (p->streamPos - p->pos); |
|
24824
|
0
|
|
|
|
|
|
size_t size = (p->bufferBase + p->blockSize - dest); |
|
24825
|
0
|
0
|
|
|
|
|
if (size == 0) |
|
24826
|
0
|
|
|
|
|
|
return; |
|
24827
|
0
|
|
|
|
|
|
p->result = p->stream->Read(p->stream, dest, &size); |
|
24828
|
0
|
0
|
|
|
|
|
if (p->result != SZ_OK) |
|
24829
|
|
|
|
|
|
|
return; |
|
24830
|
0
|
0
|
|
|
|
|
if (size == 0) |
|
24831
|
|
|
|
|
|
|
{ |
|
24832
|
0
|
|
|
|
|
|
p->streamEndWasReached = 1; |
|
24833
|
0
|
|
|
|
|
|
return; |
|
24834
|
|
|
|
|
|
|
} |
|
24835
|
0
|
|
|
|
|
|
p->streamPos += (uint32_t)size; |
|
24836
|
0
|
0
|
|
|
|
|
if (p->streamPos - p->pos > p->keepSizeAfter) |
|
24837
|
|
|
|
|
|
|
return; |
|
24838
|
|
|
|
|
|
|
} |
|
24839
|
|
|
|
|
|
|
} |
|
24840
|
|
|
|
|
|
|
|
|
24841
|
0
|
|
|
|
|
|
void MatchFinder_MoveBlock(CMatchFinder *p) |
|
24842
|
|
|
|
|
|
|
{ |
|
24843
|
0
|
|
|
|
|
|
memmove(p->bufferBase, |
|
24844
|
0
|
|
|
|
|
|
p->buffer - p->keepSizeBefore, |
|
24845
|
0
|
|
|
|
|
|
(size_t)(p->streamPos - p->pos + p->keepSizeBefore)); |
|
24846
|
0
|
|
|
|
|
|
p->buffer = p->bufferBase + p->keepSizeBefore; |
|
24847
|
0
|
|
|
|
|
|
} |
|
24848
|
|
|
|
|
|
|
|
|
24849
|
0
|
|
|
|
|
|
int MatchFinder_NeedMove(CMatchFinder *p) |
|
24850
|
|
|
|
|
|
|
{ |
|
24851
|
0
|
0
|
|
|
|
|
if (p->directInput) |
|
|
|
0
|
|
|
|
|
|
|
24852
|
|
|
|
|
|
|
return 0; |
|
24853
|
|
|
|
|
|
|
/* if (p->streamEndWasReached) return 0; */ |
|
24854
|
0
|
|
|
|
|
|
return ((size_t)(p->bufferBase + p->blockSize - p->buffer) <= p->keepSizeAfter); |
|
24855
|
|
|
|
|
|
|
} |
|
24856
|
|
|
|
|
|
|
|
|
24857
|
0
|
|
|
|
|
|
void MatchFinder_ReadIfRequired(CMatchFinder *p) |
|
24858
|
|
|
|
|
|
|
{ |
|
24859
|
0
|
0
|
|
|
|
|
if (p->streamEndWasReached) |
|
24860
|
|
|
|
|
|
|
return; |
|
24861
|
0
|
0
|
|
|
|
|
if (p->keepSizeAfter >= p->streamPos - p->pos) |
|
24862
|
0
|
|
|
|
|
|
MatchFinder_ReadBlock(p); |
|
24863
|
|
|
|
|
|
|
} |
|
24864
|
|
|
|
|
|
|
|
|
24865
|
0
|
|
|
|
|
|
static void MatchFinder_CheckAndMoveAndRead(CMatchFinder *p) |
|
24866
|
|
|
|
|
|
|
{ |
|
24867
|
0
|
0
|
|
|
|
|
if (MatchFinder_NeedMove(p)) |
|
24868
|
0
|
|
|
|
|
|
MatchFinder_MoveBlock(p); |
|
24869
|
0
|
|
|
|
|
|
MatchFinder_ReadBlock(p); |
|
24870
|
0
|
|
|
|
|
|
} |
|
24871
|
|
|
|
|
|
|
|
|
24872
|
|
|
|
|
|
|
static void MatchFinder_SetDefaultSettings(CMatchFinder *p) |
|
24873
|
|
|
|
|
|
|
{ |
|
24874
|
0
|
|
|
|
|
|
p->cutValue = 32; |
|
24875
|
0
|
|
|
|
|
|
p->btMode = 1; |
|
24876
|
0
|
|
|
|
|
|
p->numHashBytes = 4; |
|
24877
|
0
|
|
|
|
|
|
p->bigHash = 0; |
|
24878
|
|
|
|
|
|
|
} |
|
24879
|
|
|
|
|
|
|
|
|
24880
|
|
|
|
|
|
|
#define kCrcPoly 0xEDB88320 |
|
24881
|
|
|
|
|
|
|
|
|
24882
|
0
|
|
|
|
|
|
void MatchFinder_Construct(CMatchFinder *p) |
|
24883
|
|
|
|
|
|
|
{ |
|
24884
|
|
|
|
|
|
|
uint32_t i; |
|
24885
|
0
|
|
|
|
|
|
p->bufferBase = 0; |
|
24886
|
0
|
|
|
|
|
|
p->directInput = 0; |
|
24887
|
0
|
|
|
|
|
|
p->hash = 0; |
|
24888
|
|
|
|
|
|
|
MatchFinder_SetDefaultSettings(p); |
|
24889
|
|
|
|
|
|
|
|
|
24890
|
0
|
0
|
|
|
|
|
for (i = 0; i < 256; i++) |
|
|
|
0
|
|
|
|
|
|
|
24891
|
|
|
|
|
|
|
{ |
|
24892
|
|
|
|
|
|
|
uint32_t r = i; |
|
24893
|
|
|
|
|
|
|
int j; |
|
24894
|
0
|
0
|
|
|
|
|
for (j = 0; j < 8; j++) |
|
|
|
0
|
|
|
|
|
|
|
24895
|
0
|
|
|
|
|
|
r = (r >> 1) ^ (kCrcPoly & ~((r & 1) - 1)); |
|
24896
|
0
|
|
|
|
|
|
p->crc[i] = r; |
|
24897
|
|
|
|
|
|
|
} |
|
24898
|
0
|
|
|
|
|
|
} |
|
24899
|
|
|
|
|
|
|
|
|
24900
|
|
|
|
|
|
|
static void MatchFinder_FreeThisClassMemory(CMatchFinder *p, ISzAlloc *alloc) |
|
24901
|
|
|
|
|
|
|
{ |
|
24902
|
0
|
|
|
|
|
|
alloc->Free(alloc, p->hash); |
|
24903
|
0
|
|
|
|
|
|
p->hash = 0; |
|
24904
|
|
|
|
|
|
|
} |
|
24905
|
|
|
|
|
|
|
|
|
24906
|
0
|
|
|
|
|
|
void MatchFinder_Free(CMatchFinder *p, ISzAlloc *alloc) |
|
24907
|
|
|
|
|
|
|
{ |
|
24908
|
|
|
|
|
|
|
MatchFinder_FreeThisClassMemory(p, alloc); |
|
24909
|
|
|
|
|
|
|
LzInWindow_Free(p, alloc); |
|
24910
|
0
|
|
|
|
|
|
} |
|
24911
|
|
|
|
|
|
|
|
|
24912
|
|
|
|
|
|
|
static CLzRef* AllocRefs(uint32_t num, ISzAlloc *alloc) |
|
24913
|
|
|
|
|
|
|
{ |
|
24914
|
0
|
|
|
|
|
|
size_t sizeInBytes = (size_t)num * sizeof(CLzRef); |
|
24915
|
0
|
0
|
|
|
|
|
if (sizeInBytes / sizeof(CLzRef) != num) |
|
24916
|
|
|
|
|
|
|
return 0; |
|
24917
|
0
|
|
|
|
|
|
return (CLzRef *)alloc->Alloc(alloc, sizeInBytes); |
|
24918
|
|
|
|
|
|
|
} |
|
24919
|
|
|
|
|
|
|
|
|
24920
|
0
|
|
|
|
|
|
int MatchFinder_Create(CMatchFinder *p, uint32_t historySize, |
|
24921
|
|
|
|
|
|
|
uint32_t keepAddBufferBefore, uint32_t matchMaxLen, uint32_t keepAddBufferAfter, |
|
24922
|
|
|
|
|
|
|
ISzAlloc *alloc) |
|
24923
|
|
|
|
|
|
|
{ |
|
24924
|
|
|
|
|
|
|
uint32_t sizeReserv; |
|
24925
|
0
|
0
|
|
|
|
|
if (historySize > kMaxHistorySize) |
|
24926
|
|
|
|
|
|
|
{ |
|
24927
|
|
|
|
|
|
|
MatchFinder_Free(p, alloc); |
|
24928
|
|
|
|
|
|
|
return 0; |
|
24929
|
|
|
|
|
|
|
} |
|
24930
|
0
|
|
|
|
|
|
sizeReserv = historySize >> 1; |
|
24931
|
0
|
0
|
|
|
|
|
if (historySize > ((uint32_t)2 << 30)) |
|
24932
|
0
|
|
|
|
|
|
sizeReserv = historySize >> 2; |
|
24933
|
0
|
|
|
|
|
|
sizeReserv += (keepAddBufferBefore + matchMaxLen + keepAddBufferAfter) / 2 + (1 << 19); |
|
24934
|
|
|
|
|
|
|
|
|
24935
|
0
|
|
|
|
|
|
p->keepSizeBefore = historySize + keepAddBufferBefore + 1; |
|
24936
|
0
|
|
|
|
|
|
p->keepSizeAfter = matchMaxLen + keepAddBufferAfter; |
|
24937
|
|
|
|
|
|
|
/* we need one additional byte, since we use MoveBlock after pos++ and before dictionary using */ |
|
24938
|
0
|
0
|
|
|
|
|
if (LzInWindow_Create(p, sizeReserv, alloc)) |
|
24939
|
|
|
|
|
|
|
{ |
|
24940
|
0
|
|
|
|
|
|
uint32_t newCyclicBufferSize = historySize + 1; |
|
24941
|
|
|
|
|
|
|
uint32_t hs; |
|
24942
|
0
|
|
|
|
|
|
p->matchMaxLen = matchMaxLen; |
|
24943
|
|
|
|
|
|
|
{ |
|
24944
|
0
|
|
|
|
|
|
p->fixedHashSize = 0; |
|
24945
|
0
|
0
|
|
|
|
|
if (p->numHashBytes == 2) |
|
24946
|
|
|
|
|
|
|
hs = (1 << 16) - 1; |
|
24947
|
|
|
|
|
|
|
else |
|
24948
|
|
|
|
|
|
|
{ |
|
24949
|
0
|
|
|
|
|
|
hs = historySize - 1; |
|
24950
|
0
|
|
|
|
|
|
hs |= (hs >> 1); |
|
24951
|
0
|
|
|
|
|
|
hs |= (hs >> 2); |
|
24952
|
0
|
|
|
|
|
|
hs |= (hs >> 4); |
|
24953
|
0
|
|
|
|
|
|
hs |= (hs >> 8); |
|
24954
|
0
|
|
|
|
|
|
hs >>= 1; |
|
24955
|
0
|
|
|
|
|
|
hs |= 0xFFFF; /* don't change it! It's required for Deflate */ |
|
24956
|
0
|
0
|
|
|
|
|
if (hs > (1 << 24)) |
|
24957
|
|
|
|
|
|
|
{ |
|
24958
|
0
|
0
|
|
|
|
|
if (p->numHashBytes == 3) |
|
24959
|
|
|
|
|
|
|
hs = (1 << 24) - 1; |
|
24960
|
|
|
|
|
|
|
else |
|
24961
|
0
|
|
|
|
|
|
hs >>= 1; |
|
24962
|
|
|
|
|
|
|
} |
|
24963
|
|
|
|
|
|
|
} |
|
24964
|
0
|
|
|
|
|
|
p->hashMask = hs; |
|
24965
|
0
|
|
|
|
|
|
hs++; |
|
24966
|
0
|
0
|
|
|
|
|
if (p->numHashBytes > 2) p->fixedHashSize += kHash2Size; |
|
24967
|
0
|
0
|
|
|
|
|
if (p->numHashBytes > 3) p->fixedHashSize += kHash3Size; |
|
24968
|
0
|
0
|
|
|
|
|
if (p->numHashBytes > 4) p->fixedHashSize += kHash4Size; |
|
24969
|
0
|
|
|
|
|
|
hs += p->fixedHashSize; |
|
24970
|
|
|
|
|
|
|
} |
|
24971
|
|
|
|
|
|
|
|
|
24972
|
|
|
|
|
|
|
{ |
|
24973
|
0
|
|
|
|
|
|
uint32_t prevSize = p->hashSizeSum + p->numSons; |
|
24974
|
|
|
|
|
|
|
uint32_t newSize; |
|
24975
|
0
|
|
|
|
|
|
p->historySize = historySize; |
|
24976
|
0
|
|
|
|
|
|
p->hashSizeSum = hs; |
|
24977
|
0
|
|
|
|
|
|
p->cyclicBufferSize = newCyclicBufferSize; |
|
24978
|
0
|
0
|
|
|
|
|
p->numSons = (p->btMode ? newCyclicBufferSize * 2 : newCyclicBufferSize); |
|
24979
|
0
|
|
|
|
|
|
newSize = p->hashSizeSum + p->numSons; |
|
24980
|
0
|
0
|
|
|
|
|
if (p->hash != 0 && prevSize == newSize) |
|
|
|
0
|
|
|
|
|
|
|
24981
|
|
|
|
|
|
|
return 1; |
|
24982
|
|
|
|
|
|
|
MatchFinder_FreeThisClassMemory(p, alloc); |
|
24983
|
0
|
|
|
|
|
|
p->hash = AllocRefs(newSize, alloc); |
|
24984
|
0
|
0
|
|
|
|
|
if (p->hash != 0) |
|
24985
|
|
|
|
|
|
|
{ |
|
24986
|
0
|
|
|
|
|
|
p->son = p->hash + p->hashSizeSum; |
|
24987
|
0
|
|
|
|
|
|
return 1; |
|
24988
|
|
|
|
|
|
|
} |
|
24989
|
|
|
|
|
|
|
} |
|
24990
|
|
|
|
|
|
|
} |
|
24991
|
|
|
|
|
|
|
MatchFinder_Free(p, alloc); |
|
24992
|
|
|
|
|
|
|
return 0; |
|
24993
|
|
|
|
|
|
|
} |
|
24994
|
|
|
|
|
|
|
|
|
24995
|
0
|
|
|
|
|
|
static void MatchFinder_SetLimits(CMatchFinder *p) |
|
24996
|
|
|
|
|
|
|
{ |
|
24997
|
0
|
|
|
|
|
|
uint32_t limit = kMaxValForNormalize - p->pos; |
|
24998
|
0
|
|
|
|
|
|
uint32_t limit2 = p->cyclicBufferSize - p->cyclicBufferPos; |
|
24999
|
0
|
0
|
|
|
|
|
if (limit2 < limit) |
|
25000
|
|
|
|
|
|
|
limit = limit2; |
|
25001
|
0
|
|
|
|
|
|
limit2 = p->streamPos - p->pos; |
|
25002
|
0
|
0
|
|
|
|
|
if (limit2 <= p->keepSizeAfter) |
|
25003
|
|
|
|
|
|
|
{ |
|
25004
|
0
|
0
|
|
|
|
|
if (limit2 > 0) |
|
25005
|
|
|
|
|
|
|
limit2 = 1; |
|
25006
|
|
|
|
|
|
|
} |
|
25007
|
|
|
|
|
|
|
else |
|
25008
|
0
|
|
|
|
|
|
limit2 -= p->keepSizeAfter; |
|
25009
|
0
|
0
|
|
|
|
|
if (limit2 < limit) |
|
25010
|
|
|
|
|
|
|
limit = limit2; |
|
25011
|
|
|
|
|
|
|
{ |
|
25012
|
|
|
|
|
|
|
uint32_t lenLimit = p->streamPos - p->pos; |
|
25013
|
0
|
0
|
|
|
|
|
if (lenLimit > p->matchMaxLen) |
|
25014
|
|
|
|
|
|
|
lenLimit = p->matchMaxLen; |
|
25015
|
0
|
|
|
|
|
|
p->lenLimit = lenLimit; |
|
25016
|
|
|
|
|
|
|
} |
|
25017
|
0
|
|
|
|
|
|
p->posLimit = p->pos + limit; |
|
25018
|
0
|
|
|
|
|
|
} |
|
25019
|
|
|
|
|
|
|
|
|
25020
|
0
|
|
|
|
|
|
void MatchFinder_Init(CMatchFinder *p) |
|
25021
|
|
|
|
|
|
|
{ |
|
25022
|
|
|
|
|
|
|
uint32_t i; |
|
25023
|
0
|
0
|
|
|
|
|
for (i = 0; i < p->hashSizeSum; i++) |
|
25024
|
0
|
|
|
|
|
|
p->hash[i] = kEmptyHashValue; |
|
25025
|
0
|
|
|
|
|
|
p->cyclicBufferPos = 0; |
|
25026
|
0
|
|
|
|
|
|
p->buffer = p->bufferBase; |
|
25027
|
0
|
|
|
|
|
|
p->pos = p->streamPos = p->cyclicBufferSize; |
|
25028
|
0
|
|
|
|
|
|
p->result = SZ_OK; |
|
25029
|
0
|
|
|
|
|
|
p->streamEndWasReached = 0; |
|
25030
|
0
|
|
|
|
|
|
MatchFinder_ReadBlock(p); |
|
25031
|
0
|
|
|
|
|
|
MatchFinder_SetLimits(p); |
|
25032
|
0
|
|
|
|
|
|
} |
|
25033
|
|
|
|
|
|
|
|
|
25034
|
|
|
|
|
|
|
static uint32_t MatchFinder_GetSubValue(CMatchFinder *p) |
|
25035
|
|
|
|
|
|
|
{ |
|
25036
|
0
|
|
|
|
|
|
return (p->pos - p->historySize - 1) & kNormalizeMask; |
|
25037
|
|
|
|
|
|
|
} |
|
25038
|
|
|
|
|
|
|
|
|
25039
|
0
|
|
|
|
|
|
void MatchFinder_Normalize3(uint32_t subValue, CLzRef *items, uint32_t numItems) |
|
25040
|
|
|
|
|
|
|
{ |
|
25041
|
|
|
|
|
|
|
uint32_t i; |
|
25042
|
0
|
0
|
|
|
|
|
for (i = 0; i < numItems; i++) |
|
|
|
0
|
|
|
|
|
|
|
25043
|
|
|
|
|
|
|
{ |
|
25044
|
0
|
|
|
|
|
|
uint32_t value = items[i]; |
|
25045
|
0
|
0
|
|
|
|
|
if (value <= subValue) |
|
|
|
0
|
|
|
|
|
|
|
25046
|
|
|
|
|
|
|
value = kEmptyHashValue; |
|
25047
|
|
|
|
|
|
|
else |
|
25048
|
0
|
|
|
|
|
|
value -= subValue; |
|
25049
|
0
|
|
|
|
|
|
items[i] = value; |
|
25050
|
|
|
|
|
|
|
} |
|
25051
|
0
|
|
|
|
|
|
} |
|
25052
|
|
|
|
|
|
|
|
|
25053
|
0
|
|
|
|
|
|
static void MatchFinder_Normalize(CMatchFinder *p) |
|
25054
|
|
|
|
|
|
|
{ |
|
25055
|
|
|
|
|
|
|
uint32_t subValue = MatchFinder_GetSubValue(p); |
|
25056
|
0
|
|
|
|
|
|
MatchFinder_Normalize3(subValue, p->hash, p->hashSizeSum + p->numSons); |
|
25057
|
|
|
|
|
|
|
MatchFinder_ReduceOffsets(p, subValue); |
|
25058
|
0
|
|
|
|
|
|
} |
|
25059
|
|
|
|
|
|
|
|
|
25060
|
0
|
|
|
|
|
|
static void MatchFinder_CheckLimits(CMatchFinder *p) |
|
25061
|
|
|
|
|
|
|
{ |
|
25062
|
0
|
0
|
|
|
|
|
if (p->pos == kMaxValForNormalize) |
|
25063
|
0
|
|
|
|
|
|
MatchFinder_Normalize(p); |
|
25064
|
0
|
0
|
|
|
|
|
if (!p->streamEndWasReached && p->keepSizeAfter == p->streamPos - p->pos) |
|
|
|
0
|
|
|
|
|
|
|
25065
|
0
|
|
|
|
|
|
MatchFinder_CheckAndMoveAndRead(p); |
|
25066
|
0
|
0
|
|
|
|
|
if (p->cyclicBufferPos == p->cyclicBufferSize) |
|
25067
|
0
|
|
|
|
|
|
p->cyclicBufferPos = 0; |
|
25068
|
0
|
|
|
|
|
|
MatchFinder_SetLimits(p); |
|
25069
|
0
|
|
|
|
|
|
} |
|
25070
|
|
|
|
|
|
|
|
|
25071
|
0
|
|
|
|
|
|
static uint32_t * Hc_GetMatchesSpec(uint32_t lenLimit, uint32_t curMatch, uint32_t pos, const uint8_t *cur, CLzRef *son, |
|
25072
|
|
|
|
|
|
|
uint32_t _cyclicBufferPos, uint32_t _cyclicBufferSize, uint32_t cutValue, |
|
25073
|
|
|
|
|
|
|
uint32_t *distances, uint32_t maxLen) |
|
25074
|
|
|
|
|
|
|
{ |
|
25075
|
0
|
|
|
|
|
|
son[_cyclicBufferPos] = curMatch; |
|
25076
|
|
|
|
|
|
|
for (;;) |
|
25077
|
|
|
|
|
|
|
{ |
|
25078
|
0
|
|
|
|
|
|
uint32_t delta = pos - curMatch; |
|
25079
|
0
|
0
|
|
|
|
|
if (cutValue-- == 0 || delta >= _cyclicBufferSize) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
25080
|
|
|
|
|
|
|
return distances; |
|
25081
|
|
|
|
|
|
|
{ |
|
25082
|
0
|
|
|
|
|
|
const uint8_t *pb = cur - delta; |
|
25083
|
0
|
0
|
|
|
|
|
curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)]; |
|
25084
|
0
|
0
|
|
|
|
|
if (pb[maxLen] == cur[maxLen] && *pb == *cur) |
|
|
|
0
|
|
|
|
|
|
|
25085
|
|
|
|
|
|
|
{ |
|
25086
|
|
|
|
|
|
|
uint32_t len = 0; |
|
25087
|
0
|
0
|
|
|
|
|
while (++len != lenLimit) |
|
25088
|
0
|
0
|
|
|
|
|
if (pb[len] != cur[len]) |
|
25089
|
|
|
|
|
|
|
break; |
|
25090
|
0
|
0
|
|
|
|
|
if (maxLen < len) |
|
25091
|
|
|
|
|
|
|
{ |
|
25092
|
0
|
|
|
|
|
|
*distances++ = maxLen = len; |
|
25093
|
0
|
|
|
|
|
|
*distances++ = delta - 1; |
|
25094
|
0
|
0
|
|
|
|
|
if (len == lenLimit) |
|
25095
|
|
|
|
|
|
|
return distances; |
|
25096
|
|
|
|
|
|
|
} |
|
25097
|
|
|
|
|
|
|
} |
|
25098
|
|
|
|
|
|
|
} |
|
25099
|
|
|
|
|
|
|
} |
|
25100
|
|
|
|
|
|
|
} |
|
25101
|
|
|
|
|
|
|
|
|
25102
|
0
|
|
|
|
|
|
uint32_t * GetMatchesSpec1(uint32_t lenLimit, uint32_t curMatch, uint32_t pos, const uint8_t *cur, CLzRef *son, |
|
25103
|
|
|
|
|
|
|
uint32_t _cyclicBufferPos, uint32_t _cyclicBufferSize, uint32_t cutValue, |
|
25104
|
|
|
|
|
|
|
uint32_t *distances, uint32_t maxLen) |
|
25105
|
|
|
|
|
|
|
{ |
|
25106
|
0
|
|
|
|
|
|
CLzRef *ptr0 = son + (_cyclicBufferPos << 1) + 1; |
|
25107
|
0
|
|
|
|
|
|
CLzRef *ptr1 = son + (_cyclicBufferPos << 1); |
|
25108
|
|
|
|
|
|
|
uint32_t len0 = 0, len1 = 0; |
|
25109
|
|
|
|
|
|
|
for (;;) |
|
25110
|
|
|
|
|
|
|
{ |
|
25111
|
0
|
|
|
|
|
|
uint32_t delta = pos - curMatch; |
|
25112
|
0
|
0
|
|
|
|
|
if (cutValue-- == 0 || delta >= _cyclicBufferSize) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
25113
|
|
|
|
|
|
|
{ |
|
25114
|
0
|
|
|
|
|
|
*ptr0 = *ptr1 = kEmptyHashValue; |
|
25115
|
0
|
|
|
|
|
|
return distances; |
|
25116
|
|
|
|
|
|
|
} |
|
25117
|
|
|
|
|
|
|
{ |
|
25118
|
0
|
0
|
|
|
|
|
CLzRef *pair = son + ((_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1); |
|
25119
|
0
|
|
|
|
|
|
const uint8_t *pb = cur - delta; |
|
25120
|
0
|
0
|
|
|
|
|
uint32_t len = (len0 < len1 ? len0 : len1); |
|
25121
|
0
|
0
|
|
|
|
|
if (pb[len] == cur[len]) |
|
25122
|
|
|
|
|
|
|
{ |
|
25123
|
0
|
0
|
|
|
|
|
if (++len != lenLimit && pb[len] == cur[len]) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
25124
|
0
|
0
|
|
|
|
|
while (++len != lenLimit) |
|
25125
|
0
|
0
|
|
|
|
|
if (pb[len] != cur[len]) |
|
25126
|
|
|
|
|
|
|
break; |
|
25127
|
0
|
0
|
|
|
|
|
if (maxLen < len) |
|
25128
|
|
|
|
|
|
|
{ |
|
25129
|
0
|
|
|
|
|
|
*distances++ = maxLen = len; |
|
25130
|
0
|
|
|
|
|
|
*distances++ = delta - 1; |
|
25131
|
0
|
0
|
|
|
|
|
if (len == lenLimit) |
|
25132
|
|
|
|
|
|
|
{ |
|
25133
|
0
|
|
|
|
|
|
*ptr1 = pair[0]; |
|
25134
|
0
|
|
|
|
|
|
*ptr0 = pair[1]; |
|
25135
|
0
|
|
|
|
|
|
return distances; |
|
25136
|
|
|
|
|
|
|
} |
|
25137
|
|
|
|
|
|
|
} |
|
25138
|
|
|
|
|
|
|
} |
|
25139
|
0
|
0
|
|
|
|
|
if (pb[len] < cur[len]) |
|
25140
|
|
|
|
|
|
|
{ |
|
25141
|
0
|
|
|
|
|
|
*ptr1 = curMatch; |
|
25142
|
0
|
|
|
|
|
|
ptr1 = pair + 1; |
|
25143
|
0
|
|
|
|
|
|
curMatch = *ptr1; |
|
25144
|
|
|
|
|
|
|
len1 = len; |
|
25145
|
|
|
|
|
|
|
} |
|
25146
|
|
|
|
|
|
|
else |
|
25147
|
|
|
|
|
|
|
{ |
|
25148
|
0
|
|
|
|
|
|
*ptr0 = curMatch; |
|
25149
|
|
|
|
|
|
|
ptr0 = pair; |
|
25150
|
0
|
|
|
|
|
|
curMatch = *ptr0; |
|
25151
|
|
|
|
|
|
|
len0 = len; |
|
25152
|
|
|
|
|
|
|
} |
|
25153
|
|
|
|
|
|
|
} |
|
25154
|
|
|
|
|
|
|
} |
|
25155
|
|
|
|
|
|
|
} |
|
25156
|
|
|
|
|
|
|
|
|
25157
|
0
|
|
|
|
|
|
static void SkipMatchesSpec(uint32_t lenLimit, uint32_t curMatch, uint32_t pos, const uint8_t *cur, CLzRef *son, |
|
25158
|
|
|
|
|
|
|
uint32_t _cyclicBufferPos, uint32_t _cyclicBufferSize, uint32_t cutValue) |
|
25159
|
|
|
|
|
|
|
{ |
|
25160
|
0
|
|
|
|
|
|
CLzRef *ptr0 = son + (_cyclicBufferPos << 1) + 1; |
|
25161
|
0
|
|
|
|
|
|
CLzRef *ptr1 = son + (_cyclicBufferPos << 1); |
|
25162
|
|
|
|
|
|
|
uint32_t len0 = 0, len1 = 0; |
|
25163
|
|
|
|
|
|
|
for (;;) |
|
25164
|
|
|
|
|
|
|
{ |
|
25165
|
0
|
|
|
|
|
|
uint32_t delta = pos - curMatch; |
|
25166
|
0
|
0
|
|
|
|
|
if (cutValue-- == 0 || delta >= _cyclicBufferSize) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
25167
|
|
|
|
|
|
|
{ |
|
25168
|
0
|
|
|
|
|
|
*ptr0 = *ptr1 = kEmptyHashValue; |
|
25169
|
0
|
|
|
|
|
|
return; |
|
25170
|
|
|
|
|
|
|
} |
|
25171
|
|
|
|
|
|
|
{ |
|
25172
|
0
|
0
|
|
|
|
|
CLzRef *pair = son + ((_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1); |
|
25173
|
0
|
|
|
|
|
|
const uint8_t *pb = cur - delta; |
|
25174
|
0
|
0
|
|
|
|
|
uint32_t len = (len0 < len1 ? len0 : len1); |
|
25175
|
0
|
0
|
|
|
|
|
if (pb[len] == cur[len]) |
|
25176
|
|
|
|
|
|
|
{ |
|
25177
|
0
|
0
|
|
|
|
|
while (++len != lenLimit) |
|
25178
|
0
|
0
|
|
|
|
|
if (pb[len] != cur[len]) |
|
25179
|
|
|
|
|
|
|
break; |
|
25180
|
|
|
|
|
|
|
{ |
|
25181
|
0
|
0
|
|
|
|
|
if (len == lenLimit) |
|
25182
|
|
|
|
|
|
|
{ |
|
25183
|
0
|
|
|
|
|
|
*ptr1 = pair[0]; |
|
25184
|
0
|
|
|
|
|
|
*ptr0 = pair[1]; |
|
25185
|
0
|
|
|
|
|
|
return; |
|
25186
|
|
|
|
|
|
|
} |
|
25187
|
|
|
|
|
|
|
} |
|
25188
|
|
|
|
|
|
|
} |
|
25189
|
0
|
0
|
|
|
|
|
if (pb[len] < cur[len]) |
|
25190
|
|
|
|
|
|
|
{ |
|
25191
|
0
|
|
|
|
|
|
*ptr1 = curMatch; |
|
25192
|
0
|
|
|
|
|
|
ptr1 = pair + 1; |
|
25193
|
0
|
|
|
|
|
|
curMatch = *ptr1; |
|
25194
|
|
|
|
|
|
|
len1 = len; |
|
25195
|
|
|
|
|
|
|
} |
|
25196
|
|
|
|
|
|
|
else |
|
25197
|
|
|
|
|
|
|
{ |
|
25198
|
0
|
|
|
|
|
|
*ptr0 = curMatch; |
|
25199
|
|
|
|
|
|
|
ptr0 = pair; |
|
25200
|
0
|
|
|
|
|
|
curMatch = *ptr0; |
|
25201
|
|
|
|
|
|
|
len0 = len; |
|
25202
|
|
|
|
|
|
|
} |
|
25203
|
|
|
|
|
|
|
} |
|
25204
|
|
|
|
|
|
|
} |
|
25205
|
|
|
|
|
|
|
} |
|
25206
|
|
|
|
|
|
|
|
|
25207
|
|
|
|
|
|
|
#define MOVE_POS \ |
|
25208
|
|
|
|
|
|
|
++p->cyclicBufferPos; \ |
|
25209
|
|
|
|
|
|
|
p->buffer++; \ |
|
25210
|
|
|
|
|
|
|
if (++p->pos == p->posLimit) MatchFinder_CheckLimits(p); |
|
25211
|
|
|
|
|
|
|
|
|
25212
|
|
|
|
|
|
|
#define MOVE_POS_RET MOVE_POS return offset; |
|
25213
|
|
|
|
|
|
|
|
|
25214
|
0
|
0
|
|
|
|
|
static void MatchFinder_MovePos(CMatchFinder *p) { MOVE_POS; } |
|
25215
|
|
|
|
|
|
|
|
|
25216
|
|
|
|
|
|
|
#define GET_MATCHES_HEADER2(minLen, ret_op) \ |
|
25217
|
|
|
|
|
|
|
uint32_t lenLimit; uint32_t hashValue; const uint8_t *cur; uint32_t curMatch; \ |
|
25218
|
|
|
|
|
|
|
lenLimit = p->lenLimit; { if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; }} \ |
|
25219
|
|
|
|
|
|
|
cur = p->buffer; |
|
25220
|
|
|
|
|
|
|
|
|
25221
|
|
|
|
|
|
|
#define GET_MATCHES_HEADER(minLen) GET_MATCHES_HEADER2(minLen, return 0) |
|
25222
|
|
|
|
|
|
|
#define SKIP_HEADER(minLen) GET_MATCHES_HEADER2(minLen, continue) |
|
25223
|
|
|
|
|
|
|
|
|
25224
|
|
|
|
|
|
|
#define MF_PARAMS(p) p->pos, p->buffer, p->son, p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue |
|
25225
|
|
|
|
|
|
|
|
|
25226
|
|
|
|
|
|
|
#define GET_MATCHES_FOOTER(offset, maxLen) \ |
|
25227
|
|
|
|
|
|
|
offset = (uint32_t)(GetMatchesSpec1(lenLimit, curMatch, MF_PARAMS(p), \ |
|
25228
|
|
|
|
|
|
|
distances + offset, maxLen) - distances); MOVE_POS_RET; |
|
25229
|
|
|
|
|
|
|
|
|
25230
|
|
|
|
|
|
|
#define SKIP_FOOTER \ |
|
25231
|
|
|
|
|
|
|
SkipMatchesSpec(lenLimit, curMatch, MF_PARAMS(p)); MOVE_POS; |
|
25232
|
|
|
|
|
|
|
|
|
25233
|
0
|
|
|
|
|
|
static uint32_t Bt2_MatchFinder_GetMatches(CMatchFinder *p, uint32_t *distances) |
|
25234
|
|
|
|
|
|
|
{ |
|
25235
|
|
|
|
|
|
|
uint32_t offset; |
|
25236
|
0
|
0
|
|
|
|
|
GET_MATCHES_HEADER(2) |
|
25237
|
0
|
|
|
|
|
|
HASH2_CALC; |
|
25238
|
0
|
|
|
|
|
|
curMatch = p->hash[hashValue]; |
|
25239
|
0
|
|
|
|
|
|
p->hash[hashValue] = p->pos; |
|
25240
|
|
|
|
|
|
|
offset = 0; |
|
25241
|
0
|
0
|
|
|
|
|
GET_MATCHES_FOOTER(offset, 1) |
|
25242
|
|
|
|
|
|
|
} |
|
25243
|
|
|
|
|
|
|
|
|
25244
|
0
|
|
|
|
|
|
uint32_t Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, uint32_t *distances) |
|
25245
|
|
|
|
|
|
|
{ |
|
25246
|
|
|
|
|
|
|
uint32_t offset; |
|
25247
|
0
|
0
|
|
|
|
|
GET_MATCHES_HEADER(3) |
|
25248
|
0
|
|
|
|
|
|
HASH_ZIP_CALC; |
|
25249
|
0
|
|
|
|
|
|
curMatch = p->hash[hashValue]; |
|
25250
|
0
|
|
|
|
|
|
p->hash[hashValue] = p->pos; |
|
25251
|
|
|
|
|
|
|
offset = 0; |
|
25252
|
0
|
0
|
|
|
|
|
GET_MATCHES_FOOTER(offset, 2) |
|
25253
|
|
|
|
|
|
|
} |
|
25254
|
|
|
|
|
|
|
|
|
25255
|
0
|
|
|
|
|
|
static uint32_t Bt3_MatchFinder_GetMatches(CMatchFinder *p, uint32_t *distances) |
|
25256
|
|
|
|
|
|
|
{ |
|
25257
|
|
|
|
|
|
|
uint32_t hash2Value, delta2, maxLen, offset; |
|
25258
|
0
|
0
|
|
|
|
|
GET_MATCHES_HEADER(3) |
|
25259
|
|
|
|
|
|
|
|
|
25260
|
0
|
|
|
|
|
|
HASH3_CALC; |
|
25261
|
|
|
|
|
|
|
|
|
25262
|
0
|
|
|
|
|
|
delta2 = p->pos - p->hash[hash2Value]; |
|
25263
|
0
|
|
|
|
|
|
curMatch = p->hash[kFix3HashSize + hashValue]; |
|
25264
|
|
|
|
|
|
|
|
|
25265
|
|
|
|
|
|
|
p->hash[hash2Value] = |
|
25266
|
0
|
|
|
|
|
|
p->hash[kFix3HashSize + hashValue] = p->pos; |
|
25267
|
|
|
|
|
|
|
|
|
25268
|
|
|
|
|
|
|
maxLen = 2; |
|
25269
|
|
|
|
|
|
|
offset = 0; |
|
25270
|
0
|
0
|
|
|
|
|
if (delta2 < p->cyclicBufferSize && *(cur - delta2) == *cur) |
|
|
|
0
|
|
|
|
|
|
|
25271
|
|
|
|
|
|
|
{ |
|
25272
|
0
|
0
|
|
|
|
|
for (; maxLen != lenLimit; maxLen++) |
|
25273
|
0
|
0
|
|
|
|
|
if (cur[(ptrdiff_t)maxLen - delta2] != cur[maxLen]) |
|
25274
|
|
|
|
|
|
|
break; |
|
25275
|
0
|
|
|
|
|
|
distances[0] = maxLen; |
|
25276
|
0
|
|
|
|
|
|
distances[1] = delta2 - 1; |
|
25277
|
|
|
|
|
|
|
offset = 2; |
|
25278
|
0
|
0
|
|
|
|
|
if (maxLen == lenLimit) |
|
25279
|
|
|
|
|
|
|
{ |
|
25280
|
0
|
|
|
|
|
|
SkipMatchesSpec(lenLimit, curMatch, MF_PARAMS(p)); |
|
25281
|
0
|
0
|
|
|
|
|
MOVE_POS_RET; |
|
25282
|
|
|
|
|
|
|
} |
|
25283
|
|
|
|
|
|
|
} |
|
25284
|
0
|
0
|
|
|
|
|
GET_MATCHES_FOOTER(offset, maxLen) |
|
25285
|
|
|
|
|
|
|
} |
|
25286
|
|
|
|
|
|
|
|
|
25287
|
0
|
|
|
|
|
|
static uint32_t Bt4_MatchFinder_GetMatches(CMatchFinder *p, uint32_t *distances) |
|
25288
|
|
|
|
|
|
|
{ |
|
25289
|
|
|
|
|
|
|
uint32_t hash2Value, hash3Value, delta2, delta3, maxLen, offset; |
|
25290
|
0
|
0
|
|
|
|
|
GET_MATCHES_HEADER(4) |
|
25291
|
|
|
|
|
|
|
|
|
25292
|
0
|
|
|
|
|
|
HASH4_CALC; |
|
25293
|
|
|
|
|
|
|
|
|
25294
|
0
|
|
|
|
|
|
delta2 = p->pos - p->hash[ hash2Value]; |
|
25295
|
0
|
|
|
|
|
|
delta3 = p->pos - p->hash[kFix3HashSize + hash3Value]; |
|
25296
|
0
|
|
|
|
|
|
curMatch = p->hash[kFix4HashSize + hashValue]; |
|
25297
|
|
|
|
|
|
|
|
|
25298
|
|
|
|
|
|
|
p->hash[ hash2Value] = |
|
25299
|
|
|
|
|
|
|
p->hash[kFix3HashSize + hash3Value] = |
|
25300
|
0
|
|
|
|
|
|
p->hash[kFix4HashSize + hashValue] = p->pos; |
|
25301
|
|
|
|
|
|
|
|
|
25302
|
|
|
|
|
|
|
maxLen = 1; |
|
25303
|
|
|
|
|
|
|
offset = 0; |
|
25304
|
0
|
0
|
|
|
|
|
if (delta2 < p->cyclicBufferSize && *(cur - delta2) == *cur) |
|
|
|
0
|
|
|
|
|
|
|
25305
|
|
|
|
|
|
|
{ |
|
25306
|
0
|
|
|
|
|
|
distances[0] = maxLen = 2; |
|
25307
|
0
|
|
|
|
|
|
distances[1] = delta2 - 1; |
|
25308
|
|
|
|
|
|
|
offset = 2; |
|
25309
|
|
|
|
|
|
|
} |
|
25310
|
0
|
0
|
|
|
|
|
if (delta2 != delta3 && delta3 < p->cyclicBufferSize && *(cur - delta3) == *cur) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
25311
|
|
|
|
|
|
|
{ |
|
25312
|
|
|
|
|
|
|
maxLen = 3; |
|
25313
|
0
|
|
|
|
|
|
distances[offset + 1] = delta3 - 1; |
|
25314
|
0
|
|
|
|
|
|
offset += 2; |
|
25315
|
|
|
|
|
|
|
delta2 = delta3; |
|
25316
|
|
|
|
|
|
|
} |
|
25317
|
0
|
0
|
|
|
|
|
if (offset != 0) |
|
25318
|
|
|
|
|
|
|
{ |
|
25319
|
0
|
0
|
|
|
|
|
for (; maxLen != lenLimit; maxLen++) |
|
25320
|
0
|
0
|
|
|
|
|
if (cur[(ptrdiff_t)maxLen - delta2] != cur[maxLen]) |
|
25321
|
|
|
|
|
|
|
break; |
|
25322
|
0
|
|
|
|
|
|
distances[offset - 2] = maxLen; |
|
25323
|
0
|
0
|
|
|
|
|
if (maxLen == lenLimit) |
|
25324
|
|
|
|
|
|
|
{ |
|
25325
|
0
|
|
|
|
|
|
SkipMatchesSpec(lenLimit, curMatch, MF_PARAMS(p)); |
|
25326
|
0
|
0
|
|
|
|
|
MOVE_POS_RET; |
|
25327
|
|
|
|
|
|
|
} |
|
25328
|
|
|
|
|
|
|
} |
|
25329
|
0
|
0
|
|
|
|
|
if (maxLen < 3) |
|
25330
|
|
|
|
|
|
|
maxLen = 3; |
|
25331
|
0
|
0
|
|
|
|
|
GET_MATCHES_FOOTER(offset, maxLen) |
|
25332
|
|
|
|
|
|
|
} |
|
25333
|
|
|
|
|
|
|
|
|
25334
|
0
|
|
|
|
|
|
static uint32_t Hc4_MatchFinder_GetMatches(CMatchFinder *p, uint32_t *distances) |
|
25335
|
|
|
|
|
|
|
{ |
|
25336
|
|
|
|
|
|
|
uint32_t hash2Value, hash3Value, delta2, delta3, maxLen, offset; |
|
25337
|
0
|
0
|
|
|
|
|
GET_MATCHES_HEADER(4) |
|
25338
|
|
|
|
|
|
|
|
|
25339
|
0
|
|
|
|
|
|
HASH4_CALC; |
|
25340
|
|
|
|
|
|
|
|
|
25341
|
0
|
|
|
|
|
|
delta2 = p->pos - p->hash[ hash2Value]; |
|
25342
|
0
|
|
|
|
|
|
delta3 = p->pos - p->hash[kFix3HashSize + hash3Value]; |
|
25343
|
0
|
|
|
|
|
|
curMatch = p->hash[kFix4HashSize + hashValue]; |
|
25344
|
|
|
|
|
|
|
|
|
25345
|
|
|
|
|
|
|
p->hash[ hash2Value] = |
|
25346
|
|
|
|
|
|
|
p->hash[kFix3HashSize + hash3Value] = |
|
25347
|
0
|
|
|
|
|
|
p->hash[kFix4HashSize + hashValue] = p->pos; |
|
25348
|
|
|
|
|
|
|
|
|
25349
|
|
|
|
|
|
|
maxLen = 1; |
|
25350
|
|
|
|
|
|
|
offset = 0; |
|
25351
|
0
|
0
|
|
|
|
|
if (delta2 < p->cyclicBufferSize && *(cur - delta2) == *cur) |
|
|
|
0
|
|
|
|
|
|
|
25352
|
|
|
|
|
|
|
{ |
|
25353
|
0
|
|
|
|
|
|
distances[0] = maxLen = 2; |
|
25354
|
0
|
|
|
|
|
|
distances[1] = delta2 - 1; |
|
25355
|
|
|
|
|
|
|
offset = 2; |
|
25356
|
|
|
|
|
|
|
} |
|
25357
|
0
|
0
|
|
|
|
|
if (delta2 != delta3 && delta3 < p->cyclicBufferSize && *(cur - delta3) == *cur) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
25358
|
|
|
|
|
|
|
{ |
|
25359
|
|
|
|
|
|
|
maxLen = 3; |
|
25360
|
0
|
|
|
|
|
|
distances[offset + 1] = delta3 - 1; |
|
25361
|
0
|
|
|
|
|
|
offset += 2; |
|
25362
|
|
|
|
|
|
|
delta2 = delta3; |
|
25363
|
|
|
|
|
|
|
} |
|
25364
|
0
|
0
|
|
|
|
|
if (offset != 0) |
|
25365
|
|
|
|
|
|
|
{ |
|
25366
|
0
|
0
|
|
|
|
|
for (; maxLen != lenLimit; maxLen++) |
|
25367
|
0
|
0
|
|
|
|
|
if (cur[(ptrdiff_t)maxLen - delta2] != cur[maxLen]) |
|
25368
|
|
|
|
|
|
|
break; |
|
25369
|
0
|
|
|
|
|
|
distances[offset - 2] = maxLen; |
|
25370
|
0
|
0
|
|
|
|
|
if (maxLen == lenLimit) |
|
25371
|
|
|
|
|
|
|
{ |
|
25372
|
0
|
|
|
|
|
|
p->son[p->cyclicBufferPos] = curMatch; |
|
25373
|
0
|
0
|
|
|
|
|
MOVE_POS_RET; |
|
25374
|
|
|
|
|
|
|
} |
|
25375
|
|
|
|
|
|
|
} |
|
25376
|
0
|
0
|
|
|
|
|
if (maxLen < 3) |
|
25377
|
|
|
|
|
|
|
maxLen = 3; |
|
25378
|
0
|
|
|
|
|
|
offset = (uint32_t)(Hc_GetMatchesSpec(lenLimit, curMatch, MF_PARAMS(p), |
|
25379
|
0
|
|
|
|
|
|
distances + offset, maxLen) - (distances)); |
|
25380
|
0
|
0
|
|
|
|
|
MOVE_POS_RET |
|
25381
|
|
|
|
|
|
|
} |
|
25382
|
|
|
|
|
|
|
|
|
25383
|
0
|
|
|
|
|
|
uint32_t Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, uint32_t *distances) |
|
25384
|
|
|
|
|
|
|
{ |
|
25385
|
|
|
|
|
|
|
uint32_t offset; |
|
25386
|
0
|
0
|
|
|
|
|
GET_MATCHES_HEADER(3) |
|
25387
|
0
|
|
|
|
|
|
HASH_ZIP_CALC; |
|
25388
|
0
|
|
|
|
|
|
curMatch = p->hash[hashValue]; |
|
25389
|
0
|
|
|
|
|
|
p->hash[hashValue] = p->pos; |
|
25390
|
0
|
|
|
|
|
|
offset = (uint32_t)(Hc_GetMatchesSpec(lenLimit, curMatch, MF_PARAMS(p), |
|
25391
|
0
|
|
|
|
|
|
distances, 2) - (distances)); |
|
25392
|
0
|
0
|
|
|
|
|
MOVE_POS_RET |
|
25393
|
|
|
|
|
|
|
} |
|
25394
|
|
|
|
|
|
|
|
|
25395
|
0
|
|
|
|
|
|
static void Bt2_MatchFinder_Skip(CMatchFinder *p, uint32_t num) |
|
25396
|
|
|
|
|
|
|
{ |
|
25397
|
0
|
0
|
|
|
|
|
do |
|
25398
|
|
|
|
|
|
|
{ |
|
25399
|
0
|
0
|
|
|
|
|
SKIP_HEADER(2) |
|
25400
|
0
|
|
|
|
|
|
HASH2_CALC; |
|
25401
|
0
|
|
|
|
|
|
curMatch = p->hash[hashValue]; |
|
25402
|
0
|
|
|
|
|
|
p->hash[hashValue] = p->pos; |
|
25403
|
0
|
0
|
|
|
|
|
SKIP_FOOTER |
|
25404
|
|
|
|
|
|
|
} |
|
25405
|
|
|
|
|
|
|
while (--num != 0); |
|
25406
|
0
|
|
|
|
|
|
} |
|
25407
|
|
|
|
|
|
|
|
|
25408
|
0
|
|
|
|
|
|
void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, uint32_t num) |
|
25409
|
|
|
|
|
|
|
{ |
|
25410
|
0
|
0
|
|
|
|
|
do |
|
25411
|
|
|
|
|
|
|
{ |
|
25412
|
0
|
0
|
|
|
|
|
SKIP_HEADER(3) |
|
25413
|
0
|
|
|
|
|
|
HASH_ZIP_CALC; |
|
25414
|
0
|
|
|
|
|
|
curMatch = p->hash[hashValue]; |
|
25415
|
0
|
|
|
|
|
|
p->hash[hashValue] = p->pos; |
|
25416
|
0
|
0
|
|
|
|
|
SKIP_FOOTER |
|
25417
|
|
|
|
|
|
|
} |
|
25418
|
|
|
|
|
|
|
while (--num != 0); |
|
25419
|
0
|
|
|
|
|
|
} |
|
25420
|
|
|
|
|
|
|
|
|
25421
|
0
|
|
|
|
|
|
static void Bt3_MatchFinder_Skip(CMatchFinder *p, uint32_t num) |
|
25422
|
|
|
|
|
|
|
{ |
|
25423
|
0
|
0
|
|
|
|
|
do |
|
25424
|
|
|
|
|
|
|
{ |
|
25425
|
|
|
|
|
|
|
uint32_t hash2Value; |
|
25426
|
0
|
0
|
|
|
|
|
SKIP_HEADER(3) |
|
25427
|
0
|
|
|
|
|
|
HASH3_CALC; |
|
25428
|
0
|
|
|
|
|
|
curMatch = p->hash[kFix3HashSize + hashValue]; |
|
25429
|
0
|
|
|
|
|
|
p->hash[hash2Value] = |
|
25430
|
0
|
|
|
|
|
|
p->hash[kFix3HashSize + hashValue] = p->pos; |
|
25431
|
0
|
0
|
|
|
|
|
SKIP_FOOTER |
|
25432
|
|
|
|
|
|
|
} |
|
25433
|
|
|
|
|
|
|
while (--num != 0); |
|
25434
|
0
|
|
|
|
|
|
} |
|
25435
|
|
|
|
|
|
|
|
|
25436
|
0
|
|
|
|
|
|
static void Bt4_MatchFinder_Skip(CMatchFinder *p, uint32_t num) |
|
25437
|
|
|
|
|
|
|
{ |
|
25438
|
0
|
0
|
|
|
|
|
do |
|
25439
|
|
|
|
|
|
|
{ |
|
25440
|
|
|
|
|
|
|
uint32_t hash2Value, hash3Value; |
|
25441
|
0
|
0
|
|
|
|
|
SKIP_HEADER(4) |
|
25442
|
0
|
|
|
|
|
|
HASH4_CALC; |
|
25443
|
0
|
|
|
|
|
|
curMatch = p->hash[kFix4HashSize + hashValue]; |
|
25444
|
0
|
|
|
|
|
|
p->hash[ hash2Value] = |
|
25445
|
0
|
|
|
|
|
|
p->hash[kFix3HashSize + hash3Value] = p->pos; |
|
25446
|
0
|
|
|
|
|
|
p->hash[kFix4HashSize + hashValue] = p->pos; |
|
25447
|
0
|
0
|
|
|
|
|
SKIP_FOOTER |
|
25448
|
|
|
|
|
|
|
} |
|
25449
|
|
|
|
|
|
|
while (--num != 0); |
|
25450
|
0
|
|
|
|
|
|
} |
|
25451
|
|
|
|
|
|
|
|
|
25452
|
0
|
|
|
|
|
|
static void Hc4_MatchFinder_Skip(CMatchFinder *p, uint32_t num) |
|
25453
|
|
|
|
|
|
|
{ |
|
25454
|
0
|
0
|
|
|
|
|
do |
|
25455
|
|
|
|
|
|
|
{ |
|
25456
|
|
|
|
|
|
|
uint32_t hash2Value, hash3Value; |
|
25457
|
0
|
0
|
|
|
|
|
SKIP_HEADER(4) |
|
25458
|
0
|
|
|
|
|
|
HASH4_CALC; |
|
25459
|
0
|
|
|
|
|
|
curMatch = p->hash[kFix4HashSize + hashValue]; |
|
25460
|
0
|
|
|
|
|
|
p->hash[ hash2Value] = |
|
25461
|
0
|
|
|
|
|
|
p->hash[kFix3HashSize + hash3Value] = |
|
25462
|
0
|
|
|
|
|
|
p->hash[kFix4HashSize + hashValue] = p->pos; |
|
25463
|
0
|
|
|
|
|
|
p->son[p->cyclicBufferPos] = curMatch; |
|
25464
|
0
|
0
|
|
|
|
|
MOVE_POS |
|
25465
|
|
|
|
|
|
|
} |
|
25466
|
|
|
|
|
|
|
while (--num != 0); |
|
25467
|
0
|
|
|
|
|
|
} |
|
25468
|
|
|
|
|
|
|
|
|
25469
|
0
|
|
|
|
|
|
void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, uint32_t num) |
|
25470
|
|
|
|
|
|
|
{ |
|
25471
|
0
|
0
|
|
|
|
|
do |
|
25472
|
|
|
|
|
|
|
{ |
|
25473
|
0
|
0
|
|
|
|
|
SKIP_HEADER(3) |
|
25474
|
0
|
|
|
|
|
|
HASH_ZIP_CALC; |
|
25475
|
0
|
|
|
|
|
|
curMatch = p->hash[hashValue]; |
|
25476
|
0
|
|
|
|
|
|
p->hash[hashValue] = p->pos; |
|
25477
|
0
|
|
|
|
|
|
p->son[p->cyclicBufferPos] = curMatch; |
|
25478
|
0
|
0
|
|
|
|
|
MOVE_POS |
|
25479
|
|
|
|
|
|
|
} |
|
25480
|
|
|
|
|
|
|
while (--num != 0); |
|
25481
|
0
|
|
|
|
|
|
} |
|
25482
|
|
|
|
|
|
|
|
|
25483
|
0
|
|
|
|
|
|
void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder *vTable) |
|
25484
|
|
|
|
|
|
|
{ |
|
25485
|
0
|
|
|
|
|
|
vTable->Init = (Mf_Init_Func)MatchFinder_Init; |
|
25486
|
0
|
|
|
|
|
|
vTable->GetIndexByte = (Mf_GetIndexByte_Func)MatchFinder_GetIndexByte; |
|
25487
|
0
|
|
|
|
|
|
vTable->GetNumAvailableBytes = (Mf_GetNumAvailableBytes_Func)MatchFinder_GetNumAvailableBytes; |
|
25488
|
0
|
|
|
|
|
|
vTable->GetPointerToCurrentPos = (Mf_GetPointerToCurrentPos_Func)MatchFinder_GetPointerToCurrentPos; |
|
25489
|
0
|
0
|
|
|
|
|
if (!p->btMode) |
|
|
|
0
|
|
|
|
|
|
|
25490
|
|
|
|
|
|
|
{ |
|
25491
|
0
|
|
|
|
|
|
vTable->GetMatches = (Mf_GetMatches_Func)Hc4_MatchFinder_GetMatches; |
|
25492
|
0
|
|
|
|
|
|
vTable->Skip = (Mf_Skip_Func)Hc4_MatchFinder_Skip; |
|
25493
|
|
|
|
|
|
|
} |
|
25494
|
0
|
0
|
|
|
|
|
else if (p->numHashBytes == 2) |
|
|
|
0
|
|
|
|
|
|
|
25495
|
|
|
|
|
|
|
{ |
|
25496
|
0
|
|
|
|
|
|
vTable->GetMatches = (Mf_GetMatches_Func)Bt2_MatchFinder_GetMatches; |
|
25497
|
0
|
|
|
|
|
|
vTable->Skip = (Mf_Skip_Func)Bt2_MatchFinder_Skip; |
|
25498
|
|
|
|
|
|
|
} |
|
25499
|
0
|
0
|
|
|
|
|
else if (p->numHashBytes == 3) |
|
|
|
0
|
|
|
|
|
|
|
25500
|
|
|
|
|
|
|
{ |
|
25501
|
0
|
|
|
|
|
|
vTable->GetMatches = (Mf_GetMatches_Func)Bt3_MatchFinder_GetMatches; |
|
25502
|
0
|
|
|
|
|
|
vTable->Skip = (Mf_Skip_Func)Bt3_MatchFinder_Skip; |
|
25503
|
|
|
|
|
|
|
} |
|
25504
|
|
|
|
|
|
|
else |
|
25505
|
|
|
|
|
|
|
{ |
|
25506
|
0
|
|
|
|
|
|
vTable->GetMatches = (Mf_GetMatches_Func)Bt4_MatchFinder_GetMatches; |
|
25507
|
0
|
|
|
|
|
|
vTable->Skip = (Mf_Skip_Func)Bt4_MatchFinder_Skip; |
|
25508
|
|
|
|
|
|
|
} |
|
25509
|
0
|
|
|
|
|
|
} |
|
25510
|
|
|
|
|
|
|
|
|
25511
|
|
|
|
|
|
|
// LzmaEnc.h -- LZMA Encoder |
|
25512
|
|
|
|
|
|
|
// 2009-02-07 : Igor Pavlov : Public domain |
|
25513
|
|
|
|
|
|
|
|
|
25514
|
|
|
|
|
|
|
#define LZMA_PROPS_SIZE 5 |
|
25515
|
|
|
|
|
|
|
|
|
25516
|
|
|
|
|
|
|
struct CLzmaEncProps |
|
25517
|
|
|
|
|
|
|
{ |
|
25518
|
|
|
|
|
|
|
int level; /* 0 <= level <= 9 */ |
|
25519
|
|
|
|
|
|
|
uint32_t dictSize; /* (1 << 12) <= dictSize <= (1 << 27) for 32-bit version |
|
25520
|
|
|
|
|
|
|
(1 << 12) <= dictSize <= (1 << 30) for 64-bit version |
|
25521
|
|
|
|
|
|
|
default = (1 << 24) */ |
|
25522
|
|
|
|
|
|
|
int lc; /* 0 <= lc <= 8, default = 3 */ |
|
25523
|
|
|
|
|
|
|
int lp; /* 0 <= lp <= 4, default = 0 */ |
|
25524
|
|
|
|
|
|
|
int pb; /* 0 <= pb <= 4, default = 2 */ |
|
25525
|
|
|
|
|
|
|
int algo; /* 0 - fast, 1 - normal, default = 1 */ |
|
25526
|
|
|
|
|
|
|
int fb; /* 5 <= fb <= 273, default = 32 */ |
|
25527
|
|
|
|
|
|
|
int btMode; /* 0 - hashChain Mode, 1 - binTree mode - normal, default = 1 */ |
|
25528
|
|
|
|
|
|
|
int numHashBytes; /* 2, 3 or 4, default = 4 */ |
|
25529
|
|
|
|
|
|
|
uint32_t mc; /* 1 <= mc <= (1 << 30), default = 32 */ |
|
25530
|
|
|
|
|
|
|
unsigned writeEndMark; /* 0 - do not write EOPM, 1 - write EOPM, default = 0 */ |
|
25531
|
|
|
|
|
|
|
int numThreads; /* 1 or 2, default = 2 */ |
|
25532
|
|
|
|
|
|
|
}; |
|
25533
|
|
|
|
|
|
|
|
|
25534
|
|
|
|
|
|
|
void LzmaEncProps_Init(CLzmaEncProps *p); |
|
25535
|
|
|
|
|
|
|
void LzmaEncProps_Normalize(CLzmaEncProps *p); |
|
25536
|
|
|
|
|
|
|
uint32_t LzmaEncProps_GetDictSize(const CLzmaEncProps *props2); |
|
25537
|
|
|
|
|
|
|
|
|
25538
|
|
|
|
|
|
|
/* ---------- CLzmaEncHandle Interface ---------- */ |
|
25539
|
|
|
|
|
|
|
|
|
25540
|
|
|
|
|
|
|
/* LzmaEnc_* functions can return the following exit codes: |
|
25541
|
|
|
|
|
|
|
Returns: |
|
25542
|
|
|
|
|
|
|
SZ_OK - OK |
|
25543
|
|
|
|
|
|
|
SZ_ERROR_MEM - Memory allocation error |
|
25544
|
|
|
|
|
|
|
SZ_ERROR_PARAM - Incorrect paramater in props |
|
25545
|
|
|
|
|
|
|
SZ_ERROR_WRITE - Write callback error. |
|
25546
|
|
|
|
|
|
|
SZ_ERROR_PROGRESS - some break from progress callback |
|
25547
|
|
|
|
|
|
|
SZ_ERROR_THREAD - errors in multithreading functions (only for Mt version) |
|
25548
|
|
|
|
|
|
|
*/ |
|
25549
|
|
|
|
|
|
|
|
|
25550
|
|
|
|
|
|
|
typedef void * CLzmaEncHandle; |
|
25551
|
|
|
|
|
|
|
|
|
25552
|
|
|
|
|
|
|
CLzmaEncHandle LzmaEnc_Create(ISzAlloc *alloc); |
|
25553
|
|
|
|
|
|
|
void LzmaEnc_Destroy(CLzmaEncHandle p, ISzAlloc *alloc, ISzAlloc *allocBig); |
|
25554
|
|
|
|
|
|
|
SRes LzmaEnc_SetProps(CLzmaEncHandle p, const CLzmaEncProps *props); |
|
25555
|
|
|
|
|
|
|
SRes LzmaEnc_WriteProperties(CLzmaEncHandle p, uint8_t *properties, size_t *size); |
|
25556
|
|
|
|
|
|
|
SRes LzmaEnc_Encode(CLzmaEncHandle p, ISeqOutStream *outStream, ISeqInStream *inStream, |
|
25557
|
|
|
|
|
|
|
ICompressProgress *progress, ISzAlloc *alloc, ISzAlloc *allocBig); |
|
25558
|
|
|
|
|
|
|
SRes LzmaEnc_MemEncode(CLzmaEncHandle p, uint8_t *dest, size_t *destLen, const uint8_t *src, size_t srcLen, |
|
25559
|
|
|
|
|
|
|
int writeEndMark, ICompressProgress *progress, ISzAlloc *alloc, ISzAlloc *allocBig); |
|
25560
|
|
|
|
|
|
|
|
|
25561
|
|
|
|
|
|
|
/* ---------- One Call Interface ---------- */ |
|
25562
|
|
|
|
|
|
|
|
|
25563
|
|
|
|
|
|
|
/* LzmaEncode |
|
25564
|
|
|
|
|
|
|
Return code: |
|
25565
|
|
|
|
|
|
|
SZ_OK - OK |
|
25566
|
|
|
|
|
|
|
SZ_ERROR_MEM - Memory allocation error |
|
25567
|
|
|
|
|
|
|
SZ_ERROR_PARAM - Incorrect paramater |
|
25568
|
|
|
|
|
|
|
SZ_ERROR_OUTPUT_EOF - output buffer overflow |
|
25569
|
|
|
|
|
|
|
SZ_ERROR_THREAD - errors in multithreading functions (only for Mt version) |
|
25570
|
|
|
|
|
|
|
*/ |
|
25571
|
|
|
|
|
|
|
|
|
25572
|
|
|
|
|
|
|
SRes LzmaEncode(uint8_t *dest, size_t *destLen, const uint8_t *src, size_t srcLen, |
|
25573
|
|
|
|
|
|
|
const CLzmaEncProps *props, uint8_t *propsEncoded, size_t *propsSize, int writeEndMark, |
|
25574
|
|
|
|
|
|
|
ICompressProgress *progress, ISzAlloc *alloc, ISzAlloc *allocBig); |
|
25575
|
|
|
|
|
|
|
|
|
25576
|
|
|
|
|
|
|
// LzmaEnc.c -- LZMA Encoder |
|
25577
|
|
|
|
|
|
|
// 2010-04-16 : Igor Pavlov : Public domain |
|
25578
|
|
|
|
|
|
|
|
|
25579
|
|
|
|
|
|
|
#define kBlockSizeMax ((1 << LZMA_NUM_BLOCK_SIZE_BITS) - 1) |
|
25580
|
|
|
|
|
|
|
|
|
25581
|
|
|
|
|
|
|
#define kBlockSize (9 << 10) |
|
25582
|
|
|
|
|
|
|
#define kUnpackBlockSize (1 << 18) |
|
25583
|
|
|
|
|
|
|
#define kMatchArraySize (1 << 21) |
|
25584
|
|
|
|
|
|
|
#define kMatchRecordMaxSize ((LZMA_MATCH_LEN_MAX * 2 + 3) * LZMA_MATCH_LEN_MAX) |
|
25585
|
|
|
|
|
|
|
|
|
25586
|
|
|
|
|
|
|
#define kNumMaxDirectBits (31) |
|
25587
|
|
|
|
|
|
|
|
|
25588
|
|
|
|
|
|
|
#define kNumTopBits 24 |
|
25589
|
|
|
|
|
|
|
#define kTopValue ((uint32_t)1 << kNumTopBits) |
|
25590
|
|
|
|
|
|
|
|
|
25591
|
|
|
|
|
|
|
#define kNumBitModelTotalBits 11 |
|
25592
|
|
|
|
|
|
|
#define kBitModelTotal (1 << kNumBitModelTotalBits) |
|
25593
|
|
|
|
|
|
|
#define kNumMoveBits 5 |
|
25594
|
|
|
|
|
|
|
#define kProbInitValue (kBitModelTotal >> 1) |
|
25595
|
|
|
|
|
|
|
|
|
25596
|
|
|
|
|
|
|
#define kNumMoveReducingBits 4 |
|
25597
|
|
|
|
|
|
|
#define kNumBitPriceShiftBits 4 |
|
25598
|
|
|
|
|
|
|
#define kBitPrice (1 << kNumBitPriceShiftBits) |
|
25599
|
|
|
|
|
|
|
|
|
25600
|
0
|
|
|
|
|
|
void LzmaEncProps_Init(CLzmaEncProps *p) |
|
25601
|
|
|
|
|
|
|
{ |
|
25602
|
0
|
|
|
|
|
|
p->level = 5; |
|
25603
|
0
|
|
|
|
|
|
p->dictSize = p->mc = 0; |
|
25604
|
0
|
|
|
|
|
|
p->lc = p->lp = p->pb = p->algo = p->fb = p->btMode = p->numHashBytes = p->numThreads = -1; |
|
25605
|
0
|
|
|
|
|
|
p->writeEndMark = 0; |
|
25606
|
0
|
|
|
|
|
|
} |
|
25607
|
|
|
|
|
|
|
|
|
25608
|
0
|
|
|
|
|
|
void LzmaEncProps_Normalize(CLzmaEncProps *p) |
|
25609
|
|
|
|
|
|
|
{ |
|
25610
|
0
|
|
|
|
|
|
int level = p->level; |
|
25611
|
0
|
0
|
|
|
|
|
if (level < 0) level = 5; |
|
25612
|
0
|
|
|
|
|
|
p->level = level; |
|
25613
|
0
|
0
|
|
|
|
|
if (p->dictSize == 0) p->dictSize = (level <= 5 ? (1 << (level * 2 + 14)) : (level == 6 ? (1 << 25) : (1 << 26))); |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
25614
|
0
|
0
|
|
|
|
|
if (p->lc < 0) p->lc = 3; |
|
25615
|
0
|
0
|
|
|
|
|
if (p->lp < 0) p->lp = 0; |
|
25616
|
0
|
0
|
|
|
|
|
if (p->pb < 0) p->pb = 2; |
|
25617
|
0
|
0
|
|
|
|
|
if (p->algo < 0) p->algo = (level < 5 ? 0 : 1); |
|
25618
|
0
|
0
|
|
|
|
|
if (p->fb < 0) p->fb = (level < 7 ? 32 : 64); |
|
|
|
0
|
|
|
|
|
|
|
25619
|
0
|
0
|
|
|
|
|
if (p->btMode < 0) p->btMode = (p->algo == 0 ? 0 : 1); |
|
25620
|
0
|
0
|
|
|
|
|
if (p->numHashBytes < 0) p->numHashBytes = 4; |
|
25621
|
0
|
0
|
|
|
|
|
if (p->mc == 0) p->mc = (16 + (p->fb >> 1)) >> (p->btMode ? 0 : 1); |
|
25622
|
0
|
0
|
|
|
|
|
if (p->numThreads < 0) |
|
25623
|
0
|
|
|
|
|
|
p->numThreads = 1; |
|
25624
|
0
|
|
|
|
|
|
} |
|
25625
|
|
|
|
|
|
|
|
|
25626
|
0
|
|
|
|
|
|
uint32_t LzmaEncProps_GetDictSize(const CLzmaEncProps *props2) |
|
25627
|
|
|
|
|
|
|
{ |
|
25628
|
0
|
|
|
|
|
|
CLzmaEncProps props = *props2; |
|
25629
|
0
|
|
|
|
|
|
LzmaEncProps_Normalize(&props); |
|
25630
|
0
|
|
|
|
|
|
return props.dictSize; |
|
25631
|
|
|
|
|
|
|
} |
|
25632
|
|
|
|
|
|
|
|
|
25633
|
|
|
|
|
|
|
/* #define LZMA_LOG_BSR */ |
|
25634
|
|
|
|
|
|
|
/* Define it for Intel's CPU */ |
|
25635
|
|
|
|
|
|
|
|
|
25636
|
|
|
|
|
|
|
#ifdef LZMA_LOG_BSR |
|
25637
|
|
|
|
|
|
|
|
|
25638
|
|
|
|
|
|
|
#define kDicLogSizeMaxCompress 30 |
|
25639
|
|
|
|
|
|
|
|
|
25640
|
|
|
|
|
|
|
#define BSR2_RET(pos, res) { unsigned long i; _BitScanReverse(&i, (pos)); res = (i + i) + ((pos >> (i - 1)) & 1); } |
|
25641
|
|
|
|
|
|
|
|
|
25642
|
|
|
|
|
|
|
uint32_t GetPosSlot1(uint32_t pos) |
|
25643
|
|
|
|
|
|
|
{ |
|
25644
|
|
|
|
|
|
|
uint32_t res; |
|
25645
|
|
|
|
|
|
|
BSR2_RET(pos, res); |
|
25646
|
|
|
|
|
|
|
return res; |
|
25647
|
|
|
|
|
|
|
} |
|
25648
|
|
|
|
|
|
|
#define GetPosSlot2(pos, res) { BSR2_RET(pos, res); } |
|
25649
|
|
|
|
|
|
|
#define GetPosSlot(pos, res) { if (pos < 2) res = pos; else BSR2_RET(pos, res); } |
|
25650
|
|
|
|
|
|
|
|
|
25651
|
|
|
|
|
|
|
#else |
|
25652
|
|
|
|
|
|
|
|
|
25653
|
|
|
|
|
|
|
//#define kNumLogBits (9 + (int)sizeof(size_t) / 2) |
|
25654
|
|
|
|
|
|
|
#define kNumLogBits (9 + (int)sizeof(uint32_t) / 2) |
|
25655
|
|
|
|
|
|
|
#define kDicLogSizeMaxCompress ((kNumLogBits - 1) * 2 + 7) |
|
25656
|
|
|
|
|
|
|
|
|
25657
|
0
|
|
|
|
|
|
void LzmaEnc_FastPosInit(uint8_t *g_FastPos) |
|
25658
|
|
|
|
|
|
|
{ |
|
25659
|
|
|
|
|
|
|
int c = 2, slotFast; |
|
25660
|
0
|
|
|
|
|
|
g_FastPos[0] = 0; |
|
25661
|
0
|
|
|
|
|
|
g_FastPos[1] = 1; |
|
25662
|
|
|
|
|
|
|
|
|
25663
|
0
|
0
|
|
|
|
|
for (slotFast = 2; slotFast < kNumLogBits * 2; slotFast++) |
|
|
|
0
|
|
|
|
|
|
|
25664
|
|
|
|
|
|
|
{ |
|
25665
|
0
|
|
|
|
|
|
uint32_t k = (1 << ((slotFast >> 1) - 1)); |
|
25666
|
|
|
|
|
|
|
uint32_t j; |
|
25667
|
0
|
0
|
|
|
|
|
for (j = 0; j < k; j++, c++) |
|
|
|
0
|
|
|
|
|
|
|
25668
|
0
|
|
|
|
|
|
g_FastPos[c] = (uint8_t)slotFast; |
|
25669
|
|
|
|
|
|
|
} |
|
25670
|
0
|
|
|
|
|
|
} |
|
25671
|
|
|
|
|
|
|
|
|
25672
|
|
|
|
|
|
|
#define BSR2_RET(pos, res) { uint32_t i = 6 + ((kNumLogBits - 1) & \ |
|
25673
|
|
|
|
|
|
|
(0 - (((((uint32_t)1 << (kNumLogBits + 6)) - 1) - pos) >> 31))); \ |
|
25674
|
|
|
|
|
|
|
res = p->g_FastPos[pos >> i] + (i * 2); } |
|
25675
|
|
|
|
|
|
|
/* |
|
25676
|
|
|
|
|
|
|
#define BSR2_RET(pos, res) { res = (pos < (1 << (kNumLogBits + 6))) ? \ |
|
25677
|
|
|
|
|
|
|
p->g_FastPos[pos >> 6] + 12 : \ |
|
25678
|
|
|
|
|
|
|
p->g_FastPos[pos >> (6 + kNumLogBits - 1)] + (6 + (kNumLogBits - 1)) * 2; } |
|
25679
|
|
|
|
|
|
|
*/ |
|
25680
|
|
|
|
|
|
|
|
|
25681
|
|
|
|
|
|
|
#define GetPosSlot1(pos) p->g_FastPos[pos] |
|
25682
|
|
|
|
|
|
|
#define GetPosSlot2(pos, res) { BSR2_RET(pos, res); } |
|
25683
|
|
|
|
|
|
|
#define GetPosSlot(pos, res) { if (pos < kNumFullDistances) res = p->g_FastPos[pos]; else BSR2_RET(pos, res); } |
|
25684
|
|
|
|
|
|
|
|
|
25685
|
|
|
|
|
|
|
#endif |
|
25686
|
|
|
|
|
|
|
|
|
25687
|
|
|
|
|
|
|
#define LZMA_NUM_REPS 4 |
|
25688
|
|
|
|
|
|
|
|
|
25689
|
|
|
|
|
|
|
typedef unsigned CState; |
|
25690
|
|
|
|
|
|
|
|
|
25691
|
|
|
|
|
|
|
struct COptimal |
|
25692
|
|
|
|
|
|
|
{ |
|
25693
|
|
|
|
|
|
|
uint32_t price; |
|
25694
|
|
|
|
|
|
|
|
|
25695
|
|
|
|
|
|
|
CState state; |
|
25696
|
|
|
|
|
|
|
int prev1IsChar; |
|
25697
|
|
|
|
|
|
|
int prev2; |
|
25698
|
|
|
|
|
|
|
|
|
25699
|
|
|
|
|
|
|
uint32_t posPrev2; |
|
25700
|
|
|
|
|
|
|
uint32_t backPrev2; |
|
25701
|
|
|
|
|
|
|
|
|
25702
|
|
|
|
|
|
|
uint32_t posPrev; |
|
25703
|
|
|
|
|
|
|
uint32_t backPrev; |
|
25704
|
|
|
|
|
|
|
uint32_t backs[LZMA_NUM_REPS]; |
|
25705
|
|
|
|
|
|
|
}; |
|
25706
|
|
|
|
|
|
|
|
|
25707
|
|
|
|
|
|
|
#define kNumOpts (1 << 12) |
|
25708
|
|
|
|
|
|
|
|
|
25709
|
|
|
|
|
|
|
#define kNumLenToPosStates 4 |
|
25710
|
|
|
|
|
|
|
#define kNumPosSlotBits 6 |
|
25711
|
|
|
|
|
|
|
#define kDicLogSizeMin 0 |
|
25712
|
|
|
|
|
|
|
#define kDicLogSizeMax 32 |
|
25713
|
|
|
|
|
|
|
#define kDistTableSizeMax (kDicLogSizeMax * 2) |
|
25714
|
|
|
|
|
|
|
|
|
25715
|
|
|
|
|
|
|
#define kNumAlignBits 4 |
|
25716
|
|
|
|
|
|
|
#define kAlignTableSize (1 << kNumAlignBits) |
|
25717
|
|
|
|
|
|
|
#define kAlignMask (kAlignTableSize - 1) |
|
25718
|
|
|
|
|
|
|
|
|
25719
|
|
|
|
|
|
|
#define kStartPosModelIndex 4 |
|
25720
|
|
|
|
|
|
|
#define kEndPosModelIndex 14 |
|
25721
|
|
|
|
|
|
|
#define kNumPosModels (kEndPosModelIndex - kStartPosModelIndex) |
|
25722
|
|
|
|
|
|
|
|
|
25723
|
|
|
|
|
|
|
#define kNumFullDistances (1 << (kEndPosModelIndex >> 1)) |
|
25724
|
|
|
|
|
|
|
|
|
25725
|
|
|
|
|
|
|
#ifdef _LZMA_PROB32 |
|
25726
|
|
|
|
|
|
|
#define CLzmaProb uint32_t |
|
25727
|
|
|
|
|
|
|
#else |
|
25728
|
|
|
|
|
|
|
#define CLzmaProb uint16_t |
|
25729
|
|
|
|
|
|
|
#endif |
|
25730
|
|
|
|
|
|
|
|
|
25731
|
|
|
|
|
|
|
#define LZMA_PB_MAX 4 |
|
25732
|
|
|
|
|
|
|
#define LZMA_LC_MAX 8 |
|
25733
|
|
|
|
|
|
|
#define LZMA_LP_MAX 4 |
|
25734
|
|
|
|
|
|
|
|
|
25735
|
|
|
|
|
|
|
#define LZMA_NUM_PB_STATES_MAX (1 << LZMA_PB_MAX) |
|
25736
|
|
|
|
|
|
|
|
|
25737
|
|
|
|
|
|
|
#define kLenNumLowBits 3 |
|
25738
|
|
|
|
|
|
|
#define kLenNumLowSymbols (1 << kLenNumLowBits) |
|
25739
|
|
|
|
|
|
|
#define kLenNumMidBits 3 |
|
25740
|
|
|
|
|
|
|
#define kLenNumMidSymbols (1 << kLenNumMidBits) |
|
25741
|
|
|
|
|
|
|
#define kLenNumHighBits 8 |
|
25742
|
|
|
|
|
|
|
#define kLenNumHighSymbols (1 << kLenNumHighBits) |
|
25743
|
|
|
|
|
|
|
|
|
25744
|
|
|
|
|
|
|
#define kLenNumSymbolsTotal (kLenNumLowSymbols + kLenNumMidSymbols + kLenNumHighSymbols) |
|
25745
|
|
|
|
|
|
|
|
|
25746
|
|
|
|
|
|
|
#define LZMA_MATCH_LEN_MIN 2 |
|
25747
|
|
|
|
|
|
|
#define LZMA_MATCH_LEN_MAX (LZMA_MATCH_LEN_MIN + kLenNumSymbolsTotal - 1) |
|
25748
|
|
|
|
|
|
|
|
|
25749
|
|
|
|
|
|
|
#define kNumStates 12 |
|
25750
|
|
|
|
|
|
|
|
|
25751
|
|
|
|
|
|
|
struct CLenEnc |
|
25752
|
|
|
|
|
|
|
{ |
|
25753
|
|
|
|
|
|
|
CLzmaProb choice; |
|
25754
|
|
|
|
|
|
|
CLzmaProb choice2; |
|
25755
|
|
|
|
|
|
|
CLzmaProb low[LZMA_NUM_PB_STATES_MAX << kLenNumLowBits]; |
|
25756
|
|
|
|
|
|
|
CLzmaProb mid[LZMA_NUM_PB_STATES_MAX << kLenNumMidBits]; |
|
25757
|
|
|
|
|
|
|
CLzmaProb high[kLenNumHighSymbols]; |
|
25758
|
|
|
|
|
|
|
}; |
|
25759
|
|
|
|
|
|
|
|
|
25760
|
|
|
|
|
|
|
struct CLenPriceEnc |
|
25761
|
|
|
|
|
|
|
{ |
|
25762
|
|
|
|
|
|
|
CLenEnc p; |
|
25763
|
|
|
|
|
|
|
uint32_t prices[LZMA_NUM_PB_STATES_MAX][kLenNumSymbolsTotal]; |
|
25764
|
|
|
|
|
|
|
uint32_t tableSize; |
|
25765
|
|
|
|
|
|
|
uint32_t counters[LZMA_NUM_PB_STATES_MAX]; |
|
25766
|
|
|
|
|
|
|
}; |
|
25767
|
|
|
|
|
|
|
|
|
25768
|
|
|
|
|
|
|
struct CRangeEnc |
|
25769
|
|
|
|
|
|
|
{ |
|
25770
|
|
|
|
|
|
|
uint32_t range; |
|
25771
|
|
|
|
|
|
|
uint8_t cache; |
|
25772
|
|
|
|
|
|
|
uint64_t low; |
|
25773
|
|
|
|
|
|
|
uint64_t cacheSize; |
|
25774
|
|
|
|
|
|
|
uint8_t *buf; |
|
25775
|
|
|
|
|
|
|
uint8_t *bufLim; |
|
25776
|
|
|
|
|
|
|
uint8_t *bufBase; |
|
25777
|
|
|
|
|
|
|
ISeqOutStream *outStream; |
|
25778
|
|
|
|
|
|
|
uint64_t processed; |
|
25779
|
|
|
|
|
|
|
SRes res; |
|
25780
|
|
|
|
|
|
|
}; |
|
25781
|
|
|
|
|
|
|
|
|
25782
|
|
|
|
|
|
|
struct CSaveState |
|
25783
|
|
|
|
|
|
|
{ |
|
25784
|
|
|
|
|
|
|
CLzmaProb *litProbs; |
|
25785
|
|
|
|
|
|
|
|
|
25786
|
|
|
|
|
|
|
CLzmaProb isMatch[kNumStates][LZMA_NUM_PB_STATES_MAX]; |
|
25787
|
|
|
|
|
|
|
CLzmaProb isRep[kNumStates]; |
|
25788
|
|
|
|
|
|
|
CLzmaProb isRepG0[kNumStates]; |
|
25789
|
|
|
|
|
|
|
CLzmaProb isRepG1[kNumStates]; |
|
25790
|
|
|
|
|
|
|
CLzmaProb isRepG2[kNumStates]; |
|
25791
|
|
|
|
|
|
|
CLzmaProb isRep0Long[kNumStates][LZMA_NUM_PB_STATES_MAX]; |
|
25792
|
|
|
|
|
|
|
|
|
25793
|
|
|
|
|
|
|
CLzmaProb posSlotEncoder[kNumLenToPosStates][1 << kNumPosSlotBits]; |
|
25794
|
|
|
|
|
|
|
CLzmaProb posEncoders[kNumFullDistances - kEndPosModelIndex]; |
|
25795
|
|
|
|
|
|
|
CLzmaProb posAlignEncoder[1 << kNumAlignBits]; |
|
25796
|
|
|
|
|
|
|
|
|
25797
|
|
|
|
|
|
|
CLenPriceEnc lenEnc; |
|
25798
|
|
|
|
|
|
|
CLenPriceEnc repLenEnc; |
|
25799
|
|
|
|
|
|
|
|
|
25800
|
|
|
|
|
|
|
uint32_t reps[LZMA_NUM_REPS]; |
|
25801
|
|
|
|
|
|
|
uint32_t state; |
|
25802
|
|
|
|
|
|
|
}; |
|
25803
|
|
|
|
|
|
|
|
|
25804
|
|
|
|
|
|
|
struct CLzmaEnc |
|
25805
|
|
|
|
|
|
|
{ |
|
25806
|
|
|
|
|
|
|
IMatchFinder matchFinder; |
|
25807
|
|
|
|
|
|
|
CMatchFinder *matchFinderObj; |
|
25808
|
|
|
|
|
|
|
|
|
25809
|
|
|
|
|
|
|
CMatchFinder matchFinderBase; |
|
25810
|
|
|
|
|
|
|
|
|
25811
|
|
|
|
|
|
|
uint32_t optimumEndIndex; |
|
25812
|
|
|
|
|
|
|
uint32_t optimumCurrentIndex; |
|
25813
|
|
|
|
|
|
|
|
|
25814
|
|
|
|
|
|
|
uint32_t longestMatchLength; |
|
25815
|
|
|
|
|
|
|
uint32_t numPairs; |
|
25816
|
|
|
|
|
|
|
uint32_t numAvail; |
|
25817
|
|
|
|
|
|
|
COptimal opt[kNumOpts]; |
|
25818
|
|
|
|
|
|
|
|
|
25819
|
|
|
|
|
|
|
#ifndef LZMA_LOG_BSR |
|
25820
|
|
|
|
|
|
|
uint8_t g_FastPos[1 << kNumLogBits]; |
|
25821
|
|
|
|
|
|
|
#endif |
|
25822
|
|
|
|
|
|
|
|
|
25823
|
|
|
|
|
|
|
uint32_t ProbPrices[kBitModelTotal >> kNumMoveReducingBits]; |
|
25824
|
|
|
|
|
|
|
uint32_t matches[LZMA_MATCH_LEN_MAX * 2 + 2 + 1]; |
|
25825
|
|
|
|
|
|
|
uint32_t numFastBytes; |
|
25826
|
|
|
|
|
|
|
uint32_t additionalOffset; |
|
25827
|
|
|
|
|
|
|
uint32_t reps[LZMA_NUM_REPS]; |
|
25828
|
|
|
|
|
|
|
uint32_t state; |
|
25829
|
|
|
|
|
|
|
|
|
25830
|
|
|
|
|
|
|
uint32_t posSlotPrices[kNumLenToPosStates][kDistTableSizeMax]; |
|
25831
|
|
|
|
|
|
|
uint32_t distancesPrices[kNumLenToPosStates][kNumFullDistances]; |
|
25832
|
|
|
|
|
|
|
uint32_t alignPrices[kAlignTableSize]; |
|
25833
|
|
|
|
|
|
|
uint32_t alignPriceCount; |
|
25834
|
|
|
|
|
|
|
|
|
25835
|
|
|
|
|
|
|
uint32_t distTableSize; |
|
25836
|
|
|
|
|
|
|
|
|
25837
|
|
|
|
|
|
|
unsigned lc, lp, pb; |
|
25838
|
|
|
|
|
|
|
unsigned lpMask, pbMask; |
|
25839
|
|
|
|
|
|
|
|
|
25840
|
|
|
|
|
|
|
CLzmaProb *litProbs; |
|
25841
|
|
|
|
|
|
|
|
|
25842
|
|
|
|
|
|
|
CLzmaProb isMatch[kNumStates][LZMA_NUM_PB_STATES_MAX]; |
|
25843
|
|
|
|
|
|
|
CLzmaProb isRep[kNumStates]; |
|
25844
|
|
|
|
|
|
|
CLzmaProb isRepG0[kNumStates]; |
|
25845
|
|
|
|
|
|
|
CLzmaProb isRepG1[kNumStates]; |
|
25846
|
|
|
|
|
|
|
CLzmaProb isRepG2[kNumStates]; |
|
25847
|
|
|
|
|
|
|
CLzmaProb isRep0Long[kNumStates][LZMA_NUM_PB_STATES_MAX]; |
|
25848
|
|
|
|
|
|
|
|
|
25849
|
|
|
|
|
|
|
CLzmaProb posSlotEncoder[kNumLenToPosStates][1 << kNumPosSlotBits]; |
|
25850
|
|
|
|
|
|
|
CLzmaProb posEncoders[kNumFullDistances - kEndPosModelIndex]; |
|
25851
|
|
|
|
|
|
|
CLzmaProb posAlignEncoder[1 << kNumAlignBits]; |
|
25852
|
|
|
|
|
|
|
|
|
25853
|
|
|
|
|
|
|
CLenPriceEnc lenEnc; |
|
25854
|
|
|
|
|
|
|
CLenPriceEnc repLenEnc; |
|
25855
|
|
|
|
|
|
|
|
|
25856
|
|
|
|
|
|
|
unsigned lclp; |
|
25857
|
|
|
|
|
|
|
|
|
25858
|
|
|
|
|
|
|
bool fastMode; |
|
25859
|
|
|
|
|
|
|
|
|
25860
|
|
|
|
|
|
|
CRangeEnc rc; |
|
25861
|
|
|
|
|
|
|
|
|
25862
|
|
|
|
|
|
|
bool writeEndMark; |
|
25863
|
|
|
|
|
|
|
uint64_t nowPos64; |
|
25864
|
|
|
|
|
|
|
uint32_t matchPriceCount; |
|
25865
|
|
|
|
|
|
|
bool finished; |
|
25866
|
|
|
|
|
|
|
bool multiThread; |
|
25867
|
|
|
|
|
|
|
|
|
25868
|
|
|
|
|
|
|
SRes result; |
|
25869
|
|
|
|
|
|
|
uint32_t dictSize; |
|
25870
|
|
|
|
|
|
|
uint32_t matchFinderCycles; |
|
25871
|
|
|
|
|
|
|
|
|
25872
|
|
|
|
|
|
|
int needInit; |
|
25873
|
|
|
|
|
|
|
|
|
25874
|
|
|
|
|
|
|
CSaveState saveState; |
|
25875
|
|
|
|
|
|
|
}; |
|
25876
|
|
|
|
|
|
|
|
|
25877
|
0
|
|
|
|
|
|
void LzmaEnc_SaveState(CLzmaEncHandle pp) |
|
25878
|
|
|
|
|
|
|
{ |
|
25879
|
|
|
|
|
|
|
CLzmaEnc *p = (CLzmaEnc *)pp; |
|
25880
|
|
|
|
|
|
|
CSaveState *dest = &p->saveState; |
|
25881
|
|
|
|
|
|
|
int i; |
|
25882
|
0
|
|
|
|
|
|
dest->lenEnc = p->lenEnc; |
|
25883
|
0
|
|
|
|
|
|
dest->repLenEnc = p->repLenEnc; |
|
25884
|
0
|
|
|
|
|
|
dest->state = p->state; |
|
25885
|
|
|
|
|
|
|
|
|
25886
|
0
|
0
|
|
|
|
|
for (i = 0; i < kNumStates; i++) |
|
25887
|
|
|
|
|
|
|
{ |
|
25888
|
0
|
|
|
|
|
|
memcpy(dest->isMatch[i], p->isMatch[i], sizeof(p->isMatch[i])); |
|
25889
|
0
|
|
|
|
|
|
memcpy(dest->isRep0Long[i], p->isRep0Long[i], sizeof(p->isRep0Long[i])); |
|
25890
|
|
|
|
|
|
|
} |
|
25891
|
0
|
0
|
|
|
|
|
for (i = 0; i < kNumLenToPosStates; i++) |
|
25892
|
0
|
|
|
|
|
|
memcpy(dest->posSlotEncoder[i], p->posSlotEncoder[i], sizeof(p->posSlotEncoder[i])); |
|
25893
|
0
|
|
|
|
|
|
memcpy(dest->isRep, p->isRep, sizeof(p->isRep)); |
|
25894
|
0
|
|
|
|
|
|
memcpy(dest->isRepG0, p->isRepG0, sizeof(p->isRepG0)); |
|
25895
|
0
|
|
|
|
|
|
memcpy(dest->isRepG1, p->isRepG1, sizeof(p->isRepG1)); |
|
25896
|
0
|
|
|
|
|
|
memcpy(dest->isRepG2, p->isRepG2, sizeof(p->isRepG2)); |
|
25897
|
0
|
|
|
|
|
|
memcpy(dest->posEncoders, p->posEncoders, sizeof(p->posEncoders)); |
|
25898
|
0
|
|
|
|
|
|
memcpy(dest->posAlignEncoder, p->posAlignEncoder, sizeof(p->posAlignEncoder)); |
|
25899
|
0
|
|
|
|
|
|
memcpy(dest->reps, p->reps, sizeof(p->reps)); |
|
25900
|
0
|
|
|
|
|
|
memcpy(dest->litProbs, p->litProbs, (0x300 << p->lclp) * sizeof(CLzmaProb)); |
|
25901
|
0
|
|
|
|
|
|
} |
|
25902
|
|
|
|
|
|
|
|
|
25903
|
0
|
|
|
|
|
|
void LzmaEnc_RestoreState(CLzmaEncHandle pp) |
|
25904
|
|
|
|
|
|
|
{ |
|
25905
|
|
|
|
|
|
|
CLzmaEnc *dest = (CLzmaEnc *)pp; |
|
25906
|
|
|
|
|
|
|
const CSaveState *p = &dest->saveState; |
|
25907
|
|
|
|
|
|
|
int i; |
|
25908
|
0
|
|
|
|
|
|
dest->lenEnc = p->lenEnc; |
|
25909
|
0
|
|
|
|
|
|
dest->repLenEnc = p->repLenEnc; |
|
25910
|
0
|
|
|
|
|
|
dest->state = p->state; |
|
25911
|
|
|
|
|
|
|
|
|
25912
|
0
|
0
|
|
|
|
|
for (i = 0; i < kNumStates; i++) |
|
25913
|
|
|
|
|
|
|
{ |
|
25914
|
0
|
|
|
|
|
|
memcpy(dest->isMatch[i], p->isMatch[i], sizeof(p->isMatch[i])); |
|
25915
|
0
|
|
|
|
|
|
memcpy(dest->isRep0Long[i], p->isRep0Long[i], sizeof(p->isRep0Long[i])); |
|
25916
|
|
|
|
|
|
|
} |
|
25917
|
0
|
0
|
|
|
|
|
for (i = 0; i < kNumLenToPosStates; i++) |
|
25918
|
0
|
|
|
|
|
|
memcpy(dest->posSlotEncoder[i], p->posSlotEncoder[i], sizeof(p->posSlotEncoder[i])); |
|
25919
|
0
|
|
|
|
|
|
memcpy(dest->isRep, p->isRep, sizeof(p->isRep)); |
|
25920
|
0
|
|
|
|
|
|
memcpy(dest->isRepG0, p->isRepG0, sizeof(p->isRepG0)); |
|
25921
|
0
|
|
|
|
|
|
memcpy(dest->isRepG1, p->isRepG1, sizeof(p->isRepG1)); |
|
25922
|
0
|
|
|
|
|
|
memcpy(dest->isRepG2, p->isRepG2, sizeof(p->isRepG2)); |
|
25923
|
0
|
|
|
|
|
|
memcpy(dest->posEncoders, p->posEncoders, sizeof(p->posEncoders)); |
|
25924
|
0
|
|
|
|
|
|
memcpy(dest->posAlignEncoder, p->posAlignEncoder, sizeof(p->posAlignEncoder)); |
|
25925
|
0
|
|
|
|
|
|
memcpy(dest->reps, p->reps, sizeof(p->reps)); |
|
25926
|
0
|
|
|
|
|
|
memcpy(dest->litProbs, p->litProbs, (0x300 << dest->lclp) * sizeof(CLzmaProb)); |
|
25927
|
0
|
|
|
|
|
|
} |
|
25928
|
|
|
|
|
|
|
|
|
25929
|
0
|
|
|
|
|
|
SRes LzmaEnc_SetProps(CLzmaEncHandle pp, const CLzmaEncProps *props2) |
|
25930
|
|
|
|
|
|
|
{ |
|
25931
|
|
|
|
|
|
|
CLzmaEnc *p = (CLzmaEnc *)pp; |
|
25932
|
0
|
|
|
|
|
|
CLzmaEncProps props = *props2; |
|
25933
|
0
|
|
|
|
|
|
LzmaEncProps_Normalize(&props); |
|
25934
|
|
|
|
|
|
|
|
|
25935
|
0
|
0
|
|
|
|
|
if (props.lc > LZMA_LC_MAX || props.lp > LZMA_LP_MAX || props.pb > LZMA_PB_MAX || |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
25936
|
0
|
0
|
|
|
|
|
props.dictSize > ((uint32_t)1 << kDicLogSizeMaxCompress) || props.dictSize > ((uint32_t)1 << 30)) |
|
25937
|
|
|
|
|
|
|
return SZ_ERROR_PARAM; |
|
25938
|
0
|
|
|
|
|
|
p->dictSize = props.dictSize; |
|
25939
|
0
|
|
|
|
|
|
p->matchFinderCycles = props.mc; |
|
25940
|
|
|
|
|
|
|
{ |
|
25941
|
0
|
|
|
|
|
|
unsigned fb = props.fb; |
|
25942
|
0
|
0
|
|
|
|
|
if (fb < 5) |
|
25943
|
|
|
|
|
|
|
fb = 5; |
|
25944
|
0
|
0
|
|
|
|
|
if (fb > LZMA_MATCH_LEN_MAX) |
|
25945
|
|
|
|
|
|
|
fb = LZMA_MATCH_LEN_MAX; |
|
25946
|
0
|
|
|
|
|
|
p->numFastBytes = fb; |
|
25947
|
|
|
|
|
|
|
} |
|
25948
|
0
|
|
|
|
|
|
p->lc = props.lc; |
|
25949
|
0
|
|
|
|
|
|
p->lp = props.lp; |
|
25950
|
0
|
|
|
|
|
|
p->pb = props.pb; |
|
25951
|
0
|
|
|
|
|
|
p->fastMode = (props.algo == 0); |
|
25952
|
0
|
|
|
|
|
|
p->matchFinderBase.btMode = props.btMode; |
|
25953
|
|
|
|
|
|
|
{ |
|
25954
|
|
|
|
|
|
|
uint32_t numHashBytes = 4; |
|
25955
|
0
|
0
|
|
|
|
|
if (props.btMode) |
|
25956
|
|
|
|
|
|
|
{ |
|
25957
|
0
|
0
|
|
|
|
|
if (props.numHashBytes < 2) |
|
25958
|
|
|
|
|
|
|
numHashBytes = 2; |
|
25959
|
0
|
0
|
|
|
|
|
else if (props.numHashBytes < 4) |
|
25960
|
0
|
|
|
|
|
|
numHashBytes = props.numHashBytes; |
|
25961
|
|
|
|
|
|
|
} |
|
25962
|
0
|
|
|
|
|
|
p->matchFinderBase.numHashBytes = numHashBytes; |
|
25963
|
|
|
|
|
|
|
} |
|
25964
|
|
|
|
|
|
|
|
|
25965
|
0
|
|
|
|
|
|
p->matchFinderBase.cutValue = props.mc; |
|
25966
|
|
|
|
|
|
|
|
|
25967
|
0
|
|
|
|
|
|
p->writeEndMark = props.writeEndMark; |
|
25968
|
|
|
|
|
|
|
|
|
25969
|
0
|
|
|
|
|
|
return SZ_OK; |
|
25970
|
|
|
|
|
|
|
} |
|
25971
|
|
|
|
|
|
|
|
|
25972
|
|
|
|
|
|
|
static const int kLiteralNextStates[kNumStates] = {0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 4, 5}; |
|
25973
|
|
|
|
|
|
|
static const int kMatchNextStates[kNumStates] = {7, 7, 7, 7, 7, 7, 7, 10, 10, 10, 10, 10}; |
|
25974
|
|
|
|
|
|
|
static const int kRepNextStates[kNumStates] = {8, 8, 8, 8, 8, 8, 8, 11, 11, 11, 11, 11}; |
|
25975
|
|
|
|
|
|
|
static const int kShortRepNextStates[kNumStates]= {9, 9, 9, 9, 9, 9, 9, 11, 11, 11, 11, 11}; |
|
25976
|
|
|
|
|
|
|
|
|
25977
|
|
|
|
|
|
|
#define IsCharState(s) ((s) < 7) |
|
25978
|
|
|
|
|
|
|
|
|
25979
|
|
|
|
|
|
|
#define GetLenToPosState(len) (((len) < kNumLenToPosStates + 1) ? (len) - 2 : kNumLenToPosStates - 1) |
|
25980
|
|
|
|
|
|
|
|
|
25981
|
|
|
|
|
|
|
#define kInfinityPrice (1 << 30) |
|
25982
|
|
|
|
|
|
|
|
|
25983
|
|
|
|
|
|
|
static void RangeEnc_Construct(CRangeEnc *p) |
|
25984
|
|
|
|
|
|
|
{ |
|
25985
|
0
|
|
|
|
|
|
p->outStream = 0; |
|
25986
|
0
|
|
|
|
|
|
p->bufBase = 0; |
|
25987
|
|
|
|
|
|
|
} |
|
25988
|
|
|
|
|
|
|
|
|
25989
|
|
|
|
|
|
|
#define RangeEnc_GetProcessed(p) ((p)->processed + ((p)->buf - (p)->bufBase) + (p)->cacheSize) |
|
25990
|
|
|
|
|
|
|
|
|
25991
|
|
|
|
|
|
|
#define RC_BUF_SIZE (1 << 16) |
|
25992
|
|
|
|
|
|
|
static int RangeEnc_Alloc(CRangeEnc *p, ISzAlloc *alloc) |
|
25993
|
|
|
|
|
|
|
{ |
|
25994
|
0
|
0
|
|
|
|
|
if (p->bufBase == 0) |
|
25995
|
|
|
|
|
|
|
{ |
|
25996
|
0
|
|
|
|
|
|
p->bufBase = (uint8_t *)alloc->Alloc(alloc, RC_BUF_SIZE); |
|
25997
|
0
|
0
|
|
|
|
|
if (p->bufBase == 0) |
|
25998
|
|
|
|
|
|
|
return 0; |
|
25999
|
0
|
|
|
|
|
|
p->bufLim = p->bufBase + RC_BUF_SIZE; |
|
26000
|
|
|
|
|
|
|
} |
|
26001
|
|
|
|
|
|
|
return 1; |
|
26002
|
|
|
|
|
|
|
} |
|
26003
|
|
|
|
|
|
|
|
|
26004
|
|
|
|
|
|
|
static void RangeEnc_Free(CRangeEnc *p, ISzAlloc *alloc) |
|
26005
|
|
|
|
|
|
|
{ |
|
26006
|
0
|
|
|
|
|
|
alloc->Free(alloc, p->bufBase); |
|
26007
|
0
|
|
|
|
|
|
p->bufBase = 0; |
|
26008
|
|
|
|
|
|
|
} |
|
26009
|
|
|
|
|
|
|
|
|
26010
|
|
|
|
|
|
|
static void RangeEnc_Init(CRangeEnc *p) |
|
26011
|
|
|
|
|
|
|
{ |
|
26012
|
|
|
|
|
|
|
/* Stream.Init(); */ |
|
26013
|
0
|
|
|
|
|
|
p->low = 0; |
|
26014
|
0
|
|
|
|
|
|
p->range = 0xFFFFFFFF; |
|
26015
|
0
|
|
|
|
|
|
p->cacheSize = 1; |
|
26016
|
0
|
|
|
|
|
|
p->cache = 0; |
|
26017
|
|
|
|
|
|
|
|
|
26018
|
0
|
|
|
|
|
|
p->buf = p->bufBase; |
|
26019
|
|
|
|
|
|
|
|
|
26020
|
0
|
|
|
|
|
|
p->processed = 0; |
|
26021
|
0
|
|
|
|
|
|
p->res = SZ_OK; |
|
26022
|
|
|
|
|
|
|
} |
|
26023
|
|
|
|
|
|
|
|
|
26024
|
0
|
|
|
|
|
|
static void RangeEnc_FlushStream(CRangeEnc *p) |
|
26025
|
|
|
|
|
|
|
{ |
|
26026
|
|
|
|
|
|
|
size_t num; |
|
26027
|
0
|
0
|
|
|
|
|
if (p->res != SZ_OK) |
|
26028
|
|
|
|
|
|
|
return; |
|
26029
|
0
|
|
|
|
|
|
num = p->buf - p->bufBase; |
|
26030
|
0
|
0
|
|
|
|
|
if (num != p->outStream->Write(p->outStream, p->bufBase, num)) |
|
26031
|
0
|
|
|
|
|
|
p->res = SZ_ERROR_WRITE; |
|
26032
|
0
|
|
|
|
|
|
p->processed += num; |
|
26033
|
0
|
|
|
|
|
|
p->buf = p->bufBase; |
|
26034
|
|
|
|
|
|
|
} |
|
26035
|
|
|
|
|
|
|
|
|
26036
|
0
|
|
|
|
|
|
static void RangeEnc_ShiftLow(CRangeEnc *p) |
|
26037
|
|
|
|
|
|
|
{ |
|
26038
|
0
|
0
|
|
|
|
|
if ((uint32_t)p->low < (uint32_t)0xFF000000 || (int)(p->low >> 32) != 0) |
|
|
|
0
|
|
|
|
|
|
|
26039
|
|
|
|
|
|
|
{ |
|
26040
|
0
|
|
|
|
|
|
uint8_t temp = p->cache; |
|
26041
|
0
|
0
|
|
|
|
|
do |
|
26042
|
|
|
|
|
|
|
{ |
|
26043
|
0
|
|
|
|
|
|
uint8_t *buf = p->buf; |
|
26044
|
0
|
|
|
|
|
|
*buf++ = (uint8_t)(temp + (uint8_t)(p->low >> 32)); |
|
26045
|
0
|
|
|
|
|
|
p->buf = buf; |
|
26046
|
0
|
0
|
|
|
|
|
if (buf == p->bufLim) |
|
26047
|
0
|
|
|
|
|
|
RangeEnc_FlushStream(p); |
|
26048
|
|
|
|
|
|
|
temp = 0xFF; |
|
26049
|
|
|
|
|
|
|
} |
|
26050
|
0
|
|
|
|
|
|
while (--p->cacheSize != 0); |
|
26051
|
0
|
|
|
|
|
|
p->cache = (uint8_t)((uint32_t)p->low >> 24); |
|
26052
|
|
|
|
|
|
|
} |
|
26053
|
0
|
|
|
|
|
|
p->cacheSize++; |
|
26054
|
0
|
|
|
|
|
|
p->low = (uint32_t)p->low << 8; |
|
26055
|
0
|
|
|
|
|
|
} |
|
26056
|
|
|
|
|
|
|
|
|
26057
|
|
|
|
|
|
|
static void RangeEnc_FlushData(CRangeEnc *p) |
|
26058
|
|
|
|
|
|
|
{ |
|
26059
|
|
|
|
|
|
|
int i; |
|
26060
|
0
|
0
|
|
|
|
|
for (i = 0; i < 5; i++) |
|
26061
|
0
|
|
|
|
|
|
RangeEnc_ShiftLow(p); |
|
26062
|
|
|
|
|
|
|
} |
|
26063
|
|
|
|
|
|
|
|
|
26064
|
0
|
|
|
|
|
|
static void RangeEnc_EncodeDirectBits(CRangeEnc *p, uint32_t value, int numBits) |
|
26065
|
|
|
|
|
|
|
{ |
|
26066
|
|
|
|
|
|
|
do |
|
26067
|
|
|
|
|
|
|
{ |
|
26068
|
0
|
|
|
|
|
|
p->range >>= 1; |
|
26069
|
0
|
|
|
|
|
|
p->low += p->range & (0 - ((value >> --numBits) & 1)); |
|
26070
|
0
|
0
|
|
|
|
|
if (p->range < kTopValue) |
|
26071
|
|
|
|
|
|
|
{ |
|
26072
|
0
|
|
|
|
|
|
p->range <<= 8; |
|
26073
|
0
|
|
|
|
|
|
RangeEnc_ShiftLow(p); |
|
26074
|
|
|
|
|
|
|
} |
|
26075
|
|
|
|
|
|
|
} |
|
26076
|
0
|
0
|
|
|
|
|
while (numBits != 0); |
|
26077
|
0
|
|
|
|
|
|
} |
|
26078
|
|
|
|
|
|
|
|
|
26079
|
0
|
|
|
|
|
|
static void RangeEnc_EncodeBit(CRangeEnc *p, CLzmaProb *prob, uint32_t symbol) |
|
26080
|
|
|
|
|
|
|
{ |
|
26081
|
0
|
|
|
|
|
|
uint32_t ttt = *prob; |
|
26082
|
0
|
|
|
|
|
|
uint32_t newBound = (p->range >> kNumBitModelTotalBits) * ttt; |
|
26083
|
0
|
0
|
|
|
|
|
if (symbol == 0) |
|
26084
|
|
|
|
|
|
|
{ |
|
26085
|
0
|
|
|
|
|
|
p->range = newBound; |
|
26086
|
0
|
|
|
|
|
|
ttt += (kBitModelTotal - ttt) >> kNumMoveBits; |
|
26087
|
|
|
|
|
|
|
} |
|
26088
|
|
|
|
|
|
|
else |
|
26089
|
|
|
|
|
|
|
{ |
|
26090
|
0
|
|
|
|
|
|
p->low += newBound; |
|
26091
|
0
|
|
|
|
|
|
p->range -= newBound; |
|
26092
|
0
|
|
|
|
|
|
ttt -= ttt >> kNumMoveBits; |
|
26093
|
|
|
|
|
|
|
} |
|
26094
|
0
|
|
|
|
|
|
*prob = (CLzmaProb)ttt; |
|
26095
|
0
|
0
|
|
|
|
|
if (p->range < kTopValue) |
|
26096
|
|
|
|
|
|
|
{ |
|
26097
|
0
|
|
|
|
|
|
p->range <<= 8; |
|
26098
|
0
|
|
|
|
|
|
RangeEnc_ShiftLow(p); |
|
26099
|
|
|
|
|
|
|
} |
|
26100
|
0
|
|
|
|
|
|
} |
|
26101
|
|
|
|
|
|
|
|
|
26102
|
0
|
|
|
|
|
|
static void LitEnc_Encode(CRangeEnc *p, CLzmaProb *probs, uint32_t symbol) |
|
26103
|
|
|
|
|
|
|
{ |
|
26104
|
0
|
|
|
|
|
|
symbol |= 0x100; |
|
26105
|
|
|
|
|
|
|
do |
|
26106
|
|
|
|
|
|
|
{ |
|
26107
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(p, probs + (symbol >> 8), (symbol >> 7) & 1); |
|
26108
|
0
|
|
|
|
|
|
symbol <<= 1; |
|
26109
|
|
|
|
|
|
|
} |
|
26110
|
0
|
0
|
|
|
|
|
while (symbol < 0x10000); |
|
26111
|
0
|
|
|
|
|
|
} |
|
26112
|
|
|
|
|
|
|
|
|
26113
|
0
|
|
|
|
|
|
static void LitEnc_EncodeMatched(CRangeEnc *p, CLzmaProb *probs, uint32_t symbol, uint32_t matchByte) |
|
26114
|
|
|
|
|
|
|
{ |
|
26115
|
|
|
|
|
|
|
uint32_t offs = 0x100; |
|
26116
|
0
|
|
|
|
|
|
symbol |= 0x100; |
|
26117
|
|
|
|
|
|
|
do |
|
26118
|
|
|
|
|
|
|
{ |
|
26119
|
0
|
|
|
|
|
|
matchByte <<= 1; |
|
26120
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(p, probs + (offs + (matchByte & offs) + (symbol >> 8)), (symbol >> 7) & 1); |
|
26121
|
0
|
|
|
|
|
|
symbol <<= 1; |
|
26122
|
0
|
|
|
|
|
|
offs &= ~(matchByte ^ symbol); |
|
26123
|
|
|
|
|
|
|
} |
|
26124
|
0
|
0
|
|
|
|
|
while (symbol < 0x10000); |
|
26125
|
0
|
|
|
|
|
|
} |
|
26126
|
|
|
|
|
|
|
|
|
26127
|
0
|
|
|
|
|
|
void LzmaEnc_InitPriceTables(uint32_t *ProbPrices) |
|
26128
|
|
|
|
|
|
|
{ |
|
26129
|
|
|
|
|
|
|
uint32_t i; |
|
26130
|
0
|
0
|
|
|
|
|
for (i = (1 << kNumMoveReducingBits) / 2; i < kBitModelTotal; i += (1 << kNumMoveReducingBits)) |
|
26131
|
|
|
|
|
|
|
{ |
|
26132
|
|
|
|
|
|
|
const int kCyclesBits = kNumBitPriceShiftBits; |
|
26133
|
|
|
|
|
|
|
uint32_t w = i; |
|
26134
|
|
|
|
|
|
|
uint32_t bitCount = 0; |
|
26135
|
|
|
|
|
|
|
int j; |
|
26136
|
0
|
0
|
|
|
|
|
for (j = 0; j < kCyclesBits; j++) |
|
26137
|
|
|
|
|
|
|
{ |
|
26138
|
0
|
|
|
|
|
|
w = w * w; |
|
26139
|
0
|
|
|
|
|
|
bitCount <<= 1; |
|
26140
|
0
|
0
|
|
|
|
|
while (w >= ((uint32_t)1 << 16)) |
|
26141
|
|
|
|
|
|
|
{ |
|
26142
|
0
|
|
|
|
|
|
w >>= 1; |
|
26143
|
0
|
|
|
|
|
|
bitCount++; |
|
26144
|
|
|
|
|
|
|
} |
|
26145
|
|
|
|
|
|
|
} |
|
26146
|
0
|
|
|
|
|
|
ProbPrices[i >> kNumMoveReducingBits] = ((kNumBitModelTotalBits << kCyclesBits) - 15 - bitCount); |
|
26147
|
|
|
|
|
|
|
} |
|
26148
|
0
|
|
|
|
|
|
} |
|
26149
|
|
|
|
|
|
|
|
|
26150
|
|
|
|
|
|
|
#define GET_PRICE(prob, symbol) \ |
|
26151
|
|
|
|
|
|
|
p->ProbPrices[((prob) ^ (((-(int)(symbol))) & (kBitModelTotal - 1))) >> kNumMoveReducingBits]; |
|
26152
|
|
|
|
|
|
|
|
|
26153
|
|
|
|
|
|
|
#define GET_PRICEa(prob, symbol) \ |
|
26154
|
|
|
|
|
|
|
ProbPrices[((prob) ^ ((-((int)(symbol))) & (kBitModelTotal - 1))) >> kNumMoveReducingBits]; |
|
26155
|
|
|
|
|
|
|
|
|
26156
|
|
|
|
|
|
|
#define GET_PRICE_0(prob) p->ProbPrices[(prob) >> kNumMoveReducingBits] |
|
26157
|
|
|
|
|
|
|
#define GET_PRICE_1(prob) p->ProbPrices[((prob) ^ (kBitModelTotal - 1)) >> kNumMoveReducingBits] |
|
26158
|
|
|
|
|
|
|
|
|
26159
|
|
|
|
|
|
|
#define GET_PRICE_0a(prob) ProbPrices[(prob) >> kNumMoveReducingBits] |
|
26160
|
|
|
|
|
|
|
#define GET_PRICE_1a(prob) ProbPrices[((prob) ^ (kBitModelTotal - 1)) >> kNumMoveReducingBits] |
|
26161
|
|
|
|
|
|
|
|
|
26162
|
|
|
|
|
|
|
static uint32_t LitEnc_GetPrice(const CLzmaProb *probs, uint32_t symbol, uint32_t *ProbPrices) |
|
26163
|
|
|
|
|
|
|
{ |
|
26164
|
|
|
|
|
|
|
uint32_t price = 0; |
|
26165
|
0
|
|
|
|
|
|
symbol |= 0x100; |
|
26166
|
|
|
|
|
|
|
do |
|
26167
|
|
|
|
|
|
|
{ |
|
26168
|
0
|
|
|
|
|
|
price += GET_PRICEa(probs[symbol >> 8], (symbol >> 7) & 1); |
|
26169
|
0
|
|
|
|
|
|
symbol <<= 1; |
|
26170
|
|
|
|
|
|
|
} |
|
26171
|
0
|
0
|
|
|
|
|
while (symbol < 0x10000); |
|
|
|
0
|
|
|
|
|
|
|
26172
|
|
|
|
|
|
|
return price; |
|
26173
|
|
|
|
|
|
|
} |
|
26174
|
|
|
|
|
|
|
|
|
26175
|
0
|
|
|
|
|
|
static uint32_t LitEnc_GetPriceMatched(const CLzmaProb *probs, uint32_t symbol, uint32_t matchByte, uint32_t *ProbPrices) |
|
26176
|
|
|
|
|
|
|
{ |
|
26177
|
|
|
|
|
|
|
uint32_t price = 0; |
|
26178
|
|
|
|
|
|
|
uint32_t offs = 0x100; |
|
26179
|
0
|
|
|
|
|
|
symbol |= 0x100; |
|
26180
|
|
|
|
|
|
|
do |
|
26181
|
|
|
|
|
|
|
{ |
|
26182
|
0
|
|
|
|
|
|
matchByte <<= 1; |
|
26183
|
0
|
|
|
|
|
|
price += GET_PRICEa(probs[offs + (matchByte & offs) + (symbol >> 8)], (symbol >> 7) & 1); |
|
26184
|
0
|
|
|
|
|
|
symbol <<= 1; |
|
26185
|
0
|
|
|
|
|
|
offs &= ~(matchByte ^ symbol); |
|
26186
|
|
|
|
|
|
|
} |
|
26187
|
0
|
0
|
|
|
|
|
while (symbol < 0x10000); |
|
26188
|
0
|
|
|
|
|
|
return price; |
|
26189
|
|
|
|
|
|
|
} |
|
26190
|
|
|
|
|
|
|
|
|
26191
|
0
|
|
|
|
|
|
static void RcTree_Encode(CRangeEnc *rc, CLzmaProb *probs, int numBitLevels, uint32_t symbol) |
|
26192
|
|
|
|
|
|
|
{ |
|
26193
|
|
|
|
|
|
|
uint32_t m = 1; |
|
26194
|
|
|
|
|
|
|
int i; |
|
26195
|
0
|
0
|
|
|
|
|
for (i = numBitLevels; i != 0;) |
|
26196
|
|
|
|
|
|
|
{ |
|
26197
|
|
|
|
|
|
|
uint32_t bit; |
|
26198
|
0
|
|
|
|
|
|
i--; |
|
26199
|
0
|
|
|
|
|
|
bit = (symbol >> i) & 1; |
|
26200
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(rc, probs + m, bit); |
|
26201
|
0
|
|
|
|
|
|
m = (m << 1) | bit; |
|
26202
|
|
|
|
|
|
|
} |
|
26203
|
0
|
|
|
|
|
|
} |
|
26204
|
|
|
|
|
|
|
|
|
26205
|
0
|
|
|
|
|
|
static void RcTree_ReverseEncode(CRangeEnc *rc, CLzmaProb *probs, int numBitLevels, uint32_t symbol) |
|
26206
|
|
|
|
|
|
|
{ |
|
26207
|
|
|
|
|
|
|
uint32_t m = 1; |
|
26208
|
|
|
|
|
|
|
int i; |
|
26209
|
0
|
0
|
|
|
|
|
for (i = 0; i < numBitLevels; i++) |
|
26210
|
|
|
|
|
|
|
{ |
|
26211
|
0
|
|
|
|
|
|
uint32_t bit = symbol & 1; |
|
26212
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(rc, probs + m, bit); |
|
26213
|
0
|
|
|
|
|
|
m = (m << 1) | bit; |
|
26214
|
0
|
|
|
|
|
|
symbol >>= 1; |
|
26215
|
|
|
|
|
|
|
} |
|
26216
|
0
|
|
|
|
|
|
} |
|
26217
|
|
|
|
|
|
|
|
|
26218
|
|
|
|
|
|
|
static uint32_t RcTree_GetPrice(const CLzmaProb *probs, int numBitLevels, uint32_t symbol, uint32_t *ProbPrices) |
|
26219
|
|
|
|
|
|
|
{ |
|
26220
|
|
|
|
|
|
|
uint32_t price = 0; |
|
26221
|
0
|
|
|
|
|
|
symbol |= (1 << numBitLevels); |
|
26222
|
0
|
0
|
|
|
|
|
while (symbol != 1) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
26223
|
|
|
|
|
|
|
{ |
|
26224
|
0
|
|
|
|
|
|
price += GET_PRICEa(probs[symbol >> 1], symbol & 1); |
|
26225
|
|
|
|
|
|
|
symbol >>= 1; |
|
26226
|
|
|
|
|
|
|
} |
|
26227
|
|
|
|
|
|
|
return price; |
|
26228
|
|
|
|
|
|
|
} |
|
26229
|
|
|
|
|
|
|
|
|
26230
|
|
|
|
|
|
|
static uint32_t RcTree_ReverseGetPrice(const CLzmaProb *probs, int numBitLevels, uint32_t symbol, uint32_t *ProbPrices) |
|
26231
|
|
|
|
|
|
|
{ |
|
26232
|
|
|
|
|
|
|
uint32_t price = 0; |
|
26233
|
|
|
|
|
|
|
uint32_t m = 1; |
|
26234
|
|
|
|
|
|
|
int i; |
|
26235
|
0
|
0
|
|
|
|
|
for (i = numBitLevels; i != 0; i--) |
|
|
|
0
|
|
|
|
|
|
|
26236
|
|
|
|
|
|
|
{ |
|
26237
|
0
|
|
|
|
|
|
uint32_t bit = symbol & 1; |
|
26238
|
0
|
|
|
|
|
|
symbol >>= 1; |
|
26239
|
0
|
|
|
|
|
|
price += GET_PRICEa(probs[m], bit); |
|
26240
|
0
|
|
|
|
|
|
m = (m << 1) | bit; |
|
26241
|
|
|
|
|
|
|
} |
|
26242
|
|
|
|
|
|
|
return price; |
|
26243
|
|
|
|
|
|
|
} |
|
26244
|
|
|
|
|
|
|
|
|
26245
|
|
|
|
|
|
|
static void LenEnc_Init(CLenEnc *p) |
|
26246
|
|
|
|
|
|
|
{ |
|
26247
|
|
|
|
|
|
|
unsigned i; |
|
26248
|
0
|
|
|
|
|
|
p->choice = p->choice2 = kProbInitValue; |
|
26249
|
0
|
0
|
|
|
|
|
for (i = 0; i < (LZMA_NUM_PB_STATES_MAX << kLenNumLowBits); i++) |
|
|
|
0
|
|
|
|
|
|
|
26250
|
0
|
|
|
|
|
|
p->low[i] = kProbInitValue; |
|
26251
|
0
|
0
|
|
|
|
|
for (i = 0; i < (LZMA_NUM_PB_STATES_MAX << kLenNumMidBits); i++) |
|
|
|
0
|
|
|
|
|
|
|
26252
|
0
|
|
|
|
|
|
p->mid[i] = kProbInitValue; |
|
26253
|
0
|
0
|
|
|
|
|
for (i = 0; i < kLenNumHighSymbols; i++) |
|
|
|
0
|
|
|
|
|
|
|
26254
|
0
|
|
|
|
|
|
p->high[i] = kProbInitValue; |
|
26255
|
|
|
|
|
|
|
} |
|
26256
|
|
|
|
|
|
|
|
|
26257
|
0
|
|
|
|
|
|
static void LenEnc_Encode(CLenEnc *p, CRangeEnc *rc, uint32_t symbol, uint32_t posState) |
|
26258
|
|
|
|
|
|
|
{ |
|
26259
|
0
|
0
|
|
|
|
|
if (symbol < kLenNumLowSymbols) |
|
26260
|
|
|
|
|
|
|
{ |
|
26261
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(rc, &p->choice, 0); |
|
26262
|
0
|
|
|
|
|
|
RcTree_Encode(rc, p->low + (posState << kLenNumLowBits), kLenNumLowBits, symbol); |
|
26263
|
|
|
|
|
|
|
} |
|
26264
|
|
|
|
|
|
|
else |
|
26265
|
|
|
|
|
|
|
{ |
|
26266
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(rc, &p->choice, 1); |
|
26267
|
0
|
0
|
|
|
|
|
if (symbol < kLenNumLowSymbols + kLenNumMidSymbols) |
|
26268
|
|
|
|
|
|
|
{ |
|
26269
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(rc, &p->choice2, 0); |
|
26270
|
0
|
|
|
|
|
|
RcTree_Encode(rc, p->mid + (posState << kLenNumMidBits), kLenNumMidBits, symbol - kLenNumLowSymbols); |
|
26271
|
|
|
|
|
|
|
} |
|
26272
|
|
|
|
|
|
|
else |
|
26273
|
|
|
|
|
|
|
{ |
|
26274
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(rc, &p->choice2, 1); |
|
26275
|
0
|
|
|
|
|
|
RcTree_Encode(rc, p->high, kLenNumHighBits, symbol - kLenNumLowSymbols - kLenNumMidSymbols); |
|
26276
|
|
|
|
|
|
|
} |
|
26277
|
|
|
|
|
|
|
} |
|
26278
|
0
|
|
|
|
|
|
} |
|
26279
|
|
|
|
|
|
|
|
|
26280
|
0
|
|
|
|
|
|
static void LenEnc_SetPrices(CLenEnc *p, uint32_t posState, uint32_t numSymbols, uint32_t *prices, uint32_t *ProbPrices) |
|
26281
|
|
|
|
|
|
|
{ |
|
26282
|
0
|
|
|
|
|
|
uint32_t a0 = GET_PRICE_0a(p->choice); |
|
26283
|
0
|
|
|
|
|
|
uint32_t a1 = GET_PRICE_1a(p->choice); |
|
26284
|
0
|
|
|
|
|
|
uint32_t b0 = a1 + GET_PRICE_0a(p->choice2); |
|
26285
|
0
|
|
|
|
|
|
uint32_t b1 = a1 + GET_PRICE_1a(p->choice2); |
|
26286
|
|
|
|
|
|
|
uint32_t i = 0; |
|
26287
|
0
|
0
|
|
|
|
|
for (i = 0; i < kLenNumLowSymbols; i++) |
|
26288
|
|
|
|
|
|
|
{ |
|
26289
|
0
|
0
|
|
|
|
|
if (i >= numSymbols) |
|
26290
|
|
|
|
|
|
|
return; |
|
26291
|
0
|
|
|
|
|
|
prices[i] = a0 + RcTree_GetPrice(p->low + (posState << kLenNumLowBits), kLenNumLowBits, i, ProbPrices); |
|
26292
|
|
|
|
|
|
|
} |
|
26293
|
0
|
0
|
|
|
|
|
for (; i < kLenNumLowSymbols + kLenNumMidSymbols; i++) |
|
26294
|
|
|
|
|
|
|
{ |
|
26295
|
0
|
0
|
|
|
|
|
if (i >= numSymbols) |
|
26296
|
|
|
|
|
|
|
return; |
|
26297
|
0
|
|
|
|
|
|
prices[i] = b0 + RcTree_GetPrice(p->mid + (posState << kLenNumMidBits), kLenNumMidBits, i - kLenNumLowSymbols, ProbPrices); |
|
26298
|
|
|
|
|
|
|
} |
|
26299
|
0
|
0
|
|
|
|
|
for (; i < numSymbols; i++) |
|
26300
|
0
|
|
|
|
|
|
prices[i] = b1 + RcTree_GetPrice(p->high, kLenNumHighBits, i - kLenNumLowSymbols - kLenNumMidSymbols, ProbPrices); |
|
26301
|
|
|
|
|
|
|
} |
|
26302
|
|
|
|
|
|
|
|
|
26303
|
|
|
|
|
|
|
static void LenPriceEnc_UpdateTable(CLenPriceEnc *p, uint32_t posState, uint32_t *ProbPrices) |
|
26304
|
|
|
|
|
|
|
{ |
|
26305
|
0
|
|
|
|
|
|
LenEnc_SetPrices(&p->p, posState, p->tableSize, p->prices[posState], ProbPrices); |
|
26306
|
0
|
|
|
|
|
|
p->counters[posState] = p->tableSize; |
|
26307
|
|
|
|
|
|
|
} |
|
26308
|
|
|
|
|
|
|
|
|
26309
|
|
|
|
|
|
|
static void LenPriceEnc_UpdateTables(CLenPriceEnc *p, uint32_t numPosStates, uint32_t *ProbPrices) |
|
26310
|
|
|
|
|
|
|
{ |
|
26311
|
|
|
|
|
|
|
uint32_t posState; |
|
26312
|
0
|
0
|
|
|
|
|
for (posState = 0; posState < numPosStates; posState++) |
|
|
|
0
|
|
|
|
|
|
|
26313
|
|
|
|
|
|
|
LenPriceEnc_UpdateTable(p, posState, ProbPrices); |
|
26314
|
|
|
|
|
|
|
} |
|
26315
|
|
|
|
|
|
|
|
|
26316
|
0
|
|
|
|
|
|
static void LenEnc_Encode2(CLenPriceEnc *p, CRangeEnc *rc, uint32_t symbol, uint32_t posState, bool updatePrice, uint32_t *ProbPrices) |
|
26317
|
|
|
|
|
|
|
{ |
|
26318
|
0
|
|
|
|
|
|
LenEnc_Encode(&p->p, rc, symbol, posState); |
|
26319
|
0
|
0
|
|
|
|
|
if (updatePrice) |
|
26320
|
0
|
0
|
|
|
|
|
if (--p->counters[posState] == 0) |
|
26321
|
|
|
|
|
|
|
LenPriceEnc_UpdateTable(p, posState, ProbPrices); |
|
26322
|
0
|
|
|
|
|
|
} |
|
26323
|
|
|
|
|
|
|
|
|
26324
|
|
|
|
|
|
|
static void MovePos(CLzmaEnc *p, uint32_t num) |
|
26325
|
|
|
|
|
|
|
{ |
|
26326
|
0
|
0
|
|
|
|
|
if (num != 0) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
26327
|
|
|
|
|
|
|
{ |
|
26328
|
0
|
|
|
|
|
|
p->additionalOffset += num; |
|
26329
|
0
|
|
|
|
|
|
p->matchFinder.Skip(p->matchFinderObj, num); |
|
26330
|
|
|
|
|
|
|
} |
|
26331
|
|
|
|
|
|
|
} |
|
26332
|
|
|
|
|
|
|
|
|
26333
|
0
|
|
|
|
|
|
static uint32_t ReadMatchDistances(CLzmaEnc *p, uint32_t *numDistancePairsRes) |
|
26334
|
|
|
|
|
|
|
{ |
|
26335
|
|
|
|
|
|
|
uint32_t lenRes = 0, numPairs; |
|
26336
|
0
|
|
|
|
|
|
p->numAvail = p->matchFinder.GetNumAvailableBytes(p->matchFinderObj); |
|
26337
|
0
|
|
|
|
|
|
numPairs = p->matchFinder.GetMatches(p->matchFinderObj, p->matches); |
|
26338
|
0
|
0
|
|
|
|
|
if (numPairs > 0) |
|
26339
|
|
|
|
|
|
|
{ |
|
26340
|
0
|
|
|
|
|
|
lenRes = p->matches[numPairs - 2]; |
|
26341
|
0
|
0
|
|
|
|
|
if (lenRes == p->numFastBytes) |
|
26342
|
|
|
|
|
|
|
{ |
|
26343
|
0
|
|
|
|
|
|
const uint8_t *pby = p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - 1; |
|
26344
|
0
|
|
|
|
|
|
uint32_t distance = p->matches[numPairs - 1] + 1; |
|
26345
|
0
|
|
|
|
|
|
uint32_t numAvail = p->numAvail; |
|
26346
|
0
|
0
|
|
|
|
|
if (numAvail > LZMA_MATCH_LEN_MAX) |
|
26347
|
|
|
|
|
|
|
numAvail = LZMA_MATCH_LEN_MAX; |
|
26348
|
|
|
|
|
|
|
{ |
|
26349
|
0
|
|
|
|
|
|
const uint8_t *pby2 = pby - distance; |
|
26350
|
0
|
0
|
|
|
|
|
for (; lenRes < numAvail && pby[lenRes] == pby2[lenRes]; lenRes++); |
|
|
|
0
|
|
|
|
|
|
|
26351
|
|
|
|
|
|
|
} |
|
26352
|
|
|
|
|
|
|
} |
|
26353
|
|
|
|
|
|
|
} |
|
26354
|
0
|
|
|
|
|
|
p->additionalOffset++; |
|
26355
|
0
|
|
|
|
|
|
*numDistancePairsRes = numPairs; |
|
26356
|
0
|
|
|
|
|
|
return lenRes; |
|
26357
|
|
|
|
|
|
|
} |
|
26358
|
|
|
|
|
|
|
|
|
26359
|
|
|
|
|
|
|
#define MakeAsChar(p) (p)->backPrev = (uint32_t)(-1); (p)->prev1IsChar = false; |
|
26360
|
|
|
|
|
|
|
#define MakeAsShortRep(p) (p)->backPrev = 0; (p)->prev1IsChar = false; |
|
26361
|
|
|
|
|
|
|
#define IsShortRep(p) ((p)->backPrev == 0) |
|
26362
|
|
|
|
|
|
|
|
|
26363
|
|
|
|
|
|
|
static uint32_t GetRepLen1Price(CLzmaEnc *p, uint32_t state, uint32_t posState) |
|
26364
|
|
|
|
|
|
|
{ |
|
26365
|
|
|
|
|
|
|
return |
|
26366
|
0
|
|
|
|
|
|
GET_PRICE_0(p->isRepG0[state]) + |
|
26367
|
0
|
|
|
|
|
|
GET_PRICE_0(p->isRep0Long[state][posState]); |
|
26368
|
|
|
|
|
|
|
} |
|
26369
|
|
|
|
|
|
|
|
|
26370
|
0
|
|
|
|
|
|
static uint32_t GetPureRepPrice(CLzmaEnc *p, uint32_t repIndex, uint32_t state, uint32_t posState) |
|
26371
|
|
|
|
|
|
|
{ |
|
26372
|
|
|
|
|
|
|
uint32_t price; |
|
26373
|
0
|
0
|
|
|
|
|
if (repIndex == 0) |
|
26374
|
|
|
|
|
|
|
{ |
|
26375
|
0
|
|
|
|
|
|
price = GET_PRICE_0(p->isRepG0[state]); |
|
26376
|
0
|
|
|
|
|
|
price += GET_PRICE_1(p->isRep0Long[state][posState]); |
|
26377
|
|
|
|
|
|
|
} |
|
26378
|
|
|
|
|
|
|
else |
|
26379
|
|
|
|
|
|
|
{ |
|
26380
|
0
|
|
|
|
|
|
price = GET_PRICE_1(p->isRepG0[state]); |
|
26381
|
0
|
0
|
|
|
|
|
if (repIndex == 1) |
|
26382
|
0
|
|
|
|
|
|
price += GET_PRICE_0(p->isRepG1[state]); |
|
26383
|
|
|
|
|
|
|
else |
|
26384
|
|
|
|
|
|
|
{ |
|
26385
|
0
|
|
|
|
|
|
price += GET_PRICE_1(p->isRepG1[state]); |
|
26386
|
0
|
|
|
|
|
|
price += GET_PRICE(p->isRepG2[state], repIndex - 2); |
|
26387
|
|
|
|
|
|
|
} |
|
26388
|
|
|
|
|
|
|
} |
|
26389
|
0
|
|
|
|
|
|
return price; |
|
26390
|
|
|
|
|
|
|
} |
|
26391
|
|
|
|
|
|
|
|
|
26392
|
|
|
|
|
|
|
static uint32_t GetRepPrice(CLzmaEnc *p, uint32_t repIndex, uint32_t len, uint32_t state, uint32_t posState) |
|
26393
|
|
|
|
|
|
|
{ |
|
26394
|
0
|
|
|
|
|
|
return p->repLenEnc.prices[posState][len - LZMA_MATCH_LEN_MIN] + |
|
26395
|
0
|
|
|
|
|
|
GetPureRepPrice(p, repIndex, state, posState); |
|
26396
|
|
|
|
|
|
|
} |
|
26397
|
|
|
|
|
|
|
|
|
26398
|
|
|
|
|
|
|
static uint32_t Backward(CLzmaEnc *p, uint32_t *backRes, uint32_t cur) |
|
26399
|
|
|
|
|
|
|
{ |
|
26400
|
0
|
|
|
|
|
|
uint32_t posMem = p->opt[cur].posPrev; |
|
26401
|
0
|
|
|
|
|
|
uint32_t backMem = p->opt[cur].backPrev; |
|
26402
|
0
|
|
|
|
|
|
p->optimumEndIndex = cur; |
|
26403
|
|
|
|
|
|
|
do |
|
26404
|
|
|
|
|
|
|
{ |
|
26405
|
0
|
0
|
|
|
|
|
if (p->opt[cur].prev1IsChar) |
|
|
|
0
|
|
|
|
|
|
|
26406
|
|
|
|
|
|
|
{ |
|
26407
|
0
|
|
|
|
|
|
MakeAsChar(&p->opt[posMem]) |
|
26408
|
0
|
|
|
|
|
|
p->opt[posMem].posPrev = posMem - 1; |
|
26409
|
0
|
0
|
|
|
|
|
if (p->opt[cur].prev2) |
|
|
|
0
|
|
|
|
|
|
|
26410
|
|
|
|
|
|
|
{ |
|
26411
|
0
|
|
|
|
|
|
p->opt[posMem - 1].prev1IsChar = false; |
|
26412
|
0
|
|
|
|
|
|
p->opt[posMem - 1].posPrev = p->opt[cur].posPrev2; |
|
26413
|
0
|
|
|
|
|
|
p->opt[posMem - 1].backPrev = p->opt[cur].backPrev2; |
|
26414
|
|
|
|
|
|
|
} |
|
26415
|
|
|
|
|
|
|
} |
|
26416
|
|
|
|
|
|
|
{ |
|
26417
|
|
|
|
|
|
|
uint32_t posPrev = posMem; |
|
26418
|
|
|
|
|
|
|
uint32_t backCur = backMem; |
|
26419
|
|
|
|
|
|
|
|
|
26420
|
0
|
|
|
|
|
|
backMem = p->opt[posPrev].backPrev; |
|
26421
|
0
|
|
|
|
|
|
posMem = p->opt[posPrev].posPrev; |
|
26422
|
|
|
|
|
|
|
|
|
26423
|
0
|
|
|
|
|
|
p->opt[posPrev].backPrev = backCur; |
|
26424
|
0
|
|
|
|
|
|
p->opt[posPrev].posPrev = cur; |
|
26425
|
|
|
|
|
|
|
cur = posPrev; |
|
26426
|
|
|
|
|
|
|
} |
|
26427
|
|
|
|
|
|
|
} |
|
26428
|
0
|
0
|
|
|
|
|
while (cur != 0); |
|
|
|
0
|
|
|
|
|
|
|
26429
|
0
|
|
|
|
|
|
*backRes = p->opt[0].backPrev; |
|
26430
|
0
|
|
|
|
|
|
p->optimumCurrentIndex = p->opt[0].posPrev; |
|
26431
|
|
|
|
|
|
|
return p->optimumCurrentIndex; |
|
26432
|
|
|
|
|
|
|
} |
|
26433
|
|
|
|
|
|
|
|
|
26434
|
|
|
|
|
|
|
#define LIT_PROBS(pos, prevByte) (p->litProbs + ((((pos) & p->lpMask) << p->lc) + ((prevByte) >> (8 - p->lc))) * 0x300) |
|
26435
|
|
|
|
|
|
|
|
|
26436
|
0
|
|
|
|
|
|
static uint32_t GetOptimum(CLzmaEnc *p, uint32_t position, uint32_t *backRes) |
|
26437
|
|
|
|
|
|
|
{ |
|
26438
|
|
|
|
|
|
|
uint32_t numAvail, mainLen, numPairs, repMaxIndex, i, posState, lenEnd, len, cur; |
|
26439
|
|
|
|
|
|
|
uint32_t matchPrice, repMatchPrice, normalMatchPrice; |
|
26440
|
|
|
|
|
|
|
uint32_t reps[LZMA_NUM_REPS], repLens[LZMA_NUM_REPS]; |
|
26441
|
|
|
|
|
|
|
uint32_t *matches; |
|
26442
|
|
|
|
|
|
|
const uint8_t *data; |
|
26443
|
|
|
|
|
|
|
uint8_t curByte, matchByte; |
|
26444
|
0
|
0
|
|
|
|
|
if (p->optimumEndIndex != p->optimumCurrentIndex) |
|
26445
|
|
|
|
|
|
|
{ |
|
26446
|
|
|
|
|
|
|
const COptimal *opt = &p->opt[p->optimumCurrentIndex]; |
|
26447
|
0
|
|
|
|
|
|
uint32_t lenRes = opt->posPrev - p->optimumCurrentIndex; |
|
26448
|
0
|
|
|
|
|
|
*backRes = opt->backPrev; |
|
26449
|
0
|
|
|
|
|
|
p->optimumCurrentIndex = opt->posPrev; |
|
26450
|
0
|
|
|
|
|
|
return lenRes; |
|
26451
|
|
|
|
|
|
|
} |
|
26452
|
0
|
|
|
|
|
|
p->optimumCurrentIndex = p->optimumEndIndex = 0; |
|
26453
|
|
|
|
|
|
|
|
|
26454
|
0
|
0
|
|
|
|
|
if (p->additionalOffset == 0) |
|
26455
|
0
|
|
|
|
|
|
mainLen = ReadMatchDistances(p, &numPairs); |
|
26456
|
|
|
|
|
|
|
else |
|
26457
|
|
|
|
|
|
|
{ |
|
26458
|
0
|
|
|
|
|
|
mainLen = p->longestMatchLength; |
|
26459
|
0
|
|
|
|
|
|
numPairs = p->numPairs; |
|
26460
|
|
|
|
|
|
|
} |
|
26461
|
|
|
|
|
|
|
|
|
26462
|
0
|
|
|
|
|
|
numAvail = p->numAvail; |
|
26463
|
0
|
0
|
|
|
|
|
if (numAvail < 2) |
|
26464
|
|
|
|
|
|
|
{ |
|
26465
|
0
|
|
|
|
|
|
*backRes = (uint32_t)(-1); |
|
26466
|
0
|
|
|
|
|
|
return 1; |
|
26467
|
|
|
|
|
|
|
} |
|
26468
|
0
|
0
|
|
|
|
|
if (numAvail > LZMA_MATCH_LEN_MAX) |
|
26469
|
|
|
|
|
|
|
numAvail = LZMA_MATCH_LEN_MAX; |
|
26470
|
|
|
|
|
|
|
|
|
26471
|
0
|
|
|
|
|
|
data = p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - 1; |
|
26472
|
|
|
|
|
|
|
repMaxIndex = 0; |
|
26473
|
0
|
0
|
|
|
|
|
for (i = 0; i < LZMA_NUM_REPS; i++) |
|
26474
|
|
|
|
|
|
|
{ |
|
26475
|
|
|
|
|
|
|
uint32_t lenTest; |
|
26476
|
|
|
|
|
|
|
const uint8_t *data2; |
|
26477
|
0
|
|
|
|
|
|
reps[i] = p->reps[i]; |
|
26478
|
0
|
|
|
|
|
|
data2 = data - (reps[i] + 1); |
|
26479
|
0
|
0
|
|
|
|
|
if (data[0] != data2[0] || data[1] != data2[1]) |
|
|
|
0
|
|
|
|
|
|
|
26480
|
|
|
|
|
|
|
{ |
|
26481
|
0
|
|
|
|
|
|
repLens[i] = 0; |
|
26482
|
0
|
|
|
|
|
|
continue; |
|
26483
|
|
|
|
|
|
|
} |
|
26484
|
0
|
0
|
|
|
|
|
for (lenTest = 2; lenTest < numAvail && data[lenTest] == data2[lenTest]; lenTest++); |
|
|
|
0
|
|
|
|
|
|
|
26485
|
0
|
|
|
|
|
|
repLens[i] = lenTest; |
|
26486
|
0
|
0
|
|
|
|
|
if (lenTest > repLens[repMaxIndex]) |
|
26487
|
|
|
|
|
|
|
repMaxIndex = i; |
|
26488
|
|
|
|
|
|
|
} |
|
26489
|
0
|
0
|
|
|
|
|
if (repLens[repMaxIndex] >= p->numFastBytes) |
|
26490
|
|
|
|
|
|
|
{ |
|
26491
|
|
|
|
|
|
|
uint32_t lenRes; |
|
26492
|
0
|
|
|
|
|
|
*backRes = repMaxIndex; |
|
26493
|
|
|
|
|
|
|
lenRes = repLens[repMaxIndex]; |
|
26494
|
0
|
|
|
|
|
|
MovePos(p, lenRes - 1); |
|
26495
|
|
|
|
|
|
|
return lenRes; |
|
26496
|
|
|
|
|
|
|
} |
|
26497
|
|
|
|
|
|
|
|
|
26498
|
0
|
|
|
|
|
|
matches = p->matches; |
|
26499
|
0
|
0
|
|
|
|
|
if (mainLen >= p->numFastBytes) |
|
26500
|
|
|
|
|
|
|
{ |
|
26501
|
0
|
|
|
|
|
|
*backRes = matches[numPairs - 1] + LZMA_NUM_REPS; |
|
26502
|
0
|
|
|
|
|
|
MovePos(p, mainLen - 1); |
|
26503
|
|
|
|
|
|
|
return mainLen; |
|
26504
|
|
|
|
|
|
|
} |
|
26505
|
0
|
|
|
|
|
|
curByte = *data; |
|
26506
|
0
|
|
|
|
|
|
matchByte = *(data - (reps[0] + 1)); |
|
26507
|
|
|
|
|
|
|
|
|
26508
|
0
|
0
|
|
|
|
|
if (mainLen < 2 && curByte != matchByte && repLens[repMaxIndex] < 2) |
|
|
|
0
|
|
|
|
|
|
|
26509
|
|
|
|
|
|
|
{ |
|
26510
|
0
|
|
|
|
|
|
*backRes = (uint32_t)-1; |
|
26511
|
0
|
|
|
|
|
|
return 1; |
|
26512
|
|
|
|
|
|
|
} |
|
26513
|
|
|
|
|
|
|
|
|
26514
|
0
|
|
|
|
|
|
p->opt[0].state = (CState)p->state; |
|
26515
|
|
|
|
|
|
|
|
|
26516
|
0
|
|
|
|
|
|
posState = (position & p->pbMask); |
|
26517
|
|
|
|
|
|
|
|
|
26518
|
|
|
|
|
|
|
{ |
|
26519
|
0
|
|
|
|
|
|
const CLzmaProb *probs = LIT_PROBS(position, *(data - 1)); |
|
26520
|
0
|
0
|
|
|
|
|
p->opt[1].price = GET_PRICE_0(p->isMatch[p->state][posState]) + |
|
26521
|
|
|
|
|
|
|
(!IsCharState(p->state) ? |
|
26522
|
0
|
|
|
|
|
|
LitEnc_GetPriceMatched(probs, curByte, matchByte, p->ProbPrices) : |
|
26523
|
0
|
|
|
|
|
|
LitEnc_GetPrice(probs, curByte, p->ProbPrices)); |
|
26524
|
|
|
|
|
|
|
} |
|
26525
|
|
|
|
|
|
|
|
|
26526
|
0
|
|
|
|
|
|
MakeAsChar(&p->opt[1]); |
|
26527
|
|
|
|
|
|
|
|
|
26528
|
0
|
|
|
|
|
|
matchPrice = GET_PRICE_1(p->isMatch[p->state][posState]); |
|
26529
|
0
|
|
|
|
|
|
repMatchPrice = matchPrice + GET_PRICE_1(p->isRep[p->state]); |
|
26530
|
|
|
|
|
|
|
|
|
26531
|
0
|
0
|
|
|
|
|
if (matchByte == curByte) |
|
26532
|
|
|
|
|
|
|
{ |
|
26533
|
0
|
|
|
|
|
|
uint32_t shortRepPrice = repMatchPrice + GetRepLen1Price(p, p->state, posState); |
|
26534
|
0
|
0
|
|
|
|
|
if (shortRepPrice < p->opt[1].price) |
|
26535
|
|
|
|
|
|
|
{ |
|
26536
|
0
|
|
|
|
|
|
p->opt[1].price = shortRepPrice; |
|
26537
|
0
|
|
|
|
|
|
MakeAsShortRep(&p->opt[1]); |
|
26538
|
|
|
|
|
|
|
} |
|
26539
|
|
|
|
|
|
|
} |
|
26540
|
0
|
0
|
|
|
|
|
lenEnd = ((mainLen >= repLens[repMaxIndex]) ? mainLen : repLens[repMaxIndex]); |
|
26541
|
|
|
|
|
|
|
|
|
26542
|
0
|
0
|
|
|
|
|
if (lenEnd < 2) |
|
26543
|
|
|
|
|
|
|
{ |
|
26544
|
0
|
|
|
|
|
|
*backRes = p->opt[1].backPrev; |
|
26545
|
0
|
|
|
|
|
|
return 1; |
|
26546
|
|
|
|
|
|
|
} |
|
26547
|
|
|
|
|
|
|
|
|
26548
|
0
|
|
|
|
|
|
p->opt[1].posPrev = 0; |
|
26549
|
0
|
0
|
|
|
|
|
for (i = 0; i < LZMA_NUM_REPS; i++) |
|
26550
|
0
|
|
|
|
|
|
p->opt[0].backs[i] = reps[i]; |
|
26551
|
|
|
|
|
|
|
|
|
26552
|
|
|
|
|
|
|
len = lenEnd; |
|
26553
|
|
|
|
|
|
|
do |
|
26554
|
0
|
|
|
|
|
|
p->opt[len--].price = kInfinityPrice; |
|
26555
|
0
|
0
|
|
|
|
|
while (len >= 2); |
|
26556
|
|
|
|
|
|
|
|
|
26557
|
0
|
0
|
|
|
|
|
for (i = 0; i < LZMA_NUM_REPS; i++) |
|
26558
|
|
|
|
|
|
|
{ |
|
26559
|
0
|
|
|
|
|
|
uint32_t repLen = repLens[i]; |
|
26560
|
|
|
|
|
|
|
uint32_t price; |
|
26561
|
0
|
0
|
|
|
|
|
if (repLen < 2) |
|
26562
|
|
|
|
|
|
|
continue; |
|
26563
|
0
|
|
|
|
|
|
price = repMatchPrice + GetPureRepPrice(p, i, p->state, posState); |
|
26564
|
0
|
0
|
|
|
|
|
do |
|
26565
|
|
|
|
|
|
|
{ |
|
26566
|
0
|
|
|
|
|
|
uint32_t curAndLenPrice = price + p->repLenEnc.prices[posState][repLen - 2]; |
|
26567
|
|
|
|
|
|
|
COptimal *opt = &p->opt[repLen]; |
|
26568
|
0
|
0
|
|
|
|
|
if (curAndLenPrice < opt->price) |
|
26569
|
|
|
|
|
|
|
{ |
|
26570
|
0
|
|
|
|
|
|
opt->price = curAndLenPrice; |
|
26571
|
0
|
|
|
|
|
|
opt->posPrev = 0; |
|
26572
|
0
|
|
|
|
|
|
opt->backPrev = i; |
|
26573
|
0
|
|
|
|
|
|
opt->prev1IsChar = false; |
|
26574
|
|
|
|
|
|
|
} |
|
26575
|
|
|
|
|
|
|
} |
|
26576
|
|
|
|
|
|
|
while (--repLen >= 2); |
|
26577
|
|
|
|
|
|
|
} |
|
26578
|
|
|
|
|
|
|
|
|
26579
|
0
|
|
|
|
|
|
normalMatchPrice = matchPrice + GET_PRICE_0(p->isRep[p->state]); |
|
26580
|
|
|
|
|
|
|
|
|
26581
|
0
|
0
|
|
|
|
|
len = ((repLens[0] >= 2) ? repLens[0] + 1 : 2); |
|
26582
|
0
|
0
|
|
|
|
|
if (len <= mainLen) |
|
26583
|
|
|
|
|
|
|
{ |
|
26584
|
|
|
|
|
|
|
uint32_t offs = 0; |
|
26585
|
0
|
0
|
|
|
|
|
while (len > matches[offs]) |
|
26586
|
0
|
|
|
|
|
|
offs += 2; |
|
26587
|
0
|
|
|
|
|
|
for (; ; len++) |
|
26588
|
|
|
|
|
|
|
{ |
|
26589
|
|
|
|
|
|
|
COptimal *opt; |
|
26590
|
0
|
|
|
|
|
|
uint32_t distance = matches[offs + 1]; |
|
26591
|
|
|
|
|
|
|
|
|
26592
|
0
|
|
|
|
|
|
uint32_t curAndLenPrice = normalMatchPrice + p->lenEnc.prices[posState][len - LZMA_MATCH_LEN_MIN]; |
|
26593
|
0
|
0
|
|
|
|
|
uint32_t lenToPosState = GetLenToPosState(len); |
|
26594
|
0
|
0
|
|
|
|
|
if (distance < kNumFullDistances) |
|
26595
|
0
|
|
|
|
|
|
curAndLenPrice += p->distancesPrices[lenToPosState][distance]; |
|
26596
|
|
|
|
|
|
|
else |
|
26597
|
|
|
|
|
|
|
{ |
|
26598
|
|
|
|
|
|
|
uint32_t slot; |
|
26599
|
0
|
|
|
|
|
|
GetPosSlot2(distance, slot); |
|
26600
|
0
|
|
|
|
|
|
curAndLenPrice += p->alignPrices[distance & kAlignMask] + p->posSlotPrices[lenToPosState][slot]; |
|
26601
|
|
|
|
|
|
|
} |
|
26602
|
|
|
|
|
|
|
opt = &p->opt[len]; |
|
26603
|
0
|
0
|
|
|
|
|
if (curAndLenPrice < opt->price) |
|
26604
|
|
|
|
|
|
|
{ |
|
26605
|
0
|
|
|
|
|
|
opt->price = curAndLenPrice; |
|
26606
|
0
|
|
|
|
|
|
opt->posPrev = 0; |
|
26607
|
0
|
|
|
|
|
|
opt->backPrev = distance + LZMA_NUM_REPS; |
|
26608
|
0
|
|
|
|
|
|
opt->prev1IsChar = false; |
|
26609
|
|
|
|
|
|
|
} |
|
26610
|
0
|
0
|
|
|
|
|
if (len == matches[offs]) |
|
26611
|
|
|
|
|
|
|
{ |
|
26612
|
0
|
|
|
|
|
|
offs += 2; |
|
26613
|
0
|
0
|
|
|
|
|
if (offs == numPairs) |
|
26614
|
|
|
|
|
|
|
break; |
|
26615
|
|
|
|
|
|
|
} |
|
26616
|
|
|
|
|
|
|
} |
|
26617
|
|
|
|
|
|
|
} |
|
26618
|
|
|
|
|
|
|
|
|
26619
|
|
|
|
|
|
|
cur = 0; |
|
26620
|
|
|
|
|
|
|
|
|
26621
|
|
|
|
|
|
|
for (;;) |
|
26622
|
|
|
|
|
|
|
{ |
|
26623
|
|
|
|
|
|
|
uint32_t numAvailFull, newLen, numPairs, posPrev, state, posState, startLen; |
|
26624
|
|
|
|
|
|
|
uint32_t curPrice, curAnd1Price, matchPrice, repMatchPrice; |
|
26625
|
|
|
|
|
|
|
bool nextIsChar; |
|
26626
|
|
|
|
|
|
|
uint8_t curByte, matchByte; |
|
26627
|
|
|
|
|
|
|
const uint8_t *data; |
|
26628
|
|
|
|
|
|
|
COptimal *curOpt; |
|
26629
|
|
|
|
|
|
|
COptimal *nextOpt; |
|
26630
|
|
|
|
|
|
|
|
|
26631
|
0
|
|
|
|
|
|
cur++; |
|
26632
|
0
|
0
|
|
|
|
|
if (cur == lenEnd) |
|
26633
|
0
|
|
|
|
|
|
return Backward(p, backRes, cur); |
|
26634
|
|
|
|
|
|
|
|
|
26635
|
0
|
|
|
|
|
|
newLen = ReadMatchDistances(p, &numPairs); |
|
26636
|
0
|
0
|
|
|
|
|
if (newLen >= p->numFastBytes) |
|
26637
|
|
|
|
|
|
|
{ |
|
26638
|
0
|
|
|
|
|
|
p->numPairs = numPairs; |
|
26639
|
0
|
|
|
|
|
|
p->longestMatchLength = newLen; |
|
26640
|
0
|
|
|
|
|
|
return Backward(p, backRes, cur); |
|
26641
|
|
|
|
|
|
|
} |
|
26642
|
0
|
|
|
|
|
|
position++; |
|
26643
|
|
|
|
|
|
|
curOpt = &p->opt[cur]; |
|
26644
|
0
|
|
|
|
|
|
posPrev = curOpt->posPrev; |
|
26645
|
0
|
0
|
|
|
|
|
if (curOpt->prev1IsChar) |
|
26646
|
|
|
|
|
|
|
{ |
|
26647
|
0
|
|
|
|
|
|
posPrev--; |
|
26648
|
0
|
0
|
|
|
|
|
if (curOpt->prev2) |
|
26649
|
|
|
|
|
|
|
{ |
|
26650
|
0
|
|
|
|
|
|
state = p->opt[curOpt->posPrev2].state; |
|
26651
|
0
|
0
|
|
|
|
|
if (curOpt->backPrev2 < LZMA_NUM_REPS) |
|
26652
|
0
|
|
|
|
|
|
state = kRepNextStates[state]; |
|
26653
|
|
|
|
|
|
|
else |
|
26654
|
0
|
|
|
|
|
|
state = kMatchNextStates[state]; |
|
26655
|
|
|
|
|
|
|
} |
|
26656
|
|
|
|
|
|
|
else |
|
26657
|
0
|
|
|
|
|
|
state = p->opt[posPrev].state; |
|
26658
|
0
|
|
|
|
|
|
state = kLiteralNextStates[state]; |
|
26659
|
|
|
|
|
|
|
} |
|
26660
|
|
|
|
|
|
|
else |
|
26661
|
0
|
|
|
|
|
|
state = p->opt[posPrev].state; |
|
26662
|
0
|
0
|
|
|
|
|
if (posPrev == cur - 1) |
|
26663
|
|
|
|
|
|
|
{ |
|
26664
|
0
|
0
|
|
|
|
|
if (IsShortRep(curOpt)) |
|
26665
|
0
|
|
|
|
|
|
state = kShortRepNextStates[state]; |
|
26666
|
|
|
|
|
|
|
else |
|
26667
|
0
|
|
|
|
|
|
state = kLiteralNextStates[state]; |
|
26668
|
|
|
|
|
|
|
} |
|
26669
|
|
|
|
|
|
|
else |
|
26670
|
|
|
|
|
|
|
{ |
|
26671
|
|
|
|
|
|
|
uint32_t pos; |
|
26672
|
|
|
|
|
|
|
const COptimal *prevOpt; |
|
26673
|
0
|
0
|
|
|
|
|
if (curOpt->prev1IsChar && curOpt->prev2) |
|
|
|
0
|
|
|
|
|
|
|
26674
|
|
|
|
|
|
|
{ |
|
26675
|
0
|
|
|
|
|
|
posPrev = curOpt->posPrev2; |
|
26676
|
0
|
|
|
|
|
|
pos = curOpt->backPrev2; |
|
26677
|
0
|
|
|
|
|
|
state = kRepNextStates[state]; |
|
26678
|
|
|
|
|
|
|
} |
|
26679
|
|
|
|
|
|
|
else |
|
26680
|
|
|
|
|
|
|
{ |
|
26681
|
0
|
|
|
|
|
|
pos = curOpt->backPrev; |
|
26682
|
0
|
0
|
|
|
|
|
if (pos < LZMA_NUM_REPS) |
|
26683
|
0
|
|
|
|
|
|
state = kRepNextStates[state]; |
|
26684
|
|
|
|
|
|
|
else |
|
26685
|
0
|
|
|
|
|
|
state = kMatchNextStates[state]; |
|
26686
|
|
|
|
|
|
|
} |
|
26687
|
|
|
|
|
|
|
prevOpt = &p->opt[posPrev]; |
|
26688
|
0
|
0
|
|
|
|
|
if (pos < LZMA_NUM_REPS) |
|
26689
|
|
|
|
|
|
|
{ |
|
26690
|
|
|
|
|
|
|
uint32_t i; |
|
26691
|
0
|
|
|
|
|
|
reps[0] = prevOpt->backs[pos]; |
|
26692
|
0
|
0
|
|
|
|
|
for (i = 1; i <= pos; i++) |
|
26693
|
0
|
|
|
|
|
|
reps[i] = prevOpt->backs[i - 1]; |
|
26694
|
0
|
0
|
|
|
|
|
for (; i < LZMA_NUM_REPS; i++) |
|
26695
|
0
|
|
|
|
|
|
reps[i] = prevOpt->backs[i]; |
|
26696
|
|
|
|
|
|
|
} |
|
26697
|
|
|
|
|
|
|
else |
|
26698
|
|
|
|
|
|
|
{ |
|
26699
|
|
|
|
|
|
|
uint32_t i; |
|
26700
|
0
|
|
|
|
|
|
reps[0] = (pos - LZMA_NUM_REPS); |
|
26701
|
0
|
0
|
|
|
|
|
for (i = 1; i < LZMA_NUM_REPS; i++) |
|
26702
|
0
|
|
|
|
|
|
reps[i] = prevOpt->backs[i - 1]; |
|
26703
|
|
|
|
|
|
|
} |
|
26704
|
|
|
|
|
|
|
} |
|
26705
|
0
|
|
|
|
|
|
curOpt->state = (CState)state; |
|
26706
|
|
|
|
|
|
|
|
|
26707
|
0
|
|
|
|
|
|
curOpt->backs[0] = reps[0]; |
|
26708
|
0
|
|
|
|
|
|
curOpt->backs[1] = reps[1]; |
|
26709
|
0
|
|
|
|
|
|
curOpt->backs[2] = reps[2]; |
|
26710
|
0
|
|
|
|
|
|
curOpt->backs[3] = reps[3]; |
|
26711
|
|
|
|
|
|
|
|
|
26712
|
0
|
|
|
|
|
|
curPrice = curOpt->price; |
|
26713
|
|
|
|
|
|
|
nextIsChar = false; |
|
26714
|
0
|
|
|
|
|
|
data = p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - 1; |
|
26715
|
0
|
|
|
|
|
|
curByte = *data; |
|
26716
|
0
|
|
|
|
|
|
matchByte = *(data - (reps[0] + 1)); |
|
26717
|
|
|
|
|
|
|
|
|
26718
|
0
|
|
|
|
|
|
posState = (position & p->pbMask); |
|
26719
|
|
|
|
|
|
|
|
|
26720
|
0
|
|
|
|
|
|
curAnd1Price = curPrice + GET_PRICE_0(p->isMatch[state][posState]); |
|
26721
|
|
|
|
|
|
|
{ |
|
26722
|
0
|
|
|
|
|
|
const CLzmaProb *probs = LIT_PROBS(position, *(data - 1)); |
|
26723
|
|
|
|
|
|
|
curAnd1Price += |
|
26724
|
|
|
|
|
|
|
(!IsCharState(state) ? |
|
26725
|
0
|
|
|
|
|
|
LitEnc_GetPriceMatched(probs, curByte, matchByte, p->ProbPrices) : |
|
26726
|
0
|
0
|
|
|
|
|
LitEnc_GetPrice(probs, curByte, p->ProbPrices)); |
|
26727
|
|
|
|
|
|
|
} |
|
26728
|
|
|
|
|
|
|
|
|
26729
|
0
|
|
|
|
|
|
nextOpt = &p->opt[cur + 1]; |
|
26730
|
|
|
|
|
|
|
|
|
26731
|
0
|
0
|
|
|
|
|
if (curAnd1Price < nextOpt->price) |
|
26732
|
|
|
|
|
|
|
{ |
|
26733
|
0
|
|
|
|
|
|
nextOpt->price = curAnd1Price; |
|
26734
|
0
|
|
|
|
|
|
nextOpt->posPrev = cur; |
|
26735
|
0
|
|
|
|
|
|
MakeAsChar(nextOpt); |
|
26736
|
|
|
|
|
|
|
nextIsChar = true; |
|
26737
|
|
|
|
|
|
|
} |
|
26738
|
|
|
|
|
|
|
|
|
26739
|
0
|
|
|
|
|
|
matchPrice = curPrice + GET_PRICE_1(p->isMatch[state][posState]); |
|
26740
|
0
|
|
|
|
|
|
repMatchPrice = matchPrice + GET_PRICE_1(p->isRep[state]); |
|
26741
|
|
|
|
|
|
|
|
|
26742
|
0
|
0
|
|
|
|
|
if (matchByte == curByte && !(nextOpt->posPrev < cur && nextOpt->backPrev == 0)) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
26743
|
|
|
|
|
|
|
{ |
|
26744
|
0
|
|
|
|
|
|
uint32_t shortRepPrice = repMatchPrice + GetRepLen1Price(p, state, posState); |
|
26745
|
0
|
0
|
|
|
|
|
if (shortRepPrice <= nextOpt->price) |
|
26746
|
|
|
|
|
|
|
{ |
|
26747
|
0
|
|
|
|
|
|
nextOpt->price = shortRepPrice; |
|
26748
|
0
|
|
|
|
|
|
nextOpt->posPrev = cur; |
|
26749
|
0
|
|
|
|
|
|
MakeAsShortRep(nextOpt); |
|
26750
|
|
|
|
|
|
|
nextIsChar = true; |
|
26751
|
|
|
|
|
|
|
} |
|
26752
|
|
|
|
|
|
|
} |
|
26753
|
0
|
|
|
|
|
|
numAvailFull = p->numAvail; |
|
26754
|
|
|
|
|
|
|
{ |
|
26755
|
0
|
|
|
|
|
|
uint32_t temp = kNumOpts - 1 - cur; |
|
26756
|
0
|
0
|
|
|
|
|
if (temp < numAvailFull) |
|
26757
|
|
|
|
|
|
|
numAvailFull = temp; |
|
26758
|
|
|
|
|
|
|
} |
|
26759
|
|
|
|
|
|
|
|
|
26760
|
0
|
0
|
|
|
|
|
if (numAvailFull < 2) |
|
26761
|
0
|
|
|
|
|
|
continue; |
|
26762
|
0
|
0
|
|
|
|
|
numAvail = (numAvailFull <= p->numFastBytes ? numAvailFull : p->numFastBytes); |
|
26763
|
|
|
|
|
|
|
|
|
26764
|
0
|
0
|
|
|
|
|
if (!nextIsChar && matchByte != curByte) /* speed optimization */ |
|
26765
|
|
|
|
|
|
|
{ |
|
26766
|
|
|
|
|
|
|
/* try Literal + rep0 */ |
|
26767
|
|
|
|
|
|
|
uint32_t temp; |
|
26768
|
|
|
|
|
|
|
uint32_t lenTest2; |
|
26769
|
|
|
|
|
|
|
const uint8_t *data2 = data - (reps[0] + 1); |
|
26770
|
0
|
|
|
|
|
|
uint32_t limit = p->numFastBytes + 1; |
|
26771
|
0
|
0
|
|
|
|
|
if (limit > numAvailFull) |
|
26772
|
|
|
|
|
|
|
limit = numAvailFull; |
|
26773
|
|
|
|
|
|
|
|
|
26774
|
0
|
0
|
|
|
|
|
for (temp = 1; temp < limit && data[temp] == data2[temp]; temp++); |
|
|
|
0
|
|
|
|
|
|
|
26775
|
0
|
|
|
|
|
|
lenTest2 = temp - 1; |
|
26776
|
0
|
0
|
|
|
|
|
if (lenTest2 >= 2) |
|
26777
|
|
|
|
|
|
|
{ |
|
26778
|
0
|
|
|
|
|
|
uint32_t state2 = kLiteralNextStates[state]; |
|
26779
|
0
|
|
|
|
|
|
uint32_t posStateNext = (position + 1) & p->pbMask; |
|
26780
|
0
|
|
|
|
|
|
uint32_t nextRepMatchPrice = curAnd1Price + |
|
26781
|
0
|
|
|
|
|
|
GET_PRICE_1(p->isMatch[state2][posStateNext]) + |
|
26782
|
0
|
|
|
|
|
|
GET_PRICE_1(p->isRep[state2]); |
|
26783
|
|
|
|
|
|
|
/* for (; lenTest2 >= 2; lenTest2--) */ |
|
26784
|
|
|
|
|
|
|
{ |
|
26785
|
|
|
|
|
|
|
uint32_t curAndLenPrice; |
|
26786
|
|
|
|
|
|
|
COptimal *opt; |
|
26787
|
0
|
|
|
|
|
|
uint32_t offset = cur + 1 + lenTest2; |
|
26788
|
0
|
0
|
|
|
|
|
while (lenEnd < offset) |
|
26789
|
0
|
|
|
|
|
|
p->opt[++lenEnd].price = kInfinityPrice; |
|
26790
|
0
|
|
|
|
|
|
curAndLenPrice = nextRepMatchPrice + GetRepPrice(p, 0, lenTest2, state2, posStateNext); |
|
26791
|
|
|
|
|
|
|
opt = &p->opt[offset]; |
|
26792
|
0
|
0
|
|
|
|
|
if (curAndLenPrice < opt->price) |
|
26793
|
|
|
|
|
|
|
{ |
|
26794
|
0
|
|
|
|
|
|
opt->price = curAndLenPrice; |
|
26795
|
0
|
|
|
|
|
|
opt->posPrev = cur + 1; |
|
26796
|
0
|
|
|
|
|
|
opt->backPrev = 0; |
|
26797
|
0
|
|
|
|
|
|
opt->prev1IsChar = true; |
|
26798
|
0
|
|
|
|
|
|
opt->prev2 = false; |
|
26799
|
|
|
|
|
|
|
} |
|
26800
|
|
|
|
|
|
|
} |
|
26801
|
|
|
|
|
|
|
} |
|
26802
|
|
|
|
|
|
|
} |
|
26803
|
|
|
|
|
|
|
|
|
26804
|
|
|
|
|
|
|
startLen = 2; /* speed optimization */ |
|
26805
|
|
|
|
|
|
|
{ |
|
26806
|
|
|
|
|
|
|
uint32_t repIndex; |
|
26807
|
0
|
0
|
|
|
|
|
for (repIndex = 0; repIndex < LZMA_NUM_REPS; repIndex++) |
|
26808
|
|
|
|
|
|
|
{ |
|
26809
|
|
|
|
|
|
|
uint32_t lenTest; |
|
26810
|
|
|
|
|
|
|
uint32_t lenTestTemp; |
|
26811
|
|
|
|
|
|
|
uint32_t price; |
|
26812
|
0
|
|
|
|
|
|
const uint8_t *data2 = data - (reps[repIndex] + 1); |
|
26813
|
0
|
0
|
|
|
|
|
if (data[0] != data2[0] || data[1] != data2[1]) |
|
|
|
0
|
|
|
|
|
|
|
26814
|
|
|
|
|
|
|
continue; |
|
26815
|
0
|
0
|
|
|
|
|
for (lenTest = 2; lenTest < numAvail && data[lenTest] == data2[lenTest]; lenTest++); |
|
|
|
0
|
|
|
|
|
|
|
26816
|
0
|
0
|
|
|
|
|
while (lenEnd < cur + lenTest) |
|
26817
|
0
|
|
|
|
|
|
p->opt[++lenEnd].price = kInfinityPrice; |
|
26818
|
|
|
|
|
|
|
lenTestTemp = lenTest; |
|
26819
|
0
|
|
|
|
|
|
price = repMatchPrice + GetPureRepPrice(p, repIndex, state, posState); |
|
26820
|
0
|
0
|
|
|
|
|
do |
|
26821
|
|
|
|
|
|
|
{ |
|
26822
|
0
|
|
|
|
|
|
uint32_t curAndLenPrice = price + p->repLenEnc.prices[posState][lenTest - 2]; |
|
26823
|
0
|
|
|
|
|
|
COptimal *opt = &p->opt[cur + lenTest]; |
|
26824
|
0
|
0
|
|
|
|
|
if (curAndLenPrice < opt->price) |
|
26825
|
|
|
|
|
|
|
{ |
|
26826
|
0
|
|
|
|
|
|
opt->price = curAndLenPrice; |
|
26827
|
0
|
|
|
|
|
|
opt->posPrev = cur; |
|
26828
|
0
|
|
|
|
|
|
opt->backPrev = repIndex; |
|
26829
|
0
|
|
|
|
|
|
opt->prev1IsChar = false; |
|
26830
|
|
|
|
|
|
|
} |
|
26831
|
|
|
|
|
|
|
} |
|
26832
|
|
|
|
|
|
|
while (--lenTest >= 2); |
|
26833
|
|
|
|
|
|
|
lenTest = lenTestTemp; |
|
26834
|
|
|
|
|
|
|
|
|
26835
|
0
|
0
|
|
|
|
|
if (repIndex == 0) |
|
26836
|
0
|
|
|
|
|
|
startLen = lenTest + 1; |
|
26837
|
|
|
|
|
|
|
|
|
26838
|
|
|
|
|
|
|
/* if (_maxMode) */ |
|
26839
|
|
|
|
|
|
|
if (1) |
|
26840
|
|
|
|
|
|
|
{ |
|
26841
|
0
|
|
|
|
|
|
uint32_t lenTest2 = lenTest + 1; |
|
26842
|
0
|
|
|
|
|
|
uint32_t limit = lenTest2 + p->numFastBytes; |
|
26843
|
|
|
|
|
|
|
uint32_t nextRepMatchPrice; |
|
26844
|
0
|
0
|
|
|
|
|
if (limit > numAvailFull) |
|
26845
|
|
|
|
|
|
|
limit = numAvailFull; |
|
26846
|
0
|
0
|
|
|
|
|
for (; lenTest2 < limit && data[lenTest2] == data2[lenTest2]; lenTest2++); |
|
|
|
0
|
|
|
|
|
|
|
26847
|
0
|
|
|
|
|
|
lenTest2 -= lenTest + 1; |
|
26848
|
0
|
0
|
|
|
|
|
if (lenTest2 >= 2) |
|
26849
|
|
|
|
|
|
|
{ |
|
26850
|
0
|
|
|
|
|
|
uint32_t state2 = kRepNextStates[state]; |
|
26851
|
0
|
|
|
|
|
|
uint32_t posStateNext = (position + lenTest) & p->pbMask; |
|
26852
|
|
|
|
|
|
|
uint32_t curAndLenCharPrice = |
|
26853
|
0
|
|
|
|
|
|
price + p->repLenEnc.prices[posState][lenTest - 2] + |
|
26854
|
0
|
|
|
|
|
|
GET_PRICE_0(p->isMatch[state2][posStateNext]) + |
|
26855
|
0
|
|
|
|
|
|
LitEnc_GetPriceMatched(LIT_PROBS(position + lenTest, data[lenTest - 1]), |
|
26856
|
0
|
|
|
|
|
|
data[lenTest], data2[lenTest], p->ProbPrices); |
|
26857
|
0
|
|
|
|
|
|
state2 = kLiteralNextStates[state2]; |
|
26858
|
0
|
|
|
|
|
|
posStateNext = (position + lenTest + 1) & p->pbMask; |
|
26859
|
0
|
|
|
|
|
|
nextRepMatchPrice = curAndLenCharPrice + |
|
26860
|
0
|
|
|
|
|
|
GET_PRICE_1(p->isMatch[state2][posStateNext]) + |
|
26861
|
0
|
|
|
|
|
|
GET_PRICE_1(p->isRep[state2]); |
|
26862
|
|
|
|
|
|
|
|
|
26863
|
|
|
|
|
|
|
/* for (; lenTest2 >= 2; lenTest2--) */ |
|
26864
|
|
|
|
|
|
|
{ |
|
26865
|
|
|
|
|
|
|
uint32_t curAndLenPrice; |
|
26866
|
|
|
|
|
|
|
COptimal *opt; |
|
26867
|
0
|
|
|
|
|
|
uint32_t offset = cur + lenTest + 1 + lenTest2; |
|
26868
|
0
|
0
|
|
|
|
|
while (lenEnd < offset) |
|
26869
|
0
|
|
|
|
|
|
p->opt[++lenEnd].price = kInfinityPrice; |
|
26870
|
0
|
|
|
|
|
|
curAndLenPrice = nextRepMatchPrice + GetRepPrice(p, 0, lenTest2, state2, posStateNext); |
|
26871
|
|
|
|
|
|
|
opt = &p->opt[offset]; |
|
26872
|
0
|
0
|
|
|
|
|
if (curAndLenPrice < opt->price) |
|
26873
|
|
|
|
|
|
|
{ |
|
26874
|
0
|
|
|
|
|
|
opt->price = curAndLenPrice; |
|
26875
|
0
|
|
|
|
|
|
opt->posPrev = cur + lenTest + 1; |
|
26876
|
0
|
|
|
|
|
|
opt->backPrev = 0; |
|
26877
|
0
|
|
|
|
|
|
opt->prev1IsChar = true; |
|
26878
|
0
|
|
|
|
|
|
opt->prev2 = true; |
|
26879
|
0
|
|
|
|
|
|
opt->posPrev2 = cur; |
|
26880
|
0
|
|
|
|
|
|
opt->backPrev2 = repIndex; |
|
26881
|
|
|
|
|
|
|
} |
|
26882
|
|
|
|
|
|
|
} |
|
26883
|
|
|
|
|
|
|
} |
|
26884
|
|
|
|
|
|
|
} |
|
26885
|
|
|
|
|
|
|
} |
|
26886
|
|
|
|
|
|
|
} |
|
26887
|
|
|
|
|
|
|
/* for (uint32_t lenTest = 2; lenTest <= newLen; lenTest++) */ |
|
26888
|
0
|
0
|
|
|
|
|
if (newLen > numAvail) |
|
26889
|
|
|
|
|
|
|
{ |
|
26890
|
|
|
|
|
|
|
newLen = numAvail; |
|
26891
|
0
|
0
|
|
|
|
|
for (numPairs = 0; newLen > matches[numPairs]; numPairs += 2); |
|
26892
|
0
|
|
|
|
|
|
matches[numPairs] = newLen; |
|
26893
|
0
|
|
|
|
|
|
numPairs += 2; |
|
26894
|
|
|
|
|
|
|
} |
|
26895
|
0
|
0
|
|
|
|
|
if (newLen >= startLen) |
|
26896
|
|
|
|
|
|
|
{ |
|
26897
|
0
|
|
|
|
|
|
uint32_t normalMatchPrice = matchPrice + GET_PRICE_0(p->isRep[state]); |
|
26898
|
|
|
|
|
|
|
uint32_t offs, curBack, posSlot; |
|
26899
|
|
|
|
|
|
|
uint32_t lenTest; |
|
26900
|
0
|
0
|
|
|
|
|
while (lenEnd < cur + newLen) |
|
26901
|
0
|
|
|
|
|
|
p->opt[++lenEnd].price = kInfinityPrice; |
|
26902
|
|
|
|
|
|
|
|
|
26903
|
|
|
|
|
|
|
offs = 0; |
|
26904
|
0
|
0
|
|
|
|
|
while (startLen > matches[offs]) |
|
26905
|
0
|
|
|
|
|
|
offs += 2; |
|
26906
|
0
|
|
|
|
|
|
curBack = matches[offs + 1]; |
|
26907
|
0
|
|
|
|
|
|
GetPosSlot2(curBack, posSlot); |
|
26908
|
0
|
|
|
|
|
|
for (lenTest = /*2*/ startLen; ; lenTest++) |
|
26909
|
|
|
|
|
|
|
{ |
|
26910
|
0
|
|
|
|
|
|
uint32_t curAndLenPrice = normalMatchPrice + p->lenEnc.prices[posState][lenTest - LZMA_MATCH_LEN_MIN]; |
|
26911
|
0
|
0
|
|
|
|
|
uint32_t lenToPosState = GetLenToPosState(lenTest); |
|
26912
|
|
|
|
|
|
|
COptimal *opt; |
|
26913
|
0
|
0
|
|
|
|
|
if (curBack < kNumFullDistances) |
|
26914
|
0
|
|
|
|
|
|
curAndLenPrice += p->distancesPrices[lenToPosState][curBack]; |
|
26915
|
|
|
|
|
|
|
else |
|
26916
|
0
|
|
|
|
|
|
curAndLenPrice += p->posSlotPrices[lenToPosState][posSlot] + p->alignPrices[curBack & kAlignMask]; |
|
26917
|
|
|
|
|
|
|
|
|
26918
|
0
|
|
|
|
|
|
opt = &p->opt[cur + lenTest]; |
|
26919
|
0
|
0
|
|
|
|
|
if (curAndLenPrice < opt->price) |
|
26920
|
|
|
|
|
|
|
{ |
|
26921
|
0
|
|
|
|
|
|
opt->price = curAndLenPrice; |
|
26922
|
0
|
|
|
|
|
|
opt->posPrev = cur; |
|
26923
|
0
|
|
|
|
|
|
opt->backPrev = curBack + LZMA_NUM_REPS; |
|
26924
|
0
|
|
|
|
|
|
opt->prev1IsChar = false; |
|
26925
|
|
|
|
|
|
|
} |
|
26926
|
|
|
|
|
|
|
|
|
26927
|
0
|
0
|
|
|
|
|
if (/*_maxMode && */lenTest == matches[offs]) |
|
26928
|
|
|
|
|
|
|
{ |
|
26929
|
|
|
|
|
|
|
/* Try Match + Literal + Rep0 */ |
|
26930
|
0
|
|
|
|
|
|
const uint8_t *data2 = data - (curBack + 1); |
|
26931
|
0
|
|
|
|
|
|
uint32_t lenTest2 = lenTest + 1; |
|
26932
|
0
|
|
|
|
|
|
uint32_t limit = lenTest2 + p->numFastBytes; |
|
26933
|
|
|
|
|
|
|
uint32_t nextRepMatchPrice; |
|
26934
|
0
|
0
|
|
|
|
|
if (limit > numAvailFull) |
|
26935
|
|
|
|
|
|
|
limit = numAvailFull; |
|
26936
|
0
|
0
|
|
|
|
|
for (; lenTest2 < limit && data[lenTest2] == data2[lenTest2]; lenTest2++); |
|
|
|
0
|
|
|
|
|
|
|
26937
|
0
|
|
|
|
|
|
lenTest2 -= lenTest + 1; |
|
26938
|
0
|
0
|
|
|
|
|
if (lenTest2 >= 2) |
|
26939
|
|
|
|
|
|
|
{ |
|
26940
|
0
|
|
|
|
|
|
uint32_t state2 = kMatchNextStates[state]; |
|
26941
|
0
|
|
|
|
|
|
uint32_t posStateNext = (position + lenTest) & p->pbMask; |
|
26942
|
0
|
|
|
|
|
|
uint32_t curAndLenCharPrice = curAndLenPrice + |
|
26943
|
0
|
|
|
|
|
|
GET_PRICE_0(p->isMatch[state2][posStateNext]) + |
|
26944
|
0
|
|
|
|
|
|
LitEnc_GetPriceMatched(LIT_PROBS(position + lenTest, data[lenTest - 1]), |
|
26945
|
0
|
|
|
|
|
|
data[lenTest], data2[lenTest], p->ProbPrices); |
|
26946
|
0
|
|
|
|
|
|
state2 = kLiteralNextStates[state2]; |
|
26947
|
0
|
|
|
|
|
|
posStateNext = (posStateNext + 1) & p->pbMask; |
|
26948
|
0
|
|
|
|
|
|
nextRepMatchPrice = curAndLenCharPrice + |
|
26949
|
0
|
|
|
|
|
|
GET_PRICE_1(p->isMatch[state2][posStateNext]) + |
|
26950
|
0
|
|
|
|
|
|
GET_PRICE_1(p->isRep[state2]); |
|
26951
|
|
|
|
|
|
|
|
|
26952
|
|
|
|
|
|
|
/* for (; lenTest2 >= 2; lenTest2--) */ |
|
26953
|
|
|
|
|
|
|
{ |
|
26954
|
0
|
|
|
|
|
|
uint32_t offset = cur + lenTest + 1 + lenTest2; |
|
26955
|
|
|
|
|
|
|
uint32_t curAndLenPrice; |
|
26956
|
|
|
|
|
|
|
COptimal *opt; |
|
26957
|
0
|
0
|
|
|
|
|
while (lenEnd < offset) |
|
26958
|
0
|
|
|
|
|
|
p->opt[++lenEnd].price = kInfinityPrice; |
|
26959
|
0
|
|
|
|
|
|
curAndLenPrice = nextRepMatchPrice + GetRepPrice(p, 0, lenTest2, state2, posStateNext); |
|
26960
|
|
|
|
|
|
|
opt = &p->opt[offset]; |
|
26961
|
0
|
0
|
|
|
|
|
if (curAndLenPrice < opt->price) |
|
26962
|
|
|
|
|
|
|
{ |
|
26963
|
0
|
|
|
|
|
|
opt->price = curAndLenPrice; |
|
26964
|
0
|
|
|
|
|
|
opt->posPrev = cur + lenTest + 1; |
|
26965
|
0
|
|
|
|
|
|
opt->backPrev = 0; |
|
26966
|
0
|
|
|
|
|
|
opt->prev1IsChar = true; |
|
26967
|
0
|
|
|
|
|
|
opt->prev2 = true; |
|
26968
|
0
|
|
|
|
|
|
opt->posPrev2 = cur; |
|
26969
|
0
|
|
|
|
|
|
opt->backPrev2 = curBack + LZMA_NUM_REPS; |
|
26970
|
|
|
|
|
|
|
} |
|
26971
|
|
|
|
|
|
|
} |
|
26972
|
|
|
|
|
|
|
} |
|
26973
|
0
|
|
|
|
|
|
offs += 2; |
|
26974
|
0
|
0
|
|
|
|
|
if (offs == numPairs) |
|
26975
|
|
|
|
|
|
|
break; |
|
26976
|
0
|
|
|
|
|
|
curBack = matches[offs + 1]; |
|
26977
|
0
|
0
|
|
|
|
|
if (curBack >= kNumFullDistances) |
|
26978
|
0
|
|
|
|
|
|
GetPosSlot2(curBack, posSlot); |
|
26979
|
|
|
|
|
|
|
} |
|
26980
|
|
|
|
|
|
|
} |
|
26981
|
|
|
|
|
|
|
} |
|
26982
|
|
|
|
|
|
|
} |
|
26983
|
|
|
|
|
|
|
} |
|
26984
|
|
|
|
|
|
|
|
|
26985
|
|
|
|
|
|
|
#define ChangePair(smallDist, bigDist) (((bigDist) >> 7) > (smallDist)) |
|
26986
|
|
|
|
|
|
|
|
|
26987
|
0
|
|
|
|
|
|
static uint32_t GetOptimumFast(CLzmaEnc *p, uint32_t *backRes) |
|
26988
|
|
|
|
|
|
|
{ |
|
26989
|
|
|
|
|
|
|
uint32_t numAvail, mainLen, mainDist, numPairs, repIndex, repLen, i; |
|
26990
|
|
|
|
|
|
|
const uint8_t *data; |
|
26991
|
|
|
|
|
|
|
const uint32_t *matches; |
|
26992
|
|
|
|
|
|
|
|
|
26993
|
0
|
0
|
|
|
|
|
if (p->additionalOffset == 0) |
|
26994
|
0
|
|
|
|
|
|
mainLen = ReadMatchDistances(p, &numPairs); |
|
26995
|
|
|
|
|
|
|
else |
|
26996
|
|
|
|
|
|
|
{ |
|
26997
|
0
|
|
|
|
|
|
mainLen = p->longestMatchLength; |
|
26998
|
0
|
|
|
|
|
|
numPairs = p->numPairs; |
|
26999
|
|
|
|
|
|
|
} |
|
27000
|
|
|
|
|
|
|
|
|
27001
|
0
|
|
|
|
|
|
numAvail = p->numAvail; |
|
27002
|
0
|
|
|
|
|
|
*backRes = (uint32_t)-1; |
|
27003
|
0
|
0
|
|
|
|
|
if (numAvail < 2) |
|
27004
|
|
|
|
|
|
|
return 1; |
|
27005
|
0
|
0
|
|
|
|
|
if (numAvail > LZMA_MATCH_LEN_MAX) |
|
27006
|
|
|
|
|
|
|
numAvail = LZMA_MATCH_LEN_MAX; |
|
27007
|
0
|
|
|
|
|
|
data = p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - 1; |
|
27008
|
|
|
|
|
|
|
|
|
27009
|
|
|
|
|
|
|
repLen = repIndex = 0; |
|
27010
|
0
|
0
|
|
|
|
|
for (i = 0; i < LZMA_NUM_REPS; i++) |
|
27011
|
|
|
|
|
|
|
{ |
|
27012
|
|
|
|
|
|
|
uint32_t len; |
|
27013
|
0
|
|
|
|
|
|
const uint8_t *data2 = data - (p->reps[i] + 1); |
|
27014
|
0
|
0
|
|
|
|
|
if (data[0] != data2[0] || data[1] != data2[1]) |
|
|
|
0
|
|
|
|
|
|
|
27015
|
|
|
|
|
|
|
continue; |
|
27016
|
0
|
0
|
|
|
|
|
for (len = 2; len < numAvail && data[len] == data2[len]; len++); |
|
|
|
0
|
|
|
|
|
|
|
27017
|
0
|
0
|
|
|
|
|
if (len >= p->numFastBytes) |
|
27018
|
|
|
|
|
|
|
{ |
|
27019
|
0
|
|
|
|
|
|
*backRes = i; |
|
27020
|
0
|
|
|
|
|
|
MovePos(p, len - 1); |
|
27021
|
|
|
|
|
|
|
return len; |
|
27022
|
|
|
|
|
|
|
} |
|
27023
|
0
|
0
|
|
|
|
|
if (len > repLen) |
|
27024
|
|
|
|
|
|
|
{ |
|
27025
|
|
|
|
|
|
|
repIndex = i; |
|
27026
|
|
|
|
|
|
|
repLen = len; |
|
27027
|
|
|
|
|
|
|
} |
|
27028
|
|
|
|
|
|
|
} |
|
27029
|
|
|
|
|
|
|
|
|
27030
|
0
|
|
|
|
|
|
matches = p->matches; |
|
27031
|
0
|
0
|
|
|
|
|
if (mainLen >= p->numFastBytes) |
|
27032
|
|
|
|
|
|
|
{ |
|
27033
|
0
|
|
|
|
|
|
*backRes = matches[numPairs - 1] + LZMA_NUM_REPS; |
|
27034
|
0
|
|
|
|
|
|
MovePos(p, mainLen - 1); |
|
27035
|
|
|
|
|
|
|
return mainLen; |
|
27036
|
|
|
|
|
|
|
} |
|
27037
|
|
|
|
|
|
|
|
|
27038
|
|
|
|
|
|
|
mainDist = 0; /* for GCC */ |
|
27039
|
0
|
0
|
|
|
|
|
if (mainLen >= 2) |
|
27040
|
|
|
|
|
|
|
{ |
|
27041
|
0
|
|
|
|
|
|
mainDist = matches[numPairs - 1]; |
|
27042
|
0
|
0
|
|
|
|
|
while (numPairs > 2 && mainLen == matches[numPairs - 4] + 1) |
|
|
|
0
|
|
|
|
|
|
|
27043
|
|
|
|
|
|
|
{ |
|
27044
|
0
|
0
|
|
|
|
|
if (!ChangePair(matches[numPairs - 3], mainDist)) |
|
27045
|
|
|
|
|
|
|
break; |
|
27046
|
0
|
|
|
|
|
|
numPairs -= 2; |
|
27047
|
0
|
|
|
|
|
|
mainLen = matches[numPairs - 2]; |
|
27048
|
0
|
|
|
|
|
|
mainDist = matches[numPairs - 1]; |
|
27049
|
|
|
|
|
|
|
} |
|
27050
|
0
|
0
|
|
|
|
|
if (mainLen == 2 && mainDist >= 0x80) |
|
27051
|
|
|
|
|
|
|
mainLen = 1; |
|
27052
|
|
|
|
|
|
|
} |
|
27053
|
|
|
|
|
|
|
|
|
27054
|
0
|
0
|
|
|
|
|
if (repLen >= 2 && ( |
|
|
|
0
|
|
|
|
|
|
|
27055
|
0
|
0
|
|
|
|
|
(repLen + 1 >= mainLen) || |
|
27056
|
0
|
0
|
|
|
|
|
(repLen + 2 >= mainLen && mainDist >= (1 << 9)) || |
|
|
|
0
|
|
|
|
|
|
|
27057
|
0
|
0
|
|
|
|
|
(repLen + 3 >= mainLen && mainDist >= (1 << 15)))) |
|
27058
|
|
|
|
|
|
|
{ |
|
27059
|
0
|
|
|
|
|
|
*backRes = repIndex; |
|
27060
|
0
|
|
|
|
|
|
MovePos(p, repLen - 1); |
|
27061
|
|
|
|
|
|
|
return repLen; |
|
27062
|
|
|
|
|
|
|
} |
|
27063
|
|
|
|
|
|
|
|
|
27064
|
0
|
0
|
|
|
|
|
if (mainLen < 2 || numAvail <= 2) |
|
27065
|
|
|
|
|
|
|
return 1; |
|
27066
|
|
|
|
|
|
|
|
|
27067
|
0
|
|
|
|
|
|
p->longestMatchLength = ReadMatchDistances(p, &p->numPairs); |
|
27068
|
0
|
0
|
|
|
|
|
if (p->longestMatchLength >= 2) |
|
27069
|
|
|
|
|
|
|
{ |
|
27070
|
0
|
|
|
|
|
|
uint32_t newDistance = matches[p->numPairs - 1]; |
|
27071
|
0
|
0
|
|
|
|
|
if ((p->longestMatchLength >= mainLen && newDistance < mainDist) || |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
27072
|
0
|
0
|
|
|
|
|
(p->longestMatchLength == mainLen + 1 && !ChangePair(mainDist, newDistance)) || |
|
|
|
0
|
|
|
|
|
|
|
27073
|
0
|
0
|
|
|
|
|
(p->longestMatchLength > mainLen + 1) || |
|
27074
|
0
|
0
|
|
|
|
|
(p->longestMatchLength + 1 >= mainLen && mainLen >= 3 && ChangePair(newDistance, mainDist))) |
|
|
|
0
|
|
|
|
|
|
|
27075
|
|
|
|
|
|
|
return 1; |
|
27076
|
|
|
|
|
|
|
} |
|
27077
|
|
|
|
|
|
|
|
|
27078
|
0
|
|
|
|
|
|
data = p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - 1; |
|
27079
|
0
|
0
|
|
|
|
|
for (i = 0; i < LZMA_NUM_REPS; i++) |
|
27080
|
|
|
|
|
|
|
{ |
|
27081
|
|
|
|
|
|
|
uint32_t len, limit; |
|
27082
|
0
|
|
|
|
|
|
const uint8_t *data2 = data - (p->reps[i] + 1); |
|
27083
|
0
|
0
|
|
|
|
|
if (data[0] != data2[0] || data[1] != data2[1]) |
|
|
|
0
|
|
|
|
|
|
|
27084
|
|
|
|
|
|
|
continue; |
|
27085
|
0
|
|
|
|
|
|
limit = mainLen - 1; |
|
27086
|
0
|
0
|
|
|
|
|
for (len = 2; len < limit && data[len] == data2[len]; len++); |
|
|
|
0
|
|
|
|
|
|
|
27087
|
0
|
0
|
|
|
|
|
if (len >= limit) |
|
27088
|
|
|
|
|
|
|
return 1; |
|
27089
|
|
|
|
|
|
|
} |
|
27090
|
0
|
|
|
|
|
|
*backRes = mainDist + LZMA_NUM_REPS; |
|
27091
|
0
|
|
|
|
|
|
MovePos(p, mainLen - 2); |
|
27092
|
|
|
|
|
|
|
return mainLen; |
|
27093
|
|
|
|
|
|
|
} |
|
27094
|
|
|
|
|
|
|
|
|
27095
|
0
|
|
|
|
|
|
static void WriteEndMarker(CLzmaEnc *p, uint32_t posState) |
|
27096
|
|
|
|
|
|
|
{ |
|
27097
|
|
|
|
|
|
|
uint32_t len; |
|
27098
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(&p->rc, &p->isMatch[p->state][posState], 1); |
|
27099
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(&p->rc, &p->isRep[p->state], 0); |
|
27100
|
0
|
|
|
|
|
|
p->state = kMatchNextStates[p->state]; |
|
27101
|
|
|
|
|
|
|
len = LZMA_MATCH_LEN_MIN; |
|
27102
|
0
|
|
|
|
|
|
LenEnc_Encode2(&p->lenEnc, &p->rc, len - LZMA_MATCH_LEN_MIN, posState, !p->fastMode, p->ProbPrices); |
|
27103
|
0
|
|
|
|
|
|
RcTree_Encode(&p->rc, p->posSlotEncoder[GetLenToPosState(len)], kNumPosSlotBits, (1 << kNumPosSlotBits) - 1); |
|
27104
|
0
|
|
|
|
|
|
RangeEnc_EncodeDirectBits(&p->rc, (((uint32_t)1 << 30) - 1) >> kNumAlignBits, 30 - kNumAlignBits); |
|
27105
|
0
|
|
|
|
|
|
RcTree_ReverseEncode(&p->rc, p->posAlignEncoder, kNumAlignBits, kAlignMask); |
|
27106
|
0
|
|
|
|
|
|
} |
|
27107
|
|
|
|
|
|
|
|
|
27108
|
|
|
|
|
|
|
static SRes CheckErrors(CLzmaEnc *p) |
|
27109
|
|
|
|
|
|
|
{ |
|
27110
|
0
|
0
|
|
|
|
|
if (p->result != SZ_OK) |
|
27111
|
|
|
|
|
|
|
return p->result; |
|
27112
|
0
|
0
|
|
|
|
|
if (p->rc.res != SZ_OK) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
27113
|
0
|
|
|
|
|
|
p->result = SZ_ERROR_WRITE; |
|
27114
|
0
|
0
|
|
|
|
|
if (p->matchFinderBase.result != SZ_OK) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
27115
|
0
|
|
|
|
|
|
p->result = SZ_ERROR_READ; |
|
27116
|
0
|
0
|
|
|
|
|
if (p->result != SZ_OK) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
27117
|
0
|
|
|
|
|
|
p->finished = true; |
|
27118
|
|
|
|
|
|
|
return p->result; |
|
27119
|
|
|
|
|
|
|
} |
|
27120
|
|
|
|
|
|
|
|
|
27121
|
0
|
|
|
|
|
|
static SRes Flush(CLzmaEnc *p, uint32_t nowPos) |
|
27122
|
|
|
|
|
|
|
{ |
|
27123
|
|
|
|
|
|
|
/* ReleaseMFStream(); */ |
|
27124
|
0
|
|
|
|
|
|
p->finished = true; |
|
27125
|
0
|
0
|
|
|
|
|
if (p->writeEndMark) |
|
27126
|
0
|
|
|
|
|
|
WriteEndMarker(p, nowPos & p->pbMask); |
|
27127
|
0
|
|
|
|
|
|
RangeEnc_FlushData(&p->rc); |
|
27128
|
0
|
|
|
|
|
|
RangeEnc_FlushStream(&p->rc); |
|
27129
|
0
|
|
|
|
|
|
return CheckErrors(p); |
|
27130
|
|
|
|
|
|
|
} |
|
27131
|
|
|
|
|
|
|
|
|
27132
|
0
|
|
|
|
|
|
static void FillAlignPrices(CLzmaEnc *p) |
|
27133
|
|
|
|
|
|
|
{ |
|
27134
|
|
|
|
|
|
|
uint32_t i; |
|
27135
|
0
|
0
|
|
|
|
|
for (i = 0; i < kAlignTableSize; i++) |
|
27136
|
0
|
|
|
|
|
|
p->alignPrices[i] = RcTree_ReverseGetPrice(p->posAlignEncoder, kNumAlignBits, i, p->ProbPrices); |
|
27137
|
0
|
|
|
|
|
|
p->alignPriceCount = 0; |
|
27138
|
0
|
|
|
|
|
|
} |
|
27139
|
|
|
|
|
|
|
|
|
27140
|
0
|
|
|
|
|
|
static void FillDistancesPrices(CLzmaEnc *p) |
|
27141
|
|
|
|
|
|
|
{ |
|
27142
|
|
|
|
|
|
|
uint32_t tempPrices[kNumFullDistances]; |
|
27143
|
|
|
|
|
|
|
uint32_t i, lenToPosState; |
|
27144
|
0
|
0
|
|
|
|
|
for (i = kStartPosModelIndex; i < kNumFullDistances; i++) |
|
27145
|
|
|
|
|
|
|
{ |
|
27146
|
0
|
|
|
|
|
|
uint32_t posSlot = GetPosSlot1(i); |
|
27147
|
0
|
|
|
|
|
|
uint32_t footerBits = ((posSlot >> 1) - 1); |
|
27148
|
0
|
|
|
|
|
|
uint32_t base = ((2 | (posSlot & 1)) << footerBits); |
|
27149
|
0
|
|
|
|
|
|
tempPrices[i] = RcTree_ReverseGetPrice(p->posEncoders + base - posSlot - 1, footerBits, i - base, p->ProbPrices); |
|
27150
|
|
|
|
|
|
|
} |
|
27151
|
|
|
|
|
|
|
|
|
27152
|
0
|
0
|
|
|
|
|
for (lenToPosState = 0; lenToPosState < kNumLenToPosStates; lenToPosState++) |
|
27153
|
|
|
|
|
|
|
{ |
|
27154
|
|
|
|
|
|
|
uint32_t posSlot; |
|
27155
|
0
|
|
|
|
|
|
const CLzmaProb *encoder = p->posSlotEncoder[lenToPosState]; |
|
27156
|
0
|
|
|
|
|
|
uint32_t *posSlotPrices = p->posSlotPrices[lenToPosState]; |
|
27157
|
0
|
0
|
|
|
|
|
for (posSlot = 0; posSlot < p->distTableSize; posSlot++) |
|
27158
|
0
|
|
|
|
|
|
posSlotPrices[posSlot] = RcTree_GetPrice(encoder, kNumPosSlotBits, posSlot, p->ProbPrices); |
|
27159
|
0
|
0
|
|
|
|
|
for (posSlot = kEndPosModelIndex; posSlot < p->distTableSize; posSlot++) |
|
27160
|
0
|
|
|
|
|
|
posSlotPrices[posSlot] += ((((posSlot >> 1) - 1) - kNumAlignBits) << kNumBitPriceShiftBits); |
|
27161
|
|
|
|
|
|
|
|
|
27162
|
|
|
|
|
|
|
{ |
|
27163
|
0
|
|
|
|
|
|
uint32_t *distancesPrices = p->distancesPrices[lenToPosState]; |
|
27164
|
|
|
|
|
|
|
uint32_t i; |
|
27165
|
0
|
0
|
|
|
|
|
for (i = 0; i < kStartPosModelIndex; i++) |
|
27166
|
0
|
|
|
|
|
|
distancesPrices[i] = posSlotPrices[i]; |
|
27167
|
0
|
0
|
|
|
|
|
for (; i < kNumFullDistances; i++) |
|
27168
|
0
|
|
|
|
|
|
distancesPrices[i] = posSlotPrices[GetPosSlot1(i)] + tempPrices[i]; |
|
27169
|
|
|
|
|
|
|
} |
|
27170
|
|
|
|
|
|
|
} |
|
27171
|
0
|
|
|
|
|
|
p->matchPriceCount = 0; |
|
27172
|
0
|
|
|
|
|
|
} |
|
27173
|
|
|
|
|
|
|
|
|
27174
|
0
|
|
|
|
|
|
void LzmaEnc_Construct(CLzmaEnc *p) |
|
27175
|
|
|
|
|
|
|
{ |
|
27176
|
|
|
|
|
|
|
RangeEnc_Construct(&p->rc); |
|
27177
|
|
|
|
|
|
|
MatchFinder_Construct(&p->matchFinderBase); |
|
27178
|
|
|
|
|
|
|
|
|
27179
|
|
|
|
|
|
|
{ |
|
27180
|
|
|
|
|
|
|
CLzmaEncProps props; |
|
27181
|
|
|
|
|
|
|
LzmaEncProps_Init(&props); |
|
27182
|
0
|
|
|
|
|
|
LzmaEnc_SetProps(p, &props); |
|
27183
|
|
|
|
|
|
|
} |
|
27184
|
|
|
|
|
|
|
|
|
27185
|
|
|
|
|
|
|
#ifndef LZMA_LOG_BSR |
|
27186
|
0
|
|
|
|
|
|
LzmaEnc_FastPosInit(p->g_FastPos); |
|
27187
|
|
|
|
|
|
|
#endif |
|
27188
|
|
|
|
|
|
|
|
|
27189
|
0
|
|
|
|
|
|
LzmaEnc_InitPriceTables(p->ProbPrices); |
|
27190
|
0
|
|
|
|
|
|
p->litProbs = 0; |
|
27191
|
0
|
|
|
|
|
|
p->saveState.litProbs = 0; |
|
27192
|
0
|
|
|
|
|
|
} |
|
27193
|
|
|
|
|
|
|
|
|
27194
|
0
|
|
|
|
|
|
CLzmaEncHandle LzmaEnc_Create(ISzAlloc *alloc) |
|
27195
|
|
|
|
|
|
|
{ |
|
27196
|
|
|
|
|
|
|
void *p; |
|
27197
|
0
|
|
|
|
|
|
p = alloc->Alloc(alloc, sizeof(CLzmaEnc)); |
|
27198
|
0
|
0
|
|
|
|
|
if (p != 0) |
|
27199
|
0
|
|
|
|
|
|
LzmaEnc_Construct((CLzmaEnc *)p); |
|
27200
|
0
|
|
|
|
|
|
return p; |
|
27201
|
|
|
|
|
|
|
} |
|
27202
|
|
|
|
|
|
|
|
|
27203
|
0
|
|
|
|
|
|
void LzmaEnc_FreeLits(CLzmaEnc *p, ISzAlloc *alloc) |
|
27204
|
|
|
|
|
|
|
{ |
|
27205
|
0
|
|
|
|
|
|
alloc->Free(alloc, p->litProbs); |
|
27206
|
0
|
|
|
|
|
|
alloc->Free(alloc, p->saveState.litProbs); |
|
27207
|
0
|
|
|
|
|
|
p->litProbs = 0; |
|
27208
|
0
|
|
|
|
|
|
p->saveState.litProbs = 0; |
|
27209
|
0
|
|
|
|
|
|
} |
|
27210
|
|
|
|
|
|
|
|
|
27211
|
0
|
|
|
|
|
|
void LzmaEnc_Destruct(CLzmaEnc *p, ISzAlloc *alloc, ISzAlloc *allocBig) |
|
27212
|
|
|
|
|
|
|
{ |
|
27213
|
|
|
|
|
|
|
MatchFinder_Free(&p->matchFinderBase, allocBig); |
|
27214
|
|
|
|
|
|
|
LzmaEnc_FreeLits(p, alloc); |
|
27215
|
|
|
|
|
|
|
RangeEnc_Free(&p->rc, alloc); |
|
27216
|
0
|
|
|
|
|
|
} |
|
27217
|
|
|
|
|
|
|
|
|
27218
|
0
|
|
|
|
|
|
void LzmaEnc_Destroy(CLzmaEncHandle p, ISzAlloc *alloc, ISzAlloc *allocBig) |
|
27219
|
|
|
|
|
|
|
{ |
|
27220
|
0
|
|
|
|
|
|
LzmaEnc_Destruct((CLzmaEnc *)p, alloc, allocBig); |
|
27221
|
0
|
|
|
|
|
|
alloc->Free(alloc, p); |
|
27222
|
0
|
|
|
|
|
|
} |
|
27223
|
|
|
|
|
|
|
|
|
27224
|
0
|
|
|
|
|
|
static SRes LzmaEnc_CodeOneBlock(CLzmaEnc *p, bool useLimits, uint32_t maxPackSize, uint32_t maxUnpackSize) |
|
27225
|
|
|
|
|
|
|
{ |
|
27226
|
|
|
|
|
|
|
uint32_t nowPos32, startPos32; |
|
27227
|
0
|
0
|
|
|
|
|
if (p->needInit) |
|
27228
|
|
|
|
|
|
|
{ |
|
27229
|
0
|
|
|
|
|
|
p->matchFinder.Init(p->matchFinderObj); |
|
27230
|
0
|
|
|
|
|
|
p->needInit = 0; |
|
27231
|
|
|
|
|
|
|
} |
|
27232
|
|
|
|
|
|
|
|
|
27233
|
0
|
0
|
|
|
|
|
if (p->finished) |
|
27234
|
0
|
|
|
|
|
|
return p->result; |
|
27235
|
0
|
0
|
|
|
|
|
RINOK(CheckErrors(p)); |
|
27236
|
|
|
|
|
|
|
|
|
27237
|
0
|
|
|
|
|
|
nowPos32 = (uint32_t)p->nowPos64; |
|
27238
|
|
|
|
|
|
|
startPos32 = nowPos32; |
|
27239
|
|
|
|
|
|
|
|
|
27240
|
0
|
0
|
|
|
|
|
if (p->nowPos64 == 0) |
|
27241
|
|
|
|
|
|
|
{ |
|
27242
|
|
|
|
|
|
|
uint32_t numPairs; |
|
27243
|
|
|
|
|
|
|
uint8_t curByte; |
|
27244
|
0
|
0
|
|
|
|
|
if (p->matchFinder.GetNumAvailableBytes(p->matchFinderObj) == 0) |
|
27245
|
0
|
|
|
|
|
|
return Flush(p, nowPos32); |
|
27246
|
0
|
|
|
|
|
|
ReadMatchDistances(p, &numPairs); |
|
27247
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(&p->rc, &p->isMatch[p->state][0], 0); |
|
27248
|
0
|
|
|
|
|
|
p->state = kLiteralNextStates[p->state]; |
|
27249
|
0
|
|
|
|
|
|
curByte = p->matchFinder.GetIndexByte(p->matchFinderObj, 0 - p->additionalOffset); |
|
27250
|
0
|
|
|
|
|
|
LitEnc_Encode(&p->rc, p->litProbs, curByte); |
|
27251
|
0
|
|
|
|
|
|
p->additionalOffset--; |
|
27252
|
0
|
|
|
|
|
|
nowPos32++; |
|
27253
|
|
|
|
|
|
|
} |
|
27254
|
|
|
|
|
|
|
|
|
27255
|
0
|
0
|
|
|
|
|
if (p->matchFinder.GetNumAvailableBytes(p->matchFinderObj) != 0) |
|
27256
|
0
|
|
|
|
|
|
for (;;) |
|
27257
|
|
|
|
|
|
|
{ |
|
27258
|
|
|
|
|
|
|
uint32_t pos, len, posState; |
|
27259
|
|
|
|
|
|
|
|
|
27260
|
0
|
0
|
|
|
|
|
if (p->fastMode) |
|
27261
|
0
|
|
|
|
|
|
len = GetOptimumFast(p, &pos); |
|
27262
|
|
|
|
|
|
|
else |
|
27263
|
0
|
|
|
|
|
|
len = GetOptimum(p, nowPos32, &pos); |
|
27264
|
|
|
|
|
|
|
|
|
27265
|
0
|
|
|
|
|
|
posState = nowPos32 & p->pbMask; |
|
27266
|
0
|
0
|
|
|
|
|
if (len == 1 && pos == (uint32_t)-1) |
|
|
|
0
|
|
|
|
|
|
|
27267
|
|
|
|
|
|
|
{ |
|
27268
|
|
|
|
|
|
|
uint8_t curByte; |
|
27269
|
|
|
|
|
|
|
CLzmaProb *probs; |
|
27270
|
|
|
|
|
|
|
const uint8_t *data; |
|
27271
|
|
|
|
|
|
|
|
|
27272
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(&p->rc, &p->isMatch[p->state][posState], 0); |
|
27273
|
0
|
|
|
|
|
|
data = p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - p->additionalOffset; |
|
27274
|
0
|
|
|
|
|
|
curByte = *data; |
|
27275
|
0
|
|
|
|
|
|
probs = LIT_PROBS(nowPos32, *(data - 1)); |
|
27276
|
0
|
0
|
|
|
|
|
if (IsCharState(p->state)) |
|
27277
|
0
|
|
|
|
|
|
LitEnc_Encode(&p->rc, probs, curByte); |
|
27278
|
|
|
|
|
|
|
else |
|
27279
|
0
|
|
|
|
|
|
LitEnc_EncodeMatched(&p->rc, probs, curByte, *(data - p->reps[0] - 1)); |
|
27280
|
0
|
|
|
|
|
|
p->state = kLiteralNextStates[p->state]; |
|
27281
|
|
|
|
|
|
|
} |
|
27282
|
|
|
|
|
|
|
else |
|
27283
|
|
|
|
|
|
|
{ |
|
27284
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(&p->rc, &p->isMatch[p->state][posState], 1); |
|
27285
|
0
|
0
|
|
|
|
|
if (pos < LZMA_NUM_REPS) |
|
27286
|
|
|
|
|
|
|
{ |
|
27287
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(&p->rc, &p->isRep[p->state], 1); |
|
27288
|
0
|
0
|
|
|
|
|
if (pos == 0) |
|
27289
|
|
|
|
|
|
|
{ |
|
27290
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(&p->rc, &p->isRepG0[p->state], 0); |
|
27291
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(&p->rc, &p->isRep0Long[p->state][posState], ((len == 1) ? 0 : 1)); |
|
27292
|
|
|
|
|
|
|
} |
|
27293
|
|
|
|
|
|
|
else |
|
27294
|
|
|
|
|
|
|
{ |
|
27295
|
0
|
|
|
|
|
|
uint32_t distance = p->reps[pos]; |
|
27296
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(&p->rc, &p->isRepG0[p->state], 1); |
|
27297
|
0
|
0
|
|
|
|
|
if (pos == 1) |
|
27298
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(&p->rc, &p->isRepG1[p->state], 0); |
|
27299
|
|
|
|
|
|
|
else |
|
27300
|
|
|
|
|
|
|
{ |
|
27301
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(&p->rc, &p->isRepG1[p->state], 1); |
|
27302
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(&p->rc, &p->isRepG2[p->state], pos - 2); |
|
27303
|
0
|
0
|
|
|
|
|
if (pos == 3) |
|
27304
|
0
|
|
|
|
|
|
p->reps[3] = p->reps[2]; |
|
27305
|
0
|
|
|
|
|
|
p->reps[2] = p->reps[1]; |
|
27306
|
|
|
|
|
|
|
} |
|
27307
|
0
|
|
|
|
|
|
p->reps[1] = p->reps[0]; |
|
27308
|
0
|
|
|
|
|
|
p->reps[0] = distance; |
|
27309
|
|
|
|
|
|
|
} |
|
27310
|
0
|
0
|
|
|
|
|
if (len == 1) |
|
27311
|
0
|
|
|
|
|
|
p->state = kShortRepNextStates[p->state]; |
|
27312
|
|
|
|
|
|
|
else |
|
27313
|
|
|
|
|
|
|
{ |
|
27314
|
0
|
|
|
|
|
|
LenEnc_Encode2(&p->repLenEnc, &p->rc, len - LZMA_MATCH_LEN_MIN, posState, !p->fastMode, p->ProbPrices); |
|
27315
|
0
|
|
|
|
|
|
p->state = kRepNextStates[p->state]; |
|
27316
|
|
|
|
|
|
|
} |
|
27317
|
|
|
|
|
|
|
} |
|
27318
|
|
|
|
|
|
|
else |
|
27319
|
|
|
|
|
|
|
{ |
|
27320
|
|
|
|
|
|
|
uint32_t posSlot; |
|
27321
|
0
|
|
|
|
|
|
RangeEnc_EncodeBit(&p->rc, &p->isRep[p->state], 0); |
|
27322
|
0
|
|
|
|
|
|
p->state = kMatchNextStates[p->state]; |
|
27323
|
0
|
|
|
|
|
|
LenEnc_Encode2(&p->lenEnc, &p->rc, len - LZMA_MATCH_LEN_MIN, posState, !p->fastMode, p->ProbPrices); |
|
27324
|
0
|
|
|
|
|
|
pos -= LZMA_NUM_REPS; |
|
27325
|
0
|
0
|
|
|
|
|
GetPosSlot(pos, posSlot); |
|
27326
|
0
|
0
|
|
|
|
|
RcTree_Encode(&p->rc, p->posSlotEncoder[GetLenToPosState(len)], kNumPosSlotBits, posSlot); |
|
27327
|
|
|
|
|
|
|
|
|
27328
|
0
|
0
|
|
|
|
|
if (posSlot >= kStartPosModelIndex) |
|
27329
|
|
|
|
|
|
|
{ |
|
27330
|
0
|
|
|
|
|
|
uint32_t footerBits = ((posSlot >> 1) - 1); |
|
27331
|
0
|
|
|
|
|
|
uint32_t base = ((2 | (posSlot & 1)) << footerBits); |
|
27332
|
0
|
|
|
|
|
|
uint32_t posReduced = pos - base; |
|
27333
|
|
|
|
|
|
|
|
|
27334
|
0
|
0
|
|
|
|
|
if (posSlot < kEndPosModelIndex) |
|
27335
|
0
|
|
|
|
|
|
RcTree_ReverseEncode(&p->rc, p->posEncoders + base - posSlot - 1, footerBits, posReduced); |
|
27336
|
|
|
|
|
|
|
else |
|
27337
|
|
|
|
|
|
|
{ |
|
27338
|
0
|
|
|
|
|
|
RangeEnc_EncodeDirectBits(&p->rc, posReduced >> kNumAlignBits, footerBits - kNumAlignBits); |
|
27339
|
0
|
|
|
|
|
|
RcTree_ReverseEncode(&p->rc, p->posAlignEncoder, kNumAlignBits, posReduced & kAlignMask); |
|
27340
|
0
|
|
|
|
|
|
p->alignPriceCount++; |
|
27341
|
|
|
|
|
|
|
} |
|
27342
|
|
|
|
|
|
|
} |
|
27343
|
0
|
|
|
|
|
|
p->reps[3] = p->reps[2]; |
|
27344
|
0
|
|
|
|
|
|
p->reps[2] = p->reps[1]; |
|
27345
|
0
|
|
|
|
|
|
p->reps[1] = p->reps[0]; |
|
27346
|
0
|
|
|
|
|
|
p->reps[0] = pos; |
|
27347
|
0
|
|
|
|
|
|
p->matchPriceCount++; |
|
27348
|
|
|
|
|
|
|
} |
|
27349
|
|
|
|
|
|
|
} |
|
27350
|
0
|
|
|
|
|
|
p->additionalOffset -= len; |
|
27351
|
0
|
|
|
|
|
|
nowPos32 += len; |
|
27352
|
0
|
0
|
|
|
|
|
if (p->additionalOffset == 0) |
|
27353
|
|
|
|
|
|
|
{ |
|
27354
|
|
|
|
|
|
|
uint32_t processed; |
|
27355
|
0
|
0
|
|
|
|
|
if (!p->fastMode) |
|
27356
|
|
|
|
|
|
|
{ |
|
27357
|
0
|
0
|
|
|
|
|
if (p->matchPriceCount >= (1 << 7)) |
|
27358
|
0
|
|
|
|
|
|
FillDistancesPrices(p); |
|
27359
|
0
|
0
|
|
|
|
|
if (p->alignPriceCount >= kAlignTableSize) |
|
27360
|
0
|
|
|
|
|
|
FillAlignPrices(p); |
|
27361
|
|
|
|
|
|
|
} |
|
27362
|
0
|
0
|
|
|
|
|
if (p->matchFinder.GetNumAvailableBytes(p->matchFinderObj) == 0) |
|
27363
|
|
|
|
|
|
|
break; |
|
27364
|
0
|
|
|
|
|
|
processed = nowPos32 - startPos32; |
|
27365
|
0
|
0
|
|
|
|
|
if (useLimits) |
|
27366
|
|
|
|
|
|
|
{ |
|
27367
|
0
|
0
|
|
|
|
|
if (processed + kNumOpts + 300 >= maxUnpackSize || |
|
|
|
0
|
|
|
|
|
|
|
27368
|
0
|
|
|
|
|
|
RangeEnc_GetProcessed(&p->rc) + kNumOpts * 2 >= maxPackSize) |
|
27369
|
|
|
|
|
|
|
break; |
|
27370
|
|
|
|
|
|
|
} |
|
27371
|
0
|
0
|
|
|
|
|
else if (processed >= (1 << 15)) |
|
27372
|
|
|
|
|
|
|
{ |
|
27373
|
0
|
|
|
|
|
|
p->nowPos64 += nowPos32 - startPos32; |
|
27374
|
0
|
|
|
|
|
|
return CheckErrors(p); |
|
27375
|
|
|
|
|
|
|
} |
|
27376
|
|
|
|
|
|
|
} |
|
27377
|
|
|
|
|
|
|
} |
|
27378
|
0
|
|
|
|
|
|
p->nowPos64 += nowPos32 - startPos32; |
|
27379
|
0
|
|
|
|
|
|
return Flush(p, nowPos32); |
|
27380
|
|
|
|
|
|
|
} |
|
27381
|
|
|
|
|
|
|
|
|
27382
|
|
|
|
|
|
|
#define kBigHashDicLimit ((uint32_t)1 << 24) |
|
27383
|
|
|
|
|
|
|
|
|
27384
|
0
|
|
|
|
|
|
static SRes LzmaEnc_Alloc(CLzmaEnc *p, uint32_t keepWindowSize, ISzAlloc *alloc, ISzAlloc *allocBig) |
|
27385
|
|
|
|
|
|
|
{ |
|
27386
|
|
|
|
|
|
|
uint32_t beforeSize = kNumOpts; |
|
27387
|
0
|
0
|
|
|
|
|
if (!RangeEnc_Alloc(&p->rc, alloc)) |
|
27388
|
|
|
|
|
|
|
return SZ_ERROR_MEM; |
|
27389
|
|
|
|
|
|
|
|
|
27390
|
|
|
|
|
|
|
{ |
|
27391
|
0
|
|
|
|
|
|
unsigned lclp = p->lc + p->lp; |
|
27392
|
0
|
0
|
|
|
|
|
if (p->litProbs == 0 || p->saveState.litProbs == 0 || p->lclp != lclp) |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
27393
|
|
|
|
|
|
|
{ |
|
27394
|
|
|
|
|
|
|
LzmaEnc_FreeLits(p, alloc); |
|
27395
|
0
|
|
|
|
|
|
p->litProbs = (CLzmaProb *)alloc->Alloc(alloc, (0x300 << lclp) * sizeof(CLzmaProb)); |
|
27396
|
0
|
|
|
|
|
|
p->saveState.litProbs = (CLzmaProb *)alloc->Alloc(alloc, (0x300 << lclp) * sizeof(CLzmaProb)); |
|
27397
|
0
|
0
|
|
|
|
|
if (p->litProbs == 0 || p->saveState.litProbs == 0) |
|
|
|
0
|
|
|
|
|
|
|
27398
|
|
|
|
|
|
|
{ |
|
27399
|
|
|
|
|
|
|
LzmaEnc_FreeLits(p, alloc); |
|
27400
|
0
|
|
|
|
|
|
return SZ_ERROR_MEM; |
|
27401
|
|
|
|
|
|
|
} |
|
27402
|
0
|
|
|
|
|
|
p->lclp = lclp; |
|
27403
|
|
|
|
|
|
|
} |
|
27404
|
|
|
|
|
|
|
} |
|
27405
|
|
|
|
|
|
|
|
|
27406
|
0
|
|
|
|
|
|
p->matchFinderBase.bigHash = (p->dictSize > kBigHashDicLimit); |
|
27407
|
|
|
|
|
|
|
|
|
27408
|
0
|
0
|
|
|
|
|
if (beforeSize + p->dictSize < keepWindowSize) |
|
27409
|
0
|
|
|
|
|
|
beforeSize = keepWindowSize - p->dictSize; |
|
27410
|
|
|
|
|
|
|
|
|
27411
|
|
|
|
|
|
|
{ |
|
27412
|
0
|
0
|
|
|
|
|
if (!MatchFinder_Create(&p->matchFinderBase, p->dictSize, beforeSize, p->numFastBytes, LZMA_MATCH_LEN_MAX, allocBig)) |
|
27413
|
|
|
|
|
|
|
return SZ_ERROR_MEM; |
|
27414
|
0
|
|
|
|
|
|
p->matchFinderObj = &p->matchFinderBase; |
|
27415
|
|
|
|
|
|
|
MatchFinder_CreateVTable(&p->matchFinderBase, &p->matchFinder); |
|
27416
|
|
|
|
|
|
|
} |
|
27417
|
|
|
|
|
|
|
return SZ_OK; |
|
27418
|
|
|
|
|
|
|
} |
|
27419
|
|
|
|
|
|
|
|
|
27420
|
0
|
|
|
|
|
|
void LzmaEnc_Init(CLzmaEnc *p) |
|
27421
|
|
|
|
|
|
|
{ |
|
27422
|
|
|
|
|
|
|
uint32_t i; |
|
27423
|
0
|
|
|
|
|
|
p->state = 0; |
|
27424
|
0
|
0
|
|
|
|
|
for (i = 0 ; i < LZMA_NUM_REPS; i++) |
|
27425
|
0
|
|
|
|
|
|
p->reps[i] = 0; |
|
27426
|
|
|
|
|
|
|
|
|
27427
|
|
|
|
|
|
|
RangeEnc_Init(&p->rc); |
|
27428
|
|
|
|
|
|
|
|
|
27429
|
0
|
0
|
|
|
|
|
for (i = 0; i < kNumStates; i++) |
|
27430
|
|
|
|
|
|
|
{ |
|
27431
|
|
|
|
|
|
|
uint32_t j; |
|
27432
|
0
|
0
|
|
|
|
|
for (j = 0; j < LZMA_NUM_PB_STATES_MAX; j++) |
|
27433
|
|
|
|
|
|
|
{ |
|
27434
|
0
|
|
|
|
|
|
p->isMatch[i][j] = kProbInitValue; |
|
27435
|
0
|
|
|
|
|
|
p->isRep0Long[i][j] = kProbInitValue; |
|
27436
|
|
|
|
|
|
|
} |
|
27437
|
0
|
|
|
|
|
|
p->isRep[i] = kProbInitValue; |
|
27438
|
0
|
|
|
|
|
|
p->isRepG0[i] = kProbInitValue; |
|
27439
|
0
|
|
|
|
|
|
p->isRepG1[i] = kProbInitValue; |
|
27440
|
0
|
|
|
|
|
|
p->isRepG2[i] = kProbInitValue; |
|
27441
|
|
|
|
|
|
|
} |
|
27442
|
|
|
|
|
|
|
|
|
27443
|
|
|
|
|
|
|
{ |
|
27444
|
0
|
|
|
|
|
|
uint32_t num = 0x300 << (p->lp + p->lc); |
|
27445
|
0
|
0
|
|
|
|
|
for (i = 0; i < num; i++) |
|
27446
|
0
|
|
|
|
|
|
p->litProbs[i] = kProbInitValue; |
|
27447
|
|
|
|
|
|
|
} |
|
27448
|
|
|
|
|
|
|
|
|
27449
|
|
|
|
|
|
|
{ |
|
27450
|
0
|
0
|
|
|
|
|
for (i = 0; i < kNumLenToPosStates; i++) |
|
27451
|
|
|
|
|
|
|
{ |
|
27452
|
0
|
|
|
|
|
|
CLzmaProb *probs = p->posSlotEncoder[i]; |
|
27453
|
|
|
|
|
|
|
uint32_t j; |
|
27454
|
0
|
0
|
|
|
|
|
for (j = 0; j < (1 << kNumPosSlotBits); j++) |
|
27455
|
0
|
|
|
|
|
|
probs[j] = kProbInitValue; |
|
27456
|
|
|
|
|
|
|
} |
|
27457
|
|
|
|
|
|
|
} |
|
27458
|
|
|
|
|
|
|
{ |
|
27459
|
0
|
0
|
|
|
|
|
for (i = 0; i < kNumFullDistances - kEndPosModelIndex; i++) |
|
27460
|
0
|
|
|
|
|
|
p->posEncoders[i] = kProbInitValue; |
|
27461
|
|
|
|
|
|
|
} |
|
27462
|
|
|
|
|
|
|
|
|
27463
|
|
|
|
|
|
|
LenEnc_Init(&p->lenEnc.p); |
|
27464
|
|
|
|
|
|
|
LenEnc_Init(&p->repLenEnc.p); |
|
27465
|
|
|
|
|
|
|
|
|
27466
|
0
|
0
|
|
|
|
|
for (i = 0; i < (1 << kNumAlignBits); i++) |
|
27467
|
0
|
|
|
|
|
|
p->posAlignEncoder[i] = kProbInitValue; |
|
27468
|
|
|
|
|
|
|
|
|
27469
|
0
|
|
|
|
|
|
p->optimumEndIndex = 0; |
|
27470
|
0
|
|
|
|
|
|
p->optimumCurrentIndex = 0; |
|
27471
|
0
|
|
|
|
|
|
p->additionalOffset = 0; |
|
27472
|
|
|
|
|
|
|
|
|
27473
|
0
|
|
|
|
|
|
p->pbMask = (1 << p->pb) - 1; |
|
27474
|
0
|
|
|
|
|
|
p->lpMask = (1 << p->lp) - 1; |
|
27475
|
0
|
|
|
|
|
|
} |
|
27476
|
|
|
|
|
|
|
|
|
27477
|
0
|
|
|
|
|
|
void LzmaEnc_InitPrices(CLzmaEnc *p) |
|
27478
|
|
|
|
|
|
|
{ |
|
27479
|
0
|
0
|
|
|
|
|
if (!p->fastMode) |
|
27480
|
|
|
|
|
|
|
{ |
|
27481
|
0
|
|
|
|
|
|
FillDistancesPrices(p); |
|
27482
|
0
|
|
|
|
|
|
FillAlignPrices(p); |
|
27483
|
|
|
|
|
|
|
} |
|
27484
|
|
|
|
|
|
|
|
|
27485
|
|
|
|
|
|
|
p->lenEnc.tableSize = |
|
27486
|
|
|
|
|
|
|
p->repLenEnc.tableSize = |
|
27487
|
0
|
|
|
|
|
|
p->numFastBytes + 1 - LZMA_MATCH_LEN_MIN; |
|
27488
|
0
|
|
|
|
|
|
LenPriceEnc_UpdateTables(&p->lenEnc, 1 << p->pb, p->ProbPrices); |
|
27489
|
0
|
|
|
|
|
|
LenPriceEnc_UpdateTables(&p->repLenEnc, 1 << p->pb, p->ProbPrices); |
|
27490
|
0
|
|
|
|
|
|
} |
|
27491
|
|
|
|
|
|
|
|
|
27492
|
0
|
|
|
|
|
|
static SRes LzmaEnc_AllocAndInit(CLzmaEnc *p, uint32_t keepWindowSize, ISzAlloc *alloc, ISzAlloc *allocBig) |
|
27493
|
|
|
|
|
|
|
{ |
|
27494
|
|
|
|
|
|
|
uint32_t i; |
|
27495
|
0
|
0
|
|
|
|
|
for (i = 0; i < (uint32_t)kDicLogSizeMaxCompress; i++) |
|
27496
|
0
|
0
|
|
|
|
|
if (p->dictSize <= ((uint32_t)1 << i)) |
|
27497
|
|
|
|
|
|
|
break; |
|
27498
|
0
|
|
|
|
|
|
p->distTableSize = i * 2; |
|
27499
|
|
|
|
|
|
|
|
|
27500
|
0
|
|
|
|
|
|
p->finished = false; |
|
27501
|
0
|
|
|
|
|
|
p->result = SZ_OK; |
|
27502
|
0
|
0
|
|
|
|
|
RINOK(LzmaEnc_Alloc(p, keepWindowSize, alloc, allocBig)); |
|
27503
|
0
|
|
|
|
|
|
LzmaEnc_Init(p); |
|
27504
|
0
|
|
|
|
|
|
LzmaEnc_InitPrices(p); |
|
27505
|
0
|
|
|
|
|
|
p->nowPos64 = 0; |
|
27506
|
0
|
|
|
|
|
|
return SZ_OK; |
|
27507
|
|
|
|
|
|
|
} |
|
27508
|
|
|
|
|
|
|
|
|
27509
|
|
|
|
|
|
|
static SRes LzmaEnc_Prepare(CLzmaEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStream, |
|
27510
|
|
|
|
|
|
|
ISzAlloc *alloc, ISzAlloc *allocBig) |
|
27511
|
|
|
|
|
|
|
{ |
|
27512
|
|
|
|
|
|
|
CLzmaEnc *p = (CLzmaEnc *)pp; |
|
27513
|
0
|
|
|
|
|
|
p->matchFinderBase.stream = inStream; |
|
27514
|
0
|
|
|
|
|
|
p->needInit = 1; |
|
27515
|
0
|
|
|
|
|
|
p->rc.outStream = outStream; |
|
27516
|
0
|
|
|
|
|
|
return LzmaEnc_AllocAndInit(p, 0, alloc, allocBig); |
|
27517
|
|
|
|
|
|
|
} |
|
27518
|
|
|
|
|
|
|
|
|
27519
|
0
|
|
|
|
|
|
SRes LzmaEnc_PrepareForLzma2(CLzmaEncHandle pp, |
|
27520
|
|
|
|
|
|
|
ISeqInStream *inStream, uint32_t keepWindowSize, |
|
27521
|
|
|
|
|
|
|
ISzAlloc *alloc, ISzAlloc *allocBig) |
|
27522
|
|
|
|
|
|
|
{ |
|
27523
|
|
|
|
|
|
|
CLzmaEnc *p = (CLzmaEnc *)pp; |
|
27524
|
0
|
|
|
|
|
|
p->matchFinderBase.stream = inStream; |
|
27525
|
0
|
|
|
|
|
|
p->needInit = 1; |
|
27526
|
0
|
|
|
|
|
|
return LzmaEnc_AllocAndInit(p, keepWindowSize, alloc, allocBig); |
|
27527
|
|
|
|
|
|
|
} |
|
27528
|
|
|
|
|
|
|
|
|
27529
|
|
|
|
|
|
|
static void LzmaEnc_SetInputBuf(CLzmaEnc *p, const uint8_t *src, size_t srcLen) |
|
27530
|
|
|
|
|
|
|
{ |
|
27531
|
0
|
|
|
|
|
|
p->matchFinderBase.directInput = 1; |
|
27532
|
0
|
|
|
|
|
|
p->matchFinderBase.bufferBase = (uint8_t *)src; |
|
27533
|
0
|
|
|
|
|
|
p->matchFinderBase.directInputRem = srcLen; |
|
27534
|
|
|
|
|
|
|
} |
|
27535
|
|
|
|
|
|
|
|
|
27536
|
0
|
|
|
|
|
|
SRes LzmaEnc_MemPrepare(CLzmaEncHandle pp, const uint8_t *src, size_t srcLen, |
|
27537
|
|
|
|
|
|
|
uint32_t keepWindowSize, ISzAlloc *alloc, ISzAlloc *allocBig) |
|
27538
|
|
|
|
|
|
|
{ |
|
27539
|
|
|
|
|
|
|
CLzmaEnc *p = (CLzmaEnc *)pp; |
|
27540
|
|
|
|
|
|
|
LzmaEnc_SetInputBuf(p, src, srcLen); |
|
27541
|
0
|
|
|
|
|
|
p->needInit = 1; |
|
27542
|
|
|
|
|
|
|
|
|
27543
|
0
|
|
|
|
|
|
return LzmaEnc_AllocAndInit(p, keepWindowSize, alloc, allocBig); |
|
27544
|
|
|
|
|
|
|
} |
|
27545
|
|
|
|
|
|
|
|
|
27546
|
0
|
|
|
|
|
|
void LzmaEnc_Finish(CLzmaEncHandle /*pp*/) |
|
27547
|
|
|
|
|
|
|
{ |
|
27548
|
0
|
|
|
|
|
|
} |
|
27549
|
|
|
|
|
|
|
|
|
27550
|
|
|
|
|
|
|
struct CSeqOutStreamBuf |
|
27551
|
|
|
|
|
|
|
{ |
|
27552
|
|
|
|
|
|
|
ISeqOutStream funcTable; |
|
27553
|
|
|
|
|
|
|
uint8_t *data; |
|
27554
|
|
|
|
|
|
|
size_t rem; |
|
27555
|
|
|
|
|
|
|
bool overflow; |
|
27556
|
|
|
|
|
|
|
}; |
|
27557
|
|
|
|
|
|
|
|
|
27558
|
0
|
|
|
|
|
|
static size_t MyWrite(void *pp, const void *data, size_t size) |
|
27559
|
|
|
|
|
|
|
{ |
|
27560
|
|
|
|
|
|
|
CSeqOutStreamBuf *p = (CSeqOutStreamBuf *)pp; |
|
27561
|
0
|
0
|
|
|
|
|
if (p->rem < size) |
|
27562
|
|
|
|
|
|
|
{ |
|
27563
|
|
|
|
|
|
|
size = p->rem; |
|
27564
|
0
|
|
|
|
|
|
p->overflow = true; |
|
27565
|
|
|
|
|
|
|
} |
|
27566
|
0
|
|
|
|
|
|
memcpy(p->data, data, size); |
|
27567
|
0
|
|
|
|
|
|
p->rem -= size; |
|
27568
|
0
|
|
|
|
|
|
p->data += size; |
|
27569
|
0
|
|
|
|
|
|
return size; |
|
27570
|
|
|
|
|
|
|
} |
|
27571
|
|
|
|
|
|
|
|
|
27572
|
0
|
|
|
|
|
|
uint32_t LzmaEnc_GetNumAvailableBytes(CLzmaEncHandle pp) |
|
27573
|
|
|
|
|
|
|
{ |
|
27574
|
|
|
|
|
|
|
const CLzmaEnc *p = (CLzmaEnc *)pp; |
|
27575
|
0
|
|
|
|
|
|
return p->matchFinder.GetNumAvailableBytes(p->matchFinderObj); |
|
27576
|
|
|
|
|
|
|
} |
|
27577
|
|
|
|
|
|
|
|
|
27578
|
0
|
|
|
|
|
|
const uint8_t *LzmaEnc_GetCurBuf(CLzmaEncHandle pp) |
|
27579
|
|
|
|
|
|
|
{ |
|
27580
|
|
|
|
|
|
|
const CLzmaEnc *p = (CLzmaEnc *)pp; |
|
27581
|
0
|
|
|
|
|
|
return p->matchFinder.GetPointerToCurrentPos(p->matchFinderObj) - p->additionalOffset; |
|
27582
|
|
|
|
|
|
|
} |
|
27583
|
|
|
|
|
|
|
|
|
27584
|
0
|
|
|
|
|
|
SRes LzmaEnc_CodeOneMemBlock(CLzmaEncHandle pp, bool reInit, |
|
27585
|
|
|
|
|
|
|
uint8_t *dest, size_t *destLen, uint32_t desiredPackSize, uint32_t *unpackSize) |
|
27586
|
|
|
|
|
|
|
{ |
|
27587
|
|
|
|
|
|
|
CLzmaEnc *p = (CLzmaEnc *)pp; |
|
27588
|
|
|
|
|
|
|
uint64_t nowPos64; |
|
27589
|
|
|
|
|
|
|
SRes res; |
|
27590
|
|
|
|
|
|
|
CSeqOutStreamBuf outStream; |
|
27591
|
|
|
|
|
|
|
|
|
27592
|
0
|
|
|
|
|
|
outStream.funcTable.Write = MyWrite; |
|
27593
|
0
|
|
|
|
|
|
outStream.data = dest; |
|
27594
|
0
|
|
|
|
|
|
outStream.rem = *destLen; |
|
27595
|
0
|
|
|
|
|
|
outStream.overflow = false; |
|
27596
|
|
|
|
|
|
|
|
|
27597
|
0
|
|
|
|
|
|
p->writeEndMark = false; |
|
27598
|
0
|
|
|
|
|
|
p->finished = false; |
|
27599
|
0
|
|
|
|
|
|
p->result = SZ_OK; |
|
27600
|
|
|
|
|
|
|
|
|
27601
|
0
|
0
|
|
|
|
|
if (reInit) |
|
27602
|
0
|
|
|
|
|
|
LzmaEnc_Init(p); |
|
27603
|
0
|
|
|
|
|
|
LzmaEnc_InitPrices(p); |
|
27604
|
0
|
|
|
|
|
|
nowPos64 = p->nowPos64; |
|
27605
|
|
|
|
|
|
|
RangeEnc_Init(&p->rc); |
|
27606
|
0
|
|
|
|
|
|
p->rc.outStream = &outStream.funcTable; |
|
27607
|
|
|
|
|
|
|
|
|
27608
|
0
|
|
|
|
|
|
res = LzmaEnc_CodeOneBlock(p, true, desiredPackSize, *unpackSize); |
|
27609
|
|
|
|
|
|
|
|
|
27610
|
0
|
|
|
|
|
|
*unpackSize = (uint32_t)(p->nowPos64 - nowPos64); |
|
27611
|
0
|
|
|
|
|
|
*destLen -= outStream.rem; |
|
27612
|
0
|
0
|
|
|
|
|
if (outStream.overflow) |
|
27613
|
|
|
|
|
|
|
return SZ_ERROR_OUTPUT_EOF; |
|
27614
|
|
|
|
|
|
|
|
|
27615
|
0
|
|
|
|
|
|
return res; |
|
27616
|
|
|
|
|
|
|
} |
|
27617
|
|
|
|
|
|
|
|
|
27618
|
0
|
|
|
|
|
|
static SRes LzmaEnc_Encode2(CLzmaEnc *p, ICompressProgress *progress) |
|
27619
|
|
|
|
|
|
|
{ |
|
27620
|
|
|
|
|
|
|
SRes res = SZ_OK; |
|
27621
|
|
|
|
|
|
|
|
|
27622
|
|
|
|
|
|
|
for (;;) |
|
27623
|
|
|
|
|
|
|
{ |
|
27624
|
0
|
|
|
|
|
|
res = LzmaEnc_CodeOneBlock(p, false, 0, 0); |
|
27625
|
0
|
0
|
|
|
|
|
if (res != SZ_OK || p->finished != 0) |
|
|
|
0
|
|
|
|
|
|
|
27626
|
|
|
|
|
|
|
break; |
|
27627
|
0
|
0
|
|
|
|
|
if (progress != 0) |
|
27628
|
|
|
|
|
|
|
{ |
|
27629
|
0
|
|
|
|
|
|
res = progress->Progress(progress, p->nowPos64, RangeEnc_GetProcessed(&p->rc)); |
|
27630
|
0
|
0
|
|
|
|
|
if (res != SZ_OK) |
|
27631
|
|
|
|
|
|
|
{ |
|
27632
|
|
|
|
|
|
|
res = SZ_ERROR_PROGRESS; |
|
27633
|
|
|
|
|
|
|
break; |
|
27634
|
|
|
|
|
|
|
} |
|
27635
|
|
|
|
|
|
|
} |
|
27636
|
|
|
|
|
|
|
} |
|
27637
|
|
|
|
|
|
|
LzmaEnc_Finish(p); |
|
27638
|
0
|
|
|
|
|
|
return res; |
|
27639
|
|
|
|
|
|
|
} |
|
27640
|
|
|
|
|
|
|
|
|
27641
|
0
|
|
|
|
|
|
SRes LzmaEnc_Encode(CLzmaEncHandle pp, ISeqOutStream *outStream, ISeqInStream *inStream, ICompressProgress *progress, |
|
27642
|
|
|
|
|
|
|
ISzAlloc *alloc, ISzAlloc *allocBig) |
|
27643
|
|
|
|
|
|
|
{ |
|
27644
|
0
|
0
|
|
|
|
|
RINOK(LzmaEnc_Prepare(pp, outStream, inStream, alloc, allocBig)); |
|
27645
|
0
|
|
|
|
|
|
return LzmaEnc_Encode2((CLzmaEnc *)pp, progress); |
|
27646
|
|
|
|
|
|
|
} |
|
27647
|
|
|
|
|
|
|
|
|
27648
|
0
|
|
|
|
|
|
SRes LzmaEnc_WriteProperties(CLzmaEncHandle pp, uint8_t *props, size_t *size) |
|
27649
|
|
|
|
|
|
|
{ |
|
27650
|
|
|
|
|
|
|
CLzmaEnc *p = (CLzmaEnc *)pp; |
|
27651
|
|
|
|
|
|
|
int i; |
|
27652
|
0
|
|
|
|
|
|
uint32_t dictSize = p->dictSize; |
|
27653
|
0
|
0
|
|
|
|
|
if (*size < LZMA_PROPS_SIZE) |
|
27654
|
|
|
|
|
|
|
return SZ_ERROR_PARAM; |
|
27655
|
0
|
|
|
|
|
|
*size = LZMA_PROPS_SIZE; |
|
27656
|
0
|
|
|
|
|
|
props[0] = (uint8_t)((p->pb * 5 + p->lp) * 9 + p->lc); |
|
27657
|
|
|
|
|
|
|
|
|
27658
|
0
|
0
|
|
|
|
|
for (i = 11; i <= 30; i++) |
|
27659
|
|
|
|
|
|
|
{ |
|
27660
|
0
|
0
|
|
|
|
|
if (dictSize <= ((uint32_t)2 << i)) |
|
27661
|
|
|
|
|
|
|
{ |
|
27662
|
0
|
|
|
|
|
|
dictSize = (2 << i); |
|
27663
|
0
|
|
|
|
|
|
break; |
|
27664
|
|
|
|
|
|
|
} |
|
27665
|
0
|
0
|
|
|
|
|
if (dictSize <= ((uint32_t)3 << i)) |
|
27666
|
|
|
|
|
|
|
{ |
|
27667
|
0
|
|
|
|
|
|
dictSize = (3 << i); |
|
27668
|
0
|
|
|
|
|
|
break; |
|
27669
|
|
|
|
|
|
|
} |
|
27670
|
|
|
|
|
|
|
} |
|
27671
|
|
|
|
|
|
|
|
|
27672
|
0
|
0
|
|
|
|
|
for (i = 0; i < 4; i++) |
|
27673
|
0
|
|
|
|
|
|
props[1 + i] = (uint8_t)(dictSize >> (8 * i)); |
|
27674
|
|
|
|
|
|
|
return SZ_OK; |
|
27675
|
|
|
|
|
|
|
} |
|
27676
|
|
|
|
|
|
|
|
|
27677
|
0
|
|
|
|
|
|
SRes LzmaEnc_MemEncode(CLzmaEncHandle pp, uint8_t *dest, size_t *destLen, const uint8_t *src, size_t srcLen, |
|
27678
|
|
|
|
|
|
|
int writeEndMark, ICompressProgress *progress, ISzAlloc *alloc, ISzAlloc *allocBig) |
|
27679
|
|
|
|
|
|
|
{ |
|
27680
|
|
|
|
|
|
|
SRes res; |
|
27681
|
|
|
|
|
|
|
CLzmaEnc *p = (CLzmaEnc *)pp; |
|
27682
|
|
|
|
|
|
|
|
|
27683
|
|
|
|
|
|
|
CSeqOutStreamBuf outStream; |
|
27684
|
|
|
|
|
|
|
|
|
27685
|
|
|
|
|
|
|
LzmaEnc_SetInputBuf(p, src, srcLen); |
|
27686
|
|
|
|
|
|
|
|
|
27687
|
0
|
|
|
|
|
|
outStream.funcTable.Write = MyWrite; |
|
27688
|
0
|
|
|
|
|
|
outStream.data = dest; |
|
27689
|
0
|
|
|
|
|
|
outStream.rem = *destLen; |
|
27690
|
0
|
|
|
|
|
|
outStream.overflow = false; |
|
27691
|
|
|
|
|
|
|
|
|
27692
|
0
|
|
|
|
|
|
p->writeEndMark = writeEndMark; |
|
27693
|
|
|
|
|
|
|
|
|
27694
|
0
|
|
|
|
|
|
p->rc.outStream = &outStream.funcTable; |
|
27695
|
|
|
|
|
|
|
res = LzmaEnc_MemPrepare(pp, src, srcLen, 0, alloc, allocBig); |
|
27696
|
0
|
0
|
|
|
|
|
if (res == SZ_OK) |
|
27697
|
0
|
|
|
|
|
|
res = LzmaEnc_Encode2(p, progress); |
|
27698
|
|
|
|
|
|
|
|
|
27699
|
0
|
|
|
|
|
|
*destLen -= outStream.rem; |
|
27700
|
0
|
0
|
|
|
|
|
if (outStream.overflow) |
|
27701
|
|
|
|
|
|
|
return SZ_ERROR_OUTPUT_EOF; |
|
27702
|
0
|
|
|
|
|
|
return res; |
|
27703
|
|
|
|
|
|
|
} |
|
27704
|
|
|
|
|
|
|
|
|
27705
|
0
|
|
|
|
|
|
SRes LzmaEncode(uint8_t *dest, size_t *destLen, const uint8_t *src, size_t srcLen, |
|
27706
|
|
|
|
|
|
|
const CLzmaEncProps *props, uint8_t *propsEncoded, size_t *propsSize, int writeEndMark, |
|
27707
|
|
|
|
|
|
|
ICompressProgress *progress, ISzAlloc *alloc, ISzAlloc *allocBig) |
|
27708
|
|
|
|
|
|
|
{ |
|
27709
|
0
|
|
|
|
|
|
CLzmaEnc *p = (CLzmaEnc *)LzmaEnc_Create(alloc); |
|
27710
|
|
|
|
|
|
|
SRes res; |
|
27711
|
0
|
0
|
|
|
|
|
if (p == 0) |
|
27712
|
|
|
|
|
|
|
return SZ_ERROR_MEM; |
|
27713
|
|
|
|
|
|
|
|
|
27714
|
0
|
|
|
|
|
|
res = LzmaEnc_SetProps(p, props); |
|
27715
|
0
|
0
|
|
|
|
|
if (res == SZ_OK) |
|
27716
|
|
|
|
|
|
|
{ |
|
27717
|
0
|
|
|
|
|
|
res = LzmaEnc_WriteProperties(p, propsEncoded, propsSize); |
|
27718
|
0
|
0
|
|
|
|
|
if (res == SZ_OK) |
|
27719
|
|
|
|
|
|
|
res = LzmaEnc_MemEncode(p, dest, destLen, src, srcLen, |
|
27720
|
0
|
|
|
|
|
|
writeEndMark, progress, alloc, allocBig); |
|
27721
|
|
|
|
|
|
|
} |
|
27722
|
|
|
|
|
|
|
|
|
27723
|
|
|
|
|
|
|
LzmaEnc_Destroy(p, alloc, allocBig); |
|
27724
|
0
|
|
|
|
|
|
return res; |
|
27725
|
|
|
|
|
|
|
} |
|
27726
|
|
|
|
|
|
|
|
|
27727
|
|
|
|
|
|
|
} // namespace lzma |
|
27728
|
|
|
|
|
|
|
// End of LZMA compression library by Igor Pavlov |
|
27729
|
|
|
|
|
|
|
|
|
27730
|
|
|
|
|
|
|
#ifndef UFAL_CPPUTILS_COMPRESSOR_LZMA_ALLOCATOR_H |
|
27731
|
|
|
|
|
|
|
#define UFAL_CPPUTILS_COMPRESSOR_LZMA_ALLOCATOR_H |
|
27732
|
|
|
|
|
|
|
static void *LzmaAlloc(void* /*p*/, size_t size) { return new char[size]; } |
|
27733
|
|
|
|
|
|
|
static void LzmaFree(void* /*p*/, void *address) { delete[] (char*) address; } |
|
27734
|
|
|
|
|
|
|
static lzma::ISzAlloc lzmaAllocator = { LzmaAlloc, LzmaFree }; |
|
27735
|
|
|
|
|
|
|
#endif // UFAL_CPPUTILS_COMPRESSOR_LZMA_ALLOCATOR_H |
|
27736
|
|
|
|
|
|
|
|
|
27737
|
0
|
|
|
|
|
|
bool compressor::save(ostream& os, const binary_encoder& enc) { |
|
27738
|
0
|
|
|
|
|
|
size_t uncompressed_size = enc.data.size(), compressed_size = 2 * enc.data.size() + 100; |
|
27739
|
0
|
|
|
|
|
|
vector compressed(compressed_size); |
|
27740
|
|
|
|
|
|
|
|
|
27741
|
|
|
|
|
|
|
lzma::CLzmaEncProps props; |
|
27742
|
|
|
|
|
|
|
lzma::LzmaEncProps_Init(&props); |
|
27743
|
|
|
|
|
|
|
unsigned char props_encoded[LZMA_PROPS_SIZE]; |
|
27744
|
0
|
|
|
|
|
|
size_t props_encoded_size = LZMA_PROPS_SIZE; |
|
27745
|
|
|
|
|
|
|
|
|
27746
|
0
|
0
|
|
|
|
|
auto res = lzma::LzmaEncode(compressed.data(), &compressed_size, enc.data.data(), uncompressed_size, &props, props_encoded, &props_encoded_size, 0, nullptr, &lzmaAllocator, &lzmaAllocator); |
|
27747
|
0
|
0
|
|
|
|
|
if (res != SZ_OK) return false; |
|
27748
|
|
|
|
|
|
|
|
|
27749
|
0
|
|
|
|
|
|
uint32_t poor_crc = uncompressed_size * 19991 + compressed_size * 199999991 + 1234567890; |
|
27750
|
0
|
0
|
|
|
|
|
if (uint32_t(uncompressed_size) != uncompressed_size || uint32_t(compressed_size) != compressed_size) return false; |
|
|
|
0
|
|
|
|
|
|
|
27751
|
0
|
0
|
|
|
|
|
if (!os.write((const char*) &uncompressed_size, sizeof(uint32_t))) return false; |
|
|
|
0
|
|
|
|
|
|
|
27752
|
0
|
0
|
|
|
|
|
if (!os.write((const char*) &compressed_size, sizeof(uint32_t))) return false; |
|
|
|
0
|
|
|
|
|
|
|
27753
|
0
|
0
|
|
|
|
|
if (!os.write((const char*) &poor_crc, sizeof(uint32_t))) return false; |
|
|
|
0
|
|
|
|
|
|
|
27754
|
0
|
0
|
|
|
|
|
if (!os.write((const char*) props_encoded, sizeof(props_encoded))) return false; |
|
|
|
0
|
|
|
|
|
|
|
27755
|
0
|
0
|
|
|
|
|
if (!os.write((const char*) compressed.data(), compressed_size)) return false; |
|
|
|
0
|
|
|
|
|
|
|
27756
|
|
|
|
|
|
|
|
|
27757
|
0
|
|
|
|
|
|
return true; |
|
27758
|
|
|
|
|
|
|
} |
|
27759
|
|
|
|
|
|
|
|
|
27760
|
|
|
|
|
|
|
} // namespace utils |
|
27761
|
|
|
|
|
|
|
|
|
27762
|
|
|
|
|
|
|
///////// |
|
27763
|
|
|
|
|
|
|
// File: version/version.cpp |
|
27764
|
|
|
|
|
|
|
///////// |
|
27765
|
|
|
|
|
|
|
|
|
27766
|
|
|
|
|
|
|
// This file is part of UDPipe . |
|
27767
|
|
|
|
|
|
|
// |
|
27768
|
|
|
|
|
|
|
// Copyright 2016 Institute of Formal and Applied Linguistics, Faculty of |
|
27769
|
|
|
|
|
|
|
// Mathematics and Physics, Charles University in Prague, Czech Republic. |
|
27770
|
|
|
|
|
|
|
// |
|
27771
|
|
|
|
|
|
|
// This Source Code Form is subject to the terms of the Mozilla Public |
|
27772
|
|
|
|
|
|
|
// License, v. 2.0. If a copy of the MPL was not distributed with this |
|
27773
|
|
|
|
|
|
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
27774
|
|
|
|
|
|
|
|
|
27775
|
|
|
|
|
|
|
// Returns current version. |
|
27776
|
0
|
|
|
|
|
|
version version::current() { |
|
27777
|
0
|
0
|
|
|
|
|
return {1, 3, 0, ""}; |
|
27778
|
|
|
|
|
|
|
} |
|
27779
|
|
|
|
|
|
|
|
|
27780
|
|
|
|
|
|
|
// Returns multi-line formated version and copyright string. |
|
27781
|
0
|
|
|
|
|
|
string version::version_and_copyright(const string& other_libraries) { |
|
27782
|
0
|
|
|
|
|
|
ostringstream info; |
|
27783
|
|
|
|
|
|
|
|
|
27784
|
|
|
|
|
|
|
auto udpipe = version::current(); |
|
27785
|
|
|
|
|
|
|
auto unilib = unilib::version::current(); |
|
27786
|
|
|
|
|
|
|
auto morphodita = morphodita::version::current(); |
|
27787
|
|
|
|
|
|
|
auto parsito = parsito::version::current(); |
|
27788
|
|
|
|
|
|
|
|
|
27789
|
0
|
|
|
|
|
|
info << "UDPipe version " << udpipe.major << '.' << udpipe.minor << '.' << udpipe.patch |
|
27790
|
0
|
0
|
|
|
|
|
<< (udpipe.prerelease.empty() ? "" : "-") << udpipe.prerelease |
|
|
|
0
|
|
|
|
|
|
|
27791
|
0
|
|
|
|
|
|
<< " (using UniLib " << unilib.major << '.' << unilib.minor << '.' << unilib.patch |
|
27792
|
0
|
0
|
|
|
|
|
<< (unilib.prerelease.empty() ? "" : "-") << unilib.prerelease |
|
|
|
0
|
|
|
|
|
|
|
27793
|
0
|
|
|
|
|
|
<< ",\nMorphoDiTa " << morphodita.major << '.' << morphodita.minor << '.' << unilib.patch |
|
27794
|
0
|
0
|
|
|
|
|
<< (morphodita.prerelease.empty() ? "" : "-") << morphodita.prerelease |
|
|
|
0
|
|
|
|
|
|
|
27795
|
0
|
|
|
|
|
|
<< ", Parsito " << parsito.major << '.' << parsito.minor << '.' << unilib.patch |
|
27796
|
0
|
0
|
|
|
|
|
<< (parsito.prerelease.empty() ? "" : "-") << parsito.prerelease |
|
|
|
0
|
|
|
|
|
|
|
27797
|
0
|
0
|
|
|
|
|
<< (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n" |
|
|
|
0
|
|
|
|
|
|
|
27798
|
|
|
|
|
|
|
"Copyright 2016 by Institute of Formal and Applied Linguistics, Faculty of\n" |
|
27799
|
0
|
0
|
|
|
|
|
"Mathematics and Physics, Charles University in Prague, Czech Republic."; |
|
27800
|
|
|
|
|
|
|
|
|
27801
|
0
|
|
|
|
|
|
return info.str(); |
|
27802
|
|
|
|
|
|
|
} |
|
27803
|
|
|
|
|
|
|
|
|
27804
|
|
|
|
|
|
|
} // namespace udpipe |
|
27805
|
8
|
50
|
|
|
|
|
} // namespace ufal |
|
|
|
50
|
|
|
|
|
|