line |
true |
false |
branch |
179
|
4 |
0 |
if (bilou[bilou_type_O].probability > bilou[best].probability) best = bilou_type_O; |
180
|
2 |
2 |
if (bilou[bilou_type_U].probability > bilou[best].probability) best = bilou_type_U; |
187
|
10 |
0 |
if (prev.bilou[bilou_type_O].probability > best_LOU_prob) { |
191
|
6 |
4 |
if (prev.bilou[bilou_type_U].probability > best_LOU_prob) { |
199
|
0 |
10 |
if (prev.bilou[bilou_type_I].probability > best_BI_prob) { |
205
|
0 |
10 |
if (best_BI_prob > best_LOU_prob) { |
222
|
0 |
10 |
if (bilou[bilou_type_I].probability > bilou[best].probability) best = bilou_type_I; |
227
|
0 |
10 |
if (bilou[bilou_type_L].probability > bilou[best].probability) best = bilou_type_L; |
232
|
10 |
0 |
if (bilou[bilou_type_O].probability > bilou[best].probability) best = bilou_type_O; |
237
|
4 |
6 |
if (bilou[bilou_type_U].probability > bilou[best].probability) best = bilou_type_U; |
353
|
2 |
2 |
if (words.size() < size) words.resize(size); |
354
|
2 |
2 |
if (features.size() < size) features.resize(size); |
355
|
2 |
2 |
if (probabilities.size() < size) probabilities.resize(size); |
356
|
2 |
2 |
if (previous_stage.size() < size) previous_stage.resize(size); |
360
|
4 |
14 |
for (unsigned i = 0; i < size; i++) |
|
0 |
0 |
for (unsigned i = 0; i < size; i++) |
365
|
14 |
4 |
for (unsigned i = 0; i < size; i++) |
|
0 |
0 |
for (unsigned i = 0; i < size; i++) |
370
|
7 |
2 |
for (unsigned i = 0; i < size; i++) { |
|
0 |
0 |
for (unsigned i = 0; i < size; i++) { |
380
|
4 |
0 |
if (last_bilou[bilou_type_O].probability > last_bilou[best].probability) best = bilou_type_O; |
381
|
0 |
4 |
if (last_bilou[bilou_type_U].probability > last_bilou[best].probability) best = bilou_type_U; |
385
|
10 |
4 |
for (unsigned i = size - 1; i; i--) { |
392
|
14 |
4 |
for (unsigned i = 0; i < size; i++) { |
462
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
465
|
0 |
0 |
class binary_decoder { |
|
0 |
0 |
class binary_decoder { |
|
0 |
0 |
class binary_decoder { |
|
0 |
0 |
class binary_decoder { |
|
0 |
0 |
class binary_decoder { |
|
0 |
0 |
class binary_decoder { |
|
0 |
0 |
class binary_decoder { |
|
0 |
0 |
class binary_decoder { |
|
0 |
0 |
class binary_decoder { |
|
0 |
0 |
class binary_decoder { |
490
|
7 |
0 |
buffer.resize(len); |
498
|
0 |
313 |
if (data + 1 > data_end) throw binary_decoder_error("No more data in binary_decoder"); |
503
|
0 |
679 |
if (data + sizeof(uint16_t) > data_end) throw binary_decoder_error("No more data in binary_decoder"); |
511
|
0 |
516 |
if (data + sizeof(uint32_t) > data_end) throw binary_decoder_error("No more data in binary_decoder"); |
520
|
0 |
46 |
if (len == 255) len = next_4B(); |
525
|
0 |
116 |
if (data + sizeof(T) * elements > data_end) throw binary_decoder_error("No more data in binary_decoder"); |
|
0 |
253 |
if (data + sizeof(T) * elements > data_end) throw binary_decoder_error("No more data in binary_decoder"); |
|
0 |
2 |
if (data + sizeof(T) * elements > data_end) throw binary_decoder_error("No more data in binary_decoder"); |
|
0 |
118 |
if (data + sizeof(T) * elements > data_end) throw binary_decoder_error("No more data in binary_decoder"); |
540
|
0 |
1 |
if (pos > buffer.size()) throw binary_decoder_error("Cannot seek past end of binary_decoder"); |
576
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
630
|
0 |
0 |
if (uint8_t(val) != val) runtime_failure("Should encode value " << val << " in one byte!"); |
654
|
0 |
0 |
if (!(str.len < 255)) add_4B(str.len); |
687
|
0 |
0 |
class network_classifier { |
|
4 |
2 |
class network_classifier { |
|
0 |
0 |
class network_classifier { |
|
0 |
0 |
class network_classifier { |
814
|
16 |
10 |
while (size) { |
|
0 |
0 |
while (size) { |
|
0 |
0 |
while (size) { |
816
|
10 |
6 |
if (unaligned_load(first + step) < val) { |
|
0 |
0 |
if (unaligned_load(first + step) < val) { |
|
0 |
0 |
if (unaligned_load(first + step) < val) { |
857
|
2 |
0 |
if (!compressor::load(is, data)) return false; |
|
2 |
0 |
if (!compressor::load(is, data)) return false; |
861
|
2 |
0 |
load_matrix(data, indices); |
862
|
2 |
0 |
missing_weight = unaligned_load(data.next(1)); |
863
|
2 |
0 |
load_matrix(data, weights); |
868
|
2 |
0 |
hidden_layer.resize(data.next_2B()); |
|
2 |
0 |
hidden_layer.resize(data.next_2B()); |
869
|
0 |
2 |
if (!hidden_layer.empty()) { |
870
|
0 |
0 |
load_matrix(data, hidden_weights[0]); |
871
|
0 |
0 |
load_matrix(data, hidden_weights[1]); |
875
|
2 |
0 |
unsigned outcomes = data.next_2B(); |
876
|
2 |
0 |
output_layer.resize(outcomes); |
877
|
2 |
0 |
output_error.resize(outcomes); |
|
0 |
0 |
output_error.resize(outcomes); |
888
|
330 |
2 |
for (auto&& row : m) { |
|
330 |
2 |
for (auto&& row : m) { |
898
|
0 |
0 |
if (features <= 0) { if (verbose) cerr << "There must be more than zero features!" << endl; return false; } |
|
0 |
0 |
if (features <= 0) { if (verbose) cerr << "There must be more than zero features!" << endl; return false; } |
899
|
0 |
0 |
if (outcomes <= 0) { if (verbose) cerr << "There must be more than zero features!" << endl; return false; } |
|
0 |
0 |
if (outcomes <= 0) { if (verbose) cerr << "There must be more than zero features!" << endl; return false; } |
900
|
0 |
0 |
if (train.empty()) { if (verbose) cerr << "No training data!" << endl; return false; } |
|
0 |
0 |
if (train.empty()) { if (verbose) cerr << "No training data!" << endl; return false; } |
901
|
0 |
0 |
for (auto&& instance : train) { |
902
|
0 |
0 |
if (instance.outcome >= outcomes) { if (verbose) cerr << "Training instances out of range!" << endl; return false; } |
|
0 |
0 |
if (instance.outcome >= outcomes) { if (verbose) cerr << "Training instances out of range!" << endl; return false; } |
903
|
0 |
0 |
for(auto& feature : instance.features) |
904
|
0 |
0 |
if (feature >= features) { if (verbose) cerr << "Training instances out of range!" << endl; return false; } |
|
0 |
0 |
if (feature >= features) { if (verbose) cerr << "Training instances out of range!" << endl; return false; } |
906
|
0 |
0 |
for (auto&& instance : heldout) |
907
|
0 |
0 |
for(auto& feature : instance.features) |
908
|
0 |
0 |
if (feature >= features) { if (verbose) cerr << "Heldout instances out of range!" << endl; return false; } |
|
0 |
0 |
if (feature >= features) { if (verbose) cerr << "Heldout instances out of range!" << endl; return false; } |
916
|
0 |
0 |
for (auto&& instance : train) |
917
|
0 |
0 |
for (auto&& feature : instance.features) |
920
|
0 |
0 |
for (auto&& row : indices) { |
927
|
0 |
0 |
for (auto&& row : indices) |
933
|
0 |
0 |
if (!hidden_layer.empty()) { |
937
|
0 |
0 |
for (auto&& row : hidden_weights[0]) |
938
|
0 |
0 |
for (auto&& weight : row.resize(hidden_layer.size()), row) |
942
|
0 |
0 |
for (auto&& row : hidden_weights[1]) |
943
|
0 |
0 |
for (auto&& weight : row.resize(outcomes), row) |
956
|
0 |
0 |
for (unsigned i = 0; i < train.size(); i++) |
959
|
0 |
0 |
for (int iteration = 0; iteration < parameters.iterations; iteration++) { |
960
|
0 |
0 |
if (verbose) cerr << "Iteration " << iteration + 1 << ": "; |
|
0 |
0 |
if (verbose) cerr << "Iteration " << iteration + 1 << ": "; |
|
0 |
0 |
if (verbose) cerr << "Iteration " << iteration + 1 << ": "; |
962
|
0 |
0 |
double learning_rate = parameters.final_learning_rate && parameters.iterations > 1 ? |
964
|
0 |
0 |
parameters.initial_learning_rate; |
970
|
0 |
0 |
for (auto&& train_index : permutation) { |
981
|
0 |
0 |
if (verbose) |
985
|
0 |
0 |
<< "%, "; |
988
|
0 |
0 |
if (!heldout.empty()) { |
990
|
0 |
0 |
for (auto&& instance : heldout) { |
994
|
0 |
0 |
if (verbose) cerr << "heldout acc " << heldout_correct * 100. / heldout.size() << ", "; |
|
0 |
0 |
if (verbose) cerr << "heldout acc " << heldout_correct * 100. / heldout.size() << ", "; |
996
|
0 |
0 |
if (verbose) cerr << "done." << endl; |
1002
|
1 |
13 |
if (outcomes.size() != output_layer.size()) outcomes.resize(output_layer.size()); |
1003
|
0 |
14 |
if (buffer.size() != hidden_layer.size()) buffer.resize(hidden_layer.size()); |
1010
|
0 |
0 |
propagate(features, hidden_layer, output_layer); |
|
0 |
0 |
propagate(features, hidden_layer, output_layer); |
1017
|
291 |
14 |
for (auto&& feature : features) |
1018
|
291 |
0 |
if (feature < indices.size()) |
1019
|
414 |
291 |
for (unsigned i = 0; i < indices[feature].size(); i++) |
1023
|
0 |
14 |
if (!hidden_layer.empty()) { |
1024
|
0 |
0 |
for (auto&& weight : hidden_layer) |
1028
|
0 |
0 |
for (auto&& feature : features) |
1029
|
0 |
0 |
if (feature < hidden_weights[0].size()) |
1030
|
0 |
0 |
for (unsigned i = 0; i < hidden_layer.size(); i++) { |
1035
|
0 |
0 |
for (auto&& weight : hidden_layer) |
1039
|
0 |
0 |
for (unsigned h = 0; h < hidden_layer.size(); h++) |
1040
|
0 |
0 |
for (unsigned i = 0; i < output_layer.size(); i++) |
1046
|
126 |
14 |
for (unsigned i = 0; i < output_layer.size(); sum += output_layer[i], i++) |
1049
|
126 |
14 |
for (unsigned i = 0; i < output_layer.size(); i++) |
1055
|
0 |
0 |
for (unsigned i = 1; i < output_layer.size(); i++) |
|
0 |
0 |
for (unsigned i = 1; i < output_layer.size(); i++) |
1056
|
0 |
0 |
if (output_layer[i] > output_layer[best]) |
|
0 |
0 |
if (output_layer[i] > output_layer[best]) |
1064
|
0 |
0 |
for (unsigned i = 0; i < output_error.size(); i++) |
1065
|
0 |
0 |
output_error[i] = (i == instance.outcome) - output_layer[i]; |
1068
|
0 |
0 |
for (auto&& feature : instance.features) |
1069
|
0 |
0 |
for (unsigned i = 0; i < indices[feature].size(); i++) |
1073
|
0 |
0 |
if (!hidden_layer.empty()) { |
1075
|
0 |
0 |
for (unsigned h = 0; h < hidden_layer.size(); h++) { |
1077
|
0 |
0 |
for (unsigned i = 0; i < output_layer.size(); i++) |
1083
|
0 |
0 |
for (unsigned h = 0; h < hidden_layer.size(); h++) |
1084
|
0 |
0 |
for (unsigned i = 0; i < output_layer.size(); i++) |
1088
|
0 |
0 |
for (auto&& feature : instance.features) |
1089
|
0 |
0 |
for (unsigned i = 0; i < hidden_layer.size(); i++) |
1218
|
0 |
0 |
if (str == "trivial") return id = TRIVIAL, true; |
1219
|
0 |
0 |
if (str == "external") return id = EXTERNAL, true; |
1220
|
0 |
0 |
if (str == "morphodita") return id = MORPHODITA, true; |
1310
|
4 |
95 |
if (it == map.end() && total_features) { |
|
4 |
0 |
if (it == map.end() && total_features) { |
|
0 |
99 |
if (it == map.end() && total_features) { |
1314
|
95 |
4 |
return it != map.end() ? it->second : ner_feature_unknown; |
1342
|
0 |
0 |
if (window < 0) return false; |
1343
|
0 |
0 |
if (!total_features) return false; |
1348
|
0 |
0 |
lookup(string(), total_features); // Always add an empty string to the map |
1358
|
35 |
8 |
for (unsigned i = data.next_4B(); i > 0; i--) { |
1360
|
35 |
0 |
data.next_str(key); |
1361
|
35 |
0 |
map.emplace(key, data.next_4B()); |
1373
|
0 |
0 |
for (auto&& element : map_elements) { |
1374
|
0 |
0 |
enc.add_str(element.first); |
1447
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
9 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
4 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
16 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
3 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
129 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
4 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
4 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
4 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
4 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
14 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
38 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
54 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
1451
|
4 |
0 |
if (chr < CHARS) { |
1453
|
4 |
0 |
if ((othercase & 0xFF) == othercase_type::LOWER_ONLY) return othercase >> 8; |
1454
|
0 |
0 |
if ((othercase & 0xFF) == othercase_type::LOWER_THEN_UPPER) return othercase >> 8; |
1455
|
0 |
0 |
if ((othercase & 0xFF) == othercase_type::TITLE_THEN_LOWER) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8; |
1461
|
0 |
0 |
if (chr < CHARS) { |
1463
|
0 |
0 |
if ((othercase & 0xFF) == othercase_type::UPPERTITLE_ONLY) return othercase >> 8; |
1464
|
0 |
0 |
if ((othercase & 0xFF) == othercase_type::UPPER_ONLY) return othercase >> 8; |
1465
|
0 |
0 |
if ((othercase & 0xFF) == othercase_type::UPPER_THEN_TITLE) return othercase >> 8; |
1466
|
0 |
0 |
if ((othercase & 0xFF) == othercase_type::LOWER_THEN_UPPER) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8; |
1560
|
58 |
10 |
if (((unsigned char)*str) < 0x80) return (unsigned char)*str++; |
1561
|
0 |
10 |
else if (((unsigned char)*str) < 0xC0) return ++str, REPLACEMENT_CHAR; |
1562
|
10 |
0 |
else if (((unsigned char)*str) < 0xE0) { |
1564
|
10 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
10 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
1566
|
0 |
0 |
} else if (((unsigned char)*str) < 0xF0) { |
1568
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
1570
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
1572
|
0 |
0 |
} else if (((unsigned char)*str) < 0xF8) { |
1574
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
1576
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
1578
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
1584
|
218 |
0 |
if (!len) return 0; |
1586
|
195 |
23 |
if (((unsigned char)*str) < 0x80) return (unsigned char)*str++; |
1587
|
0 |
23 |
else if (((unsigned char)*str) < 0xC0) return ++str, REPLACEMENT_CHAR; |
1588
|
23 |
0 |
else if (((unsigned char)*str) < 0xE0) { |
1590
|
23 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
23 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
23 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
1592
|
0 |
0 |
} else if (((unsigned char)*str) < 0xF0) { |
1594
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
1596
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
1598
|
0 |
0 |
} else if (((unsigned char)*str) < 0xF8) { |
1600
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
1602
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
1604
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
1629
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
1695
|
4 |
0 |
if (chr < 0x80) str += chr; |
1696
|
0 |
0 |
else if (chr < 0x800) { str += 0xC0 + (chr >> 6); str += 0x80 + (chr & 0x3F); } |
1697
|
0 |
0 |
else if (chr < 0x10000) { str += 0xE0 + (chr >> 12); str += 0x80 + ((chr >> 6) & 0x3F); str += 0x80 + (chr & 0x3F); } |
1698
|
0 |
0 |
else if (chr < 0x200000) { str += 0xF0 + (chr >> 18); str += 0x80 + ((chr >> 12) & 0x3F); str += 0x80 + ((chr >> 6) & 0x3F); str += 0x80 + (chr & 0x3F); } |
1705
|
0 |
0 |
for (char32_t chr; (chr = decode(str)); ) |
1712
|
0 |
0 |
while (len) |
1758
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
1763
|
0 |
0 |
if (str.len && (str.str[0] == '+' || str.str[0] == '-')) { |
|
0 |
0 |
if (str.len && (str.str[0] == '+' || str.str[0] == '-')) { |
1769
|
0 |
0 |
if (!str.len) return error.assign("Cannot parse ").append(value_name).append(" int value '").append(original.str, original.len).append("': empty string."), false; |
1773
|
0 |
0 |
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
0 |
0 |
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
0 |
0 |
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
1774
|
0 |
0 |
if (positive) { |
1775
|
0 |
0 |
if (value > (numeric_limits::max() - (str.str[0] - '0')) / 10) |
1779
|
0 |
0 |
if (value < (numeric_limits::min() + (str.str[0] - '0')) / 10) |
1787
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
1791
|
0 |
0 |
if (str.len) return error.assign("Cannot parse ").append(value_name).append(" int value '").append(original.str, original.len).append("': non-digit character found."), false; |
1800
|
0 |
0 |
if (!parse_int(str, value_name, result, error)) |
|
0 |
0 |
if (!parse_int(str, value_name, result, error)) |
1935
|
0 |
0 |
if (text.empty()) return; |
1938
|
0 |
0 |
for (string::size_type next; (next = text.find(sep, index)) != string::npos; index = next + 1) |
2030
|
0 |
0 |
if (!feature_processor::parse(window, args, entities, total_features, pipeline)) return false; |
2031
|
0 |
0 |
if (args.size() < 1) return cerr << "BrownCluster requires a cluster file as the first argument!" << endl, false; |
2034
|
0 |
0 |
if (!in.is_open()) return cerr << "Cannot open Brown clusters file '" << args[0] << "'!" << endl, false; |
2037
|
0 |
0 |
substrings.emplace_back(string::npos); |
2038
|
0 |
0 |
for (unsigned i = 1; i < args.size(); i++) { |
2039
|
0 |
0 |
int len = parse_int(args[i].c_str(), "BrownCluster_prefix_length"); |
2040
|
0 |
0 |
if (len <= 0) |
2041
|
0 |
0 |
return cerr << "Wrong prefix length '" << len << "' in BrownCluster specification!" << endl, false; |
2043
|
0 |
0 |
substrings.emplace_back(len); |
2051
|
0 |
0 |
while (getline(in, line)) { |
|
0 |
0 |
while (getline(in, line)) { |
2052
|
0 |
0 |
split(line, '\t', tokens); |
2053
|
0 |
0 |
if (tokens.size() != 2) return cerr << "Wrong line '" << line << "' in Brown cluster file '" << args[0] << "'!" << endl, false; |
2057
|
0 |
0 |
if (it == cluster_map.end()) { |
2059
|
0 |
0 |
clusters.emplace_back(); |
2060
|
0 |
0 |
for (auto&& substring : substrings) |
2061
|
0 |
0 |
if (substring == string::npos || substring < cluster.size()) |
|
0 |
0 |
if (substring == string::npos || substring < cluster.size()) |
|
0 |
0 |
if (substring == string::npos || substring < cluster.size()) |
2062
|
0 |
0 |
clusters.back().emplace_back(prefixes_map.emplace(cluster.substr(0, substring), *total_features + (2*window + 1) * (int)prefixes_map.size() + window).first->second); |
|
0 |
0 |
clusters.back().emplace_back(prefixes_map.emplace(cluster.substr(0, substring), *total_features + (2*window + 1) * (int)prefixes_map.size() + window).first->second); |
2065
|
0 |
0 |
if (!map.emplace(form, it->second).second) return cerr << "Form '" << form << "' is present twice in Brown cluster file '" << args[0] << "'!" << endl, false; |
2076
|
0 |
0 |
for (auto&& cluster : clusters) { |
2078
|
0 |
0 |
for (auto&& feature : cluster) |
2087
|
0 |
0 |
for (auto&& cluster : clusters) { |
2089
|
0 |
0 |
for (auto&& feature : cluster) |
2095
|
0 |
0 |
for (unsigned i = 0; i < sentence.size; i++) { |
2097
|
0 |
0 |
if (it != map.end()) { |
2099
|
0 |
0 |
for (auto&& feature : cluster) |
2100
|
0 |
0 |
apply_in_window(i, feature); |
|
0 |
0 |
apply_in_window(i, feature); |
2113
|
0 |
0 |
if (window) return cerr << "CzechAddContainers cannot have non-zero window!" << endl, false; |
2121
|
0 |
0 |
for (unsigned i = 0; i < entities.size(); i++) { |
2123
|
0 |
0 |
if (entities[i].type.compare("pf") == 0 && (!i || entities[i-1].start + entities[i-1].length < entities[i].start || entities[i-1].type.compare("pf") != 0)) { |
|
0 |
0 |
if (entities[i].type.compare("pf") == 0 && (!i || entities[i-1].start + entities[i-1].length < entities[i].start || entities[i-1].type.compare("pf") != 0)) { |
|
0 |
0 |
if (entities[i].type.compare("pf") == 0 && (!i || entities[i-1].start + entities[i-1].length < entities[i].start || entities[i-1].type.compare("pf") != 0)) { |
|
0 |
0 |
if (entities[i].type.compare("pf") == 0 && (!i || entities[i-1].start + entities[i-1].length < entities[i].start || entities[i-1].type.compare("pf") != 0)) { |
|
0 |
0 |
if (entities[i].type.compare("pf") == 0 && (!i || entities[i-1].start + entities[i-1].length < entities[i].start || entities[i-1].type.compare("pf") != 0)) { |
2125
|
0 |
0 |
while (j < entities.size() && entities[j].start == entities[j-1].start + entities[j-1].length && entities[j].type.compare("pf") == 0) j++; |
|
0 |
0 |
while (j < entities.size() && entities[j].start == entities[j-1].start + entities[j-1].length && entities[j].type.compare("pf") == 0) j++; |
|
0 |
0 |
while (j < entities.size() && entities[j].start == entities[j-1].start + entities[j-1].length && entities[j].type.compare("pf") == 0) j++; |
|
0 |
0 |
while (j < entities.size() && entities[j].start == entities[j-1].start + entities[j-1].length && entities[j].type.compare("pf") == 0) j++; |
2126
|
0 |
0 |
if (j < entities.size() && entities[j].start == entities[j-1].start + entities[j-1].length && entities[j].type.compare("ps") == 0) { |
|
0 |
0 |
if (j < entities.size() && entities[j].start == entities[j-1].start + entities[j-1].length && entities[j].type.compare("ps") == 0) { |
|
0 |
0 |
if (j < entities.size() && entities[j].start == entities[j-1].start + entities[j-1].length && entities[j].type.compare("ps") == 0) { |
|
0 |
0 |
if (j < entities.size() && entities[j].start == entities[j-1].start + entities[j-1].length && entities[j].type.compare("ps") == 0) { |
2128
|
0 |
0 |
while (j < entities.size() && entities[j].start == entities[j-1].start + entities[j-1].length && entities[j].type.compare("ps") == 0) j++; |
|
0 |
0 |
while (j < entities.size() && entities[j].start == entities[j-1].start + entities[j-1].length && entities[j].type.compare("ps") == 0) j++; |
|
0 |
0 |
while (j < entities.size() && entities[j].start == entities[j-1].start + entities[j-1].length && entities[j].type.compare("ps") == 0) j++; |
|
0 |
0 |
while (j < entities.size() && entities[j].start == entities[j-1].start + entities[j-1].length && entities[j].type.compare("ps") == 0) j++; |
2134
|
0 |
0 |
if (entities[i].type.compare("td") == 0 && i+1 < entities.size() && entities[i+1].start == entities[i].start + entities[i].length && entities[i+1].type.compare("tm") == 0) { |
|
0 |
0 |
if (entities[i].type.compare("td") == 0 && i+1 < entities.size() && entities[i+1].start == entities[i].start + entities[i].length && entities[i+1].type.compare("tm") == 0) { |
|
0 |
0 |
if (entities[i].type.compare("td") == 0 && i+1 < entities.size() && entities[i+1].start == entities[i].start + entities[i].length && entities[i+1].type.compare("tm") == 0) { |
|
0 |
0 |
if (entities[i].type.compare("td") == 0 && i+1 < entities.size() && entities[i+1].start == entities[i].start + entities[i].length && entities[i+1].type.compare("tm") == 0) { |
|
0 |
0 |
if (entities[i].type.compare("td") == 0 && i+1 < entities.size() && entities[i+1].start == entities[i].start + entities[i].length && entities[i+1].type.compare("tm") == 0) { |
2136
|
0 |
0 |
if (j < entities.size() && entities[j].start == entities[j-1].start + entities[j-1].length && entities[j].type.compare("ty") == 0) j++; |
|
0 |
0 |
if (j < entities.size() && entities[j].start == entities[j-1].start + entities[j-1].length && entities[j].type.compare("ty") == 0) j++; |
|
0 |
0 |
if (j < entities.size() && entities[j].start == entities[j-1].start + entities[j-1].length && entities[j].type.compare("ty") == 0) j++; |
|
0 |
0 |
if (j < entities.size() && entities[j].start == entities[j-1].start + entities[j-1].length && entities[j].type.compare("ty") == 0) j++; |
2140
|
0 |
0 |
if (entities[i].type.compare("tm") == 0 && (!i || entities[i-1].start + entities[i-1].length < entities[i].start || entities[i-1].type.compare("td") != 0)) |
|
0 |
0 |
if (entities[i].type.compare("tm") == 0 && (!i || entities[i-1].start + entities[i-1].length < entities[i].start || entities[i-1].type.compare("td") != 0)) |
|
0 |
0 |
if (entities[i].type.compare("tm") == 0 && (!i || entities[i-1].start + entities[i-1].length < entities[i].start || entities[i-1].type.compare("td") != 0)) |
|
0 |
0 |
if (entities[i].type.compare("tm") == 0 && (!i || entities[i-1].start + entities[i-1].length < entities[i].start || entities[i-1].type.compare("td") != 0)) |
|
0 |
0 |
if (entities[i].type.compare("tm") == 0 && (!i || entities[i-1].start + entities[i-1].length < entities[i].start || entities[i-1].type.compare("td") != 0)) |
2141
|
0 |
0 |
if (i+1 < entities.size() && entities[i+1].start == entities[i].start + entities[i].length && entities[i+1].type.compare("ty") == 0) |
|
0 |
0 |
if (i+1 < entities.size() && entities[i+1].start == entities[i].start + entities[i].length && entities[i+1].type.compare("ty") == 0) |
|
0 |
0 |
if (i+1 < entities.size() && entities[i+1].start == entities[i].start + entities[i].length && entities[i+1].type.compare("ty") == 0) |
|
0 |
0 |
if (i+1 < entities.size() && entities[i+1].start == entities[i].start + entities[i].length && entities[i+1].type.compare("ty") == 0) |
2147
|
0 |
0 |
if (buffer.size() > entities.size()) entities = buffer; |
2159
|
0 |
0 |
for (unsigned i = 0; i < sentence.size; i++) { |
2160
|
0 |
0 |
for (unsigned pos = 0; pos + 2 < sentence.words[i].lemma_comments.size(); pos++) |
2161
|
0 |
0 |
if (sentence.words[i].lemma_comments[pos] == '_' && sentence.words[i].lemma_comments[pos+1] == ';') { |
|
0 |
0 |
if (sentence.words[i].lemma_comments[pos] == '_' && sentence.words[i].lemma_comments[pos+1] == ';') { |
|
0 |
0 |
if (sentence.words[i].lemma_comments[pos] == '_' && sentence.words[i].lemma_comments[pos+1] == ';') { |
2163
|
0 |
0 |
apply_in_window(i, lookup(buffer, total_features)); |
|
0 |
0 |
apply_in_window(i, lookup(buffer, total_features)); |
2173
|
14 |
4 |
for (unsigned i = 0; i < sentence.size; i++) |
2174
|
12 |
2 |
apply_in_window(i, lookup(sentence.words[i].form, total_features)); |
|
40 |
12 |
apply_in_window(i, lookup(sentence.words[i].form, total_features)); |
2176
|
4 |
0 |
apply_outer_words_in_window(lookup_empty()); |
|
8 |
4 |
apply_outer_words_in_window(lookup_empty()); |
|
8 |
0 |
apply_outer_words_in_window(lookup_empty()); |
|
12 |
8 |
apply_outer_words_in_window(lookup_empty()); |
|
8 |
0 |
apply_outer_words_in_window(lookup_empty()); |
|
12 |
8 |
apply_outer_words_in_window(lookup_empty()); |
2190
|
0 |
0 |
for (unsigned i = 0; i < sentence.size; i++) { |
2195
|
0 |
0 |
for (bool first = true; (chr = utf8::decode(form)); first = false) { |
2197
|
0 |
0 |
was_upper = was_upper || category & unicode::Lut; |
|
0 |
0 |
was_upper = was_upper || category & unicode::Lut; |
2198
|
0 |
0 |
was_lower = was_lower || category & unicode::Ll; |
|
0 |
0 |
was_lower = was_lower || category & unicode::Ll; |
2200
|
0 |
0 |
if (first && was_upper) apply_in_window(i, fst_cap); |
|
0 |
0 |
if (first && was_upper) apply_in_window(i, fst_cap); |
|
0 |
0 |
if (first && was_upper) apply_in_window(i, fst_cap); |
2202
|
0 |
0 |
if (was_upper && !was_lower) apply_in_window(i, all_cap); |
|
0 |
0 |
if (was_upper && !was_lower) apply_in_window(i, all_cap); |
|
0 |
0 |
if (was_upper && !was_lower) apply_in_window(i, all_cap); |
2203
|
0 |
0 |
if (was_upper && was_lower) apply_in_window(i, mixed_cap); |
|
0 |
0 |
if (was_upper && was_lower) apply_in_window(i, mixed_cap); |
|
0 |
0 |
if (was_upper && was_lower) apply_in_window(i, mixed_cap); |
2214
|
0 |
0 |
for (unsigned i = 0; i < sentence.size; i++) { |
2216
|
0 |
0 |
for (auto&& chr : utf8::decoder(sentence.words[i].form)) |
2217
|
0 |
0 |
utf8::append(buffer, buffer.empty() ? chr : unicode::lowercase(chr)); |
2218
|
0 |
0 |
apply_in_window(i, lookup(buffer, total_features)); |
|
0 |
0 |
apply_in_window(i, lookup(buffer, total_features)); |
2221
|
0 |
0 |
apply_outer_words_in_window(lookup_empty()); |
|
0 |
0 |
apply_outer_words_in_window(lookup_empty()); |
|
0 |
0 |
apply_outer_words_in_window(lookup_empty()); |
|
0 |
0 |
apply_outer_words_in_window(lookup_empty()); |
|
0 |
0 |
apply_outer_words_in_window(lookup_empty()); |
|
0 |
0 |
apply_outer_words_in_window(lookup_empty()); |
2234
|
0 |
0 |
if (!feature_processor::parse(window, args, entities, total_features, pipeline)) return false; |
2237
|
0 |
0 |
for (auto&& arg : args) { |
2238
|
0 |
0 |
ifstream in(path_from_utf8(arg).c_str()); |
2239
|
0 |
0 |
if (!in.is_open()) return cerr << "Cannot open gazetteers file '" << arg << "'!" << endl, false; |
2245
|
0 |
0 |
while (getline(in, line)) { |
|
0 |
0 |
while (getline(in, line)) { |
2246
|
0 |
0 |
split(line, ' ', tokens); |
2247
|
0 |
0 |
for (unsigned i = 0; i < tokens.size(); i++) |
2248
|
0 |
0 |
if (!tokens[i][0]) |
2250
|
0 |
0 |
if (tokens.size() > longest) longest = tokens.size(); |
2253
|
0 |
0 |
for (unsigned i = 0; i < tokens.size(); i++) { |
2254
|
0 |
0 |
if (i) gazetteer += ' '; |
2257
|
0 |
0 |
if (it->second == gazetteers_info.size()) gazetteers_info.emplace_back(); |
|
0 |
0 |
if (it->second == gazetteers_info.size()) gazetteers_info.emplace_back(); |
2259
|
0 |
0 |
if (i + 1 < tokens.size()) |
2262
|
0 |
0 |
if (find(info.features.begin(), info.features.end(), *total_features + window) == info.features.end()) |
2263
|
0 |
0 |
info.features.emplace_back(*total_features + window); |
2266
|
0 |
0 |
*total_features += (2*window + 1) * (longest == 0 ? 0 : longest == 1 ? U+1 : longest == 2 ? L+1 : I+1); |
|
0 |
0 |
*total_features += (2*window + 1) * (longest == 0 ? 0 : longest == 1 ? U+1 : longest == 2 ? L+1 : I+1); |
|
0 |
0 |
*total_features += (2*window + 1) * (longest == 0 ? 0 : longest == 1 ? U+1 : longest == 2 ? L+1 : I+1); |
2276
|
0 |
0 |
for (auto&& gazetteer : gazetteers_info) { |
2279
|
0 |
0 |
for (auto&& feature : gazetteer.features) |
2288
|
0 |
0 |
for (auto&& gazetteer : gazetteers_info) { |
2291
|
0 |
0 |
for (auto&& feature : gazetteer.features) |
2297
|
0 |
0 |
for (unsigned i = 0; i < sentence.size; i++) { |
2299
|
0 |
0 |
if (it == map.end()) continue; |
2302
|
0 |
0 |
for (auto&& feature : gazetteers_info[it->second].features) { |
2303
|
0 |
0 |
apply_in_window(i, feature + G * (2*window + 1)); |
|
0 |
0 |
apply_in_window(i, feature + G * (2*window + 1)); |
2304
|
0 |
0 |
apply_in_window(i, feature + U * (2*window + 1)); |
|
0 |
0 |
apply_in_window(i, feature + U * (2*window + 1)); |
2307
|
0 |
0 |
for (unsigned j = i + 1; gazetteers_info[it->second].prefix_of_longer && j < sentence.size; j++) { |
|
0 |
0 |
for (unsigned j = i + 1; gazetteers_info[it->second].prefix_of_longer && j < sentence.size; j++) { |
|
0 |
0 |
for (unsigned j = i + 1; gazetteers_info[it->second].prefix_of_longer && j < sentence.size; j++) { |
2308
|
0 |
0 |
if (j == i + 1) buffer.assign(sentence.words[i].raw_lemma); |
2312
|
0 |
0 |
if (it == map.end()) break; |
2315
|
0 |
0 |
for (auto&& feature : gazetteers_info[it->second].features) |
2316
|
0 |
0 |
for (unsigned g = i; g <= j; g++) { |
2317
|
0 |
0 |
apply_in_window(g, feature + G * (2*window + 1)); |
|
0 |
0 |
apply_in_window(g, feature + G * (2*window + 1)); |
2318
|
0 |
0 |
apply_in_window(g, feature + (g == i ? B : g == j ? L : I) * (2*window + 1)); |
|
0 |
0 |
apply_in_window(g, feature + (g == i ? B : g == j ? L : I) * (2*window + 1)); |
|
0 |
0 |
apply_in_window(g, feature + (g == i ? B : g == j ? L : I) * (2*window + 1)); |
|
0 |
0 |
apply_in_window(g, feature + (g == i ? B : g == j ? L : I) * (2*window + 1)); |
2339
|
0 |
0 |
if (!feature_processor::parse(window, args, entities, total_features, pipeline)) return false; |
2344
|
0 |
0 |
if (args.size() < 4) return cerr << "Not enough parameters to GazetteersEnhanced!" << endl, false; |
2345
|
0 |
0 |
if (args.size() & 1) return cerr << "Odd number of parameters to GazetteersEnhanced!" << endl, false; |
2347
|
0 |
0 |
if (args[0] == "form") match = MATCH_FORM; |
2348
|
0 |
0 |
else if (args[0] == "rawlemma") match = MATCH_RAWLEMMA; |
2349
|
0 |
0 |
else if (args[0] == "rawlemmas") match = MATCH_RAWLEMMAS; |
2352
|
0 |
0 |
if (args[1] == "embed_in_model") embed = EMBED_IN_MODEL; |
2353
|
0 |
0 |
else if (args[1] == "out_of_model") embed = OUT_OF_MODEL; |
2356
|
0 |
0 |
for (unsigned i = 2; i < args.size(); i += 2) { |
2360
|
0 |
0 |
gazetteer_metas.back().entity = args[i + 1] == "NONE" ? -1 : entities.parse(args[i + 1].c_str(), true); |
2364
|
0 |
0 |
for (entity_type i = 0; i < entities.size(); i++) |
2367
|
0 |
0 |
if (!load_gazetteer_lists(pipeline, embed == EMBED_IN_MODEL)) return false; |
2379
|
0 |
0 |
for (auto&& gazetteer_meta : gazetteer_metas) { |
2386
|
0 |
0 |
for (auto&& gazetteer_list : gazetteer_lists) { |
2388
|
0 |
0 |
for (auto&& gazetteer : gazetteer_list.gazetteers) |
2396
|
0 |
0 |
for (auto&& entity : entity_list) |
2408
|
0 |
0 |
for (auto&& gazetteer_meta : gazetteer_metas) { |
2414
|
0 |
0 |
if (embed == EMBED_IN_MODEL) { |
2416
|
0 |
0 |
for (auto&& gazetteer_list : gazetteer_lists) { |
2418
|
0 |
0 |
for (auto&& gazetteer : gazetteer_list.gazetteers) |
2429
|
0 |
0 |
for (auto&& entity : entity_list) |
2435
|
0 |
0 |
vector> features(sentence.size); |
2437
|
0 |
0 |
vector> recased_match_sources(sentence.size); |
2438
|
0 |
0 |
for (unsigned i = 0; i < sentence.size; i++) |
2439
|
0 |
0 |
recase_match_source(sentence.words[i], RECASE_ANY, recased_match_sources[i]); |
2441
|
0 |
0 |
for (unsigned i = 0; i < sentence.size; i++) { |
2445
|
0 |
0 |
for (unsigned j = i; j < sentence.size && !nodes.empty(); j++) { |
|
0 |
0 |
for (unsigned j = i; j < sentence.size && !nodes.empty(); j++) { |
|
0 |
0 |
for (unsigned j = i; j < sentence.size && !nodes.empty(); j++) { |
2447
|
0 |
0 |
for (auto&& node : nodes) |
2448
|
0 |
0 |
if (!gazetteers_trie[node].children.empty()) |
2449
|
0 |
0 |
for (auto&& match_source : recased_match_sources[j]) { |
2451
|
0 |
0 |
for (auto&& it = range.first; it != range.second; it++) |
2452
|
0 |
0 |
append_unless_exists(new_nodes, it->second); |
2455
|
0 |
0 |
hard_pre_possible = hard_pre_possible && !sentence.probabilities[j].local_filled; |
|
0 |
0 |
hard_pre_possible = hard_pre_possible && !sentence.probabilities[j].local_filled; |
2456
|
0 |
0 |
if (hard_pre_possible) |
2457
|
0 |
0 |
for (auto&& node : new_nodes) |
2458
|
0 |
0 |
if (gazetteers_trie[node].mode == HARD_PRE && |
|
0 |
0 |
if (gazetteers_trie[node].mode == HARD_PRE && |
|
0 |
0 |
if (gazetteers_trie[node].mode == HARD_PRE && |
2459
|
0 |
0 |
((j - i + 1) > hard_pre_length || node < hard_pre_node)) |
2463
|
0 |
0 |
for (auto&& node : new_nodes) |
2464
|
0 |
0 |
for (auto&& feature : gazetteers_trie[node].features) |
2465
|
0 |
0 |
for (unsigned k = i; k <= j; k++) { |
2466
|
0 |
0 |
bilou_type type = j == i ? bilou_type_U : k == i ? bilou_type_B : k == j ? bilou_type_L : bilou_type_I; |
|
0 |
0 |
bilou_type type = j == i ? bilou_type_U : k == i ? bilou_type_B : k == j ? bilou_type_L : bilou_type_I; |
|
0 |
0 |
bilou_type type = j == i ? bilou_type_U : k == i ? bilou_type_B : k == j ? bilou_type_L : bilou_type_I; |
2467
|
0 |
0 |
append_unless_exists(features[k], feature + G * (2 * window + 1)); |
2468
|
0 |
0 |
append_unless_exists(features[k], feature + type * (2 * window + 1)); |
2474
|
0 |
0 |
if (hard_pre_length) |
2475
|
0 |
0 |
for (unsigned j = i; j < i + hard_pre_length; j++) { |
2476
|
0 |
0 |
for (auto&& bilou : sentence.probabilities[j].local.bilou) { |
2481
|
0 |
0 |
j == i ? bilou_type_B : j + 1 == i + hard_pre_length ? bilou_type_L : bilou_type_I; |
|
0 |
0 |
j == i ? bilou_type_B : j + 1 == i + hard_pre_length ? bilou_type_L : bilou_type_I; |
|
0 |
0 |
j == i ? bilou_type_B : j + 1 == i + hard_pre_length ? bilou_type_L : bilou_type_I; |
2489
|
0 |
0 |
for (unsigned i = 0; i < sentence.size; i++) |
2490
|
0 |
0 |
for (auto&& feature : features[i]) |
2491
|
0 |
0 |
apply_in_window(i, feature); |
|
0 |
0 |
apply_in_window(i, feature); |
|
0 |
0 |
apply_in_window(i, feature); |
2497
|
0 |
0 |
vector> recased_match_sources(sentence.size); |
2498
|
0 |
0 |
for (unsigned i = 0; i < sentence.size; i++) |
2499
|
0 |
0 |
recase_match_source(sentence.words[i], RECASE_ANY, recased_match_sources[i]); |
2503
|
0 |
0 |
for (unsigned i = 0, e = 0; i < sentence.size; i++) { |
2504
|
0 |
0 |
while (e < entities.size() && entities[e].start == i) { |
|
0 |
0 |
while (e < entities.size() && entities[e].start == i) { |
|
0 |
0 |
while (e < entities.size() && entities[e].start == i) { |
2505
|
0 |
0 |
if (i + entities[e].length > entity_until) |
2507
|
0 |
0 |
buffer.push_back(entities[e++]); |
2510
|
0 |
0 |
if (entity_until <= i) { |
2512
|
0 |
0 |
unsigned free_until = e < entities.size() ? entities[e].start : sentence.size; |
2516
|
0 |
0 |
for (unsigned j = i; j < free_until && !nodes.empty(); j++) { |
|
0 |
0 |
for (unsigned j = i; j < free_until && !nodes.empty(); j++) { |
|
0 |
0 |
for (unsigned j = i; j < free_until && !nodes.empty(); j++) { |
2518
|
0 |
0 |
for (auto&& node : nodes) |
2519
|
0 |
0 |
if (!gazetteers_trie[node].children.empty()) |
2520
|
0 |
0 |
for (auto&& match_source : recased_match_sources[j]) { |
2522
|
0 |
0 |
for (auto&& it = range.first; it != range.second; it++) |
2523
|
0 |
0 |
append_unless_exists(new_nodes, it->second); |
2526
|
0 |
0 |
for (auto&& node : new_nodes) |
2527
|
0 |
0 |
if (gazetteers_trie[node].mode == HARD_POST && |
|
0 |
0 |
if (gazetteers_trie[node].mode == HARD_POST && |
|
0 |
0 |
if (gazetteers_trie[node].mode == HARD_POST && |
2528
|
0 |
0 |
((j - i + 1) > hard_post_length || node < hard_post_node)) |
2534
|
0 |
0 |
if (hard_post_length) { |
2535
|
0 |
0 |
buffer.emplace_back(i, hard_post_length, entity_list[gazetteers_trie[hard_post_node].entity]); |
2541
|
0 |
0 |
if (buffer.size() != entities.size()) |
2546
|
0 |
0 |
for (auto&& gazetteer_list : gazetteer_lists) |
2547
|
0 |
0 |
for (auto&& gazetteer : gazetteer_list.gazetteers) { |
2548
|
0 |
0 |
gazetteers.push_back(gazetteer); |
2549
|
0 |
0 |
if (gazetteer_types) gazetteer_types->push_back(gazetteer_list.entity); |
|
0 |
0 |
if (gazetteer_types) gazetteer_types->push_back(gazetteer_list.entity); |
2590
|
0 |
0 |
for (i = array.size(); i; i--) |
2591
|
0 |
0 |
if (array[i - 1] == value) |
2594
|
0 |
0 |
if (!i) |
2602
|
0 |
0 |
for (auto&& gazetteer_meta : gazetteer_metas) |
2603
|
0 |
0 |
for (int mode = 0; mode < MODES_TOTAL; mode++) { |
2606
|
0 |
0 |
ifstream file(path_from_utf8(file_name).c_str()); |
2607
|
0 |
0 |
if (!file.is_open()) { |
2608
|
0 |
0 |
if (mode == SOFT && files_must_exist) |
2613
|
0 |
0 |
gazetteer_lists.emplace_back(); |
2618
|
0 |
0 |
while (getline(file, line)) |
|
0 |
0 |
while (getline(file, line)) |
2619
|
0 |
0 |
if (!line.empty() && line[0] != '#') |
|
0 |
0 |
if (!line.empty() && line[0] != '#') |
|
0 |
0 |
if (!line.empty() && line[0] != '#') |
2620
|
0 |
0 |
gazetteer_lists.back().gazetteers.push_back(line); |
2625
|
0 |
0 |
vector gazetteer_tokens, gazetteer_tokens_additional, gazetteer_token(1); |
2630
|
0 |
0 |
gazetteers_trie.emplace_back(); |
2631
|
0 |
0 |
for (auto&& gazetteer_list : gazetteer_lists) |
2632
|
0 |
0 |
for (auto&& gazetteer : gazetteer_list.gazetteers) { |
2633
|
0 |
0 |
pipeline.tokenizer->set_text(gazetteer); |
2634
|
0 |
0 |
if (!pipeline.tokenizer->next_sentence(&gazetteer_tokens, nullptr)) continue; |
|
0 |
0 |
if (!pipeline.tokenizer->next_sentence(&gazetteer_tokens, nullptr)) continue; |
2635
|
0 |
0 |
while (pipeline.tokenizer->next_sentence(&gazetteer_tokens_additional, nullptr)) |
|
0 |
0 |
while (pipeline.tokenizer->next_sentence(&gazetteer_tokens_additional, nullptr)) |
2636
|
0 |
0 |
gazetteer_tokens.insert(gazetteer_tokens.end(), gazetteer_tokens_additional.begin(), gazetteer_tokens_additional.end()); |
2640
|
0 |
0 |
for (unsigned token = 0; token < gazetteer_tokens.size(); token++) { |
2641
|
0 |
0 |
if (token) prefix.push_back('\t'); |
|
0 |
0 |
if (token) prefix.push_back('\t'); |
2642
|
0 |
0 |
prefix.append(gazetteer_tokens[token].str, gazetteer_tokens[token].len); |
2644
|
0 |
0 |
if (prefix_it == gazetteer_prefixes.end()) { |
2646
|
0 |
0 |
gazetteers_trie.emplace_back(); |
2650
|
0 |
0 |
pipeline.tagger->tag(gazetteer_token, gazetteer_token_tagged); |
2651
|
0 |
0 |
recase_match_source(gazetteer_token_tagged.words[0], RECASE_NATIVE, gazetteer_recased_match_sources); |
2652
|
0 |
0 |
for (auto&& match_source : gazetteer_recased_match_sources) |
2661
|
0 |
0 |
append_unless_exists(gazetteers_trie[node].features, gazetteer_list.feature); |
2662
|
0 |
0 |
if ((gazetteer_list.mode == HARD_PRE && gazetteers_trie[node].mode != HARD_PRE) || |
|
0 |
0 |
if ((gazetteer_list.mode == HARD_PRE && gazetteers_trie[node].mode != HARD_PRE) || |
|
0 |
0 |
if ((gazetteer_list.mode == HARD_PRE && gazetteers_trie[node].mode != HARD_PRE) || |
|
0 |
0 |
if ((gazetteer_list.mode == HARD_PRE && gazetteers_trie[node].mode != HARD_PRE) || |
2663
|
0 |
0 |
(gazetteer_list.mode == HARD_POST && gazetteers_trie[node].mode == SOFT)) { |
2678
|
0 |
0 |
if (mode == TO_UPPER) |
2680
|
0 |
0 |
else if (mode == TO_LOWER) |
2682
|
0 |
0 |
else if (mode == TO_TITLE) |
2683
|
0 |
0 |
for (auto&& chr : utf8::decoder(text)) |
2684
|
0 |
0 |
utf8::append(recased.back(), recased.back().empty() ? unicode::uppercase(chr) : unicode::lowercase(chr)); |
2692
|
0 |
0 |
for (auto&& chr : utf8::decoder(word.form)) { |
2693
|
0 |
0 |
any_lower = any_lower || (unicode::category(chr) & unicode::Ll); |
|
0 |
0 |
any_lower = any_lower || (unicode::category(chr) & unicode::Ll); |
2694
|
0 |
0 |
if (first) first_uc = unicode::category(chr) & unicode::Lut; |
2700
|
0 |
0 |
for (int perform = 0; perform < TO_TOTAL; perform++) { |
2701
|
0 |
0 |
if (mode == RECASE_NATIVE) { |
2702
|
0 |
0 |
if (perform == TO_UPPER && !(first_uc && !any_lower)) continue; |
|
0 |
0 |
if (perform == TO_UPPER && !(first_uc && !any_lower)) continue; |
2703
|
0 |
0 |
if (perform == TO_TITLE && !(first_uc && any_lower)) continue; |
|
0 |
0 |
if (perform == TO_TITLE && !(first_uc && any_lower)) continue; |
2704
|
0 |
0 |
if (perform == TO_LOWER && first_uc) continue; |
2706
|
0 |
0 |
if (mode == RECASE_ANY) { |
2707
|
0 |
0 |
if (perform == TO_UPPER && !(first_uc && !any_lower)) continue; |
|
0 |
0 |
if (perform == TO_UPPER && !(first_uc && !any_lower)) continue; |
2708
|
0 |
0 |
if (perform == TO_TITLE && !first_uc) continue; |
2711
|
0 |
0 |
if (match == MATCH_FORM) |
2713
|
0 |
0 |
else if (match == MATCH_RAWLEMMA) |
2715
|
0 |
0 |
else if (match == MATCH_RAWLEMMAS) |
2716
|
0 |
0 |
for (auto&& raw_lemma : word.raw_lemmas_all) |
2721
|
0 |
3 |
const vector gazetteers_enhanced::basename_suffixes = {".txt", ".hard_pre.txt", ".hard_post.txt"}; |
|
9 |
3 |
const vector gazetteers_enhanced::basename_suffixes = {".txt", ".hard_pre.txt", ".hard_post.txt"}; |
|
0 |
0 |
const vector gazetteers_enhanced::basename_suffixes = {".txt", ".hard_pre.txt", ".hard_post.txt"}; |
2727
|
14 |
4 |
for (unsigned i = 0; i < sentence.size; i++) |
2728
|
14 |
0 |
apply_in_window(i, lookup(sentence.words[i].lemma_id, total_features)); |
|
46 |
14 |
apply_in_window(i, lookup(sentence.words[i].lemma_id, total_features)); |
2730
|
4 |
0 |
apply_outer_words_in_window(lookup_empty()); |
|
8 |
4 |
apply_outer_words_in_window(lookup_empty()); |
|
8 |
0 |
apply_outer_words_in_window(lookup_empty()); |
|
12 |
8 |
apply_outer_words_in_window(lookup_empty()); |
|
8 |
0 |
apply_outer_words_in_window(lookup_empty()); |
|
12 |
8 |
apply_outer_words_in_window(lookup_empty()); |
2745
|
14 |
4 |
for (unsigned i = 0; i < sentence.size; i++) { |
2750
|
14 |
0 |
for (digit = false, num = 0; *form; form++) { |
2751
|
0 |
14 |
if (*form < '0' || *form > '9') break; |
2755
|
0 |
14 |
if (digit && !*form) { |
|
0 |
0 |
if (digit && !*form) { |
2757
|
0 |
0 |
if (num < 24) apply_in_window(i, hour); |
|
0 |
0 |
if (num < 24) apply_in_window(i, hour); |
|
0 |
0 |
if (num < 24) apply_in_window(i, hour); |
2758
|
0 |
0 |
if (num < 60) apply_in_window(i, minute); |
|
0 |
0 |
if (num < 60) apply_in_window(i, minute); |
|
0 |
0 |
if (num < 60) apply_in_window(i, minute); |
2759
|
0 |
0 |
if (num >= 1 && num <= 31) apply_in_window(i, day); |
|
0 |
0 |
if (num >= 1 && num <= 31) apply_in_window(i, day); |
|
0 |
0 |
if (num >= 1 && num <= 31) apply_in_window(i, day); |
2760
|
0 |
0 |
if (num >= 1 && num <= 12) apply_in_window(i, month); |
|
0 |
0 |
if (num >= 1 && num <= 12) apply_in_window(i, month); |
|
0 |
0 |
if (num >= 1 && num <= 12) apply_in_window(i, month); |
2761
|
0 |
0 |
if (num >= 1000 && num <= 2200) apply_in_window(i, year);; |
|
0 |
0 |
if (num >= 1000 && num <= 2200) apply_in_window(i, year);; |
|
0 |
0 |
if (num >= 1000 && num <= 2200) apply_in_window(i, year);; |
2763
|
0 |
14 |
if (digit && num < 24 && (*form == '.' || *form == ':')) { |
|
0 |
0 |
if (digit && num < 24 && (*form == '.' || *form == ':')) { |
2765
|
0 |
0 |
for (digit = false, num = 0, form++; *form; form++) { |
2766
|
0 |
0 |
if (*form < '0' || *form > '9') break; |
2770
|
0 |
0 |
if (digit && !*form && num < 60) apply_in_window(i, time); |
|
0 |
0 |
if (digit && !*form && num < 60) apply_in_window(i, time); |
|
0 |
0 |
if (digit && !*form && num < 60) apply_in_window(i, time); |
|
0 |
0 |
if (digit && !*form && num < 60) apply_in_window(i, time); |
|
0 |
0 |
if (digit && !*form && num < 60) apply_in_window(i, time); |
2780
|
14 |
4 |
for (unsigned i = 0; i < sentence.size; i++) |
2781
|
7 |
7 |
if (sentence.previous_stage[i].bilou != bilou_type_unknown) { |
2786
|
7 |
0 |
apply_in_range(i, lookup(buffer, total_features), 1, window); |
|
9 |
7 |
apply_in_range(i, lookup(buffer, total_features), 1, window); |
2792
|
4 |
10 |
if (value < 0) { |
2796
|
11 |
14 |
for (; value; value >>= 4) |
2805
|
14 |
4 |
for (unsigned i = 0; i < sentence.size; i++) |
2806
|
14 |
0 |
apply_in_window(i, lookup(sentence.words[i].raw_lemma, total_features)); |
|
46 |
14 |
apply_in_window(i, lookup(sentence.words[i].raw_lemma, total_features)); |
2808
|
4 |
0 |
apply_outer_words_in_window(lookup_empty()); |
|
8 |
4 |
apply_outer_words_in_window(lookup_empty()); |
|
8 |
0 |
apply_outer_words_in_window(lookup_empty()); |
|
12 |
8 |
apply_outer_words_in_window(lookup_empty()); |
|
8 |
0 |
apply_outer_words_in_window(lookup_empty()); |
|
12 |
8 |
apply_outer_words_in_window(lookup_empty()); |
2822
|
14 |
4 |
for (unsigned i = 0; i < sentence.size; i++) { |
2827
|
54 |
14 |
for (bool first = true; (chr = utf8::decode(raw_lemma)); first = false) { |
2829
|
54 |
0 |
was_upper = was_upper || category & unicode::Lut; |
|
54 |
0 |
was_upper = was_upper || category & unicode::Lut; |
2830
|
14 |
40 |
was_lower = was_lower || category & unicode::Ll; |
|
4 |
10 |
was_lower = was_lower || category & unicode::Ll; |
2832
|
54 |
0 |
if (first && was_upper) apply_in_window(i, fst_cap); |
|
0 |
0 |
if (first && was_upper) apply_in_window(i, fst_cap); |
|
0 |
0 |
if (first && was_upper) apply_in_window(i, fst_cap); |
2834
|
0 |
14 |
if (was_upper && !was_lower) apply_in_window(i, all_cap); |
|
0 |
0 |
if (was_upper && !was_lower) apply_in_window(i, all_cap); |
|
0 |
0 |
if (was_upper && !was_lower) apply_in_window(i, all_cap); |
2835
|
0 |
14 |
if (was_upper && was_lower) apply_in_window(i, mixed_cap); |
|
0 |
0 |
if (was_upper && was_lower) apply_in_window(i, mixed_cap); |
|
0 |
0 |
if (was_upper && was_lower) apply_in_window(i, mixed_cap); |
2846
|
0 |
0 |
for (unsigned i = 0; i < sentence.size; i++) { |
2848
|
0 |
0 |
for (auto&& chr : utf8::decoder(sentence.words[i].raw_lemma)) |
2849
|
0 |
0 |
utf8::append(buffer, buffer.empty() ? chr : unicode::lowercase(chr)); |
2850
|
0 |
0 |
apply_in_window(i, lookup(buffer, total_features)); |
|
0 |
0 |
apply_in_window(i, lookup(buffer, total_features)); |
2853
|
0 |
0 |
apply_outer_words_in_window(lookup_empty()); |
|
0 |
0 |
apply_outer_words_in_window(lookup_empty()); |
|
0 |
0 |
apply_outer_words_in_window(lookup_empty()); |
|
0 |
0 |
apply_outer_words_in_window(lookup_empty()); |
|
0 |
0 |
apply_outer_words_in_window(lookup_empty()); |
|
0 |
0 |
apply_outer_words_in_window(lookup_empty()); |
2866
|
0 |
0 |
if (!feature_processor::parse(window, args, entities, total_features, pipeline)) return false; |
2867
|
0 |
0 |
if (args.size() != 2) return cerr << "*Suffix features require exactly two arguments -- shortest and longest suffix length!" << endl, false; |
2870
|
0 |
0 |
if (!parse_int(args[0], "*Suffix shortest length", shortest, error)) return cerr << error << endl, false; |
|
0 |
0 |
if (!parse_int(args[0], "*Suffix shortest length", shortest, error)) return cerr << error << endl, false; |
2871
|
0 |
0 |
if (!parse_int(args[1], "*Suffix longest length", longest, error)) return cerr << error << endl, false; |
|
0 |
0 |
if (!parse_int(args[1], "*Suffix longest length", longest, error)) return cerr << error << endl, false; |
2893
|
0 |
0 |
for (unsigned i = 0; i < sentence.size; i++) { |
2895
|
0 |
0 |
for (auto&& chr : utf8::decoder(source == SUFFIX_SOURCE_FORM ? sentence.words[i].form : sentence.words[i].raw_lemma)) |
|
0 |
0 |
for (auto&& chr : utf8::decoder(source == SUFFIX_SOURCE_FORM ? sentence.words[i].form : sentence.words[i].raw_lemma)) |
2896
|
0 |
0 |
chrs.push_back((casing == SUFFIX_CASE_ORIGINAL || chrs.empty()) ? chr : unicode::lowercase(chr)); |
|
0 |
0 |
chrs.push_back((casing == SUFFIX_CASE_ORIGINAL || chrs.empty()) ? chr : unicode::lowercase(chr)); |
2899
|
0 |
0 |
for (int s = 1; s <= longest && s <= int(chrs.size()); s++) { |
|
0 |
0 |
for (int s = 1; s <= longest && s <= int(chrs.size()); s++) { |
|
0 |
0 |
for (int s = 1; s <= longest && s <= int(chrs.size()); s++) { |
2900
|
0 |
0 |
utf8::append(buffer, chrs[chrs.size() - s]); |
2901
|
0 |
0 |
if (s >= shortest) { |
2902
|
0 |
0 |
apply_in_window(i, lookup(buffer, total_features)); |
|
0 |
0 |
apply_in_window(i, lookup(buffer, total_features)); |
|
0 |
0 |
apply_in_window(i, lookup(buffer, total_features)); |
|
0 |
0 |
apply_in_window(i, lookup(buffer, total_features)); |
2907
|
0 |
0 |
apply_outer_words_in_window(lookup_empty()); |
|
0 |
0 |
apply_outer_words_in_window(lookup_empty()); |
|
0 |
0 |
apply_outer_words_in_window(lookup_empty()); |
|
0 |
0 |
apply_outer_words_in_window(lookup_empty()); |
|
0 |
0 |
apply_outer_words_in_window(lookup_empty()); |
|
0 |
0 |
apply_outer_words_in_window(lookup_empty()); |
|
0 |
0 |
apply_outer_words_in_window(lookup_empty()); |
|
0 |
0 |
apply_outer_words_in_window(lookup_empty()); |
2919
|
14 |
4 |
for (unsigned i = 0; i < sentence.size; i++) |
2920
|
12 |
2 |
apply_in_window(i, lookup(sentence.words[i].tag, total_features)); |
|
40 |
12 |
apply_in_window(i, lookup(sentence.words[i].tag, total_features)); |
2922
|
4 |
0 |
apply_outer_words_in_window(lookup_empty()); |
|
8 |
4 |
apply_outer_words_in_window(lookup_empty()); |
|
8 |
0 |
apply_outer_words_in_window(lookup_empty()); |
|
12 |
8 |
apply_outer_words_in_window(lookup_empty()); |
|
8 |
0 |
apply_outer_words_in_window(lookup_empty()); |
|
12 |
8 |
apply_outer_words_in_window(lookup_empty()); |
2931
|
0 |
0 |
if (!feature_processor::parse(window, args, entities, total_features, pipeline)) return false; |
2932
|
0 |
0 |
if (args.size() != 2) return cerr << "URLEmailDetector requires exactly two arguments -- named entity types for URL and email!" << endl, false; |
2937
|
0 |
0 |
if (url == entity_type_unknown || email == entity_type_unknown) |
|
0 |
0 |
if (url == entity_type_unknown || email == entity_type_unknown) |
2957
|
14 |
4 |
for (unsigned i = 0; i < sentence.size; i++) { |
2959
|
0 |
14 |
if (type == url_detector::NO_URL || sentence.probabilities[i].local_filled) continue; |
|
0 |
0 |
if (type == url_detector::NO_URL || sentence.probabilities[i].local_filled) continue; |
|
0 |
14 |
if (type == url_detector::NO_URL || sentence.probabilities[i].local_filled) continue; |
2962
|
0 |
0 |
for (auto&& bilou : sentence.probabilities[i].local.bilou) { |
2967
|
0 |
0 |
sentence.probabilities[i].local.bilou[bilou_type_U].entity = type == url_detector::EMAIL ? email : url; |
2982
|
0 |
8 |
if (name.compare("BrownClusters") == 0) return new brown_clusters(); |
2983
|
0 |
8 |
if (name.compare("CzechAddContainers") == 0) return new czech_add_containers(); |
2984
|
0 |
8 |
if (name.compare("CzechLemmaTerm") == 0) return new czech_lemma_term(); |
2985
|
1 |
7 |
if (name.compare("Form") == 0) return new form(); |
2986
|
0 |
7 |
if (name.compare("FormCapitalization") == 0) return new form_capitalization(); |
2987
|
0 |
7 |
if (name.compare("FormCaseNormalized") == 0) return new form_case_normalized(); |
2988
|
0 |
7 |
if (name.compare("FormCaseNormalizedSuffix") == 0) return new suffix(SUFFIX_SOURCE_FORM, SUFFIX_CASE_NORMALIZED); |
2989
|
0 |
7 |
if (name.compare("FormSuffix") == 0) return new suffix(SUFFIX_SOURCE_FORM, SUFFIX_CASE_ORIGINAL); |
2990
|
0 |
7 |
if (name.compare("Gazetteers") == 0) return new feature_processors::gazetteers(); |
2991
|
0 |
7 |
if (name.compare("GazetteersEnhanced") == 0) return new gazetteers_enhanced(); |
2992
|
1 |
6 |
if (name.compare("Lemma") == 0) return new lemma(); |
2993
|
1 |
5 |
if (name.compare("NumericTimeValue") == 0) return new number_time_value(); |
2994
|
1 |
4 |
if (name.compare("PreviousStage") == 0) return new previous_stage(); |
2995
|
1 |
3 |
if (name.compare("RawLemma") == 0) return new raw_lemma(); |
2996
|
1 |
2 |
if (name.compare("RawLemmaCapitalization") == 0) return new raw_lemma_capitalization(); |
2997
|
0 |
2 |
if (name.compare("RawLemmaCaseNormalized") == 0) return new raw_lemma_case_normalized(); |
2998
|
0 |
2 |
if (name.compare("RawLemmaCaseNormalizedSuffix") == 0) return new suffix(SUFFIX_SOURCE_RAWLEMMA, SUFFIX_CASE_NORMALIZED); |
2999
|
0 |
2 |
if (name.compare("RawLemmaSuffix") == 0) return new suffix(SUFFIX_SOURCE_RAWLEMMA, SUFFIX_CASE_ORIGINAL); |
3000
|
1 |
1 |
if (name.compare("Tag") == 0) return new tag(); |
3001
|
1 |
0 |
if (name.compare("URLEmailDetector") == 0) return new url_email_detector(); |
3058
|
1 |
0 |
if (!compressor::load(is, data)) return false; |
|
1 |
0 |
if (!compressor::load(is, data)) return false; |
3061
|
1 |
0 |
total_features = data.next_4B(); |
3064
|
1 |
0 |
for (unsigned i = data.next_4B(); i; i--) { |
|
8 |
1 |
for (unsigned i = data.next_4B(); i; i--) { |
3066
|
8 |
0 |
data.next_str(name); |
3069
|
8 |
0 |
auto* processor = feature_processor::create(name); |
3070
|
8 |
0 |
if (processor) { |
3071
|
8 |
0 |
processor->load(data, pipeline); |
3072
|
8 |
0 |
processors.emplace_back(name, processor); |
3078
|
0 |
0 |
} |
3088
|
14 |
4 |
for (unsigned i = 0; i < sentence.size; i++) { |
3094
|
32 |
4 |
for (auto&& processor : processors) |
3095
|
0 |
32 |
processor.processor->process_sentence(sentence, adding_features ? &total_features : nullptr, buffer); |
3099
|
16 |
2 |
for (auto&& processor : processors) |
|
0 |
0 |
for (auto&& processor : processors) |
3108
|
0 |
0 |
for (auto&& processor : processors) |
|
0 |
0 |
for (auto&& processor : processors) |
3395
|
0 |
0 |
format_tagged_lemma(result); |
3400
|
0 |
0 |
for (auto&& lemma : lemmas) |
3403
|
0 |
0 |
if (lemmas.size() > 1) |
3411
|
0 |
0 |
if (converter) converter->convert(lemma); |
3415
|
0 |
0 |
if (converter) converter->convert_analyzed(lemmas); |
3428
|
0 |
0 |
for (derivated_lemma parent; derinet->parent(lemma.lemma, parent); ) |
|
0 |
0 |
for (derivated_lemma parent; derinet->parent(lemma.lemma, parent); ) |
3430
|
0 |
0 |
if (converter) converter->convert(lemma); |
3438
|
0 |
0 |
return derinet ? new root_derivation_formatter(derinet) : nullptr; |
|
0 |
0 |
return derinet ? new root_derivation_formatter(derinet) : nullptr; |
3447
|
0 |
0 |
if (converter) converter->convert(lemma); |
|
0 |
0 |
if (converter) converter->convert(lemma); |
3448
|
0 |
0 |
for (derivated_lemma parent; derinet->parent(current.lemma, parent); current.lemma.swap(parent.lemma)) { |
|
0 |
0 |
for (derivated_lemma parent; derinet->parent(current.lemma, parent); current.lemma.swap(parent.lemma)) { |
3449
|
0 |
0 |
tagged_lemma parrent_lemma(parent.lemma, current.tag); |
3450
|
0 |
0 |
if (converter) converter->convert(parrent_lemma); |
|
0 |
0 |
if (converter) converter->convert(parrent_lemma); |
3451
|
0 |
0 |
lemma.lemma.append(" ").append(parrent_lemma.lemma); |
3460
|
0 |
0 |
return derinet ? new path_derivation_formatter(derinet) : nullptr; |
|
0 |
0 |
return derinet ? new path_derivation_formatter(derinet) : nullptr; |
3469
|
0 |
0 |
if (converter) converter->convert(lemma); |
|
0 |
0 |
if (converter) converter->convert(lemma); |
3470
|
0 |
0 |
for (derivated_lemma parent; derinet->parent(root, parent); root.swap(parent.lemma)) {} |
|
0 |
0 |
for (derivated_lemma parent; derinet->parent(root, parent); root.swap(parent.lemma)) {} |
3471
|
0 |
0 |
format_tree(root, tag, lemma, converter); |
3477
|
0 |
0 |
if (converter) { |
3478
|
0 |
0 |
tagged_lemma current(root, tag); |
3479
|
0 |
0 |
converter->convert(current); |
3480
|
0 |
0 |
tree.lemma.append(" ").append(current.lemma); |
3482
|
0 |
0 |
tree.lemma.append(" ").append(root); |
3485
|
0 |
0 |
if (derinet->children(root, children)) |
|
0 |
0 |
if (derinet->children(root, children)) |
3486
|
0 |
0 |
for (auto&& child : children) |
3487
|
0 |
0 |
format_tree(child.lemma, tag, tree, converter); |
3488
|
0 |
0 |
tree.lemma.push_back(' '); |
3496
|
0 |
0 |
return derinet ? new tree_derivation_formatter(derinet) : nullptr; |
|
0 |
0 |
return derinet ? new tree_derivation_formatter(derinet) : nullptr; |
3500
|
0 |
0 |
if (name == "none") return new_none_derivation_formatter(); |
3501
|
0 |
0 |
if (name == "root") return new_root_derivation_formatter(derinet); |
3502
|
0 |
0 |
if (name == "path") return new_path_derivation_formatter(derinet); |
3503
|
0 |
0 |
if (name == "tree") return new_tree_derivation_formatter(derinet); |
3533
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
108 |
6 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
31 |
3 |
while (len--) |
|
136 |
26 |
while (len--) |
|
0 |
0 |
while (len--) |
|
118 |
10 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
3534
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
30 |
78 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
14 |
17 |
if (*a++ != *b++) |
|
124 |
12 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
98 |
20 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
3543
|
0 |
0 |
while (len--) |
|
236 |
108 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
4 |
24 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
56 |
9 |
while (len--) |
3684
|
52 |
16 |
while (mask < num) |
3686
|
16 |
0 |
hash.resize(mask + 1); |
3690
|
137 |
0 |
uint32_t size = data.next_4B(); |
3692
|
137 |
0 |
hash.resize(size); |
3693
|
137 |
0 |
memcpy(hash.data(), data.next(size), size * sizeof(uint32_t)); |
3695
|
137 |
0 |
size = data.next_4B(); |
3696
|
137 |
0 |
this->data.resize(size); |
3697
|
68 |
69 |
if (size) memcpy(this->data.data(), data.next(size), size); |
|
68 |
0 |
if (size) memcpy(this->data.data(), data.next(size), size); |
3701
|
0 |
0 |
if (len <= 0) return 0; |
|
0 |
0 |
if (len <= 0) return 0; |
|
0 |
0 |
if (len <= 0) return 0; |
|
0 |
0 |
if (len <= 0) return 0; |
|
0 |
0 |
if (len <= 0) return 0; |
|
0 |
0 |
if (len <= 0) return 0; |
|
0 |
0 |
if (len <= 0) return 0; |
|
0 |
0 |
if (len <= 0) return 0; |
|
0 |
0 |
if (len <= 0) return 0; |
|
30 |
0 |
if (len <= 0) return 0; |
|
30 |
18 |
if (len <= 0) return 0; |
|
0 |
0 |
if (len <= 0) return 0; |
|
0 |
0 |
if (len <= 0) return 0; |
|
0 |
0 |
if (len <= 0) return 0; |
|
0 |
0 |
if (len <= 0) return 0; |
|
0 |
0 |
if (len <= 0) return 0; |
|
0 |
0 |
if (len <= 0) return 0; |
|
108 |
0 |
if (len <= 0) return 0; |
|
41 |
0 |
if (len <= 0) return 0; |
|
9 |
0 |
if (len <= 0) return 0; |
|
9 |
0 |
if (len <= 0) return 0; |
3702
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
4 |
26 |
if (len == 1) return unaligned_load(data); |
|
18 |
12 |
if (len == 1) return unaligned_load(data); |
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
0 |
108 |
if (len == 1) return unaligned_load(data); |
|
7 |
34 |
if (len == 1) return unaligned_load(data); |
|
0 |
9 |
if (len == 1) return unaligned_load(data); |
|
0 |
9 |
if (len == 1) return unaligned_load(data); |
3703
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
26 |
0 |
if (len == 2) return unaligned_load(data); |
|
0 |
12 |
if (len == 2) return unaligned_load(data); |
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
20 |
88 |
if (len == 2) return unaligned_load(data); |
|
30 |
4 |
if (len == 2) return unaligned_load(data); |
|
9 |
0 |
if (len == 2) return unaligned_load(data); |
|
9 |
0 |
if (len == 2) return unaligned_load(data); |
3706
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
142 |
26 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
60 |
20 |
while (len--) |
|
132 |
30 |
while (len--) |
|
56 |
9 |
while (len--) |
|
56 |
9 |
while (len--) |
3720
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
48 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
3726
|
0 |
0 |
if (len <= 2) |
|
0 |
0 |
if (len <= 2) |
|
0 |
0 |
if (len <= 2) |
|
0 |
0 |
if (len <= 2) |
|
0 |
0 |
if (len <= 2) |
|
0 |
0 |
if (len <= 2) |
|
0 |
0 |
if (len <= 2) |
|
0 |
0 |
if (len <= 2) |
|
0 |
48 |
if (len <= 2) |
|
0 |
0 |
if (len <= 2) |
|
0 |
0 |
if (len <= 2) |
3727
|
0 |
0 |
return data != end ? data + len : nullptr; |
|
0 |
0 |
return data != end ? data + len : nullptr; |
|
0 |
0 |
return data != end ? data + len : nullptr; |
|
0 |
0 |
return data != end ? data + len : nullptr; |
|
0 |
0 |
return data != end ? data + len : nullptr; |
|
0 |
0 |
return data != end ? data + len : nullptr; |
|
0 |
0 |
return data != end ? data + len : nullptr; |
|
0 |
0 |
return data != end ? data + len : nullptr; |
|
30 |
18 |
return data != end ? data + len : nullptr; |
|
0 |
0 |
return data != end ? data + len : nullptr; |
|
0 |
0 |
return data != end ? data + len : nullptr; |
3729
|
0 |
0 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
3730
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
3741
|
108 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
41 |
55 |
if (unsigned(len) >= hashes.size()) return nullptr; |
3747
|
20 |
88 |
if (len <= 2) |
|
30 |
11 |
if (len <= 2) |
3748
|
52 |
36 |
return data != end ? (const T*)(data + len) : nullptr; |
|
11 |
0 |
return data != end ? (const T*)(data + len) : nullptr; |
3750
|
20 |
17 |
while (data < end) { |
|
38 |
4 |
while (data < end) { |
3751
|
3 |
17 |
if (small_memeq(str, data, len)) return (const T*)(data + len); |
|
26 |
12 |
if (small_memeq(str, data, len)) return (const T*)(data + len); |
3760
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return; |
|
30 |
0 |
if (unsigned(len) >= hashes.size()) return; |
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return; |
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return; |
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return; |
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return; |
3766
|
0 |
0 |
while (data < end) { |
|
30 |
30 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
3776
|
3 |
1 |
for (unsigned len = 0; len < hashes.size(); len++) { |
|
0 |
0 |
for (unsigned len = 0; len < hashes.size(); len++) { |
|
0 |
0 |
for (unsigned len = 0; len < hashes.size(); len++) { |
3780
|
13 |
3 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
3794
|
0 |
0 |
return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr; |
|
0 |
0 |
return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr; |
|
0 |
0 |
return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr; |
|
0 |
0 |
return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr; |
|
0 |
0 |
return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr; |
|
0 |
0 |
return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr; |
|
0 |
0 |
return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr; |
3798
|
2 |
14 |
if (hashes.size() == 0) hashes.emplace_back(1); |
3799
|
2 |
12 |
else if (hashes.size() == 1) hashes.emplace_back(1<<8); |
3800
|
2 |
10 |
else if (hashes.size() == 2) hashes.emplace_back(1<<16); |
3805
|
9 |
0 |
if (unsigned(str_len) < hashes.size()) |
3810
|
16 |
2 |
for (auto&& hash : hashes) { |
3812
|
131616 |
16 |
for (auto&& len : hash.hash) total += len, len = total - len; |
3818
|
9 |
0 |
if (unsigned(str_len) < hashes.size()) { |
3829
|
16 |
2 |
for (auto&& hash : hashes) |
3830
|
131616 |
16 |
for (int i = hash.hash.size() - 1; i >= 0; i--) |
3831
|
131600 |
16 |
hash.hash[i] = i > 0 ? hash.hash[i-1] : 0; |
3838
|
137 |
48 |
for (unsigned i = 0; i < sizes; i++) |
3890
|
0 |
0 |
if (dictionary) lemma.len = dictionary->lemma_id_len(lemma); |
3897
|
0 |
0 |
if (lemma_data) { |
3899
|
0 |
0 |
if (parent_encoded) { |
3903
|
0 |
0 |
if (parent_data[parent_len]) |
3913
|
0 |
0 |
if (dictionary) lemma.len = dictionary->lemma_id_len(lemma); |
3920
|
0 |
0 |
if (lemma_data) { |
3923
|
0 |
0 |
if (children_len) { |
3925
|
0 |
0 |
for (unsigned i = 0; i < children_len; i++) { |
3929
|
0 |
0 |
if (child_data[child_len]) |
3941
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
3944
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
3945
|
0 |
0 |
derinet.resize(data.next_4B()); |
|
0 |
0 |
derinet.resize(data.next_4B()); |
3949
|
0 |
0 |
for (int pass = 1; pass <= 3; pass++) { |
3950
|
0 |
0 |
if (pass > 1) data.seek(data_position); |
|
0 |
0 |
if (pass > 1) data.seek(data_position); |
3953
|
0 |
0 |
for (int i = data.next_4B(); i > 0; i--) { |
|
0 |
0 |
for (int i = data.next_4B(); i > 0; i--) { |
3954
|
0 |
0 |
lemma.resize(lemma.size() - data.next_1B()); |
|
0 |
0 |
lemma.resize(lemma.size() - data.next_1B()); |
3955
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
3956
|
0 |
0 |
lemma.push_back(data.next_1B()); |
3958
|
0 |
0 |
unsigned char lemma_comment_len = data.next_1B(); |
3959
|
0 |
0 |
const char* lemma_comment = lemma_comment_len ? data.next(lemma_comment_len) : nullptr; |
|
0 |
0 |
const char* lemma_comment = lemma_comment_len ? data.next(lemma_comment_len) : nullptr; |
3961
|
0 |
0 |
unsigned children = data.next_2B(); |
3963
|
0 |
0 |
if (pass == 3) parent.clear(); |
3965
|
0 |
0 |
int operations = data.next_1B(); |
3966
|
0 |
0 |
if (operations) { |
3967
|
0 |
0 |
int remove_start = operations & REMOVE_START ? data.next_1B() : 0; |
|
0 |
0 |
int remove_start = operations & REMOVE_START ? data.next_1B() : 0; |
3968
|
0 |
0 |
int remove_end = operations & REMOVE_END ? data.next_1B() : 0; |
|
0 |
0 |
int remove_end = operations & REMOVE_END ? data.next_1B() : 0; |
3969
|
0 |
0 |
if (operations & ADD_START) { |
3970
|
0 |
0 |
int add_start = data.next_1B(); |
3971
|
0 |
0 |
const char* str = data.next(add_start); |
3972
|
0 |
0 |
if (pass == 3) parent.assign(str, str + add_start); |
3974
|
0 |
0 |
if (pass == 3) parent.insert(parent.end(), lemma.begin() + remove_start, lemma.end() - remove_end); |
|
0 |
0 |
if (pass == 3) parent.insert(parent.end(), lemma.begin() + remove_start, lemma.end() - remove_end); |
3975
|
0 |
0 |
if (operations & ADD_END) { |
3976
|
0 |
0 |
int add_end = data.next_1B(); |
3977
|
0 |
0 |
const char* str = data.next(add_end); |
3978
|
0 |
0 |
if (pass == 3) parent.insert(parent.end(), str, str + add_end); |
3982
|
0 |
0 |
if (pass == 1) { |
3984
|
0 |
0 |
} else if (pass == 2) { |
3987
|
0 |
0 |
while (lemma_comment_len--) *lemma_data++ = *lemma_comment++; |
3990
|
0 |
0 |
if (children) unaligned_store(((uint32_t*)lemma_data) + children - 1, 0); |
3991
|
0 |
0 |
} else if (pass == 3 && !parent.empty()) { |
|
0 |
0 |
} else if (pass == 3 && !parent.empty()) { |
|
0 |
0 |
} else if (pass == 3 && !parent.empty()) { |
4002
|
0 |
0 |
assert(lemma_data && parent_data); |
4005
|
0 |
0 |
assert(parent.size() < (1<<8) && parent_offset < (1<<24)); |
|
0 |
0 |
assert(parent.size() < (1<<8) && parent_offset < (1<<24)); |
4009
|
0 |
0 |
assert(lemma.size() < (1<<8) && lemma_offset < (1<<24)); |
|
0 |
0 |
assert(lemma.size() < (1<<8) && lemma_offset < (1<<24)); |
4014
|
0 |
0 |
if (child_index+1 < children_len) |
4019
|
0 |
0 |
if (pass == 1) |
4020
|
0 |
0 |
derinet.done_adding(); |
4021
|
0 |
0 |
if (pass == 2) |
4023
|
0 |
0 |
} |
4056
|
38 |
14 |
while (form_tmp.len && !rest_has_Lut) |
|
38 |
0 |
while (form_tmp.len && !rest_has_Lut) |
4065
|
4 |
10 |
if (first_Lut && !rest_has_Lut) { // common case allowing fast execution |
4070
|
0 |
10 |
} else if (!first_Lut && rest_has_Lut) { |
4073
|
0 |
10 |
} else if (first_Lut && rest_has_Lut) { |
4080
|
0 |
0 |
while (form_tmp.len) { |
4121
|
0 |
0 |
for (unsigned len = 1; len < lemma.len; len++) |
4122
|
0 |
0 |
if (lemma.str[len] == '`' || lemma.str[len] == '_' || |
|
0 |
0 |
if (lemma.str[len] == '`' || lemma.str[len] == '_' || |
4123
|
0 |
0 |
(lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9')) |
|
0 |
0 |
(lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9')) |
|
0 |
0 |
(lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9')) |
4130
|
0 |
0 |
for (unsigned len = 1; len < lemma.len; len++) { |
4131
|
0 |
0 |
if (lemma.str[len] == '`' || lemma.str[len] == '_') |
4133
|
0 |
0 |
if (lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9') { |
|
0 |
0 |
if (lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9') { |
|
0 |
0 |
if (lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9') { |
|
0 |
0 |
if (lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9') { |
4135
|
0 |
0 |
while (len < lemma.len && lemma.str[len] >= '0' && lemma.str[len] <= '9') len++; |
|
0 |
0 |
while (len < lemma.len && lemma.str[len] >= '0' && lemma.str[len] <= '9') len++; |
|
0 |
0 |
while (len < lemma.len && lemma.str[len] >= '0' && lemma.str[len] <= '9') len++; |
4145
|
0 |
0 |
if (addinfo_len) { |
4146
|
0 |
0 |
res.reserve(addinfo_len + 4); |
4147
|
0 |
0 |
if (addinfo[0] != 255) { |
4152
|
0 |
0 |
for (int i = 1; i < addinfo_len; i++) |
4160
|
0 |
0 |
for (int i = 1; i + 2 < addinfo_len; i++) |
4161
|
0 |
0 |
if (addinfo[i] == '_' && addinfo[i+1] == ',' && addinfo[i+2] == 'x') |
|
0 |
0 |
if (addinfo[i] == '_' && addinfo[i+1] == ',' && addinfo[i+2] == 'x') |
|
0 |
0 |
if (addinfo[i] == '_' && addinfo[i+1] == ',' && addinfo[i+2] == 'x') |
4171
|
0 |
0 |
if (lemma_info < lemma.str + lemma.len) { |
4175
|
0 |
0 |
if (*lemma_info == '-') { |
4178
|
0 |
0 |
lemma_additional_info < lemma.str + lemma.len && (*lemma_additional_info >= '0' && *lemma_additional_info <= '9'); |
|
0 |
0 |
lemma_additional_info < lemma.str + lemma.len && (*lemma_additional_info >= '0' && *lemma_additional_info <= '9'); |
4182
|
0 |
0 |
if (lemma_additional_info == lemma_info + 1 || (lemma_additional_info < lemma.str + lemma.len && *lemma_additional_info != '`' && *lemma_additional_info != '_') || lemma_num >= 255) { |
|
0 |
0 |
if (lemma_additional_info == lemma_info + 1 || (lemma_additional_info < lemma.str + lemma.len && *lemma_additional_info != '`' && *lemma_additional_info != '_') || lemma_num >= 255) { |
|
0 |
0 |
if (lemma_additional_info == lemma_info + 1 || (lemma_additional_info < lemma.str + lemma.len && *lemma_additional_info != '`' && *lemma_additional_info != '_') || lemma_num >= 255) { |
|
0 |
0 |
if (lemma_additional_info == lemma_info + 1 || (lemma_additional_info < lemma.str + lemma.len && *lemma_additional_info != '`' && *lemma_additional_info != '_') || lemma_num >= 255) { |
|
0 |
0 |
if (lemma_additional_info == lemma_info + 1 || (lemma_additional_info < lemma.str + lemma.len && *lemma_additional_info != '`' && *lemma_additional_info != '_') || lemma_num >= 255) { |
4183
|
0 |
0 |
if (die_on_failure) |
4190
|
0 |
0 |
while (lemma_additional_info < lemma.str + lemma.len) |
4193
|
0 |
0 |
if (data.size() > 255) { |
4194
|
0 |
0 |
if (die_on_failure) |
4205
|
0 |
0 |
if (data.empty()) return true; |
4206
|
0 |
0 |
if (data[0] != 255 && (!other_addinfo_len || other_addinfo[0] != data[0])) return false; |
|
0 |
0 |
if (data[0] != 255 && (!other_addinfo_len || other_addinfo[0] != data[0])) return false; |
|
0 |
0 |
if (data[0] != 255 && (!other_addinfo_len || other_addinfo[0] != data[0])) return false; |
|
0 |
0 |
if (data[0] != 255 && (!other_addinfo_len || other_addinfo[0] != data[0])) return false; |
4250
|
0 |
0 |
if (filters.empty()) return true; |
4253
|
0 |
0 |
for (auto&& filter : filters) { |
4255
|
0 |
0 |
while (tag_pos < filter.pos) |
4256
|
0 |
0 |
if (!tag[tag_pos++]) |
4258
|
0 |
0 |
if (!tag[tag_pos]) |
4263
|
0 |
0 |
for (int i = 1; i < filter.chars_len && ((!matched) ^ filter.negate); i++) |
|
0 |
0 |
for (int i = 1; i < filter.chars_len && ((!matched) ^ filter.negate); i++) |
4265
|
0 |
0 |
if (!matched) return false; |
4305
|
7 |
1 |
for (int i = data.next_1B(); i > 0; i--) |
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
4307
|
9 |
1 |
for (int i = data.next_1B(); i > 0; i--) |
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
4313
|
1 |
0 |
vector root(max(lemmas.max_length(), roots.max_length())); |
|
0 |
0 |
vector root(max(lemmas.max_length(), roots.max_length())); |
|
0 |
0 |
vector root(max(lemmas.max_length(), roots.max_length())); |
4315
|
2 |
1 |
for (int pass = 1; pass <= 2; pass++) { |
|
0 |
0 |
for (int pass = 1; pass <= 2; pass++) { |
|
0 |
0 |
for (int pass = 1; pass <= 2; pass++) { |
4316
|
1 |
1 |
if (pass > 1) data.seek(data_position); |
|
1 |
0 |
if (pass > 1) data.seek(data_position); |
|
0 |
0 |
if (pass > 1) data.seek(data_position); |
|
0 |
0 |
if (pass > 1) data.seek(data_position); |
|
0 |
0 |
if (pass > 1) data.seek(data_position); |
|
0 |
0 |
if (pass > 1) data.seek(data_position); |
4321
|
2 |
0 |
for (int i = data.next_4B(); i > 0; i--) { |
|
4 |
2 |
for (int i = data.next_4B(); i > 0; i--) { |
|
0 |
0 |
for (int i = data.next_4B(); i > 0; i--) { |
|
0 |
0 |
for (int i = data.next_4B(); i > 0; i--) { |
|
0 |
0 |
for (int i = data.next_4B(); i > 0; i--) { |
|
0 |
0 |
for (int i = data.next_4B(); i > 0; i--) { |
4322
|
4 |
0 |
lemma_len -= data.next_1B(); |
|
0 |
0 |
lemma_len -= data.next_1B(); |
|
0 |
0 |
lemma_len -= data.next_1B(); |
4323
|
4 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
|
24 |
4 |
for (int i = data.next_1B(); i > 0; i--) |
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
4324
|
24 |
0 |
lemma[lemma_len++] = data.next_1B(); |
|
0 |
0 |
lemma[lemma_len++] = data.next_1B(); |
|
0 |
0 |
lemma[lemma_len++] = data.next_1B(); |
4325
|
4 |
0 |
unsigned char lemma_info_len = data.next_1B(); |
|
0 |
0 |
unsigned char lemma_info_len = data.next_1B(); |
|
0 |
0 |
unsigned char lemma_info_len = data.next_1B(); |
4326
|
0 |
4 |
const char* lemma_info = lemma_info_len ? data.next(lemma_info_len) : nullptr; |
|
0 |
0 |
const char* lemma_info = lemma_info_len ? data.next(lemma_info_len) : nullptr; |
|
0 |
0 |
const char* lemma_info = lemma_info_len ? data.next(lemma_info_len) : nullptr; |
|
0 |
0 |
const char* lemma_info = lemma_info_len ? data.next(lemma_info_len) : nullptr; |
|
0 |
0 |
const char* lemma_info = lemma_info_len ? data.next(lemma_info_len) : nullptr; |
|
0 |
0 |
const char* lemma_info = lemma_info_len ? data.next(lemma_info_len) : nullptr; |
4327
|
4 |
0 |
unsigned lemma_roots = data.next_1B(); |
|
0 |
0 |
unsigned lemma_roots = data.next_1B(); |
|
0 |
0 |
unsigned lemma_roots = data.next_1B(); |
4332
|
2 |
2 |
if (pass == 1) { |
|
0 |
0 |
if (pass == 1) { |
|
0 |
0 |
if (pass == 1) { |
4339
|
0 |
2 |
if (lemma_info_len) small_memcpy(lemma_data, lemma_info, lemma_info_len), lemma_data += lemma_info_len; |
|
0 |
0 |
if (lemma_info_len) small_memcpy(lemma_data, lemma_info, lemma_info_len), lemma_data += lemma_info_len; |
|
0 |
0 |
if (lemma_info_len) small_memcpy(lemma_data, lemma_info, lemma_info_len), lemma_data += lemma_info_len; |
4344
|
14 |
4 |
for (unsigned i = 0; i < lemma_roots; i++) { |
|
0 |
0 |
for (unsigned i = 0; i < lemma_roots; i++) { |
|
0 |
0 |
for (unsigned i = 0; i < lemma_roots; i++) { |
4346
|
14 |
0 |
int operations = data.next_1B(); |
|
0 |
0 |
int operations = data.next_1B(); |
|
0 |
0 |
int operations = data.next_1B(); |
4347
|
0 |
14 |
if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; } |
|
0 |
0 |
if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; } |
|
0 |
0 |
if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; } |
|
0 |
0 |
if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; } |
|
0 |
0 |
if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; } |
|
0 |
0 |
if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; } |
|
0 |
0 |
if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; } |
|
0 |
0 |
if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; } |
|
0 |
0 |
if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; } |
4348
|
12 |
2 |
if (operations & REMOVE_END) root_len -= data.next_1B(); |
|
12 |
0 |
if (operations & REMOVE_END) root_len -= data.next_1B(); |
|
0 |
0 |
if (operations & REMOVE_END) root_len -= data.next_1B(); |
|
0 |
0 |
if (operations & REMOVE_END) root_len -= data.next_1B(); |
|
0 |
0 |
if (operations & REMOVE_END) root_len -= data.next_1B(); |
|
0 |
0 |
if (operations & REMOVE_END) root_len -= data.next_1B(); |
4349
|
0 |
14 |
if (operations & ADD_START) { |
|
0 |
0 |
if (operations & ADD_START) { |
|
0 |
0 |
if (operations & ADD_START) { |
4350
|
0 |
0 |
int from = root_len, to = from + data.next_1B(); while (from > 0) root[--to] = root[--from]; root_len += to; |
|
0 |
0 |
int from = root_len, to = from + data.next_1B(); while (from > 0) root[--to] = root[--from]; root_len += to; |
|
0 |
0 |
int from = root_len, to = from + data.next_1B(); while (from > 0) root[--to] = root[--from]; root_len += to; |
|
0 |
0 |
int from = root_len, to = from + data.next_1B(); while (from > 0) root[--to] = root[--from]; root_len += to; |
|
0 |
0 |
int from = root_len, to = from + data.next_1B(); while (from > 0) root[--to] = root[--from]; root_len += to; |
|
0 |
0 |
int from = root_len, to = from + data.next_1B(); while (from > 0) root[--to] = root[--from]; root_len += to; |
4351
|
0 |
0 |
for (int i = 0; i < to; i++) root[i] = data.next_1B(); |
|
0 |
0 |
for (int i = 0; i < to; i++) root[i] = data.next_1B(); |
|
0 |
0 |
for (int i = 0; i < to; i++) root[i] = data.next_1B(); |
|
0 |
0 |
for (int i = 0; i < to; i++) root[i] = data.next_1B(); |
|
0 |
0 |
for (int i = 0; i < to; i++) root[i] = data.next_1B(); |
|
0 |
0 |
for (int i = 0; i < to; i++) root[i] = data.next_1B(); |
4353
|
14 |
0 |
if (operations & ADD_END) |
|
0 |
0 |
if (operations & ADD_END) |
|
0 |
0 |
if (operations & ADD_END) |
4354
|
14 |
0 |
for (int len = data.next_1B(); len > 0; len--) |
|
28 |
14 |
for (int len = data.next_1B(); len > 0; len--) |
|
0 |
0 |
for (int len = data.next_1B(); len > 0; len--) |
|
0 |
0 |
for (int len = data.next_1B(); len > 0; len--) |
|
0 |
0 |
for (int len = data.next_1B(); len > 0; len--) |
|
0 |
0 |
for (int len = data.next_1B(); len > 0; len--) |
4355
|
28 |
0 |
root[root_len++] = data.next_1B(); |
|
0 |
0 |
root[root_len++] = data.next_1B(); |
|
0 |
0 |
root[root_len++] = data.next_1B(); |
4356
|
14 |
0 |
uint16_t clas = data.next_2B(); |
|
0 |
0 |
uint16_t clas = data.next_2B(); |
|
0 |
0 |
uint16_t clas = data.next_2B(); |
4358
|
7 |
7 |
if (pass == 1) { // for each root |
|
0 |
0 |
if (pass == 1) { // for each root |
|
0 |
0 |
if (pass == 1) { // for each root |
4367
|
0 |
7 |
assert(uint8_t(lemma_len) == lemma_len); |
|
0 |
0 |
assert(uint8_t(lemma_len) == lemma_len); |
|
0 |
0 |
assert(uint8_t(lemma_len) == lemma_len); |
4372
|
0 |
7 |
assert(uint8_t(root_len) == root_len); |
|
0 |
0 |
assert(uint8_t(root_len) == root_len); |
|
0 |
0 |
assert(uint8_t(root_len) == root_len); |
4377
|
1 |
1 |
if (pass == 1) { // after the whole pass |
|
0 |
0 |
if (pass == 1) { // after the whole pass |
|
0 |
0 |
if (pass == 1) { // after the whole pass |
4378
|
1 |
0 |
lemmas.done_adding(); |
|
0 |
0 |
lemmas.done_adding(); |
|
0 |
0 |
lemmas.done_adding(); |
4379
|
1 |
0 |
roots.done_adding(); |
|
0 |
0 |
roots.done_adding(); |
|
0 |
0 |
roots.done_adding(); |
4387
|
1 |
0 |
tags.resize(data.next_2B()); |
|
1 |
0 |
tags.resize(data.next_2B()); |
|
0 |
0 |
tags.resize(data.next_2B()); |
|
0 |
0 |
tags.resize(data.next_2B()); |
|
0 |
0 |
tags.resize(data.next_2B()); |
|
0 |
0 |
tags.resize(data.next_2B()); |
4388
|
20 |
1 |
for (auto&& tag : tags) { |
|
0 |
0 |
for (auto&& tag : tags) { |
|
0 |
0 |
for (auto&& tag : tags) { |
4389
|
20 |
0 |
tag.resize(data.next_1B()); |
|
0 |
0 |
tag.resize(data.next_1B()); |
|
0 |
0 |
tag.resize(data.next_1B()); |
4390
|
60 |
20 |
for (unsigned i = 0; i < tag.size(); i++) |
|
0 |
0 |
for (unsigned i = 0; i < tag.size(); i++) |
|
0 |
0 |
for (unsigned i = 0; i < tag.size(); i++) |
4391
|
60 |
0 |
tag[i] = data.next_1B(); |
|
0 |
0 |
tag[i] = data.next_1B(); |
|
0 |
0 |
tag[i] = data.next_1B(); |
4395
|
1 |
0 |
suffixes.load(data); |
|
0 |
0 |
suffixes.load(data); |
|
0 |
0 |
suffixes.load(data); |
4398
|
1 |
0 |
suffixes.iter_all([this](const char* suffix, int len, pointer_decoder& data) mutable { |
|
0 |
0 |
suffixes.iter_all([this](const char* suffix, int len, pointer_decoder& data) mutable { |
|
0 |
0 |
suffixes.iter_all([this](const char* suffix, int len, pointer_decoder& data) mutable { |
4405
|
15 |
13 |
for (unsigned i = 0; i < classes_len; i++) { |
|
0 |
0 |
for (unsigned i = 0; i < classes_len; i++) { |
|
0 |
0 |
for (unsigned i = 0; i < classes_len; i++) { |
4407
|
5 |
10 |
if (classes_ptr_i >= classes.size()) classes.resize(classes_ptr_i + 1); |
|
5 |
0 |
if (classes_ptr_i >= classes.size()) classes.resize(classes_ptr_i + 1); |
|
0 |
0 |
if (classes_ptr_i >= classes.size()) classes.resize(classes_ptr_i + 1); |
|
0 |
0 |
if (classes_ptr_i >= classes.size()) classes.resize(classes_ptr_i + 1); |
|
0 |
0 |
if (classes_ptr_i >= classes.size()) classes.resize(classes_ptr_i + 1); |
|
0 |
0 |
if (classes_ptr_i >= classes.size()) classes.resize(classes_ptr_i + 1); |
4408
|
15 |
0 |
classes[classes_ptr_i].emplace_back(suffix_str, vector()); |
|
0 |
0 |
classes[classes_ptr_i].emplace_back(suffix_str, vector()); |
|
0 |
0 |
classes[classes_ptr_i].emplace_back(suffix_str, vector()); |
4409
|
20 |
15 |
for (const uint16_t* ptr = tags_ptr + unaligned_load(indices_ptr + i), |
|
0 |
0 |
for (const uint16_t* ptr = tags_ptr + unaligned_load(indices_ptr + i), |
|
0 |
0 |
for (const uint16_t* ptr = tags_ptr + unaligned_load(indices_ptr + i), |
4412
|
20 |
0 |
classes[classes_ptr_i].back().second.emplace_back(unaligned_load(ptr)); |
|
0 |
0 |
classes[classes_ptr_i].back().second.emplace_back(unaligned_load(ptr)); |
|
0 |
0 |
classes[classes_ptr_i].back().second.emplace_back(unaligned_load(ptr)); |
4422
|
0 |
18 |
uint16_t** suff = max_suffix_len <= 16 ? suff_stack : (suff_heap.resize(max_suffix_len), suff_heap.data()); |
|
0 |
0 |
uint16_t** suff = max_suffix_len <= 16 ? suff_stack : (suff_heap.resize(max_suffix_len), suff_heap.data()); |
|
0 |
0 |
uint16_t** suff = max_suffix_len <= 16 ? suff_stack : (suff_heap.resize(max_suffix_len), suff_heap.data()); |
|
0 |
0 |
uint16_t** suff = max_suffix_len <= 16 ? suff_stack : (suff_heap.resize(max_suffix_len), suff_heap.data()); |
|
0 |
0 |
uint16_t** suff = max_suffix_len <= 16 ? suff_stack : (suff_heap.resize(max_suffix_len), suff_heap.data()); |
|
0 |
0 |
uint16_t** suff = max_suffix_len <= 16 ? suff_stack : (suff_heap.resize(max_suffix_len), suff_heap.data()); |
4424
|
48 |
0 |
for (int i = form.len; i >= 0 && suff_len < max_suffix_len; i--, suff_len++) { |
|
0 |
0 |
for (int i = form.len; i >= 0 && suff_len < max_suffix_len; i--, suff_len++) { |
|
0 |
0 |
for (int i = form.len; i >= 0 && suff_len < max_suffix_len; i--, suff_len++) { |
4432
|
30 |
18 |
for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++) |
|
0 |
30 |
for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++) |
|
30 |
18 |
for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++) |
|
0 |
0 |
for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++) |
|
0 |
0 |
for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++) |
|
0 |
0 |
for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++) |
|
0 |
0 |
for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++) |
|
0 |
0 |
for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++) |
|
0 |
0 |
for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++) |
4433
|
30 |
0 |
if (unaligned_load(suff[suff_len])) { |
|
0 |
0 |
if (unaligned_load(suff[suff_len])) { |
|
0 |
0 |
if (unaligned_load(suff[suff_len])) { |
4437
|
30 |
0 |
roots.iter(form.str, root_len, [&](const char* root, pointer_decoder& root_data) { |
|
0 |
0 |
roots.iter(form.str, root_len, [&](const char* root, pointer_decoder& root_data) { |
|
0 |
0 |
roots.iter(form.str, root_len, [&](const char* root, pointer_decoder& root_data) { |
4442
|
10 |
20 |
if (small_memeq(form.str, root, root_len)) { |
|
0 |
0 |
if (small_memeq(form.str, root, root_len)) { |
|
0 |
0 |
if (small_memeq(form.str, root, root_len)) { |
4444
|
10 |
0 |
if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) { |
|
0 |
10 |
if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) { |
|
10 |
0 |
if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) { |
|
0 |
0 |
if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) { |
|
0 |
0 |
if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) { |
|
0 |
0 |
if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) { |
|
0 |
0 |
if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) { |
|
0 |
0 |
if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) { |
|
0 |
0 |
if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) { |
4447
|
0 |
10 |
if (lemma_data[lemma_len]) lemma += LemmaAddinfo::format(lemma_data + lemma_len + 1, lemma_data[lemma_len]); |
|
0 |
0 |
if (lemma_data[lemma_len]) lemma += LemmaAddinfo::format(lemma_data + lemma_len + 1, lemma_data[lemma_len]); |
|
0 |
0 |
if (lemma_data[lemma_len]) lemma += LemmaAddinfo::format(lemma_data + lemma_len + 1, lemma_data[lemma_len]); |
|
0 |
0 |
if (lemma_data[lemma_len]) lemma += LemmaAddinfo::format(lemma_data + lemma_len + 1, lemma_data[lemma_len]); |
4451
|
18 |
10 |
for (unsigned i = unaligned_load(suff_tag_indices + (suffix_class_ptr - suff_data)); |
|
0 |
0 |
for (unsigned i = unaligned_load(suff_tag_indices + (suffix_class_ptr - suff_data)); |
|
0 |
0 |
for (unsigned i = unaligned_load(suff_tag_indices + (suffix_class_ptr - suff_data)); |
4453
|
18 |
0 |
lemmas.emplace_back(lemma, tags[unaligned_load(suff_tags + i)]); |
|
0 |
0 |
lemmas.emplace_back(lemma, tags[unaligned_load(suff_tags + i)]); |
|
0 |
0 |
lemmas.emplace_back(lemma, tags[unaligned_load(suff_tags + i)]); |
4463
|
0 |
0 |
int raw_lemma_len = addinfo.parse(lemma); |
|
0 |
0 |
int raw_lemma_len = addinfo.parse(lemma); |
4466
|
0 |
0 |
lemmas.iter(lemma.str, raw_lemma_len, [&](const char* lemma_str, pointer_decoder& data) { |
|
0 |
0 |
lemmas.iter(lemma.str, raw_lemma_len, [&](const char* lemma_str, pointer_decoder& data) { |
|
0 |
0 |
lemmas.iter(lemma.str, raw_lemma_len, [&](const char* lemma_str, pointer_decoder& data) { |
4472
|
0 |
0 |
if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) { |
|
0 |
0 |
if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) { |
|
0 |
0 |
if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) { |
|
0 |
0 |
if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) { |
|
0 |
0 |
if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) { |
|
0 |
0 |
if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) { |
|
0 |
0 |
if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) { |
|
0 |
0 |
if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) { |
|
0 |
0 |
if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) { |
4477
|
0 |
0 |
for (unsigned i = 0; i < lemma_roots_len; i++) { |
|
0 |
0 |
for (unsigned i = 0; i < lemma_roots_len; i++) { |
|
0 |
0 |
for (unsigned i = 0; i < lemma_roots_len; i++) { |
4483
|
0 |
0 |
for (auto&& suffix : classes[clas]) { |
|
0 |
0 |
for (auto&& suffix : classes[clas]) { |
|
0 |
0 |
for (auto&& suffix : classes[clas]) { |
4485
|
0 |
0 |
for (auto&& tag : suffix.second) |
|
0 |
0 |
for (auto&& tag : suffix.second) |
|
0 |
0 |
for (auto&& tag : suffix.second) |
4486
|
0 |
0 |
if (filter.matches(tags[tag].c_str())) { |
|
0 |
0 |
if (filter.matches(tags[tag].c_str())) { |
|
0 |
0 |
if (filter.matches(tags[tag].c_str())) { |
4487
|
0 |
0 |
if (!forms) { |
|
0 |
0 |
if (!forms) { |
|
0 |
0 |
if (!forms) { |
4488
|
0 |
0 |
lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len)); |
|
0 |
0 |
lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len)); |
|
0 |
0 |
lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len)); |
|
0 |
0 |
lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len)); |
|
0 |
0 |
lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len)); |
|
0 |
0 |
lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len)); |
|
0 |
0 |
lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len)); |
4492
|
0 |
0 |
if (root_with_suffix.empty() && root_len + suffix.first.size()) { |
|
0 |
0 |
if (root_with_suffix.empty() && root_len + suffix.first.size()) { |
|
0 |
0 |
if (root_with_suffix.empty() && root_len + suffix.first.size()) { |
|
0 |
0 |
if (root_with_suffix.empty() && root_len + suffix.first.size()) { |
|
0 |
0 |
if (root_with_suffix.empty() && root_len + suffix.first.size()) { |
|
0 |
0 |
if (root_with_suffix.empty() && root_len + suffix.first.size()) { |
|
0 |
0 |
if (root_with_suffix.empty() && root_len + suffix.first.size()) { |
|
0 |
0 |
if (root_with_suffix.empty() && root_len + suffix.first.size()) { |
|
0 |
0 |
if (root_with_suffix.empty() && root_len + suffix.first.size()) { |
4493
|
0 |
0 |
root_with_suffix.reserve(root_len + suffix.first.size()); |
|
0 |
0 |
root_with_suffix.reserve(root_len + suffix.first.size()); |
|
0 |
0 |
root_with_suffix.reserve(root_len + suffix.first.size()); |
4498
|
0 |
0 |
forms->emplace_back(root_with_suffix, tags[tag]); |
|
0 |
0 |
forms->emplace_back(root_with_suffix, tags[tag]); |
|
0 |
0 |
forms->emplace_back(root_with_suffix, tags[tag]); |
4545
|
0 |
0 |
for (unsigned tag_filters_len = data.next_1B(); tag_filters_len; tag_filters_len--) { |
4549
|
0 |
0 |
tag_filters.emplace_back(tag_filter.c_str()); |
4560
|
0 |
0 |
if (!form.len) return; |
4564
|
0 |
0 |
middle_masks.reserve(form.len); |
4566
|
0 |
0 |
for (unsigned initial = 0; initial < form.len; initial++) { |
4569
|
0 |
0 |
if (initial) { |
4571
|
0 |
0 |
if (!found) break; |
4576
|
0 |
0 |
if (initial_mask) { |
4577
|
0 |
0 |
middle_masks.resize(initial); |
4578
|
0 |
0 |
middle_masks.emplace_back(initial_mask); |
4579
|
0 |
0 |
for (unsigned middle = initial; middle < middle_masks.size(); middle++) { |
4580
|
0 |
0 |
if (!middle_masks[middle]) continue; |
4582
|
0 |
0 |
for (unsigned i = middle + 1; i < form.len; i++) { |
4584
|
0 |
0 |
if (!found) break; |
4585
|
0 |
0 |
if (unaligned_load(found)) { |
4586
|
0 |
0 |
if (i + 1 > middle_masks.size()) middle_masks.resize(i + 1); |
|
0 |
0 |
if (i + 1 > middle_masks.size()) middle_masks.resize(i + 1); |
4592
|
0 |
0 |
if (middle > initial && middle < form.len ) { |
|
0 |
0 |
if (middle > initial && middle < form.len ) { |
4593
|
0 |
0 |
if (initial) { |
4594
|
0 |
0 |
if (form_tmp.empty()) form_tmp.assign(form.str, form.str + form.len); |
4598
|
0 |
0 |
dictionary.analyze(string_piece((initial ? form_tmp.data() : form.str) + middle - initial, form.len - middle + initial), lemmas); |
|
0 |
0 |
dictionary.analyze(string_piece((initial ? form_tmp.data() : form.str) + middle - initial, form.len - middle + initial), lemmas); |
4600
|
0 |
0 |
for (unsigned i = lemmas_ori_size; i < lemmas.size(); i++) { |
4601
|
0 |
0 |
for (unsigned filter = 0; filter < tag_filters.size(); filter++) |
4602
|
0 |
0 |
if ((middle_masks[middle] & (1<
|
|
0 |
0 |
if ((middle_masks[middle] & (1<
|
|
0 |
0 |
if ((middle_masks[middle] & (1<
|
4603
|
0 |
0 |
if (i == lemmas_new_size) { |
4606
|
0 |
0 |
lemmas[lemmas_new_size].lemma.reserve(lemmas[i].lemma.size() + middle - initial); |
4615
|
0 |
0 |
if (lemmas_new_size < lemmas.size()) lemmas.erase(lemmas.begin() + lemmas_new_size, lemmas.end()); |
4740
|
22 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
5 |
17 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
37 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
5 |
32 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
4800
|
0 |
0 |
czech_morpho(morpho_language language, unsigned version) : language(language), version(version) {} |
|
0 |
0 |
czech_morpho(morpho_language language, unsigned version) : language(language), version(version) {} |
|
0 |
0 |
czech_morpho(morpho_language language, unsigned version) : language(language), version(version) {} |
4843
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
4847
|
0 |
0 |
unsigned tag_length = data.next_1B(); |
4848
|
0 |
0 |
if (tag_length < unknown_tag.size()) unknown_tag.erase(tag_length); |
|
0 |
0 |
if (tag_length < unknown_tag.size()) unknown_tag.erase(tag_length); |
4849
|
0 |
0 |
if (tag_length < number_tag.size()) number_tag.erase(tag_length); |
|
0 |
0 |
if (tag_length < number_tag.size()) number_tag.erase(tag_length); |
4850
|
0 |
0 |
if (tag_length < punctuation_tag.size()) punctuation_tag.erase(tag_length); |
|
0 |
0 |
if (tag_length < punctuation_tag.size()) punctuation_tag.erase(tag_length); |
4853
|
0 |
0 |
dictionary.load(data); |
4857
|
0 |
0 |
if (data.next_1B()) { |
|
0 |
0 |
if (data.next_1B()) { |
4858
|
0 |
0 |
prefix_guesser.reset(new morpho_prefix_guesser(dictionary)); |
4859
|
0 |
0 |
prefix_guesser->load(data); |
4864
|
0 |
0 |
if (data.next_1B()) { |
|
0 |
0 |
if (data.next_1B()) { |
4865
|
0 |
0 |
statistical_guesser.reset(new morpho_statistical_guesser()); |
4866
|
0 |
0 |
statistical_guesser->load(data); |
4867
|
0 |
0 |
} |
4878
|
0 |
0 |
if (form.len) { |
4882
|
0 |
0 |
generate_casing_variants(form, form_uclc, form_lc); |
4885
|
0 |
0 |
dictionary.analyze(form, lemmas); |
4886
|
0 |
0 |
if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas); |
|
0 |
0 |
if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas); |
4887
|
0 |
0 |
if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas); |
|
0 |
0 |
if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas); |
4888
|
0 |
0 |
if (!lemmas.empty()) return NO_GUESSER; |
4891
|
0 |
0 |
analyze_special(form, lemmas); |
4892
|
0 |
0 |
if (!lemmas.empty()) return NO_GUESSER; |
4895
|
0 |
0 |
if (guesser == GUESSER && prefix_guesser) |
|
0 |
0 |
if (guesser == GUESSER && prefix_guesser) |
|
0 |
0 |
if (guesser == GUESSER && prefix_guesser) |
4896
|
0 |
0 |
prefix_guesser->analyze(form_lc.empty() ? form : form_lc, lemmas); |
|
0 |
0 |
prefix_guesser->analyze(form_lc.empty() ? form : form_lc, lemmas); |
4900
|
0 |
0 |
if (guesser == GUESSER && statistical_guesser) { |
|
0 |
0 |
if (guesser == GUESSER && statistical_guesser) { |
|
0 |
0 |
if (guesser == GUESSER && statistical_guesser) { |
4901
|
0 |
0 |
if (form_uclc.empty() && form_lc.empty()) |
|
0 |
0 |
if (form_uclc.empty() && form_lc.empty()) |
|
0 |
0 |
if (form_uclc.empty() && form_lc.empty()) |
4902
|
0 |
0 |
statistical_guesser->analyze(form, lemmas, nullptr); |
4904
|
0 |
0 |
morpho_statistical_guesser::used_rules used_rules; used_rules.reserve(3); |
4905
|
0 |
0 |
statistical_guesser->analyze(form, lemmas, &used_rules); |
4906
|
0 |
0 |
if (!form_uclc.empty()) statistical_guesser->analyze(form_uclc, lemmas, &used_rules); |
|
0 |
0 |
if (!form_uclc.empty()) statistical_guesser->analyze(form_uclc, lemmas, &used_rules); |
4907
|
0 |
0 |
if (!form_lc.empty()) statistical_guesser->analyze(form_lc, lemmas, &used_rules); |
|
0 |
0 |
if (!form_lc.empty()) statistical_guesser->analyze(form_lc, lemmas, &used_rules); |
4913
|
0 |
0 |
if (prefix_guesser_guesses) { |
4916
|
0 |
0 |
return lemma_compare < 0 || (lemma_compare == 0 && a.tag < b.tag); |
4919
|
0 |
0 |
return a.lemma == b.lemma && a.tag == b.tag; |
|
0 |
0 |
return a.lemma == b.lemma && a.tag == b.tag; |
4921
|
0 |
0 |
if (lemmas_end != lemmas.end()) lemmas.erase(lemmas_end, lemmas.end()); |
4924
|
0 |
0 |
if (!lemmas.empty()) return GUESSER; |
4927
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), unknown_tag); |
4936
|
0 |
0 |
if (lemma.len) { |
4937
|
0 |
0 |
if (dictionary.generate(lemma, filter, forms)) |
|
0 |
0 |
if (dictionary.generate(lemma, filter, forms)) |
4940
|
0 |
0 |
if (guesser == GUESSER && prefix_guesser) |
|
0 |
0 |
if (guesser == GUESSER && prefix_guesser) |
4961
|
0 |
0 |
return new czech_tokenizer(language, version, this); |
4992
|
0 |
0 |
if (!form.len) return; |
5000
|
0 |
0 |
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(form.str, form.len); |
5001
|
0 |
0 |
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(form.str, form.len); |
5002
|
0 |
0 |
if ((codepoint == '.' && form.len) || codepoint == ',') codepoint = utf8::decode(form.str, form.len); |
|
0 |
0 |
if ((codepoint == '.' && form.len) || codepoint == ',') codepoint = utf8::decode(form.str, form.len); |
|
0 |
0 |
if ((codepoint == '.' && form.len) || codepoint == ',') codepoint = utf8::decode(form.str, form.len); |
5003
|
0 |
0 |
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(form.str, form.len); |
5004
|
0 |
0 |
if (any_digit && (codepoint == 'e' || codepoint == 'E')) { |
|
0 |
0 |
if (any_digit && (codepoint == 'e' || codepoint == 'E')) { |
5006
|
0 |
0 |
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(form.str, form.len); |
5008
|
0 |
0 |
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(form.str, form.len); |
5011
|
0 |
0 |
if (any_digit && !form.len && (!codepoint || codepoint == '.')) { |
|
0 |
0 |
if (any_digit && !form.len && (!codepoint || codepoint == '.')) { |
|
0 |
0 |
if (any_digit && !form.len && (!codepoint || codepoint == '.')) { |
5012
|
0 |
0 |
lemmas.emplace_back(string(form_ori.str, form_ori.len), number_tag); |
5013
|
0 |
0 |
} else if ((first < sizeof(punctuation_additional) && punctuation_additional[first]) || |
|
0 |
0 |
} else if ((first < sizeof(punctuation_additional) && punctuation_additional[first]) || |
|
0 |
0 |
} else if ((first < sizeof(punctuation_additional) && punctuation_additional[first]) || |
|
0 |
0 |
} else if ((first < sizeof(punctuation_additional) && punctuation_additional[first]) || |
5014
|
0 |
0 |
((unicode::category(first) & unicode::P) && (first >= sizeof(punctuation_exceptions) || !punctuation_exceptions[first]))) |
|
0 |
0 |
((unicode::category(first) & unicode::P) && (first >= sizeof(punctuation_exceptions) || !punctuation_exceptions[first]))) |
5015
|
0 |
0 |
lemmas.emplace_back(string(form_ori.str, form_ori.len), punctuation_tag); |
5053
|
0 |
0 |
for (unsigned len = 1; len < lemma.len; len++) { |
5054
|
0 |
0 |
if (len + 1 == lemma.len && (lemma.str[len] == '^' || lemma.str[len] == '+')) |
|
0 |
0 |
if (len + 1 == lemma.len && (lemma.str[len] == '^' || lemma.str[len] == '+')) |
5056
|
0 |
0 |
if (len + 1 < lemma.len && lemma.str[len] == '^') { |
|
0 |
0 |
if (len + 1 < lemma.len && lemma.str[len] == '^') { |
5058
|
0 |
0 |
for (unsigned i = len + 1; ok && i < lemma.len; i++) |
|
0 |
0 |
for (unsigned i = len + 1; ok && i < lemma.len; i++) |
5059
|
0 |
0 |
ok &= (lemma.str[i] >= 'A' && lemma.str[i] <= 'Z') || |
5060
|
0 |
0 |
(lemma.str[i] >= 'a' && lemma.str[i] <= 'z') || |
|
0 |
0 |
(lemma.str[i] >= 'a' && lemma.str[i] <= 'z') || |
5061
|
0 |
0 |
(i > len + 1 && lemma.str[i] == '-'); |
5062
|
0 |
0 |
if (ok) return len; |
5085
|
0 |
0 |
for (size_t i = len; i < lemma.len; i++) |
5092
|
0 |
0 |
if (data.empty()) return true; |
5093
|
0 |
0 |
if (data.size() == 1 && data[0] == '^') return other_addinfo_len > 0 && other_addinfo[0] == '^'; |
|
0 |
0 |
if (data.size() == 1 && data[0] == '^') return other_addinfo_len > 0 && other_addinfo[0] == '^'; |
|
0 |
0 |
if (data.size() == 1 && data[0] == '^') return other_addinfo_len > 0 && other_addinfo[0] == '^'; |
|
0 |
0 |
if (data.size() == 1 && data[0] == '^') return other_addinfo_len > 0 && other_addinfo[0] == '^'; |
|
0 |
0 |
if (data.size() == 1 && data[0] == '^') return other_addinfo_len > 0 && other_addinfo[0] == '^'; |
5094
|
0 |
0 |
if (data.size() == 1 && data[0] == '+') return other_addinfo_len == 0; |
|
0 |
0 |
if (data.size() == 1 && data[0] == '+') return other_addinfo_len == 0; |
|
0 |
0 |
if (data.size() == 1 && data[0] == '+') return other_addinfo_len == 0; |
5095
|
0 |
0 |
return data.size() == size_t(other_addinfo_len) && small_memeq(data.data(), other_addinfo, other_addinfo_len); |
|
0 |
0 |
return data.size() == size_t(other_addinfo_len) && small_memeq(data.data(), other_addinfo, other_addinfo_len); |
5115
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
5163
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
5237
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
5240
|
0 |
0 |
dictionary.load(data); |
5241
|
0 |
0 |
morpho_guesser.load(data); |
|
0 |
0 |
morpho_guesser.load(data); |
5252
|
0 |
0 |
if (form.len) { |
5256
|
0 |
0 |
generate_casing_variants(form, form_uclc, form_lc); |
5259
|
0 |
0 |
dictionary.analyze(form, lemmas); |
5260
|
0 |
0 |
if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas); |
|
0 |
0 |
if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas); |
5261
|
0 |
0 |
if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas); |
|
0 |
0 |
if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas); |
5262
|
0 |
0 |
if (!lemmas.empty()) |
5263
|
0 |
0 |
return guesser == NO_GUESSER || !morpho_guesser.analyze_proper_names(form, form_lc.empty() ? form : form_lc, lemmas) ? NO_GUESSER : GUESSER; |
|
0 |
0 |
return guesser == NO_GUESSER || !morpho_guesser.analyze_proper_names(form, form_lc.empty() ? form : form_lc, lemmas) ? NO_GUESSER : GUESSER; |
|
0 |
0 |
return guesser == NO_GUESSER || !morpho_guesser.analyze_proper_names(form, form_lc.empty() ? form : form_lc, lemmas) ? NO_GUESSER : GUESSER; |
|
0 |
0 |
return guesser == NO_GUESSER || !morpho_guesser.analyze_proper_names(form, form_lc.empty() ? form : form_lc, lemmas) ? NO_GUESSER : GUESSER; |
5266
|
0 |
0 |
analyze_special(form, lemmas); |
5267
|
0 |
0 |
if (!lemmas.empty()) return NO_GUESSER; |
5270
|
0 |
0 |
if (guesser == GUESSER) |
5271
|
0 |
0 |
morpho_guesser.analyze(form, form_lc.empty() ? form : form_lc, lemmas); |
|
0 |
0 |
morpho_guesser.analyze(form, form_lc.empty() ? form : form_lc, lemmas); |
5272
|
0 |
0 |
if (!lemmas.empty()) return GUESSER; |
5275
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), unknown_tag); |
5284
|
0 |
0 |
if (lemma.len) { |
5285
|
0 |
0 |
if (dictionary.generate(lemma, filter, forms)) |
|
0 |
0 |
if (dictionary.generate(lemma, filter, forms)) |
5305
|
0 |
0 |
return new english_tokenizer(version <= 2 ? 1 : 2); |
5312
|
0 |
0 |
if (!form.len) return; |
5315
|
0 |
0 |
if (form.len == 1) |
5319
|
0 |
0 |
case '?': lemmas.emplace_back(string(form.str, form.len), dot_tag); return; |
5320
|
0 |
0 |
case ',': lemmas.emplace_back(string(form.str, form.len), comma_tag); return; |
5321
|
0 |
0 |
case '#': lemmas.emplace_back(string(form.str, form.len), hash_tag); return; |
5322
|
0 |
0 |
case '$': lemmas.emplace_back(string(form.str, form.len), dollar_tag); return; |
5323
|
0 |
0 |
case '[': lemmas.emplace_back(string(form.str, form.len), sym_tag); return; |
5324
|
0 |
0 |
case ']': lemmas.emplace_back(string(form.str, form.len), sym_tag); return; |
5325
|
0 |
0 |
case '%': lemmas.emplace_back(string(form.str, form.len), jj_tag); |
5326
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), nn_tag); return; |
5327
|
0 |
0 |
case '&': lemmas.emplace_back(string(form.str, form.len), cc_tag); |
5328
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), sym_tag); return; |
5329
|
0 |
0 |
case '*': lemmas.emplace_back(string(form.str, form.len), sym_tag); |
5330
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), nn_tag); return; |
5331
|
0 |
0 |
case '@': lemmas.emplace_back(string(form.str, form.len), sym_tag); |
5332
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), in_tag); return; |
5333
|
0 |
0 |
case '\'': lemmas.emplace_back(string(form.str, form.len), close_quotation_tag); |
5334
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), pos_tag); return; |
5341
|
0 |
0 |
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(number.str, number.len); |
5342
|
0 |
0 |
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
5343
|
0 |
0 |
while (codepoint == ',') { |
5345
|
0 |
0 |
if (unicode::category(utf8::decode(group.str, group.len) & ~unicode::N)) break; |
5346
|
0 |
0 |
if (unicode::category(utf8::decode(group.str, group.len) & ~unicode::N)) break; |
5347
|
0 |
0 |
if (unicode::category(utf8::decode(group.str, group.len) & ~unicode::N)) break; |
5352
|
0 |
0 |
if (codepoint == '.' && number.len) { |
|
0 |
0 |
if (codepoint == '.' && number.len) { |
5354
|
0 |
0 |
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
5356
|
0 |
0 |
if (version >= 2 && any_digit && codepoint == 's' && !number.len) { |
|
0 |
0 |
if (version >= 2 && any_digit && codepoint == 's' && !number.len) { |
|
0 |
0 |
if (version >= 2 && any_digit && codepoint == 's' && !number.len) { |
5357
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), number_tag); |
5358
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len - 1), nns_tag); |
5361
|
0 |
0 |
if (any_digit && (codepoint == 'e' || codepoint == 'E')) { |
|
0 |
0 |
if (any_digit && (codepoint == 'e' || codepoint == 'E')) { |
5363
|
0 |
0 |
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(number.str, number.len); |
5365
|
0 |
0 |
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
5367
|
0 |
0 |
if (any_digit && !number.len && (!codepoint || codepoint == '.')) { |
|
0 |
0 |
if (any_digit && !number.len && (!codepoint || codepoint == '.')) { |
|
0 |
0 |
if (any_digit && !number.len && (!codepoint || codepoint == '.')) { |
5368
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), number_tag); |
5369
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), nnp_tag); |
5370
|
0 |
0 |
if (form.len == 1 + (codepoint == '.') && *form.str >= '1' && *form.str <= '9') |
|
0 |
0 |
if (form.len == 1 + (codepoint == '.') && *form.str >= '1' && *form.str <= '9') |
|
0 |
0 |
if (form.len == 1 + (codepoint == '.') && *form.str >= '1' && *form.str <= '9') |
|
0 |
0 |
if (form.len == 1 + (codepoint == '.') && *form.str >= '1' && *form.str <= '9') |
5371
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), ls_tag); |
5378
|
0 |
0 |
while ((symbol || any_punctuation) && punctuation.len) { |
|
0 |
0 |
while ((symbol || any_punctuation) && punctuation.len) { |
5380
|
0 |
0 |
if (open_quotation) open_quotation = codepoint == '`' || unicode::category(codepoint) & unicode::Pi; |
|
0 |
0 |
if (open_quotation) open_quotation = codepoint == '`' || unicode::category(codepoint) & unicode::Pi; |
|
0 |
0 |
if (open_quotation) open_quotation = codepoint == '`' || unicode::category(codepoint) & unicode::Pi; |
5381
|
0 |
0 |
if (close_quotation) close_quotation = codepoint == '\'' || codepoint == '"' || unicode::category(codepoint) & unicode::Pf; |
|
0 |
0 |
if (close_quotation) close_quotation = codepoint == '\'' || codepoint == '"' || unicode::category(codepoint) & unicode::Pf; |
|
0 |
0 |
if (close_quotation) close_quotation = codepoint == '\'' || codepoint == '"' || unicode::category(codepoint) & unicode::Pf; |
5382
|
0 |
0 |
if (open_parenthesis) open_parenthesis = unicode::category(codepoint) & unicode::Ps; |
5383
|
0 |
0 |
if (close_parenthesis) close_parenthesis = unicode::category(codepoint) & unicode::Pe; |
5384
|
0 |
0 |
if (any_punctuation) any_punctuation = unicode::category(codepoint) & unicode::P; |
5385
|
0 |
0 |
if (symbol) symbol = codepoint == '*' || unicode::category(codepoint) & unicode::S; |
|
0 |
0 |
if (symbol) symbol = codepoint == '*' || unicode::category(codepoint) & unicode::S; |
|
0 |
0 |
if (symbol) symbol = codepoint == '*' || unicode::category(codepoint) & unicode::S; |
5387
|
0 |
0 |
if (!punctuation.len && open_quotation) { lemmas.emplace_back(string(form.str, form.len), open_quotation_tag); return; } |
|
0 |
0 |
if (!punctuation.len && open_quotation) { lemmas.emplace_back(string(form.str, form.len), open_quotation_tag); return; } |
|
0 |
0 |
if (!punctuation.len && open_quotation) { lemmas.emplace_back(string(form.str, form.len), open_quotation_tag); return; } |
5388
|
0 |
0 |
if (!punctuation.len && close_quotation) { lemmas.emplace_back(string(form.str, form.len), close_quotation_tag); return; } |
|
0 |
0 |
if (!punctuation.len && close_quotation) { lemmas.emplace_back(string(form.str, form.len), close_quotation_tag); return; } |
|
0 |
0 |
if (!punctuation.len && close_quotation) { lemmas.emplace_back(string(form.str, form.len), close_quotation_tag); return; } |
5389
|
0 |
0 |
if (!punctuation.len && open_parenthesis) { lemmas.emplace_back(string(form.str, form.len), open_parenthesis_tag); return; } |
|
0 |
0 |
if (!punctuation.len && open_parenthesis) { lemmas.emplace_back(string(form.str, form.len), open_parenthesis_tag); return; } |
|
0 |
0 |
if (!punctuation.len && open_parenthesis) { lemmas.emplace_back(string(form.str, form.len), open_parenthesis_tag); return; } |
5390
|
0 |
0 |
if (!punctuation.len && close_parenthesis) { lemmas.emplace_back(string(form.str, form.len), close_parenthesis_tag); return; } |
|
0 |
0 |
if (!punctuation.len && close_parenthesis) { lemmas.emplace_back(string(form.str, form.len), close_parenthesis_tag); return; } |
|
0 |
0 |
if (!punctuation.len && close_parenthesis) { lemmas.emplace_back(string(form.str, form.len), close_parenthesis_tag); return; } |
5391
|
0 |
0 |
if (!punctuation.len && symbol) { lemmas.emplace_back(string(form.str, form.len), sym_tag); return; } |
|
0 |
0 |
if (!punctuation.len && symbol) { lemmas.emplace_back(string(form.str, form.len), sym_tag); return; } |
|
0 |
0 |
if (!punctuation.len && symbol) { lemmas.emplace_back(string(form.str, form.len), sym_tag); return; } |
5392
|
0 |
0 |
if (!punctuation.len && any_punctuation) { lemmas.emplace_back(string(form.str, form.len), punctuation_tag); return; } |
|
0 |
0 |
if (!punctuation.len && any_punctuation) { lemmas.emplace_back(string(form.str, form.len), punctuation_tag); return; } |
|
0 |
0 |
if (!punctuation.len && any_punctuation) { lemmas.emplace_back(string(form.str, form.len), punctuation_tag); return; } |
5426
|
0 |
0 |
while (tags--) { |
5428
|
0 |
0 |
exceptions_tags.emplace_back(string(data.next(len), len)); |
5564
|
0 |
0 |
for (unsigned len = data.next_1B(); len; len--) { |
5570
|
0 |
0 |
if (exception) { |
5573
|
0 |
0 |
for (unsigned len = data.next_1B(); len; len--) { |
5576
|
0 |
0 |
for (unsigned tags = data.next_1B(); tags; tags--) |
5577
|
0 |
0 |
lemmas.emplace_back(lemma, exceptions_tags[data.next_2B()]); |
5584
|
0 |
0 |
for (unsigned prefix = 1; prefix <= form_lc.len; prefix++) { |
5586
|
0 |
0 |
if (!found) break; |
5587
|
0 |
0 |
if (found[NEGATION_LEN]) { |
5588
|
0 |
0 |
if (form_lc.len - prefix >= found[TO_FOLLOW]) negation_len = found[NEGATION_LEN]; |
5594
|
0 |
0 |
add(JJ, lemma_lc, negation_len, lemmas); |
5595
|
0 |
0 |
add(RB, lemma_lc, negation_len, lemmas); |
5596
|
0 |
0 |
add(NN, lemma_lc, negation_len, lemmas); |
5597
|
0 |
0 |
add_NNS(lemma_lc, negation_len, lemmas); |
5614
|
0 |
0 |
if ( p == ( (form_lc.str + form_lc.len)) ) |
5621
|
0 |
0 |
if ( _klen > 0 ) { |
5626
|
0 |
0 |
if ( _upper < _lower ) |
5630
|
0 |
0 |
if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) < *_mid ) |
5632
|
0 |
0 |
else if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) > *_mid ) |
5644
|
0 |
0 |
if ( _klen > 0 ) { |
5649
|
0 |
0 |
if ( _upper < _lower ) |
5653
|
0 |
0 |
if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) < _mid[0] ) |
5655
|
0 |
0 |
else if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) > _mid[1] ) |
5669
|
0 |
0 |
if ( _tag_guesser_trans_actions[_trans] == 0 ) |
5674
|
0 |
0 |
while ( _nacts-- > 0 ) |
5679
|
0 |
0 |
{ if (!added_JJR_RBR) added_JJR_RBR = true, add_JJR_RBR(lemma_lc, negation_len, lemmas); } |
|
0 |
0 |
{ if (!added_JJR_RBR) added_JJR_RBR = true, add_JJR_RBR(lemma_lc, negation_len, lemmas); } |
5682
|
0 |
0 |
{ if (!added_JJS_RBS) added_JJS_RBS = true, add_JJS_RBS(lemma_lc, negation_len, lemmas); } |
|
0 |
0 |
{ if (!added_JJS_RBS) added_JJS_RBS = true, add_JJS_RBS(lemma_lc, negation_len, lemmas); } |
5685
|
0 |
0 |
{ add_VBG(lemma_lc, lemmas); } |
5688
|
0 |
0 |
{ add_VBD_VBN(lemma_lc, lemmas); } |
5691
|
0 |
0 |
{ add_VBZ(lemma_lc, lemmas); } |
5697
|
0 |
0 |
{ if (!added_SYM) added_SYM = true, add(SYM, lemma_lc, lemmas); } |
5700
|
0 |
0 |
{ if (!added_CD) added_CD = true, add(CD, lemma_lc, lemmas); } |
5706
|
0 |
0 |
if ( ++p != ( (form_lc.str + form_lc.len)) ) |
5709
|
0 |
0 |
if ( p == ( (form_lc.str + form_lc.len)) ) |
5713
|
0 |
0 |
while ( __nacts-- > 0 ) { |
5714
|
0 |
0 |
switch ( *__acts++ ) { |
5716
|
0 |
0 |
{ if (!added_CD) added_CD = true, add(CD, lemma_lc, lemmas); } |
5732
|
0 |
0 |
bool is_NNP = form.str != form_lc.str || (form.len && (*form.str == '\'' || (*form.str >= '0' && *form.str <= '9'))); |
|
0 |
0 |
bool is_NNP = form.str != form_lc.str || (form.len && (*form.str == '\'' || (*form.str >= '0' && *form.str <= '9'))); |
|
0 |
0 |
bool is_NNP = form.str != form_lc.str || (form.len && (*form.str == '\'' || (*form.str >= '0' && *form.str <= '9'))); |
|
0 |
0 |
bool is_NNP = form.str != form_lc.str || (form.len && (*form.str == '\'' || (*form.str >= '0' && *form.str <= '9'))); |
5734
|
0 |
0 |
if (!is_NNP && !is_NNPS) return false; |
5737
|
0 |
0 |
for (auto&& lemma : lemmas) { |
5741
|
0 |
0 |
if (!((is_NNP && !was_NNP) || (is_NNPS && !was_NNPS))) return false; |
|
0 |
0 |
if (!((is_NNP && !was_NNP) || (is_NNPS && !was_NNPS))) return false; |
5744
|
0 |
0 |
if (is_NNP && !was_NNP) add(NNP, lemma, lemmas); |
5745
|
0 |
0 |
if (is_NNPS && !was_NNPS) add_NNPS(lemma, lemmas); |
|
0 |
0 |
if (is_NNPS && !was_NNPS) add_NNPS(lemma, lemmas); |
5750
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
0 |
0 |
lemmas.emplace_back(form, tag); |
5759
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
5861
|
0 |
0 |
if ( p == ( (form.c_str() + form.size())) ) |
5870
|
0 |
0 |
if ( _klen > 0 ) { |
5875
|
0 |
0 |
if ( _upper < _lower ) |
5879
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < *_mid ) |
5881
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > *_mid ) |
5893
|
0 |
0 |
if ( _klen > 0 ) { |
5898
|
0 |
0 |
if ( _upper < _lower ) |
5902
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < _mid[0] ) |
5904
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > _mid[1] ) |
5918
|
0 |
0 |
if ( _NNS_trans_actions[_trans] == 0 ) |
5923
|
0 |
0 |
while ( _nacts-- > 0 ) |
5928
|
0 |
0 |
{ if (best > 'a') best = 'a', remove = 2, append = "an"; } |
5931
|
0 |
0 |
{ if (best > 'b') best = 'b', remove = 1, append = nullptr; } |
5934
|
0 |
0 |
{ if (best > 'c') best = 'c', remove = 3, append = "fe"; } |
5937
|
0 |
0 |
{ if (best > 'd') best = 'd', remove = 2, append = nullptr; } |
5940
|
0 |
0 |
{ if (best > 'e') best = 'e', remove = 1, append = nullptr; } |
5943
|
0 |
0 |
{ if (best > 'f') best = 'f', remove = 2, append = nullptr; } |
5946
|
0 |
0 |
{ if (best > 'g') best = 'g', remove = 1, append = nullptr; } |
5949
|
0 |
0 |
{ if (best > 'h') best = 'h', remove = 2, append = nullptr; } |
5952
|
0 |
0 |
{ if (best > 'i') best = 'i', remove = 1, append = nullptr; } |
5955
|
0 |
0 |
{ if (best > 'j') best = 'j', remove = 1, append = nullptr; } |
5958
|
0 |
0 |
{ if (best > 'k') best = 'k', remove = 2, append = nullptr; } |
5961
|
0 |
0 |
{ if (best > 'l') best = 'l', remove = 3, append = "y"; } |
5964
|
0 |
0 |
{ if (best > 'm') best = 'm', remove = 2, append = nullptr; } |
5967
|
0 |
0 |
{ if (best > 'n') best = 'n', remove = 1, append = nullptr; } |
5973
|
0 |
0 |
if ( cs == 0 ) |
5975
|
0 |
0 |
if ( ++p != ( (form.c_str() + form.size())) ) |
5981
|
0 |
0 |
add(NNS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
|
0 |
0 |
add(NNS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
|
0 |
0 |
add(NNS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
6107
|
0 |
0 |
if ( p == ( (form.c_str() + form.size())) ) |
6116
|
0 |
0 |
if ( _klen > 0 ) { |
6121
|
0 |
0 |
if ( _upper < _lower ) |
6125
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid ) |
6127
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid ) |
6139
|
0 |
0 |
if ( _klen > 0 ) { |
6144
|
0 |
0 |
if ( _upper < _lower ) |
6148
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] ) |
6150
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] ) |
6164
|
0 |
0 |
if ( _NNPS_trans_actions[_trans] == 0 ) |
6169
|
0 |
0 |
while ( _nacts-- > 0 ) |
6174
|
0 |
0 |
{ if (best > 'a') best = 'a', remove = 2, append = "AN"; } |
6177
|
0 |
0 |
{ if (best > 'b') best = 'b', remove = 2, append = "an"; } |
6180
|
0 |
0 |
{ if (best > 'c') best = 'c', remove = 1, append = nullptr; } |
6183
|
0 |
0 |
{ if (best > 'd') best = 'd', remove = 3, append = "FE"; } |
6186
|
0 |
0 |
{ if (best > 'e') best = 'e', remove = 3, append = "fe"; } |
6189
|
0 |
0 |
{ if (best > 'f') best = 'f', remove = 2, append = nullptr; } |
6192
|
0 |
0 |
{ if (best > 'g') best = 'g', remove = 1, append = nullptr; } |
6195
|
0 |
0 |
{ if (best > 'h') best = 'h', remove = 2, append = nullptr; } |
6198
|
0 |
0 |
{ if (best > 'i') best = 'i', remove = 1, append = nullptr; } |
6201
|
0 |
0 |
{ if (best > 'j') best = 'j', remove = 2, append = nullptr; } |
6204
|
0 |
0 |
{ if (best > 'k') best = 'k', remove = 1, append = nullptr; } |
6207
|
0 |
0 |
{ if (best > 'l') best = 'l', remove = 1, append = nullptr; } |
6210
|
0 |
0 |
{ if (best > 'm') best = 'm', remove = 2, append = nullptr; } |
6213
|
0 |
0 |
{ if (best > 'n') best = 'n', remove = 3, append = "Y"; } |
6216
|
0 |
0 |
{ if (best > 'o') best = 'o', remove = 3, append = "y"; } |
6219
|
0 |
0 |
{ if (best > 'p') best = 'p', remove = 2, append = nullptr; } |
6222
|
0 |
0 |
{ if (best > 'q') best = 'q', remove = 1, append = nullptr; } |
6228
|
0 |
0 |
if ( cs == 0 ) |
6230
|
0 |
0 |
if ( ++p != ( (form.c_str() + form.size())) ) |
6236
|
0 |
0 |
add(NNPS, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
|
0 |
0 |
add(NNPS, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
6536
|
0 |
0 |
if ( p == ( (form.c_str() + form.size())) ) |
6545
|
0 |
0 |
if ( _klen > 0 ) { |
6550
|
0 |
0 |
if ( _upper < _lower ) |
6554
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid ) |
6556
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid ) |
6568
|
0 |
0 |
if ( _klen > 0 ) { |
6573
|
0 |
0 |
if ( _upper < _lower ) |
6577
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] ) |
6579
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] ) |
6593
|
0 |
0 |
if ( _VBG_trans_actions[_trans] == 0 ) |
6598
|
0 |
0 |
while ( _nacts-- > 0 ) |
6603
|
0 |
0 |
{ if (best > 'a') best = 'a', remove = 3, append = nullptr; } |
6606
|
0 |
0 |
{ if (best > 'b') best = 'b', remove = 3, append = "e"; } |
6609
|
0 |
0 |
{ if (best > 'c') best = 'c', remove = 3, append = nullptr; } |
6612
|
0 |
0 |
{ if (best > 'd') best = 'd', remove = 3, append = "e"; } |
6615
|
0 |
0 |
{ if (best > 'e') best = 'e', remove = 3, append = nullptr; } |
6618
|
0 |
0 |
{ if (best > 'f') best = 'f', remove = 3, append = "e"; } |
6621
|
0 |
0 |
{ if (best > 'g') best = 'g', remove = 3, append = nullptr; } |
6624
|
0 |
0 |
{ if (best > 'h') best = 'h', remove = 3, append = "e"; } |
6627
|
0 |
0 |
{ if (best > 'i') best = 'i', remove = 3, append = nullptr; } |
6630
|
0 |
0 |
{ if (best > 'j') best = 'j', remove = 3, append = "e"; } |
6633
|
0 |
0 |
{ if (best > 'k') best = 'k', remove = 3, append = nullptr; } |
6636
|
0 |
0 |
{ if (best > 'l') best = 'l', remove = 3, append = "e"; } |
6639
|
0 |
0 |
{ if (best > 'm') best = 'm', remove = 3, append = nullptr; } |
6642
|
0 |
0 |
{ if (best > 'n') best = 'n', remove = 3, append = "e"; } |
6645
|
0 |
0 |
{ if (best > 'o') best = 'o', remove = 3, append = nullptr; } |
6648
|
0 |
0 |
{ if (best > 'p') best = 'p', remove = 3, append = "e"; } |
6651
|
0 |
0 |
{ if (best > 'q') best = 'q', remove = 3, append = nullptr; } |
6654
|
0 |
0 |
{ if (best > 'r') best = 'r', remove = 3, append = "e"; } |
6660
|
0 |
0 |
if ( cs == 0 ) |
6662
|
0 |
0 |
if ( ++p != ( (form.c_str() + form.size())) ) |
6665
|
0 |
0 |
if ( p == ( (form.c_str() + form.size())) ) |
6669
|
0 |
0 |
while ( __nacts-- > 0 ) { |
6672
|
0 |
0 |
{ if (best > 'c') best = 'c', remove = 3, append = nullptr; } |
6675
|
0 |
0 |
{ if (best > 'f') best = 'f', remove = 3, append = "e"; } |
6678
|
0 |
0 |
{ if (best > 'p') best = 'p', remove = 3, append = "e"; } |
6687
|
0 |
0 |
add(VBG, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
|
0 |
0 |
add(VBG, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
6990
|
0 |
0 |
if ( p == ( (form.c_str() + form.size())) ) |
6999
|
0 |
0 |
if ( _klen > 0 ) { |
7004
|
0 |
0 |
if ( _upper < _lower ) |
7008
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid ) |
7010
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid ) |
7022
|
0 |
0 |
if ( _klen > 0 ) { |
7027
|
0 |
0 |
if ( _upper < _lower ) |
7031
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] ) |
7033
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] ) |
7047
|
0 |
0 |
if ( _VBD_VBN_trans_actions[_trans] == 0 ) |
7052
|
0 |
0 |
while ( _nacts-- > 0 ) |
7057
|
0 |
0 |
{ if (best > 'a') best = 'a', remove = 1, append = nullptr; } |
7060
|
0 |
0 |
{ if (best > 'b') best = 'b', remove = 2, append = nullptr; } |
7063
|
0 |
0 |
{ if (best > 'c') best = 'c', remove = 1, append = nullptr; } |
7066
|
0 |
0 |
{ if (best > 'd') best = 'd', remove = 2, append = nullptr; } |
7069
|
0 |
0 |
{ if (best > 'e') best = 'e', remove = 1, append = nullptr; } |
7072
|
0 |
0 |
{ if (best > 'f') best = 'f', remove = 2, append = nullptr; } |
7075
|
0 |
0 |
{ if (best > 'h') best = 'h', remove = 2, append = nullptr; } |
7078
|
0 |
0 |
{ if (best > 'i') best = 'i', remove = 3, append = "y"; } |
7081
|
0 |
0 |
{ if (best > 'j') best = 'j', remove = 1, append = nullptr; } |
7084
|
0 |
0 |
{ if (best > 'k') best = 'k', remove = 2, append = nullptr; } |
7087
|
0 |
0 |
{ if (best > 'l') best = 'l', remove = 1, append = nullptr; } |
7090
|
0 |
0 |
{ if (best > 'm') best = 'm', remove = 2, append = nullptr; } |
7093
|
0 |
0 |
{ if (best > 'n') best = 'n', remove = 1, append = nullptr; } |
7096
|
0 |
0 |
{ if (best > 'o') best = 'o', remove = 2, append = nullptr; } |
7099
|
0 |
0 |
{ if (best > 'p') best = 'p', remove = 1, append = nullptr; } |
7102
|
0 |
0 |
{ if (best > 'q') best = 'q', remove = 2, append = nullptr; } |
7105
|
0 |
0 |
{ if (best > 'r') best = 'r', remove = 1, append = nullptr; } |
7111
|
0 |
0 |
if ( cs == 0 ) |
7113
|
0 |
0 |
if ( ++p != ( (form.c_str() + form.size())) ) |
7116
|
0 |
0 |
if ( p == ( (form.c_str() + form.size())) ) |
7120
|
0 |
0 |
while ( __nacts-- > 0 ) { |
7123
|
0 |
0 |
{ if (best > 'd') best = 'd', remove = 2, append = nullptr; } |
7126
|
0 |
0 |
{ if (best > 'g') best = 'g', remove = 1, append = nullptr; } |
7129
|
0 |
0 |
{ if (best > 'j') best = 'j', remove = 1, append = nullptr; } |
7138
|
0 |
0 |
add(VBD, VBN, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
|
0 |
0 |
add(VBD, VBN, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
7217
|
0 |
0 |
if ( p == ( (form.c_str() + form.size())) ) |
7226
|
0 |
0 |
if ( _klen > 0 ) { |
7231
|
0 |
0 |
if ( _upper < _lower ) |
7235
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid ) |
7237
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid ) |
7249
|
0 |
0 |
if ( _klen > 0 ) { |
7254
|
0 |
0 |
if ( _upper < _lower ) |
7258
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] ) |
7260
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] ) |
7274
|
0 |
0 |
if ( _VBZ_trans_actions[_trans] == 0 ) |
7279
|
0 |
0 |
while ( _nacts-- > 0 ) |
7284
|
0 |
0 |
{ if (best > 'a') best = 'a', remove = 1, append = nullptr; } |
7287
|
0 |
0 |
{ if (best > 'b') best = 'b', remove = 2, append = nullptr; } |
7290
|
0 |
0 |
{ if (best > 'c') best = 'c', remove = 1, append = nullptr; } |
7293
|
0 |
0 |
{ if (best > 'd') best = 'd', remove = 2, append = nullptr; } |
7296
|
0 |
0 |
{ if (best > 'e') best = 'e', remove = 1, append = nullptr; } |
7299
|
0 |
0 |
{ if (best > 'f') best = 'f', remove = 2, append = nullptr; } |
7302
|
0 |
0 |
{ if (best > 'g') best = 'g', remove = 3, append = "y"; } |
7305
|
0 |
0 |
{ if (best > 'h') best = 'h', remove = 2, append = nullptr; } |
7308
|
0 |
0 |
{ if (best > 'i') best = 'i', remove = 1, append = nullptr; } |
7314
|
0 |
0 |
if ( cs == 0 ) |
7316
|
0 |
0 |
if ( ++p != ( (form.c_str() + form.size())) ) |
7322
|
0 |
0 |
add(VBZ, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
|
0 |
0 |
add(VBZ, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
7448
|
0 |
0 |
if ( p == ( (form.c_str() + form.size())) ) |
7457
|
0 |
0 |
if ( _klen > 0 ) { |
7462
|
0 |
0 |
if ( _upper < _lower ) |
7466
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < *_mid ) |
7468
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > *_mid ) |
7480
|
0 |
0 |
if ( _klen > 0 ) { |
7485
|
0 |
0 |
if ( _upper < _lower ) |
7489
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < _mid[0] ) |
7491
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > _mid[1] ) |
7505
|
0 |
0 |
if ( _JJR_RBR_trans_actions[_trans] == 0 ) |
7510
|
0 |
0 |
while ( _nacts-- > 0 ) |
7515
|
0 |
0 |
{ if (best > 'a') best = 'a', remove = 2, append = nullptr; } |
7518
|
0 |
0 |
{ if (best > 'b') best = 'b', remove = 3, append = nullptr; } |
7521
|
0 |
0 |
{ if (best > 'c') best = 'c', remove = 3, append = "y"; } |
7524
|
0 |
0 |
{ if (best > 'd') best = 'd', remove = 2, append = nullptr; } |
7527
|
0 |
0 |
{ if (best > 'e') best = 'e', remove = 1, append = nullptr; } |
7530
|
0 |
0 |
{ if (best > 'f') best = 'f', remove = 2, append = nullptr; } |
7536
|
0 |
0 |
if ( cs == 0 ) |
7538
|
0 |
0 |
if ( ++p != ( (form.c_str() + form.size())) ) |
7544
|
0 |
0 |
add(JJR, RBR, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
|
0 |
0 |
add(JJR, RBR, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
|
0 |
0 |
add(JJR, RBR, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
7674
|
0 |
0 |
if ( p == ( (form.c_str() + form.size())) ) |
7683
|
0 |
0 |
if ( _klen > 0 ) { |
7688
|
0 |
0 |
if ( _upper < _lower ) |
7692
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < *_mid ) |
7694
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > *_mid ) |
7706
|
0 |
0 |
if ( _klen > 0 ) { |
7711
|
0 |
0 |
if ( _upper < _lower ) |
7715
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < _mid[0] ) |
7717
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > _mid[1] ) |
7731
|
0 |
0 |
if ( _JJS_RBS_trans_actions[_trans] == 0 ) |
7736
|
0 |
0 |
while ( _nacts-- > 0 ) |
7741
|
0 |
0 |
{ if (best > 'a') best = 'a', remove = 3, append = nullptr; } |
7744
|
0 |
0 |
{ if (best > 'b') best = 'b', remove = 4, append = nullptr; } |
7747
|
0 |
0 |
{ if (best > 'c') best = 'c', remove = 4, append = "y"; } |
7750
|
0 |
0 |
{ if (best > 'd') best = 'd', remove = 3, append = nullptr; } |
7753
|
0 |
0 |
{ if (best > 'e') best = 'e', remove = 2, append = nullptr; } |
7756
|
0 |
0 |
{ if (best > 'f') best = 'f', remove = 3, append = nullptr; } |
7762
|
0 |
0 |
if ( cs == 0 ) |
7764
|
0 |
0 |
if ( ++p != ( (form.c_str() + form.size())) ) |
7770
|
0 |
0 |
add(JJS, RBS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
|
0 |
0 |
add(JJS, RBS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
|
0 |
0 |
add(JJS, RBS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
7853
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
7857
|
0 |
0 |
unsigned length = data.next_1B(); |
7858
|
0 |
0 |
unknown_tag.assign(data.next(length), length); |
|
0 |
0 |
unknown_tag.assign(data.next(length), length); |
7869
|
0 |
0 |
if (form.len) { |
7872
|
0 |
0 |
while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++; |
|
0 |
0 |
while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++; |
7873
|
0 |
0 |
if (lemmatags.len) lemmatags.len--, lemmatags.str++; |
7876
|
0 |
0 |
while (lemmatags.len) { |
7878
|
0 |
0 |
while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++; |
|
0 |
0 |
while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++; |
7879
|
0 |
0 |
if (!lemmatags.len) break; |
7884
|
0 |
0 |
while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++; |
|
0 |
0 |
while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++; |
7886
|
0 |
0 |
if (lemmatags.len) lemmatags.len--, lemmatags.str++; |
7888
|
0 |
0 |
lemmas.emplace_back(string(lemma_start, lemma_len), string(tag_start, tag_len)); |
7891
|
0 |
0 |
if (!lemmas.empty()) return NO_GUESSER; |
7894
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), unknown_tag); |
7903
|
0 |
0 |
if (lemma.len) { |
7906
|
0 |
0 |
while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++; |
|
0 |
0 |
while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++; |
7908
|
0 |
0 |
if (formtags.len) formtags.len--, formtags.str++; |
7912
|
0 |
0 |
while (formtags.len) { |
7914
|
0 |
0 |
while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++; |
|
0 |
0 |
while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++; |
7915
|
0 |
0 |
if (!formtags.len) break; |
7920
|
0 |
0 |
while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++; |
|
0 |
0 |
while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++; |
7922
|
0 |
0 |
if (formtags.len) formtags.len--, formtags.str++; |
7926
|
0 |
0 |
if (filter.matches(tag.c_str())) { |
7927
|
0 |
0 |
if (forms.empty()) forms.emplace_back(string(real_lemma.str, real_lemma.len)); |
|
0 |
0 |
if (forms.empty()) forms.emplace_back(string(real_lemma.str, real_lemma.len)); |
7928
|
0 |
0 |
forms.back().forms.emplace_back(string(form_start, form_len), tag); |
7932
|
0 |
0 |
if (any_result) return NO_GUESSER; |
7940
|
0 |
0 |
while (lemma_len < lemma.len && lemma.str[lemma_len] != ' ') lemma_len++; |
|
0 |
0 |
while (lemma_len < lemma.len && lemma.str[lemma_len] != ' ') lemma_len++; |
7946
|
0 |
0 |
while (lemma_len < lemma.len && lemma.str[lemma_len] != ' ') lemma_len++; |
|
0 |
0 |
while (lemma_len < lemma.len && lemma.str[lemma_len] != ' ') lemma_len++; |
7952
|
0 |
0 |
while (form_len < form.len && form.str[form_len] != ' ') form_len++; |
|
0 |
0 |
while (form_len < form.len && form.str[form_len] != ' ') form_len++; |
8073
|
1 |
0 |
if (!compressor::load(is, data)) return false; |
|
1 |
0 |
if (!compressor::load(is, data)) return false; |
8077
|
1 |
0 |
unsigned length = data.next_1B(); |
8078
|
1 |
0 |
unknown_tag.assign(data.next(length), length); |
8079
|
1 |
0 |
length = data.next_1B(); |
8080
|
1 |
0 |
number_tag.assign(data.next(length), length); |
8081
|
1 |
0 |
length = data.next_1B(); |
8082
|
1 |
0 |
punctuation_tag.assign(data.next(length), length); |
8083
|
1 |
0 |
length = data.next_1B(); |
8084
|
1 |
0 |
symbol_tag.assign(data.next(length), length); |
8087
|
1 |
0 |
dictionary.load(data); |
8091
|
1 |
0 |
if (data.next_1B()) { |
|
0 |
1 |
if (data.next_1B()) { |
8092
|
0 |
0 |
statistical_guesser.reset(new morpho_statistical_guesser()); |
8093
|
0 |
0 |
statistical_guesser->load(data); |
8094
|
0 |
0 |
} |
8105
|
14 |
0 |
if (form.len) { |
8109
|
14 |
0 |
generate_casing_variants(form, form_uclc, form_lc); |
8112
|
14 |
0 |
dictionary.analyze(form, lemmas); |
8113
|
0 |
14 |
if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas); |
|
0 |
0 |
if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas); |
8114
|
4 |
10 |
if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas); |
|
4 |
0 |
if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas); |
8115
|
4 |
10 |
if (!lemmas.empty()) return NO_GUESSER; |
8118
|
4 |
0 |
analyze_special(form, lemmas); |
8119
|
0 |
4 |
if (!lemmas.empty()) return NO_GUESSER; |
8122
|
0 |
0 |
if (guesser == GUESSER && statistical_guesser) { |
|
0 |
0 |
if (guesser == GUESSER && statistical_guesser) { |
|
0 |
0 |
if (guesser == GUESSER && statistical_guesser) { |
8123
|
0 |
0 |
if (form_uclc.empty() && form_lc.empty()) |
|
0 |
0 |
if (form_uclc.empty() && form_lc.empty()) |
|
0 |
0 |
if (form_uclc.empty() && form_lc.empty()) |
8124
|
0 |
0 |
statistical_guesser->analyze(form, lemmas, nullptr); |
8126
|
0 |
0 |
morpho_statistical_guesser::used_rules used_rules; used_rules.reserve(3); |
8127
|
0 |
0 |
statistical_guesser->analyze(form, lemmas, &used_rules); |
8128
|
0 |
0 |
if (!form_uclc.empty()) statistical_guesser->analyze(form_uclc, lemmas, &used_rules); |
|
0 |
0 |
if (!form_uclc.empty()) statistical_guesser->analyze(form_uclc, lemmas, &used_rules); |
8129
|
0 |
0 |
if (!form_lc.empty()) statistical_guesser->analyze(form_lc, lemmas, &used_rules); |
|
0 |
0 |
if (!form_lc.empty()) statistical_guesser->analyze(form_lc, lemmas, &used_rules); |
8132
|
0 |
0 |
if (!lemmas.empty()) return GUESSER; |
8135
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), unknown_tag); |
8144
|
0 |
0 |
if (lemma.len) { |
8145
|
0 |
0 |
if (dictionary.generate(lemma, filter, forms)) |
|
0 |
0 |
if (dictionary.generate(lemma, filter, forms)) |
8175
|
4 |
0 |
if (!form.len) return; |
8183
|
0 |
4 |
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(number.str, number.len); |
8184
|
0 |
4 |
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
8185
|
4 |
0 |
if ((codepoint == '.' && number.len) || codepoint == ',') codepoint = utf8::decode(number.str, number.len); |
|
4 |
0 |
if ((codepoint == '.' && number.len) || codepoint == ',') codepoint = utf8::decode(number.str, number.len); |
|
0 |
4 |
if ((codepoint == '.' && number.len) || codepoint == ',') codepoint = utf8::decode(number.str, number.len); |
8186
|
0 |
4 |
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
8187
|
0 |
4 |
if (any_digit && (codepoint == 'e' || codepoint == 'E')) { |
|
0 |
0 |
if (any_digit && (codepoint == 'e' || codepoint == 'E')) { |
8189
|
0 |
0 |
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(number.str, number.len); |
8191
|
0 |
0 |
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
8194
|
0 |
4 |
if (any_digit && !number.len && (!codepoint || codepoint == '.')) { |
|
0 |
0 |
if (any_digit && !number.len && (!codepoint || codepoint == '.')) { |
|
0 |
0 |
if (any_digit && !number.len && (!codepoint || codepoint == '.')) { |
8195
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), number_tag); |
8202
|
4 |
4 |
while (form.len) { |
8204
|
4 |
0 |
punctuation = punctuation && unicode::category(codepoint) & unicode::P; |
|
0 |
4 |
punctuation = punctuation && unicode::category(codepoint) & unicode::P; |
8205
|
0 |
4 |
symbol = symbol && unicode::category(codepoint) & unicode::S; |
|
0 |
4 |
symbol = symbol && unicode::category(codepoint) & unicode::S; |
8207
|
4 |
0 |
if (punctuation) |
8208
|
4 |
0 |
lemmas.emplace_back(string(form_ori.str, form_ori.len), punctuation_tag); |
8209
|
0 |
0 |
else if (symbol) |
8210
|
0 |
0 |
lemmas.emplace_back(string(form_ori.str, form_ori.len), symbol_tag); |
8274
|
0 |
0 |
return unique_ptr(new T(std::forward(args)...)); |
|
1 |
0 |
return unique_ptr(new T(std::forward(args)...)); |
|
0 |
0 |
return unique_ptr(new T(std::forward(args)...)); |
8300
|
0 |
0 |
if (res->load(is)) return res.release(); |
|
0 |
0 |
if (res->load(is)) return res.release(); |
8309
|
0 |
0 |
3); |
|
0 |
0 |
3); |
8310
|
0 |
0 |
if (res->load(is)) return res.release(); |
|
0 |
0 |
if (res->load(is)) return res.release(); |
8316
|
0 |
0 |
if (res->load(is)) return res.release(); |
|
0 |
0 |
if (res->load(is)) return res.release(); |
8322
|
1 |
0 |
if (res->load(is)) return res.release(); |
|
1 |
0 |
if (res->load(is)) return res.release(); |
8328
|
0 |
0 |
if (res->load(is)) return res.release(); |
|
0 |
0 |
if (res->load(is)) return res.release(); |
8334
|
0 |
0 |
if (!derinet->load(is)) return nullptr; |
|
0 |
0 |
if (!derinet->load(is)) return nullptr; |
8336
|
0 |
0 |
unique_ptr dictionary(load(is)); |
8337
|
0 |
0 |
if (!dictionary) return nullptr; |
8348
|
0 |
0 |
ifstream f(path_from_utf8(fname).c_str(), ifstream::binary); |
8349
|
0 |
0 |
if (!f) return nullptr; |
8351
|
0 |
0 |
return load(f); |
8378
|
0 |
0 |
for (auto&& tag : tags) { |
8380
|
0 |
0 |
for (unsigned i = 0; i < tag.size(); i++) |
8391
|
0 |
0 |
if (!used) return false; |
8393
|
0 |
0 |
for (auto&& used_rule : *used) |
8394
|
0 |
0 |
if (used_rule == rule) |
8406
|
0 |
0 |
string rule_label; rule_label.reserve(12); |
8408
|
0 |
0 |
for (; suffix_len < form.len; suffix_len++) { |
8409
|
0 |
0 |
rule_label.push_back(form.str[form.len - (suffix_len + 1)]); |
8410
|
0 |
0 |
if (!rules.at(rule_label.c_str(), rule_label.size(), [](pointer_decoder& data){ data.next(data.next_2B()); })) |
8414
|
0 |
0 |
for (suffix_len++; suffix_len--; ) { |
8416
|
0 |
0 |
rule_label.push_back(' '); |
8420
|
0 |
0 |
for (unsigned prefix_len = 0; prefix_len + suffix_len <= form.len; prefix_len++) { |
8421
|
0 |
0 |
if (prefix_len) rule_label.push_back(form.str[prefix_len - 1]); |
|
0 |
0 |
if (prefix_len) rule_label.push_back(form.str[prefix_len - 1]); |
8423
|
0 |
0 |
if (!found) break; |
8424
|
0 |
0 |
if (*(found += sizeof(uint16_t))) { |
8430
|
0 |
0 |
if (rule) { |
8432
|
0 |
0 |
if (rule_label.size() > 1 && !contains(used, rule_label)) { // ignore rule ' ' |
|
0 |
0 |
if (rule_label.size() > 1 && !contains(used, rule_label)) { // ignore rule ' ' |
|
0 |
0 |
if (rule_label.size() > 1 && !contains(used, rule_label)) { // ignore rule ' ' |
8433
|
0 |
0 |
if (used) used->push_back(rule_label); |
|
0 |
0 |
if (used) used->push_back(rule_label); |
8434
|
0 |
0 |
for (int rules_len = *rule++; rules_len; rules_len--) { |
8441
|
0 |
0 |
if (pref_del_len + suff_del_len > form.len || |
|
0 |
0 |
if (pref_del_len + suff_del_len > form.len || |
8442
|
0 |
0 |
(pref_del_len && !small_memeq(pref_del, form.str, pref_del_len)) || |
|
0 |
0 |
(pref_del_len && !small_memeq(pref_del, form.str, pref_del_len)) || |
8443
|
0 |
0 |
(suff_del_len && !small_memeq(suff_del, form.str + form.len - suff_del_len, suff_del_len)) || |
|
0 |
0 |
(suff_del_len && !small_memeq(suff_del, form.str + form.len - suff_del_len, suff_del_len)) || |
|
0 |
0 |
(suff_del_len && !small_memeq(suff_del, form.str + form.len - suff_del_len, suff_del_len)) || |
8448
|
0 |
0 |
lemma.reserve(form.len + pref_add_len - pref_del_len + suff_add_len - suff_del_len); |
8449
|
0 |
0 |
if (pref_add_len) lemma.append(pref_add, pref_add_len); |
|
0 |
0 |
if (pref_add_len) lemma.append(pref_add, pref_add_len); |
8450
|
0 |
0 |
if (pref_del_len + suff_del_len < form.len) lemma.append(form.str + pref_del_len, form.len - pref_del_len - suff_del_len); |
|
0 |
0 |
if (pref_del_len + suff_del_len < form.len) lemma.append(form.str + pref_del_len, form.len - pref_del_len - suff_del_len); |
8451
|
0 |
0 |
if (suff_add_len) lemma.append(suff_add, suff_add_len); |
|
0 |
0 |
if (suff_add_len) lemma.append(suff_add, suff_add_len); |
8452
|
0 |
0 |
while (tags_len--) |
8453
|
0 |
0 |
lemmas.emplace_back(lemma, this->tags[unaligned_load_inc(tags)]); |
8461
|
0 |
0 |
if (lemmas.size() == lemmas_initial_size) |
8462
|
0 |
0 |
if (!contains(used, string())) { |
8463
|
0 |
0 |
if (used) used->push_back(string()); |
8464
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), tags[default_tag]); |
8486
|
0 |
0 |
if (!filter) return; |
8488
|
0 |
0 |
wildcard.assign(filter); |
8491
|
0 |
0 |
for (int tag_pos = 0, filter_pos = 0; filter[filter_pos]; tag_pos++, filter_pos++) { |
8492
|
0 |
0 |
if (filter[filter_pos] == '?') continue; |
8493
|
0 |
0 |
if (filter[filter_pos] == '[') { |
8497
|
0 |
0 |
if (filter[filter_pos] == '^') negate = true, filter_pos++; |
8500
|
0 |
0 |
for (bool first = true; filter[filter_pos] && (first || filter[filter_pos] != ']'); first = false) |
|
0 |
0 |
for (bool first = true; filter[filter_pos] && (first || filter[filter_pos] != ']'); first = false) |
|
0 |
0 |
for (bool first = true; filter[filter_pos] && (first || filter[filter_pos] != ']'); first = false) |
8503
|
0 |
0 |
filters.emplace_back(tag_pos, negate, chars_start, filter_pos - chars_start); |
8504
|
0 |
0 |
if (!filter[filter_pos]) break; |
8506
|
0 |
0 |
filters.emplace_back(tag_pos, false, filter_pos, 1); |
8599
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
8 |
3 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
11 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
11 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
11 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
2 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
7 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
6 |
1 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
2 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
2 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
2 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
2 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
2 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
2 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
2 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
2 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
2 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
2 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
8607
|
1 |
0 |
if (!compressor::load(is, data)) return false; |
|
1 |
0 |
if (!compressor::load(is, data)) return false; |
8610
|
1 |
0 |
maps.resize(data.next_1B()); |
|
1 |
0 |
maps.resize(data.next_1B()); |
8611
|
26 |
1 |
for (auto&& map : maps) |
8612
|
26 |
0 |
map.load(data); |
|
0 |
0 |
map.load(data); |
8654
|
267 |
0 |
if (value < 0x80) *where++ = value; |
8655
|
0 |
0 |
else if (value < 0x4000) *where++ = (value >> 7) | 0x80u, *where++ = value & 0x7Fu; |
8656
|
0 |
0 |
else if (value < 0x200000) *where++ = (value >> 14) | 0x80u, *where++ = ((value >> 7) & 0x7Fu) | 0x80u, *where++ = value & 0x7Fu; |
8657
|
0 |
0 |
else if (value < 0x10000000) *where++ = (value >> 21) | 0x80u, *where++ = ((value >> 14) & 0x7Fu) | 0x80u, *where++ = ((value >> 7) & 0x7Fu) | 0x80u, *where++ = value & 0x7Fu; |
8705
|
0 |
0 |
class feature_sequences { |
|
1 |
0 |
class feature_sequences { |
|
0 |
0 |
class feature_sequences { |
8734
|
0 |
0 |
return it ? unaligned_load(it) : 0; |
|
55 |
53 |
return it ? unaligned_load(it) : 0; |
|
0 |
0 |
return it ? unaligned_load(it) : 0; |
8743
|
0 |
0 |
if (!elementary.load(is)) return false; |
|
1 |
0 |
if (!elementary.load(is)) return false; |
|
0 |
0 |
if (!elementary.load(is)) return false; |
8746
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
|
1 |
0 |
if (!compressor::load(is, data)) return false; |
|
1 |
0 |
if (!compressor::load(is, data)) return false; |
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
8749
|
0 |
0 |
sequences.resize(data.next_1B()); |
|
0 |
0 |
sequences.resize(data.next_1B()); |
|
1 |
0 |
sequences.resize(data.next_1B()); |
|
1 |
0 |
sequences.resize(data.next_1B()); |
|
0 |
0 |
sequences.resize(data.next_1B()); |
|
0 |
0 |
sequences.resize(data.next_1B()); |
8750
|
0 |
0 |
for (auto&& sequence : sequences) { |
|
21 |
1 |
for (auto&& sequence : sequences) { |
|
0 |
0 |
for (auto&& sequence : sequences) { |
8751
|
0 |
0 |
sequence.dependant_range = data.next_4B(); |
|
21 |
0 |
sequence.dependant_range = data.next_4B(); |
|
0 |
0 |
sequence.dependant_range = data.next_4B(); |
8752
|
0 |
0 |
sequence.elements.resize(data.next_1B()); |
|
0 |
0 |
sequence.elements.resize(data.next_1B()); |
|
21 |
0 |
sequence.elements.resize(data.next_1B()); |
|
21 |
0 |
sequence.elements.resize(data.next_1B()); |
|
0 |
0 |
sequence.elements.resize(data.next_1B()); |
|
0 |
0 |
sequence.elements.resize(data.next_1B()); |
8753
|
0 |
0 |
for (auto&& element : sequence.elements) { |
|
45 |
21 |
for (auto&& element : sequence.elements) { |
|
0 |
0 |
for (auto&& element : sequence.elements) { |
8754
|
0 |
0 |
element.type = elementary_feature_type(data.next_4B()); |
|
45 |
0 |
element.type = elementary_feature_type(data.next_4B()); |
|
0 |
0 |
element.type = elementary_feature_type(data.next_4B()); |
8755
|
0 |
0 |
element.elementary_index = data.next_4B(); |
|
45 |
0 |
element.elementary_index = data.next_4B(); |
|
0 |
0 |
element.elementary_index = data.next_4B(); |
8756
|
0 |
0 |
element.sequence_index = data.next_4B(); |
|
45 |
0 |
element.sequence_index = data.next_4B(); |
|
0 |
0 |
element.sequence_index = data.next_4B(); |
8760
|
0 |
0 |
scores.resize(data.next_1B()); |
|
0 |
0 |
scores.resize(data.next_1B()); |
|
1 |
0 |
scores.resize(data.next_1B()); |
|
1 |
0 |
scores.resize(data.next_1B()); |
|
0 |
0 |
scores.resize(data.next_1B()); |
|
0 |
0 |
scores.resize(data.next_1B()); |
8761
|
0 |
0 |
for (auto&& score : scores) |
|
21 |
1 |
for (auto&& score : scores) |
|
0 |
0 |
for (auto&& score : scores) |
8762
|
0 |
0 |
score.load(data); |
|
21 |
0 |
score.load(data); |
|
0 |
0 |
score.load(data); |
|
0 |
0 |
score.load(data); |
|
0 |
0 |
score.load(data); |
|
0 |
0 |
score.load(data); |
8782
|
0 |
0 |
cache_element(int elements) : key(vli::max_length() * elements), key_size(0), score(0) {} |
|
0 |
0 |
cache_element(int elements) : key(vli::max_length() * elements), key_size(0), score(0) {} |
|
0 |
0 |
cache_element(int elements) : key(vli::max_length() * elements), key_size(0), score(0) {} |
8790
|
0 |
0 |
caches.reserve(self.sequences.size()); |
|
1 |
0 |
caches.reserve(self.sequences.size()); |
|
0 |
0 |
caches.reserve(self.sequences.size()); |
8792
|
0 |
0 |
for (auto&& sequence : self.sequences) { |
|
21 |
1 |
for (auto&& sequence : self.sequences) { |
|
0 |
0 |
for (auto&& sequence : self.sequences) { |
8793
|
0 |
0 |
caches.emplace_back(int(sequence.elements.size())); |
|
21 |
0 |
caches.emplace_back(int(sequence.elements.size())); |
|
0 |
0 |
caches.emplace_back(int(sequence.elements.size())); |
8794
|
0 |
0 |
if (int(sequence.elements.size()) > max_sequence_elements) max_sequence_elements = sequence.elements.size(); |
|
1 |
20 |
if (int(sequence.elements.size()) > max_sequence_elements) max_sequence_elements = sequence.elements.size(); |
|
0 |
0 |
if (int(sequence.elements.size()) > max_sequence_elements) max_sequence_elements = sequence.elements.size(); |
8795
|
0 |
0 |
for (auto&& element : sequence.elements) |
|
45 |
21 |
for (auto&& element : sequence.elements) |
|
0 |
0 |
for (auto&& element : sequence.elements) |
8796
|
0 |
0 |
if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size) |
|
0 |
0 |
if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size) |
|
26 |
19 |
if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size) |
|
2 |
24 |
if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size) |
|
0 |
0 |
if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size) |
|
0 |
0 |
if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size) |
8799
|
0 |
0 |
key.resize(max_sequence_elements * vli::max_length()); |
|
1 |
0 |
key.resize(max_sequence_elements * vli::max_length()); |
|
0 |
0 |
key.resize(max_sequence_elements * vli::max_length()); |
8800
|
0 |
0 |
window.resize(max_window_size); |
|
1 |
0 |
window.resize(max_window_size); |
|
0 |
0 |
window.resize(max_window_size); |
8811
|
0 |
0 |
if (forms.size() > c.elementary_per_form.size()) c.elementary_per_form.resize(forms.size() * 2); |
|
1 |
1 |
if (forms.size() > c.elementary_per_form.size()) c.elementary_per_form.resize(forms.size() * 2); |
|
0 |
0 |
if (forms.size() > c.elementary_per_form.size()) c.elementary_per_form.resize(forms.size() * 2); |
8812
|
0 |
0 |
if (forms.size() > c.elementary_per_tag.size()) c.elementary_per_tag.resize(forms.size() * 2); |
|
1 |
1 |
if (forms.size() > c.elementary_per_tag.size()) c.elementary_per_tag.resize(forms.size() * 2); |
|
0 |
0 |
if (forms.size() > c.elementary_per_tag.size()) c.elementary_per_tag.resize(forms.size() * 2); |
8813
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) |
|
7 |
2 |
for (unsigned i = 0; i < forms.size(); i++) |
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) |
8814
|
0 |
0 |
if (analyses[i].size() > c.elementary_per_tag[i].size()) |
|
5 |
2 |
if (analyses[i].size() > c.elementary_per_tag[i].size()) |
|
0 |
0 |
if (analyses[i].size() > c.elementary_per_tag[i].size()) |
8822
|
0 |
0 |
for (auto&& cache : c.caches) |
|
42 |
2 |
for (auto&& cache : c.caches) |
|
0 |
0 |
for (auto&& cache : c.caches) |
8828
|
0 |
0 |
elementary.compute_dynamic_features((*c.analyses)[form_index][tag_index], c.elementary_per_form[form_index], c.elementary_per_tag[form_index][tag_index], form_index > 0 ? prev_dynamic : nullptr, dynamic); |
|
20 |
2 |
elementary.compute_dynamic_features((*c.analyses)[form_index][tag_index], c.elementary_per_form[form_index], c.elementary_per_tag[form_index][tag_index], form_index > 0 ? prev_dynamic : nullptr, dynamic); |
|
0 |
0 |
elementary.compute_dynamic_features((*c.analyses)[form_index][tag_index], c.elementary_per_form[form_index], c.elementary_per_tag[form_index][tag_index], form_index > 0 ? prev_dynamic : nullptr, dynamic); |
8834
|
0 |
0 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
0 |
0 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
0 |
0 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
54 |
16 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
2 |
52 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
52 |
18 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
0 |
0 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
0 |
0 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
0 |
0 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
8839
|
0 |
0 |
for (unsigned i = 0; i < sequences.size(); i++) { |
|
190 |
7 |
for (unsigned i = 0; i < sequences.size(); i++) { |
|
0 |
0 |
for (unsigned i = 0; i < sequences.size(); i++) { |
8840
|
0 |
0 |
if (tags_unchanged >= sequences[i].dependant_range) |
|
179 |
11 |
if (tags_unchanged >= sequences[i].dependant_range) |
|
0 |
0 |
if (tags_unchanged >= sequences[i].dependant_range) |
8844
|
0 |
0 |
for (unsigned j = 0; j < sequences[i].elements.size(); j++) { |
|
332 |
114 |
for (unsigned j = 0; j < sequences[i].elements.size(); j++) { |
|
0 |
0 |
for (unsigned j = 0; j < sequences[i].elements.size(); j++) { |
8850
|
0 |
0 |
value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index]; |
|
0 |
0 |
value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index]; |
|
91 |
4 |
value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index]; |
|
86 |
5 |
value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index]; |
|
0 |
0 |
value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index]; |
|
0 |
0 |
value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index]; |
8853
|
0 |
0 |
value = form_index + element.sequence_index < 0 ? elementary_feature_empty : c.window[-element.sequence_index]->values[element.elementary_index]; |
|
235 |
2 |
value = form_index + element.sequence_index < 0 ? elementary_feature_empty : c.window[-element.sequence_index]->values[element.elementary_index]; |
|
0 |
0 |
value = form_index + element.sequence_index < 0 ? elementary_feature_empty : c.window[-element.sequence_index]->values[element.elementary_index]; |
8860
|
0 |
0 |
if (value == elementary_feature_unknown) { |
|
65 |
267 |
if (value == elementary_feature_unknown) { |
|
0 |
0 |
if (value == elementary_feature_unknown) { |
8869
|
0 |
0 |
if (!key_size) { |
|
65 |
114 |
if (!key_size) { |
|
0 |
0 |
if (!key_size) { |
8872
|
0 |
0 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
0 |
0 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
0 |
0 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
84 |
30 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
78 |
6 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
108 |
6 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
0 |
0 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
0 |
0 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
0 |
0 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
8933
|
0 |
0 |
cache(const viterbi& self) : features_cache(self.features) {} |
|
1 |
0 |
cache(const viterbi& self) : features_cache(self.features) {} |
|
0 |
0 |
cache(const viterbi& self) : features_cache(self.features) {} |
8946
|
0 |
0 |
if (!forms.size()) return; |
|
2 |
0 |
if (!forms.size()) return; |
|
0 |
0 |
if (!forms.size()) return; |
8950
|
0 |
0 |
for (unsigned i = 0, states = 1; i < forms.size(); i++) { |
|
7 |
2 |
for (unsigned i = 0, states = 1; i < forms.size(); i++) { |
|
0 |
0 |
for (unsigned i = 0, states = 1; i < forms.size(); i++) { |
8951
|
0 |
0 |
if (analyses[i].empty()) return; |
|
7 |
0 |
if (analyses[i].empty()) return; |
|
0 |
0 |
if (analyses[i].empty()) return; |
8952
|
0 |
0 |
states = (i+1 >= unsigned(decoding_order) ? states / analyses[i-decoding_order+1].size() : states) * analyses[i].size(); |
|
3 |
4 |
states = (i+1 >= unsigned(decoding_order) ? states / analyses[i-decoding_order+1].size() : states) * analyses[i].size(); |
|
0 |
0 |
states = (i+1 >= unsigned(decoding_order) ? states / analyses[i-decoding_order+1].size() : states) * analyses[i].size(); |
8955
|
0 |
0 |
if (nodes > c.nodes.size()) c.nodes.resize(nodes); |
|
2 |
0 |
if (nodes > c.nodes.size()) c.nodes.resize(nodes); |
|
0 |
0 |
if (nodes > c.nodes.size()) c.nodes.resize(nodes); |
8961
|
0 |
0 |
int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data()); |
|
0 |
0 |
int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data()); |
|
0 |
2 |
int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data()); |
|
0 |
0 |
int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data()); |
|
0 |
0 |
int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data()); |
|
0 |
0 |
int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data()); |
8967
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) { |
|
7 |
2 |
for (unsigned i = 0; i < forms.size(); i++) { |
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) { |
8970
|
0 |
0 |
for (int j = 0; j < window_size; j++) window[j] = -1; |
|
7 |
21 |
for (int j = 0; j < window_size; j++) window[j] = -1; |
|
0 |
0 |
for (int j = 0; j < window_size; j++) window[j] = -1; |
8971
|
0 |
0 |
for (int tag = 0; tag < int(analyses[i].size()); tag++) |
|
11 |
7 |
for (int tag = 0; tag < int(analyses[i].size()); tag++) |
|
0 |
0 |
for (int tag = 0; tag < int(analyses[i].size()); tag++) |
8972
|
0 |
0 |
for (int prev = nodes_prev; prev < nodes_now; prev++) { |
|
22 |
11 |
for (int prev = nodes_prev; prev < nodes_now; prev++) { |
|
0 |
0 |
for (int prev = nodes_prev; prev < nodes_now; prev++) { |
8976
|
0 |
0 |
for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) { |
|
0 |
0 |
for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) { |
|
45 |
14 |
for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) { |
|
37 |
8 |
for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) { |
|
0 |
0 |
for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) { |
|
0 |
0 |
for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) { |
8977
|
0 |
0 |
same_tags += same_tags == n && window[n] == c.nodes[p].tag; |
|
0 |
0 |
same_tags += same_tags == n && window[n] == c.nodes[p].tag; |
|
15 |
22 |
same_tags += same_tags == n && window[n] == c.nodes[p].tag; |
|
11 |
4 |
same_tags += same_tags == n && window[n] == c.nodes[p].tag; |
|
0 |
0 |
same_tags += same_tags == n && window[n] == c.nodes[p].tag; |
|
0 |
0 |
same_tags += same_tags == n && window[n] == c.nodes[p].tag; |
8982
|
0 |
0 |
features.compute_dynamic_features(i, tag, prev >= 0 ? &c.nodes[prev].dynamic : nullptr, dynamic, c.features_cache); |
|
20 |
2 |
features.compute_dynamic_features(i, tag, prev >= 0 ? &c.nodes[prev].dynamic : nullptr, dynamic, c.features_cache); |
|
0 |
0 |
features.compute_dynamic_features(i, tag, prev >= 0 ? &c.nodes[prev].dynamic : nullptr, dynamic, c.features_cache); |
8983
|
0 |
0 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
0 |
0 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
0 |
0 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
6 |
16 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
2 |
4 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
20 |
2 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
0 |
0 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
0 |
0 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
0 |
0 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
8987
|
0 |
0 |
if (same_tags >= decoding_order-1) { |
|
4 |
18 |
if (same_tags >= decoding_order-1) { |
|
0 |
0 |
if (same_tags >= decoding_order-1) { |
8988
|
0 |
0 |
if (score <= c.nodes[nodes_next-1].score) continue; |
|
4 |
0 |
if (score <= c.nodes[nodes_next-1].score) continue; |
|
0 |
0 |
if (score <= c.nodes[nodes_next-1].score) continue; |
9003
|
0 |
0 |
for (int node = nodes_prev + 1; node < nodes_now; node++) |
|
3 |
2 |
for (int node = nodes_prev + 1; node < nodes_now; node++) |
|
0 |
0 |
for (int node = nodes_prev + 1; node < nodes_now; node++) |
9004
|
0 |
0 |
if (c.nodes[node].score > c.nodes[best].score) |
|
1 |
2 |
if (c.nodes[node].score > c.nodes[best].score) |
|
0 |
0 |
if (c.nodes[node].score > c.nodes[best].score) |
9007
|
0 |
0 |
for (int i = forms.size() - 1; i >= 0; i--, best = c.nodes[best].prev) |
|
7 |
2 |
for (int i = forms.size() - 1; i >= 0; i--, best = c.nodes[best].prev) |
|
0 |
0 |
for (int i = forms.size() - 1; i >= 0; i--, best = c.nodes[best].prev) |
9056
|
0 |
0 |
maps.resize(MAP_TOTAL); |
9105
|
0 |
0 |
for (unsigned i = forms.size(); i--;) { |
9109
|
0 |
0 |
for (unsigned j = 0; j < analyses[i].size(); j++) { |
9120
|
0 |
0 |
per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == lemma ? per_tag[i][j-1].values[LEMMA] : |
|
0 |
0 |
per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == lemma ? per_tag[i][j-1].values[LEMMA] : |
9125
|
0 |
0 |
if (index == string::npos) index = tag.size(); |
9126
|
0 |
0 |
per_tag[i][j].values[TAG_UPOS] = maps[MAP_TAG_UPOS].value(tag.c_str() + (index ? 1 : 0), index - (index ? 1 : 0)); |
|
0 |
0 |
per_tag[i][j].values[TAG_UPOS] = maps[MAP_TAG_UPOS].value(tag.c_str() + (index ? 1 : 0), index - (index ? 1 : 0)); |
9128
|
0 |
0 |
if (index < tag.size()) index++; |
9129
|
0 |
0 |
if (index < tag.size()) index = tag.find(separator, index); |
9130
|
0 |
0 |
if (index < tag.size()) index++; |
9131
|
0 |
0 |
for (size_t length; index < tag.size(); index += length + 1) { |
9133
|
0 |
0 |
length = (length == string::npos ? tag.size() : length) - index; |
9135
|
0 |
0 |
for (size_t equal_sign = 0; equal_sign + 1 < length; equal_sign++) |
9136
|
0 |
0 |
if (tag[index + equal_sign] == '=') { |
9140
|
0 |
0 |
if (tag.compare(index, equal_sign, "Case") == 0) value = TAG_CASE, map = MAP_TAG_CASE; |
9143
|
0 |
0 |
if (tag.compare(index, equal_sign, "Gender") == 0) value = TAG_GENDER, map = MAP_TAG_GENDER; |
9144
|
0 |
0 |
if (tag.compare(index, equal_sign, "Number") == 0) value = TAG_NUMBER, map = MAP_TAG_NUMBER; |
9145
|
0 |
0 |
if (tag.compare(index, equal_sign, "Person") == 0) value = TAG_PERSON, map = MAP_TAG_PERSON; |
9148
|
0 |
0 |
if (tag.compare(index, equal_sign, "Negative") == 0) value = TAG_NEGATIVE, map = MAP_TAG_NEGATIVE; |
9152
|
0 |
0 |
if (value >= 0) |
9158
|
0 |
0 |
if (tag.size() >= 2 && tag[1] == 'V') { |
|
0 |
0 |
if (tag.size() >= 2 && tag[1] == 'V') { |
|
0 |
0 |
if (tag.size() >= 2 && tag[1] == 'V') { |
9160
|
0 |
0 |
verb_candidate = verb_candidate < 0 || (tag_compare = tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate; |
|
0 |
0 |
verb_candidate = verb_candidate < 0 || (tag_compare = tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate; |
9170
|
0 |
0 |
if (verb_candidate >= 0) { |
9176
|
0 |
0 |
if (analyses[i].size() == 1) { |
9184
|
0 |
0 |
} else if (forms[i].len <= 0) { |
9199
|
0 |
0 |
while (form.len) { |
9203
|
0 |
0 |
num = num || cat & unicode::N; |
|
0 |
0 |
num = num || cat & unicode::N; |
9204
|
0 |
0 |
cap = cap || cat & unicode::Lut; |
|
0 |
0 |
cap = cap || cat & unicode::Lut; |
9205
|
0 |
0 |
dash = dash || cat & unicode::Pd; |
|
0 |
0 |
dash = dash || cat & unicode::Pd; |
9207
|
0 |
0 |
if (index == 10 || (!form.len && index < 10)) { |
|
0 |
0 |
if (index == 10 || (!form.len && index < 10)) { |
|
0 |
0 |
if (index == 10 || (!form.len && index < 10)) { |
9237
|
0 |
0 |
if (prev_dynamic) { |
9245
|
0 |
0 |
if (tag.tag.size() >= 2 && tag.tag[1] == 'V') { |
|
0 |
0 |
if (tag.tag.size() >= 2 && tag.tag[1] == 'V') { |
|
0 |
0 |
if (tag.tag.size() >= 2 && tag.tag[1] == 'V') { |
9299
|
0 |
0 |
maps.resize(MAP_TOTAL); |
9335
|
0 |
0 |
for (unsigned i = forms.size(); i--;) { |
9339
|
0 |
0 |
for (unsigned j = 0; j < analyses[i].size(); j++) { |
9342
|
0 |
0 |
per_tag[i][j].values[TAG3] = analyses[i][j].tag.size() >= 3 ? maps[MAP_TAG3].value(analyses[i][j].tag.c_str() + 2, 1) : elementary_feature_empty; |
9343
|
0 |
0 |
per_tag[i][j].values[TAG5] = analyses[i][j].tag.size() >= 5 ? maps[MAP_TAG5].value(analyses[i][j].tag.c_str() + 4, 1) : elementary_feature_empty; |
9344
|
0 |
0 |
per_tag[i][j].values[TAG25] = analyses[i][j].tag.size() >= 5 ? maps[MAP_TAG25].value((tag25[0] = analyses[i][j].tag[1], tag25[1] = analyses[i][j].tag[4], tag25), 2) : elementary_feature_empty; |
9345
|
0 |
0 |
per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == analyses[i][j].lemma ? per_tag[i][j-1].values[LEMMA] : |
|
0 |
0 |
per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == analyses[i][j].lemma ? per_tag[i][j-1].values[LEMMA] : |
9348
|
0 |
0 |
if (analyses[i][j].tag[0] == 'V') { |
9350
|
0 |
0 |
verb_candidate = verb_candidate < 0 || (tag_compare = analyses[i][j].tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && analyses[i][j].lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate; |
|
0 |
0 |
verb_candidate = verb_candidate < 0 || (tag_compare = analyses[i][j].tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && analyses[i][j].lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate; |
9360
|
0 |
0 |
if (verb_candidate >= 0) { |
9366
|
0 |
0 |
if (analyses[i].size() == 1) { |
9370
|
0 |
0 |
} else if (forms[i].len <= 0) { |
9381
|
0 |
0 |
while (form.len) { |
9385
|
0 |
0 |
num = num || cat & unicode::N; |
|
0 |
0 |
num = num || cat & unicode::N; |
9386
|
0 |
0 |
cap = cap || cat & unicode::Lut; |
|
0 |
0 |
cap = cap || cat & unicode::Lut; |
9387
|
0 |
0 |
dash = dash || cat & unicode::Pd; |
|
0 |
0 |
dash = dash || cat & unicode::Pd; |
9389
|
0 |
0 |
if (index == 5 || (!form.len && index < 5)) { |
|
0 |
0 |
if (index == 5 || (!form.len && index < 5)) { |
|
0 |
0 |
if (index == 5 || (!form.len && index < 5)) { |
9409
|
0 |
0 |
if (prev_dynamic) { |
9417
|
0 |
0 |
if (tag.tag[0] == 'V') { |
9471
|
1 |
0 |
maps.resize(MAP_TOTAL); |
9519
|
7 |
2 |
for (unsigned i = forms.size(); i--;) { |
9523
|
11 |
7 |
for (unsigned j = 0; j < analyses[i].size(); j++) { |
9525
|
11 |
0 |
per_tag[i][j].values[TAG1] = analyses[i][j].tag.size() >= 1 ? maps[MAP_TAG1].value(analyses[i][j].tag.c_str() + 0, 1) : elementary_feature_empty; |
9526
|
11 |
0 |
per_tag[i][j].values[TAG2] = analyses[i][j].tag.size() >= 2 ? maps[MAP_TAG2].value(analyses[i][j].tag.c_str() + 1, 1) : elementary_feature_empty; |
9527
|
11 |
0 |
per_tag[i][j].values[TAG3] = analyses[i][j].tag.size() >= 3 ? maps[MAP_TAG3].value(analyses[i][j].tag.c_str() + 2, 1) : elementary_feature_empty; |
9528
|
2 |
9 |
per_tag[i][j].values[TAG4] = analyses[i][j].tag.size() >= 4 ? maps[MAP_TAG4].value(analyses[i][j].tag.c_str() + 3, 1) : elementary_feature_empty; |
9529
|
0 |
11 |
per_tag[i][j].values[TAG5] = analyses[i][j].tag.size() >= 5 ? maps[MAP_TAG5].value(analyses[i][j].tag.c_str() + 4, 1) : elementary_feature_empty; |
9530
|
4 |
7 |
per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == analyses[i][j].lemma ? per_tag[i][j-1].values[LEMMA] : |
|
4 |
0 |
per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == analyses[i][j].lemma ? per_tag[i][j-1].values[LEMMA] : |
9533
|
3 |
8 |
if (analyses[i][j].tag[0] == 'V') { |
9535
|
1 |
2 |
verb_candidate = verb_candidate < 0 || (tag_compare = analyses[i][j].tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && analyses[i][j].lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate; |
|
1 |
0 |
verb_candidate = verb_candidate < 0 || (tag_compare = analyses[i][j].tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && analyses[i][j].lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate; |
9545
|
2 |
5 |
if (verb_candidate >= 0) { |
9551
|
5 |
2 |
if (analyses[i].size() == 1) { |
9559
|
0 |
2 |
} else if (forms[i].len <= 0) { |
9574
|
9 |
2 |
while (form.len) { |
9578
|
9 |
0 |
num = num || cat & unicode::N; |
|
9 |
0 |
num = num || cat & unicode::N; |
9579
|
9 |
0 |
cap = cap || cat & unicode::Lut; |
|
9 |
0 |
cap = cap || cat & unicode::Lut; |
9580
|
9 |
0 |
dash = dash || cat & unicode::Pd; |
|
9 |
0 |
dash = dash || cat & unicode::Pd; |
9582
|
9 |
0 |
if (index == 10 || (!form.len && index < 10)) { |
|
7 |
2 |
if (index == 10 || (!form.len && index < 10)) { |
|
0 |
2 |
if (index == 10 || (!form.len && index < 10)) { |
9612
|
20 |
2 |
if (prev_dynamic) { |
9620
|
3 |
19 |
if (tag.tag[0] == 'V') { |
9667
|
0 |
0 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
0 |
2 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
0 |
0 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
0 |
2 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
0 |
2 |
while (lock.test_and_set(memory_order_acquire)) {} |
9676
|
0 |
0 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
0 |
2 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
0 |
0 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
0 |
2 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
0 |
2 |
while (lock.test_and_set(memory_order_acquire)) {} |
9677
|
0 |
0 |
if (!stack.empty()) { |
|
1 |
1 |
if (!stack.empty()) { |
|
0 |
0 |
if (!stack.empty()) { |
|
1 |
1 |
if (!stack.empty()) { |
|
1 |
1 |
if (!stack.empty()) { |
9728
|
0 |
0 |
cache(const perceptron_tagger& self) : decoder_cache(self.decoder) {} |
|
1 |
0 |
cache(const perceptron_tagger& self) : decoder_cache(self.decoder) {} |
|
0 |
0 |
cache(const perceptron_tagger& self) : decoder_cache(self.decoder) {} |
9742
|
0 |
0 |
if (dict.reset(morpho::load(is)), !dict) return false; |
|
1 |
0 |
if (dict.reset(morpho::load(is)), !dict) return false; |
|
0 |
0 |
if (dict.reset(morpho::load(is)), !dict) return false; |
9744
|
0 |
0 |
if (!features.load(is)) return false; |
|
1 |
0 |
if (!features.load(is)) return false; |
|
0 |
0 |
if (!features.load(is)) return false; |
9756
|
0 |
0 |
if (!dict) return; |
|
2 |
0 |
if (!dict) return; |
|
0 |
0 |
if (!dict) return; |
9759
|
0 |
0 |
if (!c) c = new cache(*this); |
|
0 |
0 |
if (!c) c = new cache(*this); |
|
1 |
1 |
if (!c) c = new cache(*this); |
|
1 |
0 |
if (!c) c = new cache(*this); |
|
0 |
0 |
if (!c) c = new cache(*this); |
|
0 |
0 |
if (!c) c = new cache(*this); |
9762
|
0 |
0 |
if (c->analyses.size() < forms.size()) c->analyses.resize(forms.size()); |
|
2 |
0 |
if (c->analyses.size() < forms.size()) c->analyses.resize(forms.size()); |
|
0 |
0 |
if (c->analyses.size() < forms.size()) c->analyses.resize(forms.size()); |
9763
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) { |
|
7 |
2 |
for (unsigned i = 0; i < forms.size(); i++) { |
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) { |
9766
|
0 |
0 |
dict->analyze(forms[i], guesser >= 0 ? guesser : use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, c->analyses[i]); |
|
0 |
0 |
dict->analyze(forms[i], guesser >= 0 ? guesser : use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, c->analyses[i]); |
|
7 |
0 |
dict->analyze(forms[i], guesser >= 0 ? guesser : use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, c->analyses[i]); |
|
7 |
0 |
dict->analyze(forms[i], guesser >= 0 ? guesser : use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, c->analyses[i]); |
|
0 |
0 |
dict->analyze(forms[i], guesser >= 0 ? guesser : use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, c->analyses[i]); |
|
0 |
0 |
dict->analyze(forms[i], guesser >= 0 ? guesser : use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, c->analyses[i]); |
9769
|
0 |
0 |
if (c->tags.size() < forms.size()) c->tags.resize(forms.size() * 2); |
|
1 |
1 |
if (c->tags.size() < forms.size()) c->tags.resize(forms.size() * 2); |
|
0 |
0 |
if (c->tags.size() < forms.size()) c->tags.resize(forms.size() * 2); |
9772
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) |
|
7 |
2 |
for (unsigned i = 0; i < forms.size(); i++) |
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) |
9783
|
0 |
0 |
if (!c) c = new cache(*this); |
|
0 |
0 |
if (!c) c = new cache(*this); |
|
0 |
0 |
if (!c) c = new cache(*this); |
|
0 |
0 |
if (!c) c = new cache(*this); |
|
0 |
0 |
if (!c) c = new cache(*this); |
|
0 |
0 |
if (!c) c = new cache(*this); |
9878
|
1 |
0 |
tagger_id id = tagger_id(is.get()); |
|
0 |
1 |
tagger_id id = tagger_id(is.get()); |
|
0 |
0 |
tagger_id id = tagger_id(is.get()); |
|
0 |
0 |
tagger_id id = tagger_id(is.get()); |
|
0 |
0 |
tagger_id id = tagger_id(is.get()); |
|
0 |
0 |
tagger_id id = tagger_id(is.get()); |
|
0 |
0 |
tagger_id id = tagger_id(is.get()); |
9884
|
0 |
0 |
auto res = new_unique_ptr>>(tagger_ids::decoding_order(id), tagger_ids::window_size(id)); |
9885
|
0 |
0 |
if (res->load(is)) return res.release(); |
|
0 |
0 |
if (res->load(is)) return res.release(); |
9893
|
1 |
0 |
auto res = new_unique_ptr>>(tagger_ids::decoding_order(id), tagger_ids::window_size(id)); |
9894
|
1 |
0 |
if (res->load(is)) return res.release(); |
|
1 |
0 |
if (res->load(is)) return res.release(); |
9901
|
0 |
0 |
auto res = new_unique_ptr>>(tagger_ids::decoding_order(id), tagger_ids::window_size(id)); |
9902
|
0 |
0 |
if (res->load(is)) return res.release(); |
|
0 |
0 |
if (res->load(is)) return res.release(); |
9911
|
0 |
0 |
ifstream f(path_from_utf8(fname).c_str(), ifstream::binary); |
9912
|
0 |
0 |
if (!f) return nullptr; |
9914
|
0 |
0 |
return load(f); |
9919
|
0 |
0 |
return morpho ? morpho->new_tokenizer() : nullptr; |
10024
|
0 |
0 |
for (int i = 0; i < 15 && pdt_tag[i]; i++) |
|
0 |
0 |
for (int i = 0; i < 15 && pdt_tag[i]; i++) |
10025
|
0 |
0 |
if (pdt_tag[i] != '-') { |
10026
|
0 |
0 |
if (!tag.empty()) tag.push_back('|'); |
10033
|
0 |
0 |
for (unsigned i = 0; i + 2 < lemma.size(); i++) |
10034
|
0 |
0 |
if (lemma[i] == '_' && lemma[i + 1] == ';') { |
|
0 |
0 |
if (lemma[i] == '_' && lemma[i + 1] == ';') { |
|
0 |
0 |
if (lemma[i] == '_' && lemma[i + 1] == ';') { |
10035
|
0 |
0 |
if (!tag.empty()) tag.push_back('|'); |
10044
|
0 |
0 |
return raw_lemma < lemma.size() ? (lemma.resize(raw_lemma), true) : false; |
10055
|
0 |
0 |
for (auto&& tagged_lemma : tagged_lemmas) { |
10061
|
0 |
0 |
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
|
0 |
0 |
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
|
0 |
0 |
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
10069
|
0 |
0 |
for (auto&& tagged_lemma_forms : forms) { |
10070
|
0 |
0 |
for (auto&& tagged_form : tagged_lemma_forms.forms) |
10076
|
0 |
0 |
if (!lemma_changed || forms.size() < 2) return; |
|
0 |
0 |
if (!lemma_changed || forms.size() < 2) return; |
|
0 |
0 |
if (!lemma_changed || forms.size() < 2) return; |
10130
|
0 |
0 |
return lemma_id_len < lemma.size() ? (lemma.resize(lemma_id_len), true) : false; |
10140
|
0 |
0 |
for (auto&& tagged_lemma : tagged_lemmas) |
10144
|
0 |
0 |
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
|
0 |
0 |
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
|
0 |
0 |
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
10152
|
0 |
0 |
for (auto&& tagged_lemma_forms : forms) |
10156
|
0 |
0 |
if (!lemma_changed || forms.size() < 2) return; |
|
0 |
0 |
if (!lemma_changed || forms.size() < 2) return; |
|
0 |
0 |
if (!lemma_changed || forms.size() < 2) return; |
10210
|
0 |
0 |
return raw_lemma_len < lemma.size() ? (lemma.resize(raw_lemma_len), true) : false; |
10220
|
0 |
0 |
for (auto&& tagged_lemma : tagged_lemmas) |
10224
|
0 |
0 |
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
|
0 |
0 |
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
|
0 |
0 |
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
10232
|
0 |
0 |
for (auto&& tagged_lemma_forms : forms) |
10236
|
0 |
0 |
if (!lemma_changed || forms.size() < 2) return; |
|
0 |
0 |
if (!lemma_changed || forms.size() < 2) return; |
|
0 |
0 |
if (!lemma_changed || forms.size() < 2) return; |
10275
|
0 |
0 |
if (name == "pdt_to_conll2009") return tagset_converter::new_pdt_to_conll2009_converter(); |
10276
|
0 |
0 |
if (name == "strip_lemma_comment") return tagset_converter::new_strip_lemma_comment_converter(dictionary); |
10277
|
0 |
0 |
if (name == "strip_lemma_id") return tagset_converter::new_strip_lemma_id_converter(dictionary); |
10284
|
0 |
0 |
inline static bool eq(const tagged_lemma& a, const tagged_lemma& b) { return a.lemma == b.lemma && a.tag == b.tag; } |
|
0 |
0 |
inline static bool eq(const tagged_lemma& a, const tagged_lemma& b) { return a.lemma == b.lemma && a.tag == b.tag; } |
10285
|
0 |
0 |
inline static bool lt(const tagged_lemma& a, const tagged_lemma& b) { int lemma_compare = a.lemma.compare(b.lemma); return lemma_compare < 0 || (lemma_compare == 0 && a.tag < b.tag); } |
10294
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) { |
10296
|
0 |
0 |
for (unsigned j = forms.size() - 1; j > i; j--) |
10297
|
0 |
0 |
if (forms[j].lemma == forms[i].lemma) { |
10299
|
0 |
0 |
for (auto&& tagged_form : forms[j].forms) |
10303
|
0 |
0 |
if (j < forms.size() - 1) { |
10311
|
0 |
0 |
if (any_merged && forms[i].forms.size() > 1) { |
|
0 |
0 |
if (any_merged && forms[i].forms.size() > 1) { |
|
0 |
0 |
if (any_merged && forms[i].forms.size() > 1) { |
10314
|
0 |
0 |
inline static bool eq(const tagged_form& a, const tagged_form& b) { return a.tag == b.tag && a.form == b.form; } |
|
0 |
0 |
inline static bool eq(const tagged_form& a, const tagged_form& b) { return a.tag == b.tag && a.form == b.form; } |
10315
|
0 |
0 |
inline static bool lt(const tagged_form& a, const tagged_form& b) { int tag_compare = a.tag.compare(b.tag); return tag_compare < 0 || (tag_compare == 0 && a.form < b.form); } |
10473
|
321 |
3 |
const unordered_set czech_tokenizer::abbreviations_czech = { |
|
0 |
0 |
const unordered_set czech_tokenizer::abbreviations_czech = { |
10489
|
309 |
3 |
const unordered_set czech_tokenizer::abbreviations_slovak = { |
|
0 |
0 |
const unordered_set czech_tokenizer::abbreviations_slovak = { |
10506
|
0 |
0 |
: ragel_tokenizer(version <= 1 ? 1 : 2), m(m) { |
|
0 |
0 |
: ragel_tokenizer(version <= 1 ? 1 : 2), m(m) { |
10520
|
0 |
0 |
if (!m) return; |
10521
|
0 |
0 |
if (tokens.empty() || chars[tokens.back().start].cat & ~unicode::L) return; |
|
0 |
0 |
if (tokens.empty() || chars[tokens.back().start].cat & ~unicode::L) return; |
|
0 |
0 |
if (tokens.empty() || chars[tokens.back().start].cat & ~unicode::L) return; |
10524
|
0 |
0 |
for (unsigned hyphens = 1; hyphens <= 2; hyphens++) { |
10526
|
0 |
0 |
if (tokens.size() < 2*hyphens + 1) break; |
10528
|
0 |
0 |
if (tokens[first_hyphen].length != 1 || chars[tokens[first_hyphen].start].cat & ~unicode::P || |
|
0 |
0 |
if (tokens[first_hyphen].length != 1 || chars[tokens[first_hyphen].start].cat & ~unicode::P || |
|
0 |
0 |
if (tokens[first_hyphen].length != 1 || chars[tokens[first_hyphen].start].cat & ~unicode::P || |
10529
|
0 |
0 |
tokens[first_hyphen].start + tokens[first_hyphen].length != tokens[first_hyphen + 1].start || |
10530
|
0 |
0 |
tokens[first_hyphen-1].start + tokens[first_hyphen-1].length != tokens[first_hyphen].start || |
|
0 |
0 |
tokens[first_hyphen-1].start + tokens[first_hyphen-1].length != tokens[first_hyphen].start || |
10534
|
0 |
0 |
if (m->analyze(string_piece(chars[tokens[first_hyphen-1].start].str, chars[tokens.back().start + tokens.back().length].str - chars[tokens[first_hyphen-1].start].str), morpho::NO_GUESSER, lemmas) >= 0) |
10538
|
0 |
0 |
if (matched_hyphens) { |
10552
|
0 |
0 |
while (tokenize_url_email(tokens)) |
10553
|
0 |
0 |
if (emergency_sentence_split(tokens)) |
10569
|
0 |
0 |
if ( ( current) == ( (chars.size() - 1)) ) |
10574
|
0 |
0 |
switch ( _czech_tokenizer_from_state_actions[cs] ) { |
10583
|
0 |
0 |
if ( _klen > 0 ) { |
10588
|
0 |
0 |
if ( _upper < _lower ) |
10592
|
0 |
0 |
if ( _widec < _mid[0] ) |
10594
|
0 |
0 |
else if ( _widec > _mid[1] ) |
10600
|
0 |
0 |
if ( |
10601
|
0 |
0 |
!current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256; |
|
0 |
0 |
!current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256; |
10606
|
0 |
0 |
if ( |
10607
|
0 |
0 |
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
|
0 |
0 |
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
|
0 |
0 |
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
10620
|
0 |
0 |
if ( _klen > 0 ) { |
10625
|
0 |
0 |
if ( _upper < _lower ) |
10629
|
0 |
0 |
if ( _widec < *_mid ) |
10631
|
0 |
0 |
else if ( _widec > *_mid ) |
10643
|
0 |
0 |
if ( _klen > 0 ) { |
10648
|
0 |
0 |
if ( _upper < _lower ) |
10652
|
0 |
0 |
if ( _widec < _mid[0] ) |
10654
|
0 |
0 |
else if ( _widec > _mid[1] ) |
10669
|
0 |
0 |
if ( _czech_tokenizer_trans_actions[_trans] == 0 ) |
10683
|
0 |
0 |
do |
10684
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
10692
|
0 |
0 |
for (current = ts; current < whitespace; current++) |
10695
|
0 |
0 |
if (eos) {( current)++; goto _out; } |
10700
|
0 |
0 |
if (!tokens.empty()) {( current)++; goto _out; } |
10702
|
0 |
0 |
do |
10703
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
10712
|
0 |
0 |
do |
10713
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
10721
|
0 |
0 |
do |
10722
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
10729
|
0 |
0 |
if (!tokens.empty()) {( current)++; goto _out; } |
10731
|
0 |
0 |
do |
10732
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
10741
|
0 |
0 |
do |
10742
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
10750
|
0 |
0 |
switch ( _czech_tokenizer_to_state_actions[cs] ) { |
10756
|
0 |
0 |
if ( cs == 0 ) |
10758
|
0 |
0 |
if ( ++( current) != ( (chars.size() - 1)) ) |
10761
|
0 |
0 |
if ( ( current) == ( (chars.size() - 1)) ) |
10763
|
0 |
0 |
if ( _czech_tokenizer_eof_trans[cs] > 0 ) { |
10795
|
342 |
3 |
const unordered_set english_tokenizer::abbreviations = { |
|
0 |
0 |
const unordered_set english_tokenizer::abbreviations = { |
10894
|
0 |
0 |
if (tokens.empty() || chars[tokens.back().start].cat & ~unilib::unicode::L) return; |
|
0 |
0 |
if (tokens.empty() || chars[tokens.back().start].cat & ~unilib::unicode::L) return; |
|
0 |
0 |
if (tokens.empty() || chars[tokens.back().start].cat & ~unilib::unicode::L) return; |
10909
|
0 |
0 |
if ( ( index) == ( end) ) |
10918
|
0 |
0 |
if ( _klen > 0 ) { |
10923
|
0 |
0 |
if ( _upper < _lower ) |
10927
|
0 |
0 |
if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) < *_mid ) |
10929
|
0 |
0 |
else if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) > *_mid ) |
10941
|
0 |
0 |
if ( _klen > 0 ) { |
10946
|
0 |
0 |
if ( _upper < _lower ) |
10950
|
0 |
0 |
if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) < _mid[0] ) |
10952
|
0 |
0 |
else if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) > _mid[1] ) |
10966
|
0 |
0 |
if ( _english_tokenizer_split_token_trans_actions[_trans] == 0 ) |
10980
|
0 |
0 |
if ( cs == 0 ) |
10982
|
0 |
0 |
if ( ++( index) != ( end) ) |
10985
|
0 |
0 |
if ( ( index) == ( end) ) |
10987
|
0 |
0 |
switch ( _english_tokenizer_split_token_eof_actions[cs] ) { |
10997
|
0 |
0 |
if (split_len && split_len < end) { |
11151
|
0 |
0 |
english_tokenizer::english_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {} |
|
0 |
0 |
english_tokenizer::english_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {} |
|
0 |
0 |
english_tokenizer::english_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {} |
|
0 |
0 |
english_tokenizer::english_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {} |
11160
|
0 |
0 |
while (tokenize_url_email(tokens)) |
11161
|
0 |
0 |
if (emergency_sentence_split(tokens)) |
11177
|
0 |
0 |
if ( ( current) == ( (chars.size() - 1)) ) |
11182
|
0 |
0 |
switch ( _english_tokenizer_from_state_actions[cs] ) { |
11191
|
0 |
0 |
if ( _klen > 0 ) { |
11196
|
0 |
0 |
if ( _upper < _lower ) |
11200
|
0 |
0 |
if ( _widec < _mid[0] ) |
11202
|
0 |
0 |
else if ( _widec > _mid[1] ) |
11208
|
0 |
0 |
if ( |
11209
|
0 |
0 |
!current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256; |
|
0 |
0 |
!current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256; |
11214
|
0 |
0 |
if ( |
11215
|
0 |
0 |
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
|
0 |
0 |
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
|
0 |
0 |
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
11228
|
0 |
0 |
if ( _klen > 0 ) { |
11233
|
0 |
0 |
if ( _upper < _lower ) |
11237
|
0 |
0 |
if ( _widec < *_mid ) |
11239
|
0 |
0 |
else if ( _widec > *_mid ) |
11251
|
0 |
0 |
if ( _klen > 0 ) { |
11256
|
0 |
0 |
if ( _upper < _lower ) |
11260
|
0 |
0 |
if ( _widec < _mid[0] ) |
11262
|
0 |
0 |
else if ( _widec > _mid[1] ) |
11277
|
0 |
0 |
if ( _english_tokenizer_trans_actions[_trans] == 0 ) |
11291
|
0 |
0 |
do |
11292
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
11300
|
0 |
0 |
for (current = ts; current < whitespace; current++) |
11303
|
0 |
0 |
if (eos) {( current)++; goto _out; } |
11308
|
0 |
0 |
if (!tokens.empty()) {( current)++; goto _out; } |
11310
|
0 |
0 |
do |
11311
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
11320
|
0 |
0 |
do |
11321
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
11329
|
0 |
0 |
do |
11330
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
11337
|
0 |
0 |
if (!tokens.empty()) {( current)++; goto _out; } |
11339
|
0 |
0 |
do |
11340
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
11349
|
0 |
0 |
do |
11350
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
11358
|
0 |
0 |
switch ( _english_tokenizer_to_state_actions[cs] ) { |
11364
|
0 |
0 |
if ( cs == 0 ) |
11366
|
0 |
0 |
if ( ++( current) != ( (chars.size() - 1)) ) |
11369
|
0 |
0 |
if ( ( current) == ( (chars.size() - 1)) ) |
11371
|
0 |
0 |
if ( _english_tokenizer_eof_trans[cs] > 0 ) { |
11528
|
3 |
0 |
generic_tokenizer::generic_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {} |
|
0 |
0 |
generic_tokenizer::generic_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {} |
|
0 |
0 |
generic_tokenizer::generic_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {} |
|
0 |
0 |
generic_tokenizer::generic_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {} |
11537
|
0 |
2 |
while (tokenize_url_email(tokens)) |
11538
|
0 |
0 |
if (emergency_sentence_split(tokens)) |
11554
|
2 |
0 |
if ( ( current) == ( (chars.size() - 1)) ) |
11559
|
10 |
27 |
switch ( _generic_tokenizer_from_state_actions[cs] ) { |
11568
|
10 |
27 |
if ( _klen > 0 ) { |
11573
|
17 |
10 |
if ( _upper < _lower ) |
11577
|
3 |
14 |
if ( _widec < _mid[0] ) |
11579
|
14 |
0 |
else if ( _widec > _mid[1] ) |
11585
|
0 |
0 |
if ( |
11586
|
0 |
0 |
!current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256; |
|
0 |
0 |
!current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256; |
11591
|
0 |
0 |
if ( |
11592
|
0 |
0 |
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
|
0 |
0 |
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
|
0 |
0 |
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
11605
|
37 |
0 |
if ( _klen > 0 ) { |
11610
|
112 |
27 |
if ( _upper < _lower ) |
11614
|
66 |
46 |
if ( _widec < *_mid ) |
11616
|
36 |
10 |
else if ( _widec > *_mid ) |
11628
|
24 |
3 |
if ( _klen > 0 ) { |
11633
|
32 |
5 |
if ( _upper < _lower ) |
11637
|
10 |
22 |
if ( _widec < _mid[0] ) |
11639
|
3 |
19 |
else if ( _widec > _mid[1] ) |
11654
|
12 |
27 |
if ( _generic_tokenizer_trans_actions[_trans] == 0 ) |
11667
|
0 |
0 |
do |
11668
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
11676
|
0 |
0 |
for (current = ts; current < whitespace; current++) |
11679
|
0 |
0 |
if (eos) {( current)++; goto _out; } |
11684
|
0 |
0 |
if (!tokens.empty()) {( current)++; goto _out; } |
11686
|
0 |
0 |
do |
11687
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
11695
|
0 |
7 |
do |
11696
|
0 |
7 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
11704
|
0 |
3 |
do |
11705
|
0 |
3 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
11712
|
0 |
0 |
if (!tokens.empty()) {( current)++; goto _out; } |
11714
|
0 |
0 |
do |
11715
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
11723
|
0 |
0 |
do |
11724
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
11732
|
10 |
29 |
switch ( _generic_tokenizer_to_state_actions[cs] ) { |
11738
|
39 |
0 |
if ( cs == 0 ) |
11740
|
35 |
4 |
if ( ++( current) != ( (chars.size() - 1)) ) |
11743
|
4 |
0 |
if ( ( current) == ( (chars.size() - 1)) ) |
11745
|
2 |
2 |
if ( _generic_tokenizer_eof_trans[cs] > 0 ) { |
12127
|
3 |
0 |
initialize_ragel_map(); |
12131
|
0 |
7 |
while (ragel_map_flag.test_and_set()) {} |
12132
|
2 |
5 |
if (ragel_map.empty()) { |
12133
|
256 |
2 |
for (uint8_t ascii = 0; ascii < 128; ascii++) |
12145
|
2 |
6 |
if (chr >= ragel_map.size()) |
12165
|
10 |
0 |
if ( ( current) == ( (chars.size() - 1)) ) |
12173
|
0 |
22 |
if ( _klen > 0 ) { |
12178
|
0 |
0 |
if ( _upper < _lower ) |
12182
|
0 |
0 |
if ( _widec < _mid[0] ) |
12184
|
0 |
0 |
else if ( _widec > _mid[1] ) |
12190
|
0 |
0 |
if ( |
12196
|
0 |
0 |
if ( |
12210
|
22 |
0 |
if ( _klen > 0 ) { |
12215
|
63 |
22 |
if ( _upper < _lower ) |
12219
|
12 |
51 |
if ( _widec < *_mid ) |
12221
|
51 |
0 |
else if ( _widec > *_mid ) |
12233
|
22 |
0 |
if ( _klen > 0 ) { |
12238
|
61 |
8 |
if ( _upper < _lower ) |
12242
|
8 |
53 |
if ( _widec < _mid[0] ) |
12244
|
39 |
14 |
else if ( _widec > _mid[1] ) |
12258
|
0 |
22 |
if ( _ragel_url_email_trans_actions[_trans] == 0 ) |
12279
|
14 |
8 |
if ( cs == 0 ) |
12281
|
12 |
2 |
if ( ++( current) != ( (chars.size() - 1)) ) |
12287
|
0 |
10 |
if (end > start) { |
12316
|
1 |
0 |
vertical_tokenizer() : unicode_tokenizer(0) {} |
12372
|
4 |
0 |
ragel_tokenizer::initialize_ragel_map(); |
12374
|
4 |
0 |
set_text(string_piece(nullptr, 0)); |
12380
|
3 |
4 |
if (make_copy && text.str) { |
|
3 |
0 |
if (make_copy && text.str) { |
12387
|
145 |
7 |
for (const char* curr_str = text.str; text.len; curr_str = text.str) |
12393
|
7 |
0 |
vector& tokens = tokens_ptr ? *tokens_ptr : tokens_buffer; |
12395
|
7 |
0 |
if (forms) forms->clear(); |
12396
|
4 |
3 |
if (current >= chars.size() - 1) return false; |
12399
|
4 |
0 |
if (forms) |
12400
|
33 |
4 |
for (auto&& token : tokens) |
12407
|
10 |
2 |
if (current >= chars.size() - 1) return false; |
12409
|
10 |
0 |
return url_email_tokenizer ? ragel_tokenizer::ragel_url_email(url_email_tokenizer, chars, current, tokens) : false; |
12416
|
0 |
10 |
return tokens.size() >= 500 || |
12417
|
10 |
0 |
(tokens.size() >= 450 && chars[tokens.back().start].cat & unicode::P) || |
|
0 |
0 |
(tokens.size() >= 450 && chars[tokens.back().start].cat & unicode::P) || |
|
0 |
10 |
(tokens.size() >= 450 && chars[tokens.back().start].cat & unicode::P) || |
12418
|
0 |
0 |
(tokens.size() >= 400 && chars[tokens.back().start].cat & unicode::Po); |
12424
|
0 |
0 |
if (eos_chr == '.' && !tokens.empty()) { |
|
0 |
0 |
if (eos_chr == '.' && !tokens.empty()) { |
|
0 |
0 |
if (eos_chr == '.' && !tokens.empty()) { |
12426
|
0 |
0 |
if (tokens.back().length == 1 && chars[tokens.back().start].cat & unicode::Lut) |
|
0 |
0 |
if (tokens.back().length == 1 && chars[tokens.back().start].cat & unicode::Lut) |
|
0 |
0 |
if (tokens.back().length == 1 && chars[tokens.back().start].cat & unicode::Lut) |
12430
|
0 |
0 |
if (abbreviations) { |
12432
|
0 |
0 |
for (size_t i = 0; i < tokens.back().length; i++) |
12434
|
0 |
0 |
if (abbreviations->count(eos_buffer)) |
12459
|
2 |
0 |
if (current >= chars.size() - 1) return false; |
12463
|
116 |
2 |
while (current < chars.size() - 1 && chars[current].chr != '\r' && chars[current].chr != '\n') current++; |
|
116 |
0 |
while (current < chars.size() - 1 && chars[current].chr != '\r' && chars[current].chr != '\n') current++; |
|
26 |
90 |
while (current < chars.size() - 1 && chars[current].chr != '\r' && chars[current].chr != '\n') current++; |
|
90 |
28 |
while (current < chars.size() - 1 && chars[current].chr != '\r' && chars[current].chr != '\n') current++; |
12466
|
26 |
2 |
if (current < chars.size() - 1) { |
12468
|
26 |
0 |
if (current < chars.size() - 1 && |
|
0 |
26 |
if (current < chars.size() - 1 && |
|
0 |
26 |
if (current < chars.size() - 1 && |
12469
|
0 |
0 |
((chars[current-1].chr == '\r' && chars[current].chr == '\n') || |
|
26 |
0 |
((chars[current-1].chr == '\r' && chars[current].chr == '\n') || |
12470
|
0 |
26 |
(chars[current-1].chr == '\n' && chars[current].chr == '\r'))) |
12474
|
26 |
2 |
if (line_start < line_end) |
12562
|
0 |
0 |
return {1, 11, 1, ""}; |
|
0 |
0 |
return {1, 11, 1, ""}; |
12573
|
0 |
0 |
<< (morphodita.prerelease.empty() ? "" : "-") << morphodita.prerelease |
|
0 |
0 |
<< (morphodita.prerelease.empty() ? "" : "-") << morphodita.prerelease |
12575
|
0 |
0 |
<< (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n" |
|
0 |
0 |
<< (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n" |
12577
|
0 |
0 |
"Mathematics and Physics, Charles University in Prague, Czech Republic."; |
12611
|
42 |
42 |
default: return (bilou_entity - B_first) & 1 ? bilou_type_U : bilou_type_B; |
12615
|
28 |
42 |
switch (bilou_entity) { |
12750
|
1 |
0 |
if (tagger.reset(tagger::load_instance(is)), !tagger) return false; |
12751
|
1 |
0 |
if (!named_entities.load(is)) return false; |
12754
|
1 |
0 |
if (!templates.load(is, nlp_pipeline(tokenizer.get(), tagger.get()))) return false; |
|
1 |
0 |
if (!templates.load(is, nlp_pipeline(tokenizer.get(), tagger.get()))) return false; |
12756
|
1 |
0 |
int stages = is.get(); |
12757
|
1 |
0 |
if (stages == EOF) return false; |
12758
|
1 |
0 |
networks.resize(stages); |
12759
|
2 |
1 |
for (auto&& network : networks) |
12760
|
2 |
0 |
if (!network.load(is)) return false; |
|
2 |
0 |
if (!network.load(is)) return false; |
12767
|
2 |
0 |
if (forms.empty() || !tagger || !named_entities.size() || !networks.size()) return; |
|
2 |
0 |
if (forms.empty() || !tagger || !named_entities.size() || !networks.size()) return; |
|
2 |
0 |
if (forms.empty() || !tagger || !named_entities.size() || !networks.size()) return; |
|
0 |
2 |
if (forms.empty() || !tagger || !named_entities.size() || !networks.size()) return; |
|
2 |
0 |
if (forms.empty() || !tagger || !named_entities.size() || !networks.size()) return; |
12771
|
1 |
1 |
if (!c) c = new cache(); |
12777
|
2 |
0 |
if (sentence.size) { |
12781
|
4 |
2 |
for (auto&& network : networks) { |
12789
|
14 |
4 |
for (unsigned i = 0; i < sentence.size; i++) { |
12790
|
14 |
0 |
if (!sentence.probabilities[i].local_filled) { |
12796
|
4 |
10 |
if (i == 0) { |
12808
|
7 |
2 |
for (unsigned i = 0; i < sentence.size; i++) |
12809
|
3 |
4 |
if (sentence.probabilities[i].global.best == bilou_type_U) { |
12811
|
0 |
4 |
} else if (sentence.probabilities[i].global.best == bilou_type_B) { |
12813
|
0 |
0 |
while (i < sentence.size && sentence.probabilities[i].global.best != bilou_type_L) i++; |
|
0 |
0 |
while (i < sentence.size && sentence.probabilities[i].global.best != bilou_type_L) i++; |
|
0 |
0 |
while (i < sentence.size && sentence.probabilities[i].global.best != bilou_type_L) i++; |
12830
|
0 |
0 |
for (unsigned i = 0; i < types.size(); i++) |
12836
|
0 |
0 |
if (gazetteer_types) gazetteer_types->clear(); |
12842
|
14 |
70 |
for (auto&& prob_bilou : prob.bilou) |
12845
|
126 |
14 |
for (bilou_entity::value i = 0; i < outcomes.size(); i++) { |
12847
|
70 |
56 |
if (outcomes[i] > prob.bilou[bilou].probability) { |
12882
|
0 |
0 |
if (it == str2id.end() && add_entity) { |
|
0 |
0 |
if (it == str2id.end() && add_entity) { |
|
0 |
0 |
if (it == str2id.end() && add_entity) { |
12886
|
0 |
0 |
return it == str2id.end() ? entity_type_unknown : it->second; |
12890
|
0 |
0 |
return entity < id2str.size() ? id2str[entity] : empty; |
|
0 |
0 |
return entity < id2str.size() ? id2str[entity] : empty; |
|
3 |
0 |
return entity < id2str.size() ? id2str[entity] : empty; |
|
0 |
0 |
return entity < id2str.size() ? id2str[entity] : empty; |
|
0 |
0 |
return entity < id2str.size() ? id2str[entity] : empty; |
12895
|
1 |
0 |
if (!compressor::load(is, data)) return false; |
|
1 |
0 |
if (!compressor::load(is, data)) return false; |
12899
|
1 |
0 |
id2str.resize(data.next_4B()); |
|
1 |
0 |
id2str.resize(data.next_4B()); |
12900
|
3 |
1 |
for (unsigned i = 0; i < id2str.size(); i++) { |
12901
|
3 |
0 |
data.next_str(id2str[i]); |
12903
|
0 |
0 |
} |
12930
|
1 |
0 |
switch (id) { |
12936
|
1 |
0 |
if (res->load(is)) return res.release(); |
|
0 |
1 |
if (res->load(is)) return res.release(); |
12945
|
1 |
0 |
ifstream in(path_from_utf8(fname).c_str(), ifstream::in | ifstream::binary); |
12946
|
1 |
0 |
if (!in.is_open()) return nullptr; |
12948
|
1 |
0 |
return load(in); |
12986
|
0 |
0 |
for (; len--; str++, pos++) |
|
0 |
0 |
for (; len--; str++, pos++) |
|
0 |
0 |
for (; len--; str++, pos++) |
12987
|
0 |
0 |
if (*str == c) |
|
0 |
0 |
if (*str == c) |
|
0 |
0 |
if (*str == c) |
13003
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) { |
13007
|
0 |
0 |
if (space < form.len) { |
13013
|
0 |
0 |
if (space < form.len) { |
13081
|
1 |
0 |
morpho = tagger ? tagger->get_morpho() : nullptr; |
13082
|
1 |
0 |
return tagger && morpho; |
|
0 |
1 |
return tagger && morpho; |
13086
|
0 |
0 |
if (params.empty()) return cerr << "Missing tagger_file argument to morphodita_tagger!" << endl, false; |
13089
|
0 |
0 |
if (!in.is_open()) return cerr << "Cannot open morphodita tagger file '" << params << "'!" << endl, false; |
13090
|
0 |
0 |
if (!load(in)) return cerr << "Cannot load morphodita tagger from file '" << params << "'!" << endl, false; |
|
0 |
0 |
if (!load(in)) return cerr << "Cannot load morphodita tagger from file '" << params << "'!" << endl, false; |
13092
|
0 |
0 |
if (!in.seekg(0, ifstream::beg)) return cerr << "Cannot seek in morphodita tagger file '" << params << "'!" << endl, false; |
|
0 |
0 |
if (!in.seekg(0, ifstream::beg)) return cerr << "Cannot seek in morphodita tagger file '" << params << "'!" << endl, false; |
13093
|
0 |
0 |
os << in.rdbuf(); |
13100
|
2 |
0 |
if (!tagger || !morpho) return; |
|
2 |
0 |
if (!tagger || !morpho) return; |
|
2 |
0 |
if (!tagger || !morpho) return; |
13104
|
1 |
1 |
if (!c) c = new cache(); |
13110
|
2 |
0 |
if (c->tags.size() >= forms.size()) { |
13112
|
7 |
2 |
for (unsigned i = 0; i < forms.size(); i++) { |
13122
|
11 |
7 |
for (auto&& analysis : c->analyses) |
13123
|
11 |
0 |
sentence.words[i].raw_lemmas_all.emplace_back(analysis.lemma, 0, morpho->raw_lemma_len(analysis.lemma)); |
|
11 |
0 |
sentence.words[i].raw_lemmas_all.emplace_back(analysis.lemma, 0, morpho->raw_lemma_len(analysis.lemma)); |
13174
|
1 |
0 |
if (!res) return nullptr; |
13175
|
1 |
0 |
if (!res->load(is)) return nullptr; |
|
1 |
0 |
if (!res->load(is)) return nullptr; |
13185
|
0 |
0 |
if (colon == string::npos) { |
13188
|
0 |
0 |
tagger_id = tagger_id_and_params.substr(0, colon); |
13189
|
0 |
0 |
params = tagger_id_and_params.substr(colon + 1); |
13194
|
0 |
0 |
if (!tagger_ids::parse(tagger_id, id)) return cerr << "Unknown tagger_id '" << tagger_id << "'!" << endl, nullptr; |
|
0 |
0 |
if (!tagger_ids::parse(tagger_id, id)) return cerr << "Unknown tagger_id '" << tagger_id << "'!" << endl, nullptr; |
13197
|
0 |
0 |
unique_ptr res(create(id)); |
13198
|
0 |
0 |
if (!res) return cerr << "Cannot create instance for tagger_id '" << tagger_id << "'!" << endl, nullptr; |
13201
|
0 |
0 |
os.put(id); |
13202
|
0 |
0 |
if (!res->create_and_encode(params, os)) return cerr << "Cannot encode instance of tagger_id '" << tagger_id << "'!" << endl, nullptr; |
|
0 |
0 |
if (!res->create_and_encode(params, os)) return cerr << "Cannot encode instance of tagger_id '" << tagger_id << "'!" << endl, nullptr; |
13243
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) { |
13544
|
0 |
0 |
for (; *str; str++) |
13545
|
0 |
0 |
if (((unsigned char)*str) >= 0x80) { |
13546
|
0 |
0 |
if (((unsigned char)*str) < 0xC0) return false; |
13547
|
0 |
0 |
else if (((unsigned char)*str) < 0xE0) { |
13548
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
13549
|
0 |
0 |
} else if (((unsigned char)*str) < 0xF0) { |
13550
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
13551
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
13552
|
0 |
0 |
} else if (((unsigned char)*str) < 0xF8) { |
13553
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
13554
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
13555
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
13562
|
0 |
0 |
for (; len > 0; str++, len--) |
13563
|
0 |
0 |
if (((unsigned char)*str) >= 0x80) { |
13564
|
0 |
0 |
if (((unsigned char)*str) < 0xC0) return false; |
13565
|
0 |
0 |
else if (((unsigned char)*str) < 0xE0) { |
13566
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
13567
|
0 |
0 |
} else if (((unsigned char)*str) < 0xF0) { |
13568
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
13569
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
13570
|
0 |
0 |
} else if (((unsigned char)*str) < 0xF8) { |
13571
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
13572
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
13573
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
13582
|
0 |
0 |
for (char32_t chr; (chr = decode(str)); ) |
13589
|
0 |
0 |
while (len) |
13596
|
0 |
0 |
for (auto&& chr : str) |
13624
|
0 |
0 |
return {3, 3, 0, ""}; |
|
0 |
0 |
return {3, 3, 0, ""}; |
14154
|
244 |
31893 |
IF_BIT_0(prob) |
|
784 |
31353 |
IF_BIT_0(prob) |
14159
|
777 |
7 |
if (checkDicSize != 0 || processedPos != 0) |
14161
|
0 |
777 |
(dic[(dicPos == 0 ? dicBufSize : dicPos) - 1] >> (8 - lc)))); |
14163
|
440 |
344 |
if (state < kNumLitStates) |
14167
|
385 |
3135 |
do { GET_BIT(prob + symbol, symbol) } while (symbol < 0x100); |
|
2054 |
1466 |
do { GET_BIT(prob + symbol, symbol) } while (symbol < 0x100); |
|
3080 |
440 |
do { GET_BIT(prob + symbol, symbol) } while (symbol < 0x100); |
14171
|
0 |
344 |
unsigned matchByte = p->dic[(dicPos - rep0) + ((dicPos < rep0) ? dicBufSize : 0)]; |
14173
|
95 |
249 |
state -= (state < 10) ? 3 : 6; |
14182
|
310 |
2442 |
GET_BIT2(probLit, symbol, offs &= ~bit, offs &= bit) |
|
1825 |
927 |
GET_BIT2(probLit, symbol, offs &= ~bit, offs &= bit) |
14184
|
2408 |
344 |
while (symbol < 0x100); |
14194
|
198 |
31155 |
IF_BIT_0(prob) |
|
267 |
31086 |
IF_BIT_0(prob) |
14203
|
31086 |
0 |
if (checkDicSize == 0 && processedPos == 0) |
14206
|
120 |
30966 |
IF_BIT_0(prob) |
|
30975 |
111 |
IF_BIT_0(prob) |
14210
|
123 |
30852 |
IF_BIT_0(prob) |
|
54 |
30921 |
IF_BIT_0(prob) |
14213
|
0 |
54 |
dic[dicPos] = dic[(dicPos - rep0) + ((dicPos < rep0) ? dicBufSize : 0)]; |
14216
|
3 |
51 |
state = state < kNumLitStates ? 9 : 11; |
14226
|
22 |
89 |
IF_BIT_0(prob) |
|
61 |
50 |
IF_BIT_0(prob) |
14235
|
4 |
46 |
IF_BIT_0(prob) |
|
31 |
19 |
IF_BIT_0(prob) |
14251
|
30873 |
159 |
state = state < kNumLitStates ? 8 : 11; |
14257
|
206 |
31093 |
IF_BIT_0(probLen) |
|
276 |
31023 |
IF_BIT_0(probLen) |
14268
|
111 |
30912 |
IF_BIT_0(probLen) |
|
54 |
30969 |
IF_BIT_0(probLen) |
14283
|
1134 |
247608 |
TREE_DECODE(probLen, limit, len); |
|
1269 |
247473 |
TREE_DECODE(probLen, limit, len); |
|
217443 |
31299 |
TREE_DECODE(probLen, limit, len); |
14287
|
267 |
31032 |
if (state >= kNumStates) |
14292
|
34 |
233 |
TREE_6_DECODE(prob, distance); |
|
194 |
73 |
TREE_6_DECODE(prob, distance); |
|
39 |
228 |
TREE_6_DECODE(prob, distance); |
|
244 |
23 |
TREE_6_DECODE(prob, distance); |
|
22 |
245 |
TREE_6_DECODE(prob, distance); |
|
169 |
98 |
TREE_6_DECODE(prob, distance); |
|
29 |
238 |
TREE_6_DECODE(prob, distance); |
|
112 |
155 |
TREE_6_DECODE(prob, distance); |
|
25 |
242 |
TREE_6_DECODE(prob, distance); |
|
154 |
113 |
TREE_6_DECODE(prob, distance); |
|
30 |
237 |
TREE_6_DECODE(prob, distance); |
|
178 |
89 |
TREE_6_DECODE(prob, distance); |
14293
|
229 |
38 |
if (distance >= kStartPosModelIndex) |
14298
|
119 |
110 |
if (posSlot < kEndPosModelIndex) |
14305
|
218 |
119 |
do |
14307
|
41 |
296 |
GET_BIT2(prob + i, i, ; , distance |= mask); |
|
147 |
190 |
GET_BIT2(prob + i, i, ; , distance |= mask); |
14316
|
1025 |
110 |
do |
14318
|
137 |
998 |
NORMALIZE |
14342
|
14 |
96 |
GET_BIT2(prob + i, i, ; , distance |= 1); |
|
53 |
57 |
GET_BIT2(prob + i, i, ; , distance |= 1); |
14343
|
20 |
90 |
GET_BIT2(prob + i, i, ; , distance |= 2); |
|
51 |
59 |
GET_BIT2(prob + i, i, ; , distance |= 2); |
14344
|
9 |
101 |
GET_BIT2(prob + i, i, ; , distance |= 4); |
|
66 |
44 |
GET_BIT2(prob + i, i, ; , distance |= 4); |
14345
|
18 |
92 |
GET_BIT2(prob + i, i, ; , distance |= 8); |
|
70 |
40 |
GET_BIT2(prob + i, i, ; , distance |= 8); |
14347
|
0 |
110 |
if (distance == (uint32_t)0xFFFFFFFF) |
14359
|
267 |
0 |
if (checkDicSize == 0) |
14361
|
267 |
0 |
if (distance >= processedPos) |
14364
|
0 |
0 |
else if (distance >= checkDicSize) |
14366
|
130 |
137 |
state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3; |
14371
|
31299 |
0 |
if (limit == dicPos) |
14375
|
0 |
31299 |
unsigned curLen = ((rem < len) ? (unsigned)rem : len); |
14376
|
0 |
31299 |
size_t pos = (dicPos - rep0) + ((dicPos < rep0) ? dicBufSize : 0); |
14381
|
31299 |
0 |
if (pos + curLen <= dicBufSize) |
14387
|
8397330 |
31299 |
do |
14393
|
0 |
0 |
do |
14396
|
0 |
0 |
if (++pos == dicBufSize) |
14404
|
32017 |
120 |
while (dicPos < limit && buf < bufLimit); |
14405
|
26 |
94 |
NORMALIZE; |
14423
|
0 |
127 |
if (p->remainLen != 0 && p->remainLen < kMatchSpecLenStart) |
14430
|
0 |
0 |
if (limit - dicPos < len) |
14433
|
0 |
0 |
if (p->checkDicSize == 0 && p->prop.dicSize - p->processedPos <= len) |
|
0 |
0 |
if (p->checkDicSize == 0 && p->prop.dicSize - p->processedPos <= len) |
14438
|
0 |
0 |
while (len-- != 0) |
14440
|
0 |
0 |
dic[dicPos] = dic[(dicPos - rep0) + ((dicPos < rep0) ? dicBufSize : 0)]; |
14452
|
120 |
0 |
if (p->checkDicSize == 0) |
14455
|
0 |
120 |
if (limit - p->dicPos > rem) |
14458
|
120 |
0 |
RINOK(LzmaDec_DecodeReal(p, limit2, bufLimit)); |
14459
|
0 |
120 |
if (p->processedPos >= p->prop.dicSize) |
14463
|
113 |
7 |
while (p->dicPos < limit && p->buf < bufLimit && p->remainLen < kMatchSpecLenStart); |
|
0 |
113 |
while (p->dicPos < limit && p->buf < bufLimit && p->remainLen < kMatchSpecLenStart); |
|
0 |
0 |
while (p->dicPos < limit && p->buf < bufLimit && p->remainLen < kMatchSpecLenStart); |
14465
|
0 |
120 |
if (p->remainLen > kMatchSpecLenStart) |
14496
|
0 |
110 |
IF_BIT_0_CHECK(prob) |
|
0 |
0 |
IF_BIT_0_CHECK(prob) |
|
69 |
41 |
IF_BIT_0_CHECK(prob) |
14503
|
68 |
1 |
if (p->checkDicSize != 0 || p->processedPos != 0) |
14506
|
0 |
68 |
(p->dic[(p->dicPos == 0 ? p->dicBufSize : p->dicPos) - 1] >> (8 - p->prop.lc)))); |
14508
|
44 |
25 |
if (state < kNumLitStates) |
14511
|
28 |
324 |
do { GET_BIT_CHECK(prob + symbol, symbol) } while (symbol < 0x100); |
|
28 |
0 |
do { GET_BIT_CHECK(prob + symbol, symbol) } while (symbol < 0x100); |
|
218 |
134 |
do { GET_BIT_CHECK(prob + symbol, symbol) } while (symbol < 0x100); |
|
308 |
44 |
do { GET_BIT_CHECK(prob + symbol, symbol) } while (symbol < 0x100); |
14516
|
0 |
25 |
((p->dicPos < p->reps[0]) ? p->dicBufSize : 0)]; |
14526
|
20 |
180 |
GET_BIT2_CHECK(probLit, symbol, offs &= ~bit, offs &= bit) |
|
20 |
0 |
GET_BIT2_CHECK(probLit, symbol, offs &= ~bit, offs &= bit) |
|
146 |
54 |
GET_BIT2_CHECK(probLit, symbol, offs &= ~bit, offs &= bit) |
14528
|
175 |
25 |
while (symbol < 0x100); |
14538
|
3 |
38 |
IF_BIT_0_CHECK(prob) |
|
3 |
0 |
IF_BIT_0_CHECK(prob) |
|
20 |
21 |
IF_BIT_0_CHECK(prob) |
14550
|
1 |
20 |
IF_BIT_0_CHECK(prob) |
|
1 |
0 |
IF_BIT_0_CHECK(prob) |
|
15 |
6 |
IF_BIT_0_CHECK(prob) |
14554
|
2 |
13 |
IF_BIT_0_CHECK(prob) |
|
2 |
0 |
IF_BIT_0_CHECK(prob) |
|
9 |
6 |
IF_BIT_0_CHECK(prob) |
14557
|
3 |
6 |
NORMALIZE_CHECK; |
|
3 |
0 |
NORMALIZE_CHECK; |
14569
|
2 |
4 |
IF_BIT_0_CHECK(prob) |
|
2 |
0 |
IF_BIT_0_CHECK(prob) |
|
4 |
2 |
IF_BIT_0_CHECK(prob) |
14577
|
1 |
3 |
IF_BIT_0_CHECK(prob) |
|
1 |
0 |
IF_BIT_0_CHECK(prob) |
|
1 |
3 |
IF_BIT_0_CHECK(prob) |
14593
|
2 |
30 |
IF_BIT_0_CHECK(probLen) |
|
2 |
0 |
IF_BIT_0_CHECK(probLen) |
|
23 |
9 |
IF_BIT_0_CHECK(probLen) |
14604
|
0 |
9 |
IF_BIT_0_CHECK(probLen) |
|
0 |
0 |
IF_BIT_0_CHECK(probLen) |
|
4 |
5 |
IF_BIT_0_CHECK(probLen) |
14619
|
15 |
106 |
TREE_DECODE_CHECK(probLen, limit, len); |
|
15 |
0 |
TREE_DECODE_CHECK(probLen, limit, len); |
|
75 |
46 |
TREE_DECODE_CHECK(probLen, limit, len); |
|
89 |
32 |
TREE_DECODE_CHECK(probLen, limit, len); |
14623
|
20 |
12 |
if (state < 4) |
14629
|
15 |
105 |
TREE_DECODE_CHECK(prob, 1 << kNumPosSlotBits, posSlot); |
|
15 |
0 |
TREE_DECODE_CHECK(prob, 1 << kNumPosSlotBits, posSlot); |
|
80 |
40 |
TREE_DECODE_CHECK(prob, 1 << kNumPosSlotBits, posSlot); |
|
100 |
20 |
TREE_DECODE_CHECK(prob, 1 << kNumPosSlotBits, posSlot); |
14630
|
18 |
2 |
if (posSlot >= kStartPosModelIndex) |
14636
|
10 |
8 |
if (posSlot < kEndPosModelIndex) |
14643
|
47 |
8 |
do |
14645
|
6 |
49 |
NORMALIZE_CHECK |
|
6 |
0 |
NORMALIZE_CHECK |
14656
|
45 |
18 |
do |
14658
|
8 |
55 |
GET_BIT_CHECK(prob + i, i); |
|
8 |
0 |
GET_BIT_CHECK(prob + i, i); |
|
18 |
45 |
GET_BIT_CHECK(prob + i, i); |
14666
|
22 |
79 |
NORMALIZE_CHECK; |
|
22 |
0 |
NORMALIZE_CHECK; |
14683
|
0 |
0 |
if (initDic) |
14689
|
0 |
0 |
if (initState) |
14704
|
55930 |
7 |
for (i = 0; i < numProbs; i++) |
14720
|
127 |
0 |
while (p->remainLen != kMatchSpecLenStart) |
14724
|
7 |
120 |
if (p->needFlush != 0) |
14726
|
42 |
0 |
for (; inSize > 0 && p->tempBufSize < RC_INIT_SIZE; (*srcLen)++, inSize--) |
|
35 |
7 |
for (; inSize > 0 && p->tempBufSize < RC_INIT_SIZE; (*srcLen)++, inSize--) |
14728
|
0 |
7 |
if (p->tempBufSize < RC_INIT_SIZE) |
14733
|
7 |
0 |
if (p->tempBuf[0] != 0) |
14741
|
7 |
120 |
if (p->dicPos >= dicLimit) |
14743
|
7 |
0 |
if (p->remainLen == 0 && p->code == 0) |
|
7 |
0 |
if (p->remainLen == 0 && p->code == 0) |
14748
|
0 |
0 |
if (finishMode == LZMA_FINISH_ANY) |
14753
|
0 |
0 |
if (p->remainLen != 0) |
14761
|
7 |
113 |
if (p->needInitState) |
14764
|
0 |
120 |
if (p->tempBufSize == 0) |
14768
|
110 |
10 |
if (inSize < LZMA_REQUIRED_INPUT_MAX || checkEndMarkNow) |
14771
|
0 |
110 |
if (dummyRes == DUMMY_ERROR) |
14779
|
0 |
110 |
if (checkEndMarkNow && dummyRes != DUMMY_MATCH) |
14789
|
120 |
0 |
if (LzmaDec_DecodeReal2(p, dicLimit, bufLimit) != 0) |
14799
|
0 |
0 |
while (rem < LZMA_REQUIRED_INPUT_MAX && lookAhead < inSize) |
|
0 |
0 |
while (rem < LZMA_REQUIRED_INPUT_MAX && lookAhead < inSize) |
14802
|
0 |
0 |
if (rem < LZMA_REQUIRED_INPUT_MAX || checkEndMarkNow) |
14805
|
0 |
0 |
if (dummyRes == DUMMY_ERROR) |
14811
|
0 |
0 |
if (checkEndMarkNow && dummyRes != DUMMY_MATCH) |
14818
|
0 |
0 |
if (LzmaDec_DecodeReal2(p, dicLimit, p->buf) != 0) |
14827
|
0 |
0 |
if (p->code == 0) |
14842
|
0 |
0 |
if (p->dicPos == p->dicBufSize) |
14845
|
0 |
0 |
if (outSize > p->dicBufSize - dicPos) |
14865
|
0 |
0 |
if (res != 0) |
14867
|
0 |
0 |
if (outSizeCur == 0 || outSize == 0) |
14895
|
7 |
0 |
if (size < LZMA_PROPS_SIZE) |
14900
|
0 |
7 |
if (dicSize < LZMA_DIC_MIN) |
14905
|
7 |
0 |
if (d >= (9 * 5 * 5)) |
14919
|
0 |
7 |
if (p->probs == 0 || numProbs != p->numProbs) |
|
0 |
0 |
if (p->probs == 0 || numProbs != p->numProbs) |
14924
|
7 |
0 |
if (p->probs == 0) |
14933
|
7 |
0 |
RINOK(LzmaProps_Decode(&propNew, props, propsSize)); |
14934
|
7 |
0 |
RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc)); |
14943
|
0 |
0 |
RINOK(LzmaProps_Decode(&propNew, props, propsSize)); |
14944
|
0 |
0 |
RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc)); |
14946
|
0 |
0 |
if (p->dic == 0 || dicBufSize != p->dicBufSize) |
|
0 |
0 |
if (p->dic == 0 || dicBufSize != p->dicBufSize) |
14950
|
0 |
0 |
if (p->dic == 0) |
14970
|
7 |
0 |
if (inSize < RC_INIT_SIZE) |
14975
|
7 |
0 |
if (res != 0) |
14985
|
7 |
0 |
if (res == SZ_OK && *status == LZMA_STATUS_NEEDS_MORE_INPUT) |
|
0 |
7 |
if (res == SZ_OK && *status == LZMA_STATUS_NEEDS_MORE_INPUT) |
14999
|
7 |
7 |
static void LzmaFree(void* /*p*/, void *address) { delete[] (char*) address; } |
15007
|
7 |
0 |
if (!is.read((char *) &uncompressed_len, sizeof(uncompressed_len))) return false; |
15008
|
7 |
0 |
if (!is.read((char *) &compressed_len, sizeof(compressed_len))) return false; |
15009
|
7 |
0 |
if (!is.read((char *) &poor_crc, sizeof(poor_crc))) return false; |
15010
|
7 |
0 |
if (poor_crc != uncompressed_len * 19991 + compressed_len * 199999991 + 1234567890) return false; |
15011
|
7 |
0 |
if (!is.read((char *) props_encoded, sizeof(props_encoded))) return false; |
15014
|
7 |
0 |
if (!is.read((char *) compressed.data(), compressed_len)) return false; |
|
7 |
0 |
if (!is.read((char *) compressed.data(), compressed_len)) return false; |
15018
|
7 |
0 |
auto res = lzma::LzmaDecode(data.fill(uncompressed_len), &uncompressed_size, compressed.data(), &compressed_size, props_encoded, LZMA_PROPS_SIZE, lzma::LZMA_FINISH_ANY, &status, &lzmaAllocator); |
15019
|
7 |
0 |
if (res != SZ_OK || uncompressed_size != uncompressed_len || compressed_size != compressed_len) return false; |
|
7 |
0 |
if (res != SZ_OK || uncompressed_size != uncompressed_len || compressed_size != compressed_len) return false; |
|
7 |
0 |
if (res != SZ_OK || uncompressed_size != uncompressed_len || compressed_size != compressed_len) return false; |
16264
|
14 |
0 |
if ( p == ( (str.str + str.len)) ) |
16272
|
0 |
38 |
if ( _klen > 0 ) { |
16277
|
0 |
0 |
if ( _upper < _lower ) |
16281
|
0 |
0 |
if ( _widec < _mid[0] ) |
16283
|
0 |
0 |
else if ( _widec > _mid[1] ) |
16286
|
0 |
0 |
switch ( _url_detector_cond_spaces[_url_detector_cond_offsets[cs] + ((_mid - _keys)>>1)] ) { |
16289
|
0 |
0 |
if ( |
16303
|
38 |
0 |
if ( _klen > 0 ) { |
16308
|
104 |
38 |
if ( _upper < _lower ) |
16312
|
32 |
72 |
if ( _widec < *_mid ) |
16314
|
72 |
0 |
else if ( _widec > *_mid ) |
16326
|
38 |
0 |
if ( _klen > 0 ) { |
16331
|
100 |
10 |
if ( _upper < _lower ) |
16335
|
24 |
76 |
if ( _widec < _mid[0] ) |
16337
|
48 |
28 |
else if ( _widec > _mid[1] ) |
16351
|
0 |
38 |
if ( _url_detector_trans_actions[_trans] == 0 ) |
16356
|
0 |
0 |
while ( _nacts-- > 0 ) |
16376
|
28 |
10 |
if ( cs == 0 ) |
16378
|
24 |
4 |
if ( ++p != ( (str.str + str.len)) ) |
16407
|
0 |
14 |
if (length) *length = result_length; |
16408
|
14 |
0 |
return length || result_length == str.len ? result : NO_URL; |
|
0 |
14 |
return length || result_length == str.len ? result : NO_URL; |
16454
|
0 |
0 |
return {1, 2, 0, ""}; |
16466
|
0 |
0 |
<< (nametag.prerelease.empty() ? "" : "-") << nametag.prerelease |
|
0 |
0 |
<< (nametag.prerelease.empty() ? "" : "-") << nametag.prerelease |
16468
|
0 |
0 |
<< (unilib.prerelease.empty() ? "" : "-") << unilib.prerelease |
|
0 |
0 |
<< (unilib.prerelease.empty() ? "" : "-") << unilib.prerelease |
16470
|
0 |
0 |
<< (morphodita.prerelease.empty() ? "" : "-") << morphodita.prerelease |
|
0 |
0 |
<< (morphodita.prerelease.empty() ? "" : "-") << morphodita.prerelease |
16471
|
0 |
0 |
<< (other_libraries.empty() ? "" : "\nand ") << other_libraries << ")\n" |
|
0 |
0 |
<< (other_libraries.empty() ? "" : "\nand ") << other_libraries << ")\n" |
16473
|
0 |
0 |
"Mathematics and Physics, Charles University in Prague, Czech Republic."; |
16479
|
3 |
0 |
} // namespace ufal |
|
3 |
0 |
} // namespace ufal |