| line |
true |
false |
branch |
|
93
|
0 |
0 |
return os.write(str.str, str.len); |
|
|
0 |
0 |
return os.write(str.str, str.len); |
|
|
0 |
0 |
return os.write(str.str, str.len); |
|
|
0 |
0 |
return os.write(str.str, str.len); |
|
|
0 |
0 |
return os.write(str.str, str.len); |
|
|
0 |
0 |
return os.write(str.str, str.len); |
|
|
0 |
0 |
return os.write(str.str, str.len); |
|
|
0 |
0 |
return os.write(str.str, str.len); |
|
|
0 |
0 |
return os.write(str.str, str.len); |
|
|
0 |
0 |
return os.write(str.str, str.len); |
|
97
|
1 |
3 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
0 |
1 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
1 |
2 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
1 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
0 |
3 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
0 |
3 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
1 |
2 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
0 |
1 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
1 |
1 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
0 |
1 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
0 |
1 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
1 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
0 |
1 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
15 |
3 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
0 |
15 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
3 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
0 |
3 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
0 |
16 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
16 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
0 |
16 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
207
|
0 |
0 |
class multiword_token : public token { |
|
|
0 |
0 |
class multiword_token : public token { |
|
|
0 |
0 |
class multiword_token : public token { |
|
229
|
0 |
0 |
class word : public token { |
|
259
|
0 |
0 |
class sentence { |
|
|
0 |
0 |
class sentence { |
|
|
0 |
0 |
class sentence { |
|
417
|
0 |
0 |
pair_system_gold(const word& system, const word& gold) : system(system), gold(gold) {} |
|
492
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
16 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
1 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
6 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
1 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
28 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
1 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
1 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
18 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
7 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
|
22 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
496
|
30 |
0 |
if (chr < CHARS) { |
|
498
|
2 |
28 |
if ((othercase & 0xFF) == othercase_type::LOWER_ONLY) return othercase >> 8; |
|
499
|
0 |
28 |
if ((othercase & 0xFF) == othercase_type::LOWER_THEN_UPPER) return othercase >> 8; |
|
500
|
0 |
28 |
if ((othercase & 0xFF) == othercase_type::TITLE_THEN_LOWER) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8; |
|
506
|
0 |
0 |
if (chr < CHARS) { |
|
508
|
0 |
0 |
if ((othercase & 0xFF) == othercase_type::UPPERTITLE_ONLY) return othercase >> 8; |
|
509
|
0 |
0 |
if ((othercase & 0xFF) == othercase_type::UPPER_ONLY) return othercase >> 8; |
|
510
|
0 |
0 |
if ((othercase & 0xFF) == othercase_type::UPPER_THEN_TITLE) return othercase >> 8; |
|
511
|
0 |
0 |
if ((othercase & 0xFF) == othercase_type::LOWER_THEN_UPPER) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8; |
|
605
|
54 |
0 |
if (((unsigned char)*str) < 0x80) return (unsigned char)*str++; |
|
606
|
0 |
0 |
else if (((unsigned char)*str) < 0xC0) return ++str, REPLACEMENT_CHAR; |
|
607
|
0 |
0 |
else if (((unsigned char)*str) < 0xE0) { |
|
609
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
611
|
0 |
0 |
} else if (((unsigned char)*str) < 0xF0) { |
|
613
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
615
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
617
|
0 |
0 |
} else if (((unsigned char)*str) < 0xF8) { |
|
619
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
621
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
623
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
629
|
145 |
0 |
if (!len) return 0; |
|
631
|
122 |
23 |
if (((unsigned char)*str) < 0x80) return (unsigned char)*str++; |
|
632
|
0 |
23 |
else if (((unsigned char)*str) < 0xC0) return ++str, REPLACEMENT_CHAR; |
|
633
|
23 |
0 |
else if (((unsigned char)*str) < 0xE0) { |
|
635
|
23 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
23 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
23 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
637
|
0 |
0 |
} else if (((unsigned char)*str) < 0xF0) { |
|
639
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
641
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
643
|
0 |
0 |
} else if (((unsigned char)*str) < 0xF8) { |
|
645
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
647
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
649
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
674
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
36 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
33 |
3 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
18 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
15 |
3 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
706
|
0 |
0 |
iterator& operator++() { if (!len) next = nullptr; if (next) codepoint = decode(next, len); return *this; } |
|
|
0 |
0 |
iterator& operator++() { if (!len) next = nullptr; if (next) codepoint = decode(next, len); return *this; } |
|
740
|
25 |
5 |
if (chr < 0x80) str += chr; |
|
741
|
5 |
0 |
else if (chr < 0x800) { str += 0xC0 + (chr >> 6); str += 0x80 + (chr & 0x3F); } |
|
742
|
0 |
0 |
else if (chr < 0x10000) { str += 0xE0 + (chr >> 12); str += 0x80 + ((chr >> 6) & 0x3F); str += 0x80 + (chr & 0x3F); } |
|
743
|
0 |
0 |
else if (chr < 0x200000) { str += 0xF0 + (chr >> 18); str += 0x80 + ((chr >> 12) & 0x3F); str += 0x80 + ((chr >> 6) & 0x3F); str += 0x80 + (chr & 0x3F); } |
|
750
|
0 |
0 |
for (char32_t chr; (chr = decode(str)); ) |
|
757
|
29 |
7 |
while (len) |
|
762
|
0 |
0 |
map(f, str.c_str(), result); |
|
|
0 |
0 |
map(f, str.c_str(), result); |
|
|
0 |
0 |
map(f, str.c_str(), result); |
|
|
0 |
0 |
map(f, str.c_str(), result); |
|
|
0 |
0 |
map(f, str.c_str(), result); |
|
809
|
0 |
0 |
unique_ptr conllu_input(input_format::new_conllu_input_format()); |
|
810
|
0 |
0 |
if (!conllu_input) return error.assign("Cannot allocate CoNLL-U input format instance!"), false; |
|
|
0 |
0 |
if (!conllu_input) return error.assign("Cannot allocate CoNLL-U input format instance!"), false; |
|
812
|
0 |
0 |
vector plain_text_paragraphs(1); unsigned space_after_nos = 0; |
|
813
|
0 |
0 |
sentence system, gold; |
|
|
0 |
0 |
sentence system, gold; |
|
817
|
0 |
0 |
while (conllu_input->read_block(is, block)) { |
|
|
0 |
0 |
while (conllu_input->read_block(is, block)) { |
|
818
|
0 |
0 |
conllu_input->set_text(block); |
|
819
|
0 |
0 |
while (conllu_input->next_sentence(gold, error)) { |
|
|
0 |
0 |
while (conllu_input->next_sentence(gold, error)) { |
|
820
|
0 |
0 |
gold_data.add_sentence(gold); |
|
823
|
0 |
0 |
if (tokenizer != NONE) { |
|
824
|
0 |
0 |
if (gold.get_new_doc() || gold.get_new_par()) { |
|
|
0 |
0 |
if (gold.get_new_doc() || gold.get_new_par()) { |
|
|
0 |
0 |
if (gold.get_new_doc() || gold.get_new_par()) { |
|
|
0 |
0 |
if (gold.get_new_doc() || gold.get_new_par()) { |
|
|
0 |
0 |
if (gold.get_new_doc() || gold.get_new_par()) { |
|
825
|
0 |
0 |
plain_text_paragraphs.back().append("\n\n"); |
|
826
|
0 |
0 |
plain_text_paragraphs.emplace_back(); |
|
829
|
0 |
0 |
for (size_t i = 1, j = 0; i < gold.words.size(); i++) { |
|
830
|
0 |
0 |
const token& tok = j < gold.multiword_tokens.size() && gold.multiword_tokens[j].id_first == int(i) ? (const token&)gold.multiword_tokens[j] : (const token&)gold.words[i]; |
|
|
0 |
0 |
const token& tok = j < gold.multiword_tokens.size() && gold.multiword_tokens[j].id_first == int(i) ? (const token&)gold.multiword_tokens[j] : (const token&)gold.words[i]; |
|
832
|
0 |
0 |
if (tok.get_space_after()) |
|
|
0 |
0 |
if (tok.get_space_after()) |
|
833
|
0 |
0 |
plain_text_paragraphs.back().push_back(' '); |
|
836
|
0 |
0 |
if (j < gold.multiword_tokens.size() && gold.multiword_tokens[j].id_first == int(i)) |
|
|
0 |
0 |
if (j < gold.multiword_tokens.size() && gold.multiword_tokens[j].id_first == int(i)) |
|
|
0 |
0 |
if (j < gold.multiword_tokens.size() && gold.multiword_tokens[j].id_first == int(i)) |
|
842
|
0 |
0 |
if (tokenizer == NONE && tagger != NONE) { |
|
843
|
0 |
0 |
system.clear(); |
|
844
|
0 |
0 |
for (size_t i = 1; i < gold.words.size(); i++) |
|
847
|
0 |
0 |
if (tagger != NONE) { |
|
848
|
0 |
0 |
if (!m->tag(system, tagger, error)) |
|
|
0 |
0 |
if (!m->tag(system, tagger, error)) |
|
850
|
0 |
0 |
if (parser != NONE) |
|
851
|
0 |
0 |
if (!m->parse(system, parser, error)) |
|
|
0 |
0 |
if (!m->parse(system, parser, error)) |
|
854
|
0 |
0 |
system_goldtok_data.add_sentence(system); |
|
858
|
0 |
0 |
if (tokenizer == NONE && tagger == NONE && parser != NONE) { |
|
|
0 |
0 |
if (tokenizer == NONE && tagger == NONE && parser != NONE) { |
|
859
|
0 |
0 |
system.clear(); |
|
860
|
0 |
0 |
for (size_t i = 1; i < gold.words.size(); i++) { |
|
867
|
0 |
0 |
if (parser != NONE) |
|
868
|
0 |
0 |
if (!m->parse(system, parser, error)) |
|
|
0 |
0 |
if (!m->parse(system, parser, error)) |
|
870
|
0 |
0 |
system_goldtok_goldtags_data.add_sentence(system); |
|
873
|
0 |
0 |
if (!error.empty()) return false; |
|
877
|
0 |
0 |
if (tokenizer != NONE) { |
|
878
|
0 |
0 |
unique_ptr t(m->new_tokenizer(tokenizer)); |
|
879
|
0 |
0 |
if (!t) return error.assign("Cannot allocate new tokenizer!"), false; |
|
|
0 |
0 |
if (!t) return error.assign("Cannot allocate new tokenizer!"), false; |
|
881
|
0 |
0 |
for (auto&& plain_text : plain_text_paragraphs) { |
|
882
|
0 |
0 |
t->set_text(plain_text); |
|
883
|
0 |
0 |
while (t->next_sentence(system, error)) { |
|
|
0 |
0 |
while (t->next_sentence(system, error)) { |
|
884
|
0 |
0 |
if (tagger != NONE) { |
|
885
|
0 |
0 |
if (!m->tag(system, tagger, error)) |
|
|
0 |
0 |
if (!m->tag(system, tagger, error)) |
|
888
|
0 |
0 |
if (parser != NONE) |
|
889
|
0 |
0 |
if (!m->parse(system, parser, error)) |
|
|
0 |
0 |
if (!m->parse(system, parser, error)) |
|
892
|
0 |
0 |
system_plaintext_data.add_sentence(system); |
|
894
|
0 |
0 |
if (!error.empty()) return false; |
|
899
|
0 |
0 |
if (tokenizer != NONE) { |
|
900
|
0 |
0 |
if (system_plaintext_data.chars != gold_data.chars) { |
|
904
|
0 |
0 |
word_alignment::best_alignment(system_plaintext_data, gold_data, plaintext_alignment); |
|
912
|
0 |
0 |
if (multiwords.total_gold || multiwords.total_system) |
|
|
0 |
0 |
if (multiwords.total_gold || multiwords.total_system) |
|
926
|
0 |
0 |
if (tagger != NONE) { |
|
930
|
0 |
0 |
auto alltags = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag && w.xpostag == u.xpostag && w.feats == u.feats; }); |
|
|
0 |
0 |
auto alltags = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag && w.xpostag == u.xpostag && w.feats == u.feats; }); |
|
|
0 |
0 |
auto alltags = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag && w.xpostag == u.xpostag && w.feats == u.feats; }); |
|
938
|
0 |
0 |
if (tagger != NONE && parser != NONE) { |
|
940
|
0 |
0 |
auto las = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; }); |
|
|
0 |
0 |
auto las = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; }); |
|
948
|
0 |
0 |
if (tokenizer == NONE && tagger != NONE) { |
|
950
|
0 |
0 |
if (!word_alignment::perfect_alignment(system_goldtok_data, gold_data, goldtok_alignment)) |
|
|
0 |
0 |
if (!word_alignment::perfect_alignment(system_goldtok_data, gold_data, goldtok_alignment)) |
|
951
|
0 |
0 |
return error.assign("Internal UDPipe error (the words of the gold data do not match)!"), false; |
|
956
|
0 |
0 |
auto alltags = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag && w.xpostag == u.xpostag && w.feats == u.feats; }); |
|
|
0 |
0 |
auto alltags = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag && w.xpostag == u.xpostag && w.feats == u.feats; }); |
|
|
0 |
0 |
auto alltags = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag && w.xpostag == u.xpostag && w.feats == u.feats; }); |
|
963
|
0 |
0 |
if (parser != NONE) { |
|
965
|
0 |
0 |
auto las = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; }); |
|
|
0 |
0 |
auto las = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; }); |
|
972
|
0 |
0 |
if (tokenizer == NONE && tagger == NONE && parser != NONE) { |
|
|
0 |
0 |
if (tokenizer == NONE && tagger == NONE && parser != NONE) { |
|
974
|
0 |
0 |
if (!word_alignment::perfect_alignment(system_goldtok_goldtags_data, gold_data, goldtok_goldtags_alignment)) |
|
|
0 |
0 |
if (!word_alignment::perfect_alignment(system_goldtok_goldtags_data, gold_data, goldtok_goldtags_alignment)) |
|
975
|
0 |
0 |
return error.assign("Internal UDPipe error (the words of the goldtok data do not match)!"), false; |
|
978
|
0 |
0 |
auto las = goldtok_goldtags_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; }); |
|
|
0 |
0 |
auto las = goldtok_goldtags_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; }); |
|
989
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
990
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].first < gold[gi].first)) |
|
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].first < gold[gi].first)) |
|
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].first < gold[gi].first)) |
|
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].first < gold[gi].first)) |
|
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].first < gold[gi].first)) |
|
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].first < gold[gi].first)) |
|
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].first < gold[gi].first)) |
|
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].first < gold[gi].first)) |
|
992
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].first < system[si].first)) |
|
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].first < system[si].first)) |
|
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].first < system[si].first)) |
|
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].first < system[si].first)) |
|
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].first < system[si].first)) |
|
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].first < system[si].first)) |
|
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].first < system[si].first)) |
|
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].first < system[si].first)) |
|
998
|
0 |
0 |
gold.size() ? both / double(gold.size()) : 0., system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0. }; |
|
|
0 |
0 |
gold.size() ? both / double(gold.size()) : 0., system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0. }; |
|
|
0 |
0 |
gold.size() ? both / double(gold.size()) : 0., system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0. }; |
|
|
0 |
0 |
gold.size() ? both / double(gold.size()) : 0., system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0. }; |
|
|
0 |
0 |
gold.size() ? both / double(gold.size()) : 0., system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0. }; |
|
|
0 |
0 |
gold.size() ? both / double(gold.size()) : 0., system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0. }; |
|
1006
|
0 |
0 |
this->w.head = w.head ? id + (w.head - w.id) : 0; |
|
1014
|
0 |
0 |
if (colon != string::npos) |
|
1015
|
0 |
0 |
this->w.deprel.erase(colon); |
|
1020
|
0 |
0 |
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
|
1022
|
0 |
0 |
const string& form = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? s.multiword_tokens[j].form : s.words[i].form; |
|
|
0 |
0 |
const string& form = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? s.multiword_tokens[j].form : s.words[i].form; |
|
1023
|
0 |
0 |
for (auto&& chr : unilib::utf8::decoder(form)) |
|
1024
|
0 |
0 |
if (chr != ' ') |
|
1028
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) { |
|
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) { |
|
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) { |
|
1030
|
0 |
0 |
for (size_t k = i; int(k) <= s.multiword_tokens[j].id_last; k++) { |
|
1045
|
0 |
0 |
for (auto&& match : matched) |
|
|
0 |
0 |
for (auto&& match : matched) |
|
|
0 |
0 |
for (auto&& match : matched) |
|
|
0 |
0 |
for (auto&& match : matched) |
|
|
0 |
0 |
for (auto&& match : matched) |
|
|
0 |
0 |
for (auto&& match : matched) |
|
|
0 |
0 |
for (auto&& match : matched) |
|
|
0 |
0 |
for (auto&& match : matched) |
|
|
0 |
0 |
for (auto&& match : matched) |
|
|
0 |
0 |
for (auto&& match : matched) |
|
|
0 |
0 |
for (auto&& match : matched) |
|
|
0 |
0 |
for (auto&& match : matched) |
|
|
0 |
0 |
for (auto&& match : matched) |
|
|
0 |
0 |
for (auto&& match : matched) |
|
|
0 |
0 |
for (auto&& match : matched) |
|
|
0 |
0 |
for (auto&& match : matched) |
|
|
0 |
0 |
for (auto&& match : matched) |
|
1046
|
0 |
0 |
if (equals(match.system, match.gold)) |
|
|
0 |
0 |
if (equals(match.system, match.gold)) |
|
|
0 |
0 |
if (equals(match.system, match.gold)) |
|
|
0 |
0 |
if (equals(match.system, match.gold)) |
|
1050
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
1056
|
0 |
0 |
if (alignment.total_system != alignment.total_gold) return false; |
|
1060
|
0 |
0 |
for (size_t i = 0; i < system.words.size(); i++) { |
|
1061
|
0 |
0 |
if (system.words[i].w.form != gold.words[i].w.form) |
|
1074
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.words.size() && gi < gold.words.size(); ) |
|
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.words.size() && gi < gold.words.size(); ) |
|
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.words.size() && gi < gold.words.size(); ) |
|
1075
|
0 |
0 |
if ((system.words[si].start > gold.words[gi].start || !system.words[si].is_multiword) && |
|
|
0 |
0 |
if ((system.words[si].start > gold.words[gi].start || !system.words[si].is_multiword) && |
|
|
0 |
0 |
if ((system.words[si].start > gold.words[gi].start || !system.words[si].is_multiword) && |
|
|
0 |
0 |
if ((system.words[si].start > gold.words[gi].start || !system.words[si].is_multiword) && |
|
1076
|
0 |
0 |
(gold.words[gi].start > system.words[si].start || !gold.words[gi].is_multiword)) { |
|
1078
|
0 |
0 |
if (system.words[si].start == gold.words[gi].start && system.words[si].end == gold.words[gi].end) |
|
|
0 |
0 |
if (system.words[si].start == gold.words[gi].start && system.words[si].end == gold.words[gi].end) |
|
|
0 |
0 |
if (system.words[si].start == gold.words[gi].start && system.words[si].end == gold.words[gi].end) |
|
1080
|
0 |
0 |
else if (system.words[si].start <= gold.words[gi].start) |
|
1086
|
0 |
0 |
size_t ss = si, gs = gi, multiword_range_end = system.words[si].is_multiword ? system.words[si].end : gold.words[gi].end; |
|
1089
|
0 |
0 |
while ((si < system.words.size() && (system.words[si].is_multiword ? system.words[si].start < multiword_range_end : |
|
|
0 |
0 |
while ((si < system.words.size() && (system.words[si].is_multiword ? system.words[si].start < multiword_range_end : |
|
|
0 |
0 |
while ((si < system.words.size() && (system.words[si].is_multiword ? system.words[si].start < multiword_range_end : |
|
1090
|
0 |
0 |
system.words[si].end <= multiword_range_end)) || |
|
|
0 |
0 |
system.words[si].end <= multiword_range_end)) || |
|
|
0 |
0 |
system.words[si].end <= multiword_range_end)) || |
|
|
0 |
0 |
system.words[si].end <= multiword_range_end)) || |
|
1091
|
0 |
0 |
(gi < gold.words.size() && (gold.words[gi].is_multiword ? gold.words[gi].start < multiword_range_end : |
|
|
0 |
0 |
(gi < gold.words.size() && (gold.words[gi].is_multiword ? gold.words[gi].start < multiword_range_end : |
|
1094
|
0 |
0 |
if (si < system.words.size() && (gi >= gold.words.size() || system.words[si].start <= gold.words[gi].start)) { |
|
|
0 |
0 |
if (si < system.words.size() && (gi >= gold.words.size() || system.words[si].start <= gold.words[gi].start)) { |
|
|
0 |
0 |
if (si < system.words.size() && (gi >= gold.words.size() || system.words[si].start <= gold.words[gi].start)) { |
|
|
0 |
0 |
if (si < system.words.size() && (gi >= gold.words.size() || system.words[si].start <= gold.words[gi].start)) { |
|
1095
|
0 |
0 |
if (system.words[si].is_multiword) multiword_range_end = max(multiword_range_end, system.words[si].end); |
|
1098
|
0 |
0 |
if (gold.words[gi].is_multiword) multiword_range_end = max(multiword_range_end, gold.words[gi].end); |
|
1105
|
0 |
0 |
for (unsigned s = si - ss; s--; ) { |
|
1106
|
0 |
0 |
lcs[s].resize(gi - gs); |
|
1107
|
0 |
0 |
for (unsigned g = gi - gs; g--; ) { |
|
1108
|
0 |
0 |
lcs[s][g] = max(lcs[s][g], s+1 < lcs.size() ? lcs[s+1][g] : 0); |
|
1109
|
0 |
0 |
lcs[s][g] = max(lcs[s][g], g+1 < lcs[s].size() ? lcs[s][g+1] : 0); |
|
1110
|
0 |
0 |
if (system.words[ss + s].w.form == gold.words[gs + g].w.form) |
|
1111
|
0 |
0 |
lcs[s][g] = max(lcs[s][g], 1 + (s+1 < lcs.size() && g+1 < lcs[s].size() ? lcs[s+1][g+1] : 0)); |
|
|
0 |
0 |
lcs[s][g] = max(lcs[s][g], 1 + (s+1 < lcs.size() && g+1 < lcs[s].size() ? lcs[s+1][g+1] : 0)); |
|
1115
|
0 |
0 |
for (unsigned s = 0, g = 0; s < si - ss && g < gi - gs; ) { |
|
|
0 |
0 |
for (unsigned s = 0, g = 0; s < si - ss && g < gi - gs; ) { |
|
1116
|
0 |
0 |
if (system.words[ss + s].w.form == gold.words[gs + g].w.form) |
|
1117
|
0 |
0 |
alignment.matched.emplace_back(system.words[ss + s++].w, gold.words[gs + g++].w); |
|
1118
|
0 |
0 |
else if (lcs[s][g] == (s+1 < lcs.size() ? lcs[s+1][g] : 0)) |
|
|
0 |
0 |
else if (lcs[s][g] == (s+1 < lcs.size() ? lcs[s+1][g] : 0)) |
|
1127
|
0 |
0 |
for (auto&& match : alignment.matched) |
|
1129
|
0 |
0 |
for (auto&& match : alignment.matched) |
|
1130
|
0 |
0 |
if (match.system.head > 0) |
|
1395
|
0 |
0 |
class node { |
|
1430
|
0 |
0 |
class tree { |
|
|
0 |
0 |
class tree { |
|
|
0 |
0 |
class tree { |
|
|
0 |
0 |
class tree { |
|
|
0 |
0 |
class tree { |
|
1499
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
1502
|
0 |
0 |
class binary_decoder { |
|
|
0 |
0 |
class binary_decoder { |
|
|
0 |
0 |
class binary_decoder { |
|
|
0 |
0 |
class binary_decoder { |
|
|
0 |
0 |
class binary_decoder { |
|
|
0 |
0 |
class binary_decoder { |
|
|
0 |
0 |
class binary_decoder { |
|
|
0 |
0 |
class binary_decoder { |
|
|
0 |
0 |
class binary_decoder { |
|
1527
|
6 |
0 |
buffer.resize(len); |
|
1535
|
0 |
1308 |
if (data + 1 > data_end) throw binary_decoder_error("No more data in binary_decoder"); |
|
1540
|
0 |
26 |
if (data + sizeof(uint16_t) > data_end) throw binary_decoder_error("No more data in binary_decoder"); |
|
1548
|
0 |
1573 |
if (data + sizeof(uint32_t) > data_end) throw binary_decoder_error("No more data in binary_decoder"); |
|
1557
|
1 |
35 |
if (len == 255) len = next_4B(); |
|
1562
|
0 |
603 |
if (data + sizeof(T) * elements > data_end) throw binary_decoder_error("No more data in binary_decoder"); |
|
|
0 |
484 |
if (data + sizeof(T) * elements > data_end) throw binary_decoder_error("No more data in binary_decoder"); |
|
|
0 |
185 |
if (data + sizeof(T) * elements > data_end) throw binary_decoder_error("No more data in binary_decoder"); |
|
1577
|
0 |
1 |
if (pos > buffer.size()) throw binary_decoder_error("Cannot seek past end of binary_decoder"); |
|
1683
|
34 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
|
34 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
|
34 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
|
34 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
|
34 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
|
0 |
34 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
1688
|
34 |
0 |
if (str.len && (str.str[0] == '+' || str.str[0] == '-')) { |
|
|
8 |
26 |
if (str.len && (str.str[0] == '+' || str.str[0] == '-')) { |
|
1694
|
0 |
34 |
if (!str.len) return error.assign("Cannot parse ").append(value_name).append(" int value '").append(original.str, original.len).append("': empty string."), false; |
|
1698
|
34 |
34 |
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
|
0 |
34 |
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
|
0 |
34 |
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
1699
|
26 |
8 |
if (positive) { |
|
1700
|
0 |
26 |
if (value > (numeric_limits::max() - (str.str[0] - '0')) / 10) |
|
1704
|
0 |
8 |
if (value < (numeric_limits::min() + (str.str[0] - '0')) / 10) |
|
1712
|
0 |
34 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
1716
|
0 |
34 |
if (str.len) return error.assign("Cannot parse ").append(value_name).append(" int value '").append(original.str, original.len).append("': non-digit character found."), false; |
|
1725
|
0 |
0 |
if (!parse_int(str, value_name, result, error)) |
|
|
0 |
0 |
if (!parse_int(str, value_name, result, error)) |
|
1866
|
0 |
3 |
for (size_t start = 0; start < values.size(); ) { |
|
1867
|
0 |
0 |
while (start < values.size() && values[start] == ';') start++; |
|
|
0 |
0 |
while (start < values.size() && values[start] == ';') start++; |
|
|
0 |
0 |
while (start < values.size() && values[start] == ';') start++; |
|
1868
|
0 |
0 |
if (start >= values.size()) break; |
|
1871
|
0 |
0 |
name.assign(values, start, name_end - start); |
|
1874
|
0 |
0 |
if (name_end == string::npos) { |
|
1876
|
0 |
0 |
} else if (values[name_end] == ';') { |
|
1881
|
0 |
0 |
if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "file:") == 0) { |
|
|
0 |
0 |
if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "file:") == 0) { |
|
|
0 |
0 |
if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "file:") == 0) { |
|
|
0 |
0 |
if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "file:") == 0) { |
|
1886
|
0 |
0 |
file.assign(values, file_name, semicolon - file_name); |
|
1887
|
0 |
0 |
ifstream is(path_from_utf8(file).c_str()); |
|
1888
|
0 |
0 |
if (!is.is_open()) return error.assign("Cannot open file '").append(file).append("'!"), false; |
|
|
0 |
0 |
if (!is.is_open()) return error.assign("Cannot open file '").append(file).append("'!"), false; |
|
|
0 |
0 |
if (!is.is_open()) return error.assign("Cannot open file '").append(file).append("'!"), false; |
|
1891
|
0 |
0 |
for (value.clear(); is.read(buffer, sizeof(buffer)); ) |
|
|
0 |
0 |
for (value.clear(); is.read(buffer, sizeof(buffer)); ) |
|
1892
|
0 |
0 |
value.append(buffer, sizeof(buffer)); |
|
1893
|
0 |
0 |
value.append(buffer, is.gcount()); |
|
1896
|
0 |
0 |
} else if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "data:") == 0) { |
|
|
0 |
0 |
} else if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "data:") == 0) { |
|
|
0 |
0 |
} else if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "data:") == 0) { |
|
|
0 |
0 |
} else if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "data:") == 0) { |
|
1900
|
0 |
0 |
if (data_size_end == string::npos) return error.assign("Cannot parse named values, data size of value '").append(name).append("' not terminated!"), false; |
|
|
0 |
0 |
if (data_size_end == string::npos) return error.assign("Cannot parse named values, data size of value '").append(name).append("' not terminated!"), false; |
|
|
0 |
0 |
if (data_size_end == string::npos) return error.assign("Cannot parse named values, data size of value '").append(name).append("' not terminated!"), false; |
|
1903
|
0 |
0 |
if (!parse_int(string_piece(values.c_str() + data_size_start, data_size_end - data_size_start), "data_size", data_size, error)) return false; |
|
|
0 |
0 |
if (!parse_int(string_piece(values.c_str() + data_size_start, data_size_end - data_size_start), "data_size", data_size, error)) return false; |
|
1906
|
0 |
0 |
if (data_end > values.size()) return error.assign("Cannot parse named values, value '").append(name).append("' shorter than specified length!"), false; |
|
|
0 |
0 |
if (data_end > values.size()) return error.assign("Cannot parse named values, value '").append(name).append("' shorter than specified length!"), false; |
|
|
0 |
0 |
if (data_end > values.size()) return error.assign("Cannot parse named values, value '").append(name).append("' shorter than specified length!"), false; |
|
1907
|
0 |
0 |
if (data_end < values.size() && values[data_end] != ';') return error.assign("Cannot parse named values, value '").append(name).append("' not terminated by semicolon!"), false; |
|
|
0 |
0 |
if (data_end < values.size() && values[data_end] != ';') return error.assign("Cannot parse named values, value '").append(name).append("' not terminated by semicolon!"), false; |
|
|
0 |
0 |
if (data_end < values.size() && values[data_end] != ';') return error.assign("Cannot parse named values, value '").append(name).append("' not terminated by semicolon!"), false; |
|
|
0 |
0 |
if (data_end < values.size() && values[data_end] != ';') return error.assign("Cannot parse named values, value '").append(name).append("' not terminated by semicolon!"), false; |
|
|
0 |
0 |
if (data_end < values.size() && values[data_end] != ';') return error.assign("Cannot parse named values, value '").append(name).append("' not terminated by semicolon!"), false; |
|
1909
|
0 |
0 |
value.assign(values, data_start, data_end - data_start); |
|
1914
|
0 |
0 |
value.assign(values, equal_sign + 1, semicolon - equal_sign - 1); |
|
1961
|
0 |
0 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
|
0 |
0 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
|
0 |
1 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
|
0 |
1 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
|
0 |
1 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
|
0 |
1 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
1970
|
0 |
0 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
|
0 |
0 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
|
0 |
1 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
|
0 |
1 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
|
0 |
1 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
|
0 |
1 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
1971
|
0 |
0 |
if (!stack.empty()) { |
|
|
0 |
0 |
if (!stack.empty()) { |
|
|
0 |
1 |
if (!stack.empty()) { |
|
|
0 |
1 |
if (!stack.empty()) { |
|
|
0 |
1 |
if (!stack.empty()) { |
|
|
0 |
1 |
if (!stack.empty()) { |
|
2027
|
1 |
0 |
struct parser_cache { |
|
2090
|
1 |
0 |
ifstream in(path_from_utf8(fname).c_str(), ifstream::in | ifstream::binary); |
|
2091
|
1 |
0 |
if (!in.is_open()) return nullptr; |
|
2092
|
1 |
0 |
return load(in); |
|
2097
|
1 |
0 |
if (!is.get(len)) return nullptr; |
|
2099
|
1 |
0 |
if (!is.read(&name[0], len)) return nullptr; |
|
|
1 |
0 |
if (!is.read(&name[0], len)) return nullptr; |
|
2101
|
1 |
0 |
if (name == "morphodita_parsito") return model_morphodita_parsito::load(is); |
|
|
1 |
0 |
if (name == "morphodita_parsito") return model_morphodita_parsito::load(is); |
|
2245
|
0 |
0 |
for (string line; getline(is, line); ) { |
|
|
0 |
0 |
for (string line; getline(is, line); ) { |
|
2247
|
0 |
0 |
para.push_back('\n'); |
|
2249
|
0 |
0 |
if (line.empty()) break; |
|
2252
|
0 |
0 |
if (is.eof() && !para.empty()) is.clear(istream::eofbit); |
|
|
0 |
0 |
if (is.eof() && !para.empty()) is.clear(istream::eofbit); |
|
|
0 |
0 |
if (is.eof() && !para.empty()) is.clear(istream::eofbit); |
|
2294
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
2299
|
0 |
0 |
if (str.len && (str.str[0] == '+' || str.str[0] == '-')) { |
|
|
0 |
0 |
if (str.len && (str.str[0] == '+' || str.str[0] == '-')) { |
|
2305
|
0 |
0 |
if (!str.len) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': empty string."), false; |
|
2309
|
0 |
0 |
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
|
0 |
0 |
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
|
0 |
0 |
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
2315
|
0 |
0 |
if (str.len && str.str[0] == '.') { |
|
|
0 |
0 |
if (str.len && str.str[0] == '.') { |
|
2319
|
0 |
0 |
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
|
0 |
0 |
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
|
0 |
0 |
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
2327
|
0 |
0 |
if (!isfinite(value)) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': overflow occured."), false; |
|
2330
|
0 |
0 |
if (str.len && (str.str[0] == 'e' || str.str[0] == 'E')) { |
|
|
0 |
0 |
if (str.len && (str.str[0] == 'e' || str.str[0] == 'E')) { |
|
2335
|
0 |
0 |
if (str.len && (str.str[0] == '+' || str.str[0] == '-')) { |
|
|
0 |
0 |
if (str.len && (str.str[0] == '+' || str.str[0] == '-')) { |
|
2340
|
0 |
0 |
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
|
0 |
0 |
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
|
0 |
0 |
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
2345
|
0 |
0 |
exponent = pow(10., exponent_negative ? -exponent : exponent); |
|
2346
|
0 |
0 |
if (!isfinite(exponent)) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': exponent overflow occured."), false; |
|
2347
|
0 |
0 |
if (exponent == 0) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': exponent underflow occured."), false; |
|
2349
|
0 |
0 |
if (value) { |
|
2351
|
0 |
0 |
if (!isfinite(value)) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': overflow occured."), false; |
|
2352
|
0 |
0 |
if (value == 0) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': underflow occured."), false; |
|
2357
|
0 |
0 |
if (negative) value *= -1; |
|
2360
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
2364
|
0 |
0 |
if (str.len) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': non-digit character found."), false; |
|
2373
|
0 |
0 |
if (!parse_double(str, value_name, result, error)) |
|
|
0 |
0 |
if (!parse_double(str, value_name, result, error)) |
|
2400
|
1 |
0 |
if (!tokenizer_factory) |
|
2405
|
1 |
0 |
if (!named_values::parse(options, parsed_options, parse_error)) |
|
|
1 |
0 |
if (!named_values::parse(options, parsed_options, parse_error)) |
|
2408
|
1 |
0 |
bool normalized_spaces = parsed_options.count("normalized_spaces"); |
|
2409
|
1 |
0 |
bool token_ranges = parsed_options.count("ranges"); |
|
2411
|
1 |
0 |
const auto* morpho = !taggers.empty() ? taggers[0].tagger->get_morpho() : nullptr; |
|
|
1 |
0 |
const auto* morpho = !taggers.empty() ? taggers[0].tagger->get_morpho() : nullptr; |
|
2412
|
1 |
0 |
unique_ptr result(new morphodita_tokenizer_wrapper(tokenizer_factory->new_tokenizer(morpho), splitter.get(), normalized_spaces, token_ranges)); |
|
|
1 |
0 |
unique_ptr result(new morphodita_tokenizer_wrapper(tokenizer_factory->new_tokenizer(morpho), splitter.get(), normalized_spaces, token_ranges)); |
|
|
1 |
0 |
unique_ptr result(new morphodita_tokenizer_wrapper(tokenizer_factory->new_tokenizer(morpho), splitter.get(), normalized_spaces, token_ranges)); |
|
2415
|
0 |
0 |
if (parsed_options.count("presegmented") && result) |
|
|
0 |
1 |
if (parsed_options.count("presegmented") && result) |
|
2416
|
0 |
0 |
result.reset(input_format::new_presegmented_tokenizer(result.release())); |
|
2419
|
0 |
0 |
if (parsed_options.count("joint_with_parsing") && result) { |
|
|
0 |
1 |
if (parsed_options.count("joint_with_parsing") && result) { |
|
2421
|
0 |
0 |
if (parsed_options.count("joint_max_sentence_len") && !parse_int(parsed_options["joint_max_sentence_len"], "joint max sentence len", max_sentence_len, parse_error)) |
|
|
0 |
0 |
if (parsed_options.count("joint_max_sentence_len") && !parse_int(parsed_options["joint_max_sentence_len"], "joint max sentence len", max_sentence_len, parse_error)) |
|
|
0 |
0 |
if (parsed_options.count("joint_max_sentence_len") && !parse_int(parsed_options["joint_max_sentence_len"], "joint max sentence len", max_sentence_len, parse_error)) |
|
|
0 |
0 |
if (parsed_options.count("joint_max_sentence_len") && !parse_int(parsed_options["joint_max_sentence_len"], "joint max sentence len", max_sentence_len, parse_error)) |
|
|
0 |
0 |
if (parsed_options.count("joint_max_sentence_len") && !parse_int(parsed_options["joint_max_sentence_len"], "joint max sentence len", max_sentence_len, parse_error)) |
|
|
0 |
0 |
if (parsed_options.count("joint_max_sentence_len") && !parse_int(parsed_options["joint_max_sentence_len"], "joint max sentence len", max_sentence_len, parse_error)) |
|
|
0 |
0 |
if (parsed_options.count("joint_max_sentence_len") && !parse_int(parsed_options["joint_max_sentence_len"], "joint max sentence len", max_sentence_len, parse_error)) |
|
2425
|
0 |
0 |
if (parsed_options.count("joint_change_boundary_logprob") && !parse_double(parsed_options["joint_change_boundary_logprob"], "joint change boundary logprob", change_boundary_logprob, parse_error)) |
|
|
0 |
0 |
if (parsed_options.count("joint_change_boundary_logprob") && !parse_double(parsed_options["joint_change_boundary_logprob"], "joint change boundary logprob", change_boundary_logprob, parse_error)) |
|
|
0 |
0 |
if (parsed_options.count("joint_change_boundary_logprob") && !parse_double(parsed_options["joint_change_boundary_logprob"], "joint change boundary logprob", change_boundary_logprob, parse_error)) |
|
|
0 |
0 |
if (parsed_options.count("joint_change_boundary_logprob") && !parse_double(parsed_options["joint_change_boundary_logprob"], "joint change boundary logprob", change_boundary_logprob, parse_error)) |
|
|
0 |
0 |
if (parsed_options.count("joint_change_boundary_logprob") && !parse_double(parsed_options["joint_change_boundary_logprob"], "joint change boundary logprob", change_boundary_logprob, parse_error)) |
|
|
0 |
0 |
if (parsed_options.count("joint_change_boundary_logprob") && !parse_double(parsed_options["joint_change_boundary_logprob"], "joint change boundary logprob", change_boundary_logprob, parse_error)) |
|
|
0 |
0 |
if (parsed_options.count("joint_change_boundary_logprob") && !parse_double(parsed_options["joint_change_boundary_logprob"], "joint change boundary logprob", change_boundary_logprob, parse_error)) |
|
2429
|
0 |
0 |
if (parsed_options.count("joint_sentence_logprob") && !parse_double(parsed_options["joint_sentence_logprob"], "joint sentence logprob", sentence_logprob, parse_error)) |
|
|
0 |
0 |
if (parsed_options.count("joint_sentence_logprob") && !parse_double(parsed_options["joint_sentence_logprob"], "joint sentence logprob", sentence_logprob, parse_error)) |
|
|
0 |
0 |
if (parsed_options.count("joint_sentence_logprob") && !parse_double(parsed_options["joint_sentence_logprob"], "joint sentence logprob", sentence_logprob, parse_error)) |
|
|
0 |
0 |
if (parsed_options.count("joint_sentence_logprob") && !parse_double(parsed_options["joint_sentence_logprob"], "joint sentence logprob", sentence_logprob, parse_error)) |
|
|
0 |
0 |
if (parsed_options.count("joint_sentence_logprob") && !parse_double(parsed_options["joint_sentence_logprob"], "joint sentence logprob", sentence_logprob, parse_error)) |
|
|
0 |
0 |
if (parsed_options.count("joint_sentence_logprob") && !parse_double(parsed_options["joint_sentence_logprob"], "joint sentence logprob", sentence_logprob, parse_error)) |
|
|
0 |
0 |
if (parsed_options.count("joint_sentence_logprob") && !parse_double(parsed_options["joint_sentence_logprob"], "joint sentence logprob", sentence_logprob, parse_error)) |
|
2432
|
0 |
0 |
result.reset(new joint_with_parsing_tokenizer(result.release(), *this, max_sentence_len, change_boundary_logprob, sentence_logprob)); |
|
2441
|
0 |
1 |
if (taggers.empty()) return error.assign("No tagger defined for the UDPipe model!"), false; |
|
2442
|
1 |
0 |
if (s.empty()) return true; |
|
2445
|
1 |
0 |
if (!c) c = new tagger_cache(); |
|
2450
|
1 |
7 |
for (size_t i = 1; i < s.words.size(); i++) |
|
2454
|
7 |
1 |
for (size_t i = 1; i < s.words.size(); i++) { |
|
2462
|
1 |
1 |
for (auto&& tagger : taggers) { |
|
2463
|
0 |
1 |
if (!tagger.tagger) return error.assign("No tagger defined for the UDPipe model!"), false; |
|
2467
|
7 |
1 |
for (size_t i = 0; i < c->lemmas.size(); i++) |
|
2472
|
1 |
0 |
if (taggers.size() == 1 && taggers[0].raw && taggers[0].tagger->get_morpho()) { |
|
|
0 |
1 |
if (taggers.size() == 1 && taggers[0].raw && taggers[0].tagger->get_morpho()) { |
|
|
0 |
0 |
if (taggers.size() == 1 && taggers[0].raw && taggers[0].tagger->get_morpho()) { |
|
|
0 |
1 |
if (taggers.size() == 1 && taggers[0].raw && taggers[0].tagger->get_morpho()) { |
|
2474
|
0 |
0 |
for (size_t i = 0; i < c->forms_string_pieces.size(); i++) { |
|
2475
|
0 |
0 |
if (morpho->analyze(c->forms_string_pieces[i], morphodita::morpho::GUESSER, c->lemmas) == morphodita::morpho::GUESSER) |
|
2476
|
0 |
0 |
s.words[i + 1].misc.append(s.words[i + 1].misc.empty() ? "" : "|").append("MorphoGuesser=Yes"); |
|
2491
|
0 |
1 |
if (!parser) return error.assign("No parser defined for the UDPipe model!"), false; |
|
2492
|
1 |
0 |
if (s.empty()) return true; |
|
2495
|
1 |
0 |
if (!c) c = new parser_cache(); |
|
2498
|
1 |
0 |
if (!named_values::parse(options, c->options, error)) |
|
2500
|
0 |
1 |
if (c->options.count("beam_search")) |
|
2501
|
0 |
0 |
if (!parse_int(c->options["beam_search"], "beam_search", beam_search, error)) |
|
|
0 |
0 |
if (!parse_int(c->options["beam_search"], "beam_search", beam_search, error)) |
|
2505
|
7 |
1 |
for (size_t i = 1; i < s.words.size(); i++) { |
|
2517
|
7 |
1 |
for (size_t i = 1; i < s.words.size(); i++) |
|
2526
|
1 |
0 |
if (!is.get(version)) return nullptr; |
|
2527
|
1 |
0 |
if (!(version >= 1 && version <= VERSION_LATEST)) return nullptr; |
|
2532
|
0 |
1 |
if (version >= 2) { |
|
2534
|
0 |
0 |
if (!is.get(sentinel) || sentinel != 0x7F) return nullptr; |
|
|
0 |
0 |
if (!is.get(sentinel) || sentinel != 0x7F) return nullptr; |
|
|
0 |
0 |
if (!is.get(sentinel) || sentinel != 0x7F) return nullptr; |
|
2535
|
0 |
0 |
if (!is.get(sentinel) || sentinel != 0x7F) return nullptr; |
|
|
0 |
0 |
if (!is.get(sentinel) || sentinel != 0x7F) return nullptr; |
|
|
0 |
0 |
if (!is.get(sentinel) || sentinel != 0x7F) return nullptr; |
|
2539
|
1 |
0 |
if (!m) return nullptr; |
|
2542
|
1 |
0 |
if (!is.get(tokenizer)) return nullptr; |
|
|
1 |
0 |
if (!is.get(tokenizer)) return nullptr; |
|
2543
|
1 |
0 |
m->tokenizer_factory.reset(tokenizer ? morphodita::tokenizer_factory::load(is) : nullptr); |
|
|
1 |
0 |
m->tokenizer_factory.reset(tokenizer ? morphodita::tokenizer_factory::load(is) : nullptr); |
|
2544
|
1 |
0 |
if (tokenizer && !m->tokenizer_factory) return nullptr; |
|
|
1 |
0 |
if (tokenizer && !m->tokenizer_factory) return nullptr; |
|
|
1 |
0 |
if (tokenizer && !m->tokenizer_factory) return nullptr; |
|
2545
|
1 |
0 |
m->splitter.reset(tokenizer ? multiword_splitter::load(is) : nullptr); |
|
|
1 |
0 |
m->splitter.reset(tokenizer ? multiword_splitter::load(is) : nullptr); |
|
2546
|
1 |
0 |
if (tokenizer && !m->splitter) return nullptr; |
|
|
1 |
0 |
if (tokenizer && !m->splitter) return nullptr; |
|
|
1 |
0 |
if (tokenizer && !m->splitter) return nullptr; |
|
2549
|
1 |
0 |
char taggers; if (!is.get(taggers)) return nullptr; |
|
|
1 |
0 |
char taggers; if (!is.get(taggers)) return nullptr; |
|
2550
|
1 |
1 |
for (char i = 0; i < taggers; i++) { |
|
2551
|
1 |
0 |
char lemma; if (!is.get(lemma)) return nullptr; |
|
|
1 |
0 |
char lemma; if (!is.get(lemma)) return nullptr; |
|
2552
|
1 |
0 |
char xpostag; if (!is.get(xpostag)) return nullptr; |
|
|
1 |
0 |
char xpostag; if (!is.get(xpostag)) return nullptr; |
|
2553
|
1 |
0 |
char feats; if (!is.get(feats)) return nullptr; |
|
|
1 |
0 |
char feats; if (!is.get(feats)) return nullptr; |
|
2554
|
1 |
0 |
int model_type = is.peek(); |
|
2557
|
1 |
0 |
model_type == morphodita::tagger_ids::CONLLU3); |
|
|
1 |
0 |
model_type == morphodita::tagger_ids::CONLLU3); |
|
2558
|
1 |
0 |
morphodita::tagger* tagger = morphodita::tagger::load(is); |
|
2559
|
1 |
0 |
if (!tagger) return nullptr; |
|
2560
|
1 |
0 |
m->taggers.emplace_back(raw, i == 0, int(lemma), bool(xpostag), bool(feats), tagger); |
|
2564
|
1 |
0 |
if (!is.get(parser)) return nullptr; |
|
|
1 |
0 |
if (!is.get(parser)) return nullptr; |
|
2565
|
1 |
0 |
m->parser.reset(parser ? parsito::parser::load(is) : nullptr); |
|
|
1 |
0 |
m->parser.reset(parser ? parsito::parser::load(is) : nullptr); |
|
2566
|
1 |
0 |
if (parser && !m->parser) return nullptr; |
|
|
1 |
0 |
if (parser && !m->parser) return nullptr; |
|
|
1 |
0 |
if (parser && !m->parser) return nullptr; |
|
2576
|
0 |
0 |
for (string line; getline(is, line); ) { |
|
|
0 |
0 |
for (string line; getline(is, line); ) { |
|
2578
|
0 |
0 |
block.push_back('\n'); |
|
2581
|
0 |
0 |
if (is.eof() && !block.empty()) is.clear(istream::eofbit); |
|
|
0 |
0 |
if (is.eof() && !block.empty()) is.clear(istream::eofbit); |
|
|
0 |
0 |
if (is.eof() && !block.empty()) is.clear(istream::eofbit); |
|
2595
|
0 |
0 |
if (make_copy) { |
|
2605
|
0 |
0 |
if (text.len) { |
|
2613
|
0 |
0 |
while (tokenizer->next_sentence(input, error)) { |
|
|
0 |
0 |
while (tokenizer->next_sentence(input, error)) { |
|
2614
|
0 |
0 |
if (input.get_new_par() && !paragraph.empty()) { |
|
|
0 |
0 |
if (input.get_new_par() && !paragraph.empty()) { |
|
|
0 |
0 |
if (input.get_new_par() && !paragraph.empty()) { |
|
|
0 |
0 |
if (input.get_new_par() && !paragraph.empty()) { |
|
2615
|
0 |
0 |
if (!parse_paragraph(paragraph, error)) return false; |
|
|
0 |
0 |
if (!parse_paragraph(paragraph, error)) return false; |
|
2616
|
0 |
0 |
for (auto&& sentence : paragraph) |
|
2617
|
0 |
0 |
sentences.push_back(sentence); |
|
2620
|
0 |
0 |
paragraph.push_back(input); |
|
2622
|
0 |
0 |
if (!error.empty()) return false; |
|
2624
|
0 |
0 |
if (!paragraph.empty()) { |
|
2625
|
0 |
0 |
if (!parse_paragraph(paragraph, error)) return false; |
|
|
0 |
0 |
if (!parse_paragraph(paragraph, error)) return false; |
|
2626
|
0 |
0 |
for (auto&& sentence : paragraph) |
|
2627
|
0 |
0 |
sentences.push_back(sentence); |
|
2633
|
0 |
0 |
if (sentences_index < sentences.size()) { |
|
2643
|
0 |
0 |
vector sentence_boundary(1, true); |
|
2644
|
0 |
0 |
vector token_boundary(1, true); |
|
2646
|
0 |
0 |
for (auto&& s : paragraph) { |
|
2648
|
0 |
0 |
for (unsigned i = 1; i < s.words.size(); i++) { |
|
2649
|
0 |
0 |
all_words.words.push_back(s.words[i]); |
|
2651
|
0 |
0 |
sentence_boundary.push_back(i+1 == s.words.size()); |
|
2652
|
0 |
0 |
token_boundary.push_back(true); |
|
2655
|
0 |
0 |
for (auto&& mwt : s.multiword_tokens) { |
|
2656
|
0 |
0 |
all_words.multiword_tokens.push_back(mwt); |
|
2659
|
0 |
0 |
for (int i = all_words.multiword_tokens.back().id_first; i < all_words.multiword_tokens.back().id_last; i++) |
|
2664
|
0 |
0 |
vector best_logprob(all_words.words.size(), -numeric_limits::infinity()); best_logprob[0] = 0.; |
|
2665
|
0 |
0 |
vector best_length(all_words.words.size(), 0); |
|
2666
|
0 |
0 |
sentence s; |
|
2668
|
0 |
0 |
for (unsigned start = 1; start < all_words.words.size(); start++) { |
|
2669
|
0 |
0 |
if (!token_boundary[start - 1]) continue; |
|
2670
|
0 |
0 |
s.clear(); |
|
2671
|
0 |
0 |
for (unsigned end = start + 1; end <= all_words.words.size() && (end - start) <= unsigned(max_sentence_len); end++) { |
|
|
0 |
0 |
for (unsigned end = start + 1; end <= all_words.words.size() && (end - start) <= unsigned(max_sentence_len); end++) { |
|
|
0 |
0 |
for (unsigned end = start + 1; end <= all_words.words.size() && (end - start) <= unsigned(max_sentence_len); end++) { |
|
2672
|
0 |
0 |
s.words.push_back(all_words.words[end - 1]); |
|
2674
|
0 |
0 |
if (!token_boundary[end - 1]) continue; |
|
2676
|
0 |
0 |
for (unsigned i = 1; i < s.words.size(); i++) { |
|
2682
|
0 |
0 |
if (!model.parse(s, DEFAULT, error, &cost)) return false; |
|
|
0 |
0 |
if (!model.parse(s, DEFAULT, error, &cost)) return false; |
|
2684
|
0 |
0 |
if (best_logprob[start - 1] + cost > best_logprob[end - 1]) { |
|
2692
|
0 |
0 |
for (unsigned end = all_words.words.size(); end > 1; end -= best_length[end - 1]) |
|
2693
|
0 |
0 |
sentence_lengths.push_back(best_length[end - 1]); |
|
2699
|
0 |
0 |
for (unsigned i = 1; i < sentence_lengths.size(); i++) { |
|
2702
|
0 |
0 |
paragraph.emplace_back(); |
|
2703
|
0 |
0 |
while (!all_words.multiword_tokens.empty() && unsigned(all_words.multiword_tokens.front().id_first) < sentence_lengths[i]) { |
|
|
0 |
0 |
while (!all_words.multiword_tokens.empty() && unsigned(all_words.multiword_tokens.front().id_first) < sentence_lengths[i]) { |
|
|
0 |
0 |
while (!all_words.multiword_tokens.empty() && unsigned(all_words.multiword_tokens.front().id_first) < sentence_lengths[i]) { |
|
2704
|
0 |
0 |
paragraph.back().multiword_tokens.push_back(all_words.multiword_tokens.front()); |
|
2710
|
0 |
0 |
for (unsigned word = sentence_lengths[i - 1]; word < sentence_lengths[i]; word++) { |
|
2711
|
0 |
0 |
paragraph.back().words.push_back(all_words.words[word]); |
|
2718
|
0 |
0 |
if (!paragraph.empty()) { |
|
2719
|
0 |
0 |
if (new_document) { |
|
2720
|
0 |
0 |
paragraph.front().set_new_doc(true, document_id); |
|
2724
|
0 |
0 |
paragraph.front().set_new_par(true); |
|
2732
|
0 |
7 |
if (raw) { |
|
2733
|
0 |
0 |
if (lemma) word.lemma.assign(analysis.lemma); |
|
2734
|
0 |
0 |
if (xpostag) word.xpostag.assign(analysis.tag); |
|
2739
|
7 |
0 |
if (lemma == 1) { |
|
2741
|
0 |
0 |
} else if (lemma == 2) { |
|
2745
|
0 |
0 |
if (analysis.lemma[0] == '~') { |
|
2747
|
0 |
0 |
if (end != string::npos) { |
|
2749
|
0 |
0 |
if (analysis.lemma.compare(end + 1, string::npos, word.lemma) == 0) |
|
2756
|
0 |
7 |
if (version == 2) { |
|
2758
|
0 |
0 |
for (auto && chr : word.lemma) |
|
2759
|
0 |
0 |
if (chr == '\001') |
|
2761
|
0 |
7 |
} else if (version >= 3) { |
|
2763
|
0 |
0 |
for (size_t i = 0; i + 1 < word.lemma.size(); i++) |
|
2764
|
0 |
0 |
if (word.lemma[i] == char(0xC2) && word.lemma[i+1] == char(0xA0)) |
|
|
0 |
0 |
if (word.lemma[i] == char(0xC2) && word.lemma[i+1] == char(0xA0)) |
|
|
0 |
0 |
if (word.lemma[i] == char(0xC2) && word.lemma[i+1] == char(0xA0)) |
|
2768
|
0 |
7 |
if (!upostag && !xpostag && !feats) return; |
|
|
0 |
0 |
if (!upostag && !xpostag && !feats) return; |
|
2773
|
7 |
0 |
if (upostag) word.upostag.assign(analysis.tag, start, end - start); |
|
2775
|
7 |
0 |
if (!xpostag && !feats) return; |
|
2780
|
7 |
0 |
if (xpostag) word.xpostag.assign(analysis.tag, start, end - start); |
|
2782
|
7 |
0 |
if (!feats) return; |
|
2793
|
14 |
0 |
if (version <= 1) return output.assign(form.str, form.len); |
|
2835
|
0 |
0 |
for (auto&& chr : utf8::decoder(form.str, form.len)) { |
|
2837
|
0 |
0 |
if (chr == 0x640 || (chr >= 0x64B && chr <= 0x657) || chr == 0x670) {} |
|
|
0 |
0 |
if (chr == 0x640 || (chr >= 0x64B && chr <= 0x657) || chr == 0x670) {} |
|
2838
|
0 |
0 |
else if (chr == 0x622) utf8::append(output, 0x627); |
|
2839
|
0 |
0 |
else if (chr == 0x623) utf8::append(output, 0x627); |
|
2840
|
0 |
0 |
else if (chr == 0x624) utf8::append(output, 0x648); |
|
2841
|
0 |
0 |
else if (chr == 0x625) utf8::append(output, 0x627); |
|
2842
|
0 |
0 |
else if (chr == 0x626) utf8::append(output, 0x64A); |
|
2843
|
0 |
0 |
else if (chr == 0x671) utf8::append(output, 0x627); |
|
2844
|
0 |
0 |
else if (chr == 0x6A9) utf8::append(output, 0x643); |
|
2845
|
0 |
0 |
else if (chr == 0x6AA) utf8::append(output, 0x643); |
|
2846
|
0 |
0 |
else if (chr == 0x6CC) utf8::append(output, 0x64A); |
|
2848
|
0 |
0 |
else if (chr == ' ' && version == 2) utf8::append(output, 0x01); |
|
|
0 |
0 |
else if (chr == ' ' && version == 2) utf8::append(output, 0x01); |
|
2849
|
0 |
0 |
else if (chr == ' ' && version >= 3) utf8::append(output, 0xA0); |
|
|
0 |
0 |
else if (chr == ' ' && version >= 3) utf8::append(output, 0xA0); |
|
2855
|
0 |
0 |
if (output.empty() && form.len) |
|
|
0 |
0 |
if (output.empty() && form.len) |
|
|
0 |
0 |
if (output.empty() && form.len) |
|
2865
|
7 |
0 |
if (version <= 2) return output.assign(lemma.str, lemma.len); |
|
2869
|
0 |
0 |
for (size_t i = 0; i < lemma.len; i++) { |
|
2871
|
0 |
0 |
if (lemma.str[i] == ' ') utf8::append(output, 0xA0); |
|
2982
|
0 |
0 |
for (string line; getline(is, line); ) |
|
|
0 |
0 |
for (string line; getline(is, line); ) |
|
2983
|
0 |
0 |
whole.append(line).push_back('\n'); |
|
2985
|
0 |
0 |
if (is.eof() && !whole.empty()) is.clear(istream::eofbit); |
|
|
0 |
0 |
if (is.eof() && !whole.empty()) is.clear(istream::eofbit); |
|
|
0 |
0 |
if (is.eof() && !whole.empty()) is.clear(istream::eofbit); |
|
3009
|
0 |
0 |
set_input(input); |
|
3012
|
0 |
0 |
set_output(output); |
|
3022
|
0 |
0 |
if (input.empty()) { |
|
3024
|
0 |
0 |
} else if (input == "tokenize" || input == "tokenizer") { |
|
3026
|
0 |
0 |
} else if (input.compare(0, 10, "tokenizer=") == 0) { |
|
3043
|
0 |
0 |
this->output = output.empty() ? "conllu" : output; |
|
3060
|
0 |
0 |
if (input == "tokenizer") { |
|
3061
|
0 |
0 |
reader.reset(m->new_tokenizer(tokenizer)); |
|
3062
|
0 |
0 |
if (!reader) return error.assign("The model does not have a tokenizer!"), false; |
|
|
0 |
0 |
if (!reader) return error.assign("The model does not have a tokenizer!"), false; |
|
3064
|
0 |
0 |
reader.reset(input_format::new_input_format(input)); |
|
3065
|
0 |
0 |
if (!reader) return error.assign("The requested input format '").append(input).append("' does not exist!"), false; |
|
|
0 |
0 |
if (!reader) return error.assign("The requested input format '").append(input).append("' does not exist!"), false; |
|
|
0 |
0 |
if (!reader) return error.assign("The requested input format '").append(input).append("' does not exist!"), false; |
|
3067
|
0 |
0 |
reader->reset_document(document_id); |
|
3069
|
0 |
0 |
unique_ptr writer(output_format::new_output_format(output)); |
|
3070
|
0 |
0 |
if (!writer) return error.assign("The requested output format '").append(output).append("' does not exist!"), false; |
|
|
0 |
0 |
if (!writer) return error.assign("The requested output format '").append(output).append("' does not exist!"), false; |
|
|
0 |
0 |
if (!writer) return error.assign("The requested output format '").append(output).append("' does not exist!"), false; |
|
3073
|
0 |
0 |
while (immediate ? reader->read_block(is, block) : bool(getwhole(is, block))) { |
|
|
0 |
0 |
while (immediate ? reader->read_block(is, block) : bool(getwhole(is, block))) { |
|
|
0 |
0 |
while (immediate ? reader->read_block(is, block) : bool(getwhole(is, block))) { |
|
|
0 |
0 |
while (immediate ? reader->read_block(is, block) : bool(getwhole(is, block))) { |
|
3074
|
0 |
0 |
reader->set_text(block); |
|
3075
|
0 |
0 |
while (reader->next_sentence(s, error)) { |
|
|
0 |
0 |
while (reader->next_sentence(s, error)) { |
|
3076
|
0 |
0 |
if (tagger != NONE) |
|
3077
|
0 |
0 |
if (!m->tag(s, tagger, error)) |
|
|
0 |
0 |
if (!m->tag(s, tagger, error)) |
|
3080
|
0 |
0 |
if (parser != NONE) |
|
3081
|
0 |
0 |
if (!m->parse(s, parser, error)) |
|
|
0 |
0 |
if (!m->parse(s, parser, error)) |
|
3084
|
0 |
0 |
writer->write_sentence(s, os); |
|
3086
|
0 |
0 |
if (!error.empty()) return false; |
|
3088
|
0 |
0 |
writer->finish_document(os); |
|
3198
|
0 |
0 |
format_tagged_lemma(result); |
|
3203
|
0 |
0 |
for (auto&& lemma : lemmas) |
|
3206
|
0 |
0 |
if (lemmas.size() > 1) |
|
3214
|
0 |
0 |
if (converter) converter->convert(lemma); |
|
3218
|
0 |
0 |
if (converter) converter->convert_analyzed(lemmas); |
|
3231
|
0 |
0 |
for (derivated_lemma parent; derinet->parent(lemma.lemma, parent); ) |
|
|
0 |
0 |
for (derivated_lemma parent; derinet->parent(lemma.lemma, parent); ) |
|
3233
|
0 |
0 |
if (converter) converter->convert(lemma); |
|
3241
|
0 |
0 |
return derinet ? new root_derivation_formatter(derinet) : nullptr; |
|
|
0 |
0 |
return derinet ? new root_derivation_formatter(derinet) : nullptr; |
|
3250
|
0 |
0 |
if (converter) converter->convert(lemma); |
|
|
0 |
0 |
if (converter) converter->convert(lemma); |
|
3251
|
0 |
0 |
for (derivated_lemma parent; derinet->parent(current.lemma, parent); current.lemma.swap(parent.lemma)) { |
|
|
0 |
0 |
for (derivated_lemma parent; derinet->parent(current.lemma, parent); current.lemma.swap(parent.lemma)) { |
|
3252
|
0 |
0 |
tagged_lemma parrent_lemma(parent.lemma, current.tag); |
|
3253
|
0 |
0 |
if (converter) converter->convert(parrent_lemma); |
|
|
0 |
0 |
if (converter) converter->convert(parrent_lemma); |
|
3254
|
0 |
0 |
lemma.lemma.append(" ").append(parrent_lemma.lemma); |
|
3263
|
0 |
0 |
return derinet ? new path_derivation_formatter(derinet) : nullptr; |
|
|
0 |
0 |
return derinet ? new path_derivation_formatter(derinet) : nullptr; |
|
3272
|
0 |
0 |
if (converter) converter->convert(lemma); |
|
|
0 |
0 |
if (converter) converter->convert(lemma); |
|
3273
|
0 |
0 |
for (derivated_lemma parent; derinet->parent(root, parent); root.swap(parent.lemma)) {} |
|
|
0 |
0 |
for (derivated_lemma parent; derinet->parent(root, parent); root.swap(parent.lemma)) {} |
|
3274
|
0 |
0 |
format_tree(root, tag, lemma, converter); |
|
3280
|
0 |
0 |
if (converter) { |
|
3281
|
0 |
0 |
tagged_lemma current(root, tag); |
|
3282
|
0 |
0 |
converter->convert(current); |
|
3283
|
0 |
0 |
tree.lemma.append(" ").append(current.lemma); |
|
3285
|
0 |
0 |
tree.lemma.append(" ").append(root); |
|
3288
|
0 |
0 |
if (derinet->children(root, children)) |
|
|
0 |
0 |
if (derinet->children(root, children)) |
|
3289
|
0 |
0 |
for (auto&& child : children) |
|
3290
|
0 |
0 |
format_tree(child.lemma, tag, tree, converter); |
|
3291
|
0 |
0 |
tree.lemma.push_back(' '); |
|
3299
|
0 |
0 |
return derinet ? new tree_derivation_formatter(derinet) : nullptr; |
|
|
0 |
0 |
return derinet ? new tree_derivation_formatter(derinet) : nullptr; |
|
3303
|
0 |
0 |
if (name == "none") return new_none_derivation_formatter(); |
|
3304
|
0 |
0 |
if (name == "root") return new_root_derivation_formatter(derinet); |
|
3305
|
0 |
0 |
if (name == "path") return new_path_derivation_formatter(derinet); |
|
3306
|
0 |
0 |
if (name == "tree") return new_tree_derivation_formatter(derinet); |
|
3336
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
529 |
133 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
133 |
36 |
while (len--) |
|
|
1009 |
66 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
64 |
10 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
3337
|
0 |
0 |
if (*a++ != *b++) |
|
|
0 |
0 |
if (*a++ != *b++) |
|
|
0 |
0 |
if (*a++ != *b++) |
|
|
0 |
0 |
if (*a++ != *b++) |
|
|
0 |
0 |
if (*a++ != *b++) |
|
|
0 |
0 |
if (*a++ != *b++) |
|
|
0 |
0 |
if (*a++ != *b++) |
|
|
0 |
0 |
if (*a++ != *b++) |
|
|
0 |
0 |
if (*a++ != *b++) |
|
|
0 |
0 |
if (*a++ != *b++) |
|
|
307 |
222 |
if (*a++ != *b++) |
|
|
0 |
0 |
if (*a++ != *b++) |
|
|
0 |
0 |
if (*a++ != *b++) |
|
|
0 |
0 |
if (*a++ != *b++) |
|
|
0 |
0 |
if (*a++ != *b++) |
|
|
111 |
22 |
if (*a++ != *b++) |
|
|
1000 |
9 |
if (*a++ != *b++) |
|
|
0 |
0 |
if (*a++ != *b++) |
|
|
61 |
3 |
if (*a++ != *b++) |
|
|
0 |
0 |
if (*a++ != *b++) |
|
|
0 |
0 |
if (*a++ != *b++) |
|
|
0 |
0 |
if (*a++ != *b++) |
|
|
0 |
0 |
if (*a++ != *b++) |
|
|
0 |
0 |
if (*a++ != *b++) |
|
|
0 |
0 |
if (*a++ != *b++) |
|
|
0 |
0 |
if (*a++ != *b++) |
|
3346
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
729 |
346 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
20 |
118 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
120 |
20 |
while (len--) |
|
3419
|
0 |
0 |
data.reserve(16); |
|
3423
|
0 |
0 |
if (uint8_t(val) != val) training_failure("Should encode value " << val << " in one byte!"); |
|
|
0 |
0 |
if (uint8_t(val) != val) training_failure("Should encode value " << val << " in one byte!"); |
|
|
0 |
0 |
if (uint8_t(val) != val) training_failure("Should encode value " << val << " in one byte!"); |
|
3428
|
0 |
0 |
if (uint16_t(val) != val) training_failure("Should encode value " << val << " in two bytes!"); |
|
|
0 |
0 |
if (uint16_t(val) != val) training_failure("Should encode value " << val << " in two bytes!"); |
|
|
0 |
0 |
if (uint16_t(val) != val) training_failure("Should encode value " << val << " in two bytes!"); |
|
3447
|
0 |
0 |
if (!(str.len < 255)) add_4B(str.len); |
|
3606
|
30 |
10 |
while (size) { |
|
|
0 |
0 |
while (size) { |
|
|
0 |
0 |
while (size) { |
|
3608
|
21 |
9 |
if (unaligned_load(first + step) < val) { |
|
|
0 |
0 |
if (unaligned_load(first + step) < val) { |
|
|
0 |
0 |
if (unaligned_load(first + step) < val) { |
|
3650
|
0 |
0 |
class persistent_unordered_map { |
|
|
0 |
0 |
class persistent_unordered_map { |
|
3696
|
0 |
0 |
struct persistent_unordered_map::fnv_hash { |
|
3699
|
52 |
24 |
while (mask < num) |
|
3701
|
24 |
0 |
hash.resize(mask + 1); |
|
3705
|
484 |
0 |
uint32_t size = data.next_4B(); |
|
3707
|
484 |
0 |
hash.resize(size); |
|
3708
|
484 |
0 |
memcpy(hash.data(), data.next(size), size * sizeof(uint32_t)); |
|
3710
|
484 |
0 |
size = data.next_4B(); |
|
3711
|
484 |
0 |
this->data.resize(size); |
|
3712
|
145 |
339 |
if (size) memcpy(this->data.data(), data.next(size), size); |
|
|
145 |
0 |
if (size) memcpy(this->data.data(), data.next(size), size); |
|
3716
|
0 |
0 |
if (len <= 0) return 0; |
|
|
0 |
0 |
if (len <= 0) return 0; |
|
|
0 |
0 |
if (len <= 0) return 0; |
|
|
0 |
0 |
if (len <= 0) return 0; |
|
|
0 |
0 |
if (len <= 0) return 0; |
|
|
0 |
0 |
if (len <= 0) return 0; |
|
|
0 |
0 |
if (len <= 0) return 0; |
|
|
0 |
0 |
if (len <= 0) return 0; |
|
|
0 |
0 |
if (len <= 0) return 0; |
|
|
8 |
0 |
if (len <= 0) return 0; |
|
|
0 |
8 |
if (len <= 0) return 0; |
|
|
0 |
0 |
if (len <= 0) return 0; |
|
|
0 |
0 |
if (len <= 0) return 0; |
|
|
0 |
0 |
if (len <= 0) return 0; |
|
|
0 |
0 |
if (len <= 0) return 0; |
|
|
0 |
0 |
if (len <= 0) return 0; |
|
|
0 |
0 |
if (len <= 0) return 0; |
|
|
330 |
0 |
if (len <= 0) return 0; |
|
|
78 |
0 |
if (len <= 0) return 0; |
|
|
20 |
0 |
if (len <= 0) return 0; |
|
|
20 |
0 |
if (len <= 0) return 0; |
|
3717
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
|
2 |
6 |
if (len == 1) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
|
13 |
317 |
if (len == 1) return unaligned_load(data); |
|
|
6 |
72 |
if (len == 1) return unaligned_load(data); |
|
|
4 |
16 |
if (len == 1) return unaligned_load(data); |
|
|
4 |
16 |
if (len == 1) return unaligned_load(data); |
|
3718
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
|
5 |
1 |
if (len == 2) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
|
48 |
269 |
if (len == 2) return unaligned_load(data); |
|
|
67 |
5 |
if (len == 2) return unaligned_load(data); |
|
|
15 |
1 |
if (len == 2) return unaligned_load(data); |
|
|
15 |
1 |
if (len == 2) return unaligned_load(data); |
|
3721
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
38 |
5 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
0 |
0 |
while (len--) |
|
|
144 |
48 |
while (len--) |
|
|
1003 |
67 |
while (len--) |
|
|
114 |
15 |
while (len--) |
|
|
114 |
15 |
while (len--) |
|
3735
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
|
8 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
3741
|
0 |
0 |
if (len <= 2) |
|
|
0 |
0 |
if (len <= 2) |
|
|
0 |
0 |
if (len <= 2) |
|
|
0 |
0 |
if (len <= 2) |
|
|
0 |
0 |
if (len <= 2) |
|
|
0 |
0 |
if (len <= 2) |
|
|
0 |
0 |
if (len <= 2) |
|
|
0 |
0 |
if (len <= 2) |
|
|
0 |
8 |
if (len <= 2) |
|
|
0 |
0 |
if (len <= 2) |
|
|
0 |
0 |
if (len <= 2) |
|
3742
|
0 |
0 |
return data != end ? data + len : nullptr; |
|
|
0 |
0 |
return data != end ? data + len : nullptr; |
|
|
0 |
0 |
return data != end ? data + len : nullptr; |
|
|
0 |
0 |
return data != end ? data + len : nullptr; |
|
|
0 |
0 |
return data != end ? data + len : nullptr; |
|
|
0 |
0 |
return data != end ? data + len : nullptr; |
|
|
0 |
0 |
return data != end ? data + len : nullptr; |
|
|
0 |
0 |
return data != end ? data + len : nullptr; |
|
|
8 |
0 |
return data != end ? data + len : nullptr; |
|
|
0 |
0 |
return data != end ? data + len : nullptr; |
|
|
0 |
0 |
return data != end ? data + len : nullptr; |
|
3744
|
0 |
0 |
while (data < end) { |
|
|
0 |
0 |
while (data < end) { |
|
|
0 |
0 |
while (data < end) { |
|
|
0 |
0 |
while (data < end) { |
|
|
0 |
0 |
while (data < end) { |
|
|
0 |
0 |
while (data < end) { |
|
|
0 |
0 |
while (data < end) { |
|
|
0 |
0 |
while (data < end) { |
|
|
0 |
0 |
while (data < end) { |
|
|
0 |
0 |
while (data < end) { |
|
|
0 |
0 |
while (data < end) { |
|
3745
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
3756
|
330 |
16 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
|
78 |
14 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
3762
|
48 |
282 |
if (len <= 2) |
|
|
67 |
11 |
if (len <= 2) |
|
3763
|
234 |
48 |
return data != end ? (const T*)(data + len) : nullptr; |
|
|
10 |
1 |
return data != end ? (const T*)(data + len) : nullptr; |
|
3765
|
58 |
12 |
while (data < end) { |
|
|
75 |
1 |
while (data < end) { |
|
3766
|
36 |
22 |
if (small_memeq(str, data, len)) return (const T*)(data + len); |
|
|
66 |
9 |
if (small_memeq(str, data, len)) return (const T*)(data + len); |
|
3775
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return; |
|
|
8 |
0 |
if (unsigned(len) >= hashes.size()) return; |
|
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return; |
|
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return; |
|
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return; |
|
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return; |
|
3781
|
0 |
0 |
while (data < end) { |
|
|
13 |
8 |
while (data < end) { |
|
|
0 |
0 |
while (data < end) { |
|
|
0 |
0 |
while (data < end) { |
|
|
0 |
0 |
while (data < end) { |
|
|
0 |
0 |
while (data < end) { |
|
3791
|
1 |
1 |
for (unsigned len = 0; len < hashes.size(); len++) { |
|
|
0 |
0 |
for (unsigned len = 0; len < hashes.size(); len++) { |
|
|
0 |
0 |
for (unsigned len = 0; len < hashes.size(); len++) { |
|
3795
|
1 |
1 |
while (data < end) { |
|
|
0 |
0 |
while (data < end) { |
|
|
0 |
0 |
while (data < end) { |
|
3809
|
0 |
0 |
return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr; |
|
|
0 |
0 |
return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr; |
|
|
0 |
0 |
return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr; |
|
|
0 |
0 |
return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr; |
|
|
0 |
0 |
return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr; |
|
|
0 |
0 |
return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr; |
|
|
0 |
0 |
return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr; |
|
3813
|
2 |
22 |
if (hashes.size() == 0) hashes.emplace_back(1); |
|
3814
|
2 |
20 |
else if (hashes.size() == 1) hashes.emplace_back(1<<8); |
|
3815
|
2 |
18 |
else if (hashes.size() == 2) hashes.emplace_back(1<<16); |
|
3820
|
20 |
0 |
if (unsigned(str_len) < hashes.size()) |
|
3825
|
24 |
2 |
for (auto&& hash : hashes) { |
|
3827
|
131633 |
24 |
for (auto&& len : hash.hash) total += len, len = total - len; |
|
3833
|
20 |
0 |
if (unsigned(str_len) < hashes.size()) { |
|
3844
|
24 |
2 |
for (auto&& hash : hashes) |
|
3845
|
131633 |
24 |
for (int i = hash.hash.size() - 1; i >= 0; i--) |
|
3846
|
131609 |
24 |
hash.hash[i] = i > 0 ? hash.hash[i-1] : 0; |
|
3853
|
484 |
103 |
for (unsigned i = 0; i < sizes; i++) |
|
3931
|
0 |
0 |
if (dictionary) lemma.len = dictionary->lemma_id_len(lemma); |
|
3938
|
0 |
0 |
if (lemma_data) { |
|
3940
|
0 |
0 |
if (parent_encoded) { |
|
3944
|
0 |
0 |
if (parent_data[parent_len]) |
|
3954
|
0 |
0 |
if (dictionary) lemma.len = dictionary->lemma_id_len(lemma); |
|
3961
|
0 |
0 |
if (lemma_data) { |
|
3964
|
0 |
0 |
if (children_len) { |
|
3966
|
0 |
0 |
for (unsigned i = 0; i < children_len; i++) { |
|
3970
|
0 |
0 |
if (child_data[child_len]) |
|
3982
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
|
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
|
3985
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
|
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
|
3986
|
0 |
0 |
derinet.resize(data.next_4B()); |
|
|
0 |
0 |
derinet.resize(data.next_4B()); |
|
3990
|
0 |
0 |
for (int pass = 1; pass <= 3; pass++) { |
|
3991
|
0 |
0 |
if (pass > 1) data.seek(data_position); |
|
|
0 |
0 |
if (pass > 1) data.seek(data_position); |
|
3994
|
0 |
0 |
for (int i = data.next_4B(); i > 0; i--) { |
|
|
0 |
0 |
for (int i = data.next_4B(); i > 0; i--) { |
|
3995
|
0 |
0 |
lemma.resize(lemma.size() - data.next_1B()); |
|
|
0 |
0 |
lemma.resize(lemma.size() - data.next_1B()); |
|
3996
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
|
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
|
3997
|
0 |
0 |
lemma.push_back(data.next_1B()); |
|
3999
|
0 |
0 |
unsigned char lemma_comment_len = data.next_1B(); |
|
4000
|
0 |
0 |
const char* lemma_comment = lemma_comment_len ? data.next(lemma_comment_len) : nullptr; |
|
|
0 |
0 |
const char* lemma_comment = lemma_comment_len ? data.next(lemma_comment_len) : nullptr; |
|
4002
|
0 |
0 |
unsigned children = data.next_2B(); |
|
4004
|
0 |
0 |
if (pass == 3) parent.clear(); |
|
4006
|
0 |
0 |
int operations = data.next_1B(); |
|
4007
|
0 |
0 |
if (operations) { |
|
4008
|
0 |
0 |
int remove_start = operations & REMOVE_START ? data.next_1B() : 0; |
|
|
0 |
0 |
int remove_start = operations & REMOVE_START ? data.next_1B() : 0; |
|
4009
|
0 |
0 |
int remove_end = operations & REMOVE_END ? data.next_1B() : 0; |
|
|
0 |
0 |
int remove_end = operations & REMOVE_END ? data.next_1B() : 0; |
|
4010
|
0 |
0 |
if (operations & ADD_START) { |
|
4011
|
0 |
0 |
int add_start = data.next_1B(); |
|
4012
|
0 |
0 |
const char* str = data.next(add_start); |
|
4013
|
0 |
0 |
if (pass == 3) parent.assign(str, str + add_start); |
|
4015
|
0 |
0 |
if (pass == 3) parent.insert(parent.end(), lemma.begin() + remove_start, lemma.end() - remove_end); |
|
|
0 |
0 |
if (pass == 3) parent.insert(parent.end(), lemma.begin() + remove_start, lemma.end() - remove_end); |
|
4016
|
0 |
0 |
if (operations & ADD_END) { |
|
4017
|
0 |
0 |
int add_end = data.next_1B(); |
|
4018
|
0 |
0 |
const char* str = data.next(add_end); |
|
4019
|
0 |
0 |
if (pass == 3) parent.insert(parent.end(), str, str + add_end); |
|
4023
|
0 |
0 |
if (pass == 1) { |
|
4025
|
0 |
0 |
} else if (pass == 2) { |
|
4028
|
0 |
0 |
while (lemma_comment_len--) *lemma_data++ = *lemma_comment++; |
|
4031
|
0 |
0 |
if (children) unaligned_store(((uint32_t*)lemma_data) + children - 1, 0); |
|
4032
|
0 |
0 |
} else if (pass == 3 && !parent.empty()) { |
|
|
0 |
0 |
} else if (pass == 3 && !parent.empty()) { |
|
|
0 |
0 |
} else if (pass == 3 && !parent.empty()) { |
|
4043
|
0 |
0 |
assert(lemma_data && parent_data); |
|
4046
|
0 |
0 |
assert(parent.size() < (1<<8) && parent_offset < (1<<24)); |
|
|
0 |
0 |
assert(parent.size() < (1<<8) && parent_offset < (1<<24)); |
|
4050
|
0 |
0 |
assert(lemma.size() < (1<<8) && lemma_offset < (1<<24)); |
|
|
0 |
0 |
assert(lemma.size() < (1<<8) && lemma_offset < (1<<24)); |
|
4055
|
0 |
0 |
if (child_index+1 < children_len) |
|
4060
|
0 |
0 |
if (pass == 1) |
|
4061
|
0 |
0 |
derinet.done_adding(); |
|
4062
|
0 |
0 |
if (pass == 2) |
|
4064
|
0 |
0 |
} |
|
4097
|
22 |
7 |
while (form_tmp.len && !rest_has_Lut) |
|
|
22 |
0 |
while (form_tmp.len && !rest_has_Lut) |
|
4106
|
1 |
6 |
if (first_Lut && !rest_has_Lut) { // common case allowing fast execution |
|
4111
|
0 |
6 |
} else if (!first_Lut && rest_has_Lut) { |
|
4114
|
0 |
6 |
} else if (first_Lut && rest_has_Lut) { |
|
4121
|
0 |
0 |
while (form_tmp.len) { |
|
4162
|
0 |
0 |
for (unsigned len = 1; len < lemma.len; len++) |
|
4163
|
0 |
0 |
if (lemma.str[len] == '`' || lemma.str[len] == '_' || |
|
|
0 |
0 |
if (lemma.str[len] == '`' || lemma.str[len] == '_' || |
|
4164
|
0 |
0 |
(lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9')) |
|
|
0 |
0 |
(lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9')) |
|
|
0 |
0 |
(lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9')) |
|
4171
|
0 |
0 |
for (unsigned len = 1; len < lemma.len; len++) { |
|
4172
|
0 |
0 |
if (lemma.str[len] == '`' || lemma.str[len] == '_') |
|
4174
|
0 |
0 |
if (lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9') { |
|
|
0 |
0 |
if (lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9') { |
|
|
0 |
0 |
if (lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9') { |
|
|
0 |
0 |
if (lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9') { |
|
4176
|
0 |
0 |
while (len < lemma.len && lemma.str[len] >= '0' && lemma.str[len] <= '9') len++; |
|
|
0 |
0 |
while (len < lemma.len && lemma.str[len] >= '0' && lemma.str[len] <= '9') len++; |
|
|
0 |
0 |
while (len < lemma.len && lemma.str[len] >= '0' && lemma.str[len] <= '9') len++; |
|
4186
|
0 |
0 |
if (addinfo_len) { |
|
4187
|
0 |
0 |
res.reserve(addinfo_len + 4); |
|
4188
|
0 |
0 |
if (addinfo[0] != 255) { |
|
4193
|
0 |
0 |
for (int i = 1; i < addinfo_len; i++) |
|
4201
|
0 |
0 |
for (int i = 1; i + 2 < addinfo_len; i++) |
|
4202
|
0 |
0 |
if (addinfo[i] == '_' && addinfo[i+1] == ',' && addinfo[i+2] == 'x') |
|
|
0 |
0 |
if (addinfo[i] == '_' && addinfo[i+1] == ',' && addinfo[i+2] == 'x') |
|
|
0 |
0 |
if (addinfo[i] == '_' && addinfo[i+1] == ',' && addinfo[i+2] == 'x') |
|
4212
|
0 |
0 |
if (lemma_info < lemma.str + lemma.len) { |
|
4216
|
0 |
0 |
if (*lemma_info == '-') { |
|
4219
|
0 |
0 |
lemma_additional_info < lemma.str + lemma.len && (*lemma_additional_info >= '0' && *lemma_additional_info <= '9'); |
|
|
0 |
0 |
lemma_additional_info < lemma.str + lemma.len && (*lemma_additional_info >= '0' && *lemma_additional_info <= '9'); |
|
4223
|
0 |
0 |
if (lemma_additional_info == lemma_info + 1 || (lemma_additional_info < lemma.str + lemma.len && *lemma_additional_info != '`' && *lemma_additional_info != '_') || lemma_num >= 255) { |
|
|
0 |
0 |
if (lemma_additional_info == lemma_info + 1 || (lemma_additional_info < lemma.str + lemma.len && *lemma_additional_info != '`' && *lemma_additional_info != '_') || lemma_num >= 255) { |
|
|
0 |
0 |
if (lemma_additional_info == lemma_info + 1 || (lemma_additional_info < lemma.str + lemma.len && *lemma_additional_info != '`' && *lemma_additional_info != '_') || lemma_num >= 255) { |
|
|
0 |
0 |
if (lemma_additional_info == lemma_info + 1 || (lemma_additional_info < lemma.str + lemma.len && *lemma_additional_info != '`' && *lemma_additional_info != '_') || lemma_num >= 255) { |
|
|
0 |
0 |
if (lemma_additional_info == lemma_info + 1 || (lemma_additional_info < lemma.str + lemma.len && *lemma_additional_info != '`' && *lemma_additional_info != '_') || lemma_num >= 255) { |
|
4224
|
0 |
0 |
if (die_on_failure) |
|
4225
|
0 |
0 |
training_failure("Lemma number " << lemma_num << " in lemma " << lemma << " out of range!"); |
|
|
0 |
0 |
training_failure("Lemma number " << lemma_num << " in lemma " << lemma << " out of range!"); |
|
|
0 |
0 |
training_failure("Lemma number " << lemma_num << " in lemma " << lemma << " out of range!"); |
|
4231
|
0 |
0 |
while (lemma_additional_info < lemma.str + lemma.len) |
|
4234
|
0 |
0 |
if (data.size() > 255) { |
|
4235
|
0 |
0 |
if (die_on_failure) |
|
4236
|
0 |
0 |
training_failure("Too long lemma info " << lemma_info << " in lemma " << lemma << '!'); |
|
|
0 |
0 |
training_failure("Too long lemma info " << lemma_info << " in lemma " << lemma << '!'); |
|
4246
|
0 |
0 |
if (data.empty()) return true; |
|
4247
|
0 |
0 |
if (data[0] != 255 && (!other_addinfo_len || other_addinfo[0] != data[0])) return false; |
|
|
0 |
0 |
if (data[0] != 255 && (!other_addinfo_len || other_addinfo[0] != data[0])) return false; |
|
|
0 |
0 |
if (data[0] != 255 && (!other_addinfo_len || other_addinfo[0] != data[0])) return false; |
|
|
0 |
0 |
if (data[0] != 255 && (!other_addinfo_len || other_addinfo[0] != data[0])) return false; |
|
4291
|
0 |
0 |
if (filters.empty()) return true; |
|
4294
|
0 |
0 |
for (auto&& filter : filters) { |
|
4296
|
0 |
0 |
while (tag_pos < filter.pos) |
|
4297
|
0 |
0 |
if (!tag[tag_pos++]) |
|
4299
|
0 |
0 |
if (!tag[tag_pos]) |
|
4304
|
0 |
0 |
for (int i = 1; i < filter.chars_len && ((!matched) ^ filter.negate); i++) |
|
|
0 |
0 |
for (int i = 1; i < filter.chars_len && ((!matched) ^ filter.negate); i++) |
|
4306
|
0 |
0 |
if (!matched) return false; |
|
4346
|
12 |
1 |
for (int i = data.next_1B(); i > 0; i--) |
|
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
|
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
|
4348
|
12 |
1 |
for (int i = data.next_1B(); i > 0; i--) |
|
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
|
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
|
4354
|
1 |
0 |
vector root(max(lemmas.max_length(), roots.max_length())); |
|
|
0 |
0 |
vector root(max(lemmas.max_length(), roots.max_length())); |
|
|
0 |
0 |
vector root(max(lemmas.max_length(), roots.max_length())); |
|
4356
|
2 |
1 |
for (int pass = 1; pass <= 2; pass++) { |
|
|
0 |
0 |
for (int pass = 1; pass <= 2; pass++) { |
|
|
0 |
0 |
for (int pass = 1; pass <= 2; pass++) { |
|
4357
|
1 |
1 |
if (pass > 1) data.seek(data_position); |
|
|
1 |
0 |
if (pass > 1) data.seek(data_position); |
|
|
0 |
0 |
if (pass > 1) data.seek(data_position); |
|
|
0 |
0 |
if (pass > 1) data.seek(data_position); |
|
|
0 |
0 |
if (pass > 1) data.seek(data_position); |
|
|
0 |
0 |
if (pass > 1) data.seek(data_position); |
|
4362
|
2 |
0 |
for (int i = data.next_4B(); i > 0; i--) { |
|
|
20 |
2 |
for (int i = data.next_4B(); i > 0; i--) { |
|
|
0 |
0 |
for (int i = data.next_4B(); i > 0; i--) { |
|
|
0 |
0 |
for (int i = data.next_4B(); i > 0; i--) { |
|
|
0 |
0 |
for (int i = data.next_4B(); i > 0; i--) { |
|
|
0 |
0 |
for (int i = data.next_4B(); i > 0; i--) { |
|
4363
|
20 |
0 |
lemma_len -= data.next_1B(); |
|
|
0 |
0 |
lemma_len -= data.next_1B(); |
|
|
0 |
0 |
lemma_len -= data.next_1B(); |
|
4364
|
20 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
|
|
106 |
20 |
for (int i = data.next_1B(); i > 0; i--) |
|
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
|
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
|
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
|
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
|
4365
|
106 |
0 |
lemma[lemma_len++] = data.next_1B(); |
|
|
0 |
0 |
lemma[lemma_len++] = data.next_1B(); |
|
|
0 |
0 |
lemma[lemma_len++] = data.next_1B(); |
|
4366
|
20 |
0 |
unsigned char lemma_info_len = data.next_1B(); |
|
|
0 |
0 |
unsigned char lemma_info_len = data.next_1B(); |
|
|
0 |
0 |
unsigned char lemma_info_len = data.next_1B(); |
|
4367
|
0 |
20 |
const char* lemma_info = lemma_info_len ? data.next(lemma_info_len) : nullptr; |
|
|
0 |
0 |
const char* lemma_info = lemma_info_len ? data.next(lemma_info_len) : nullptr; |
|
|
0 |
0 |
const char* lemma_info = lemma_info_len ? data.next(lemma_info_len) : nullptr; |
|
|
0 |
0 |
const char* lemma_info = lemma_info_len ? data.next(lemma_info_len) : nullptr; |
|
|
0 |
0 |
const char* lemma_info = lemma_info_len ? data.next(lemma_info_len) : nullptr; |
|
|
0 |
0 |
const char* lemma_info = lemma_info_len ? data.next(lemma_info_len) : nullptr; |
|
4368
|
20 |
0 |
unsigned lemma_roots = data.next_1B(); |
|
|
0 |
0 |
unsigned lemma_roots = data.next_1B(); |
|
|
0 |
0 |
unsigned lemma_roots = data.next_1B(); |
|
4373
|
10 |
10 |
if (pass == 1) { |
|
|
0 |
0 |
if (pass == 1) { |
|
|
0 |
0 |
if (pass == 1) { |
|
4380
|
0 |
10 |
if (lemma_info_len) small_memcpy(lemma_data, lemma_info, lemma_info_len), lemma_data += lemma_info_len; |
|
|
0 |
0 |
if (lemma_info_len) small_memcpy(lemma_data, lemma_info, lemma_info_len), lemma_data += lemma_info_len; |
|
|
0 |
0 |
if (lemma_info_len) small_memcpy(lemma_data, lemma_info, lemma_info_len), lemma_data += lemma_info_len; |
|
4385
|
20 |
20 |
for (unsigned i = 0; i < lemma_roots; i++) { |
|
|
0 |
0 |
for (unsigned i = 0; i < lemma_roots; i++) { |
|
|
0 |
0 |
for (unsigned i = 0; i < lemma_roots; i++) { |
|
4387
|
20 |
0 |
int operations = data.next_1B(); |
|
|
0 |
0 |
int operations = data.next_1B(); |
|
|
0 |
0 |
int operations = data.next_1B(); |
|
4388
|
4 |
16 |
if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; } |
|
|
4 |
0 |
if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; } |
|
|
28 |
4 |
if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; } |
|
|
0 |
0 |
if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; } |
|
|
0 |
0 |
if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; } |
|
|
0 |
0 |
if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; } |
|
|
0 |
0 |
if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; } |
|
|
0 |
0 |
if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; } |
|
|
0 |
0 |
if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; } |
|
4389
|
12 |
8 |
if (operations & REMOVE_END) root_len -= data.next_1B(); |
|
|
12 |
0 |
if (operations & REMOVE_END) root_len -= data.next_1B(); |
|
|
0 |
0 |
if (operations & REMOVE_END) root_len -= data.next_1B(); |
|
|
0 |
0 |
if (operations & REMOVE_END) root_len -= data.next_1B(); |
|
|
0 |
0 |
if (operations & REMOVE_END) root_len -= data.next_1B(); |
|
|
0 |
0 |
if (operations & REMOVE_END) root_len -= data.next_1B(); |
|
4390
|
6 |
14 |
if (operations & ADD_START) { |
|
|
0 |
0 |
if (operations & ADD_START) { |
|
|
0 |
0 |
if (operations & ADD_START) { |
|
4391
|
6 |
0 |
int from = root_len, to = from + data.next_1B(); while (from > 0) root[--to] = root[--from]; root_len += to; |
|
|
38 |
6 |
int from = root_len, to = from + data.next_1B(); while (from > 0) root[--to] = root[--from]; root_len += to; |
|
|
0 |
0 |
int from = root_len, to = from + data.next_1B(); while (from > 0) root[--to] = root[--from]; root_len += to; |
|
|
0 |
0 |
int from = root_len, to = from + data.next_1B(); while (from > 0) root[--to] = root[--from]; root_len += to; |
|
|
0 |
0 |
int from = root_len, to = from + data.next_1B(); while (from > 0) root[--to] = root[--from]; root_len += to; |
|
|
0 |
0 |
int from = root_len, to = from + data.next_1B(); while (from > 0) root[--to] = root[--from]; root_len += to; |
|
4392
|
8 |
6 |
for (int i = 0; i < to; i++) root[i] = data.next_1B(); |
|
|
8 |
0 |
for (int i = 0; i < to; i++) root[i] = data.next_1B(); |
|
|
0 |
0 |
for (int i = 0; i < to; i++) root[i] = data.next_1B(); |
|
|
0 |
0 |
for (int i = 0; i < to; i++) root[i] = data.next_1B(); |
|
|
0 |
0 |
for (int i = 0; i < to; i++) root[i] = data.next_1B(); |
|
|
0 |
0 |
for (int i = 0; i < to; i++) root[i] = data.next_1B(); |
|
4394
|
12 |
8 |
if (operations & ADD_END) |
|
|
0 |
0 |
if (operations & ADD_END) |
|
|
0 |
0 |
if (operations & ADD_END) |
|
4395
|
12 |
0 |
for (int len = data.next_1B(); len > 0; len--) |
|
|
22 |
12 |
for (int len = data.next_1B(); len > 0; len--) |
|
|
0 |
0 |
for (int len = data.next_1B(); len > 0; len--) |
|
|
0 |
0 |
for (int len = data.next_1B(); len > 0; len--) |
|
|
0 |
0 |
for (int len = data.next_1B(); len > 0; len--) |
|
|
0 |
0 |
for (int len = data.next_1B(); len > 0; len--) |
|
4396
|
22 |
0 |
root[root_len++] = data.next_1B(); |
|
|
0 |
0 |
root[root_len++] = data.next_1B(); |
|
|
0 |
0 |
root[root_len++] = data.next_1B(); |
|
4397
|
20 |
0 |
uint16_t clas = data.next_2B(); |
|
|
0 |
0 |
uint16_t clas = data.next_2B(); |
|
|
0 |
0 |
uint16_t clas = data.next_2B(); |
|
4399
|
10 |
10 |
if (pass == 1) { // for each root |
|
|
0 |
0 |
if (pass == 1) { // for each root |
|
|
0 |
0 |
if (pass == 1) { // for each root |
|
4408
|
0 |
10 |
assert(uint8_t(lemma_len) == lemma_len); |
|
|
0 |
0 |
assert(uint8_t(lemma_len) == lemma_len); |
|
|
0 |
0 |
assert(uint8_t(lemma_len) == lemma_len); |
|
4413
|
0 |
10 |
assert(uint8_t(root_len) == root_len); |
|
|
0 |
0 |
assert(uint8_t(root_len) == root_len); |
|
|
0 |
0 |
assert(uint8_t(root_len) == root_len); |
|
4418
|
1 |
1 |
if (pass == 1) { // after the whole pass |
|
|
0 |
0 |
if (pass == 1) { // after the whole pass |
|
|
0 |
0 |
if (pass == 1) { // after the whole pass |
|
4419
|
1 |
0 |
lemmas.done_adding(); |
|
|
0 |
0 |
lemmas.done_adding(); |
|
|
0 |
0 |
lemmas.done_adding(); |
|
4420
|
1 |
0 |
roots.done_adding(); |
|
|
0 |
0 |
roots.done_adding(); |
|
|
0 |
0 |
roots.done_adding(); |
|
4428
|
1 |
0 |
tags.resize(data.next_2B()); |
|
|
1 |
0 |
tags.resize(data.next_2B()); |
|
|
0 |
0 |
tags.resize(data.next_2B()); |
|
|
0 |
0 |
tags.resize(data.next_2B()); |
|
|
0 |
0 |
tags.resize(data.next_2B()); |
|
|
0 |
0 |
tags.resize(data.next_2B()); |
|
4429
|
6 |
1 |
for (auto&& tag : tags) { |
|
|
0 |
0 |
for (auto&& tag : tags) { |
|
|
0 |
0 |
for (auto&& tag : tags) { |
|
4430
|
6 |
0 |
tag.resize(data.next_1B()); |
|
|
0 |
0 |
tag.resize(data.next_1B()); |
|
|
0 |
0 |
tag.resize(data.next_1B()); |
|
4431
|
397 |
6 |
for (unsigned i = 0; i < tag.size(); i++) |
|
|
0 |
0 |
for (unsigned i = 0; i < tag.size(); i++) |
|
|
0 |
0 |
for (unsigned i = 0; i < tag.size(); i++) |
|
4432
|
397 |
0 |
tag[i] = data.next_1B(); |
|
|
0 |
0 |
tag[i] = data.next_1B(); |
|
|
0 |
0 |
tag[i] = data.next_1B(); |
|
4436
|
1 |
0 |
suffixes.load(data); |
|
|
0 |
0 |
suffixes.load(data); |
|
|
0 |
0 |
suffixes.load(data); |
|
4439
|
1 |
0 |
suffixes.iter_all([this](const char* suffix, int len, pointer_decoder& data) mutable { |
|
|
0 |
0 |
suffixes.iter_all([this](const char* suffix, int len, pointer_decoder& data) mutable { |
|
|
0 |
0 |
suffixes.iter_all([this](const char* suffix, int len, pointer_decoder& data) mutable { |
|
4444
|
6 |
1 |
for (unsigned i = 0; i < classes_len; i++) |
|
|
0 |
0 |
for (unsigned i = 0; i < classes_len; i++) |
|
|
0 |
0 |
for (unsigned i = 0; i < classes_len; i++) |
|
4450
|
6 |
1 |
for (unsigned i = 0; i < classes_len; i++) { |
|
|
0 |
0 |
for (unsigned i = 0; i < classes_len; i++) { |
|
|
0 |
0 |
for (unsigned i = 0; i < classes_len; i++) { |
|
4452
|
6 |
0 |
if (classes_ptr_i >= classes.size()) classes.resize(classes_ptr_i + 1); |
|
|
6 |
0 |
if (classes_ptr_i >= classes.size()) classes.resize(classes_ptr_i + 1); |
|
|
0 |
0 |
if (classes_ptr_i >= classes.size()) classes.resize(classes_ptr_i + 1); |
|
|
0 |
0 |
if (classes_ptr_i >= classes.size()) classes.resize(classes_ptr_i + 1); |
|
|
0 |
0 |
if (classes_ptr_i >= classes.size()) classes.resize(classes_ptr_i + 1); |
|
|
0 |
0 |
if (classes_ptr_i >= classes.size()) classes.resize(classes_ptr_i + 1); |
|
4455
|
6 |
0 |
classes[classes_ptr_i].emplace_back(suffix_str, vector()); |
|
|
0 |
0 |
classes[classes_ptr_i].emplace_back(suffix_str, vector()); |
|
|
0 |
0 |
classes[classes_ptr_i].emplace_back(suffix_str, vector()); |
|
4456
|
6 |
6 |
for (const uint16_t* ptr = tags_ptr + prev_index; ptr < tags_ptr + index; ptr++) |
|
|
0 |
0 |
for (const uint16_t* ptr = tags_ptr + prev_index; ptr < tags_ptr + index; ptr++) |
|
|
0 |
0 |
for (const uint16_t* ptr = tags_ptr + prev_index; ptr < tags_ptr + index; ptr++) |
|
4457
|
6 |
0 |
classes[classes_ptr_i].back().second.emplace_back(unaligned_load(ptr)); |
|
|
0 |
0 |
classes[classes_ptr_i].back().second.emplace_back(unaligned_load(ptr)); |
|
|
0 |
0 |
classes[classes_ptr_i].back().second.emplace_back(unaligned_load(ptr)); |
|
4467
|
0 |
8 |
uint16_t** suff = max_suffix_len <= 16 ? suff_stack : (suff_heap.resize(max_suffix_len), suff_heap.data()); |
|
|
0 |
0 |
uint16_t** suff = max_suffix_len <= 16 ? suff_stack : (suff_heap.resize(max_suffix_len), suff_heap.data()); |
|
|
0 |
0 |
uint16_t** suff = max_suffix_len <= 16 ? suff_stack : (suff_heap.resize(max_suffix_len), suff_heap.data()); |
|
|
0 |
0 |
uint16_t** suff = max_suffix_len <= 16 ? suff_stack : (suff_heap.resize(max_suffix_len), suff_heap.data()); |
|
|
0 |
0 |
uint16_t** suff = max_suffix_len <= 16 ? suff_stack : (suff_heap.resize(max_suffix_len), suff_heap.data()); |
|
|
0 |
0 |
uint16_t** suff = max_suffix_len <= 16 ? suff_stack : (suff_heap.resize(max_suffix_len), suff_heap.data()); |
|
4469
|
8 |
8 |
for (int i = form.len; i >= 0 && suff_len < max_suffix_len; i--, suff_len++) { |
|
|
0 |
0 |
for (int i = form.len; i >= 0 && suff_len < max_suffix_len; i--, suff_len++) { |
|
|
0 |
0 |
for (int i = form.len; i >= 0 && suff_len < max_suffix_len; i--, suff_len++) { |
|
4477
|
8 |
8 |
for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++) |
|
|
0 |
8 |
for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++) |
|
|
8 |
8 |
for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++) |
|
|
0 |
0 |
for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++) |
|
|
0 |
0 |
for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++) |
|
|
0 |
0 |
for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++) |
|
|
0 |
0 |
for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++) |
|
|
0 |
0 |
for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++) |
|
|
0 |
0 |
for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++) |
|
4478
|
8 |
0 |
if (unaligned_load(suff[suff_len])) { |
|
|
0 |
0 |
if (unaligned_load(suff[suff_len])) { |
|
|
0 |
0 |
if (unaligned_load(suff[suff_len])) { |
|
4482
|
8 |
0 |
roots.iter(form.str, root_len, [&](const char* root, pointer_decoder& root_data) { |
|
|
0 |
0 |
roots.iter(form.str, root_len, [&](const char* root, pointer_decoder& root_data) { |
|
|
0 |
0 |
roots.iter(form.str, root_len, [&](const char* root, pointer_decoder& root_data) { |
|
4487
|
10 |
3 |
if (small_memeq(form.str, root, root_len)) { |
|
|
0 |
0 |
if (small_memeq(form.str, root, root_len)) { |
|
|
0 |
0 |
if (small_memeq(form.str, root, root_len)) { |
|
4489
|
10 |
0 |
if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) { |
|
|
0 |
10 |
if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) { |
|
|
10 |
0 |
if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) { |
|
|
0 |
0 |
if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) { |
|
|
0 |
0 |
if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) { |
|
|
0 |
0 |
if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) { |
|
|
0 |
0 |
if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) { |
|
|
0 |
0 |
if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) { |
|
|
0 |
0 |
if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) { |
|
4492
|
0 |
10 |
if (lemma_data[lemma_len]) lemma += LemmaAddinfo::format(lemma_data + lemma_len + 1, lemma_data[lemma_len]); |
|
|
0 |
0 |
if (lemma_data[lemma_len]) lemma += LemmaAddinfo::format(lemma_data + lemma_len + 1, lemma_data[lemma_len]); |
|
|
0 |
0 |
if (lemma_data[lemma_len]) lemma += LemmaAddinfo::format(lemma_data + lemma_len + 1, lemma_data[lemma_len]); |
|
|
0 |
0 |
if (lemma_data[lemma_len]) lemma += LemmaAddinfo::format(lemma_data + lemma_len + 1, lemma_data[lemma_len]); |
|
4496
|
10 |
10 |
for (unsigned i = unaligned_load(suff_tag_indices + (suffix_class_ptr - suff_data)); |
|
|
0 |
0 |
for (unsigned i = unaligned_load(suff_tag_indices + (suffix_class_ptr - suff_data)); |
|
|
0 |
0 |
for (unsigned i = unaligned_load(suff_tag_indices + (suffix_class_ptr - suff_data)); |
|
4498
|
10 |
0 |
lemmas.emplace_back(lemma, tags[unaligned_load(suff_tags + i)]); |
|
|
0 |
0 |
lemmas.emplace_back(lemma, tags[unaligned_load(suff_tags + i)]); |
|
|
0 |
0 |
lemmas.emplace_back(lemma, tags[unaligned_load(suff_tags + i)]); |
|
4508
|
0 |
0 |
int raw_lemma_len = addinfo.parse(lemma); |
|
|
0 |
0 |
int raw_lemma_len = addinfo.parse(lemma); |
|
4511
|
0 |
0 |
lemmas.iter(lemma.str, raw_lemma_len, [&](const char* lemma_str, pointer_decoder& data) { |
|
|
0 |
0 |
lemmas.iter(lemma.str, raw_lemma_len, [&](const char* lemma_str, pointer_decoder& data) { |
|
|
0 |
0 |
lemmas.iter(lemma.str, raw_lemma_len, [&](const char* lemma_str, pointer_decoder& data) { |
|
4517
|
0 |
0 |
if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) { |
|
|
0 |
0 |
if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) { |
|
|
0 |
0 |
if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) { |
|
|
0 |
0 |
if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) { |
|
|
0 |
0 |
if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) { |
|
|
0 |
0 |
if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) { |
|
|
0 |
0 |
if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) { |
|
|
0 |
0 |
if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) { |
|
|
0 |
0 |
if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) { |
|
4522
|
0 |
0 |
for (unsigned i = 0; i < lemma_roots_len; i++) { |
|
|
0 |
0 |
for (unsigned i = 0; i < lemma_roots_len; i++) { |
|
|
0 |
0 |
for (unsigned i = 0; i < lemma_roots_len; i++) { |
|
4528
|
0 |
0 |
for (auto&& suffix : classes[clas]) { |
|
|
0 |
0 |
for (auto&& suffix : classes[clas]) { |
|
|
0 |
0 |
for (auto&& suffix : classes[clas]) { |
|
4530
|
0 |
0 |
for (auto&& tag : suffix.second) |
|
|
0 |
0 |
for (auto&& tag : suffix.second) |
|
|
0 |
0 |
for (auto&& tag : suffix.second) |
|
4531
|
0 |
0 |
if (filter.matches(tags[tag].c_str())) { |
|
|
0 |
0 |
if (filter.matches(tags[tag].c_str())) { |
|
|
0 |
0 |
if (filter.matches(tags[tag].c_str())) { |
|
4532
|
0 |
0 |
if (!forms) { |
|
|
0 |
0 |
if (!forms) { |
|
|
0 |
0 |
if (!forms) { |
|
4533
|
0 |
0 |
lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len)); |
|
|
0 |
0 |
lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len)); |
|
|
0 |
0 |
lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len)); |
|
|
0 |
0 |
lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len)); |
|
|
0 |
0 |
lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len)); |
|
|
0 |
0 |
lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len)); |
|
|
0 |
0 |
lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len)); |
|
4537
|
0 |
0 |
if (root_with_suffix.empty() && root_len + suffix.first.size()) { |
|
|
0 |
0 |
if (root_with_suffix.empty() && root_len + suffix.first.size()) { |
|
|
0 |
0 |
if (root_with_suffix.empty() && root_len + suffix.first.size()) { |
|
|
0 |
0 |
if (root_with_suffix.empty() && root_len + suffix.first.size()) { |
|
|
0 |
0 |
if (root_with_suffix.empty() && root_len + suffix.first.size()) { |
|
|
0 |
0 |
if (root_with_suffix.empty() && root_len + suffix.first.size()) { |
|
|
0 |
0 |
if (root_with_suffix.empty() && root_len + suffix.first.size()) { |
|
|
0 |
0 |
if (root_with_suffix.empty() && root_len + suffix.first.size()) { |
|
|
0 |
0 |
if (root_with_suffix.empty() && root_len + suffix.first.size()) { |
|
4538
|
0 |
0 |
root_with_suffix.reserve(root_len + suffix.first.size()); |
|
|
0 |
0 |
root_with_suffix.reserve(root_len + suffix.first.size()); |
|
|
0 |
0 |
root_with_suffix.reserve(root_len + suffix.first.size()); |
|
4543
|
0 |
0 |
forms->emplace_back(root_with_suffix, tags[tag]); |
|
|
0 |
0 |
forms->emplace_back(root_with_suffix, tags[tag]); |
|
|
0 |
0 |
forms->emplace_back(root_with_suffix, tags[tag]); |
|
4590
|
0 |
0 |
for (unsigned tag_filters_len = data.next_1B(); tag_filters_len; tag_filters_len--) { |
|
4594
|
0 |
0 |
tag_filters.emplace_back(tag_filter.c_str()); |
|
4605
|
0 |
0 |
if (!form.len) return; |
|
4609
|
0 |
0 |
middle_masks.reserve(form.len); |
|
4611
|
0 |
0 |
for (unsigned initial = 0; initial < form.len; initial++) { |
|
4614
|
0 |
0 |
if (initial) { |
|
4616
|
0 |
0 |
if (!found) break; |
|
4621
|
0 |
0 |
if (initial_mask) { |
|
4622
|
0 |
0 |
middle_masks.resize(initial); |
|
4623
|
0 |
0 |
middle_masks.emplace_back(initial_mask); |
|
4624
|
0 |
0 |
for (unsigned middle = initial; middle < middle_masks.size(); middle++) { |
|
4625
|
0 |
0 |
if (!middle_masks[middle]) continue; |
|
4627
|
0 |
0 |
for (unsigned i = middle + 1; i < form.len; i++) { |
|
4629
|
0 |
0 |
if (!found) break; |
|
4630
|
0 |
0 |
if (unaligned_load(found)) { |
|
4631
|
0 |
0 |
if (i + 1 > middle_masks.size()) middle_masks.resize(i + 1); |
|
|
0 |
0 |
if (i + 1 > middle_masks.size()) middle_masks.resize(i + 1); |
|
4637
|
0 |
0 |
if (middle > initial && middle < form.len ) { |
|
|
0 |
0 |
if (middle > initial && middle < form.len ) { |
|
4638
|
0 |
0 |
if (initial) { |
|
4639
|
0 |
0 |
if (form_tmp.empty()) form_tmp.assign(form.str, form.str + form.len); |
|
4643
|
0 |
0 |
dictionary.analyze(string_piece((initial ? form_tmp.data() : form.str) + middle - initial, form.len - middle + initial), lemmas); |
|
|
0 |
0 |
dictionary.analyze(string_piece((initial ? form_tmp.data() : form.str) + middle - initial, form.len - middle + initial), lemmas); |
|
4645
|
0 |
0 |
for (unsigned i = lemmas_ori_size; i < lemmas.size(); i++) { |
|
4646
|
0 |
0 |
for (unsigned filter = 0; filter < tag_filters.size(); filter++) |
|
4647
|
0 |
0 |
if ((middle_masks[middle] & (1<
|
|
|
0 |
0 |
if ((middle_masks[middle] & (1<
|
|
|
0 |
0 |
if ((middle_masks[middle] & (1<
|
|
4648
|
0 |
0 |
if (i == lemmas_new_size) { |
|
4651
|
0 |
0 |
lemmas[lemmas_new_size].lemma.reserve(lemmas[i].lemma.size() + middle - initial); |
|
4660
|
0 |
0 |
if (lemmas_new_size < lemmas.size()) lemmas.erase(lemmas.begin() + lemmas_new_size, lemmas.end()); |
|
4785
|
30 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
4 |
26 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
4845
|
0 |
0 |
czech_morpho(morpho_language language, unsigned version) : language(language), version(version) {} |
|
|
0 |
0 |
czech_morpho(morpho_language language, unsigned version) : language(language), version(version) {} |
|
|
0 |
0 |
czech_morpho(morpho_language language, unsigned version) : language(language), version(version) {} |
|
4888
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
|
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
|
4892
|
0 |
0 |
unsigned tag_length = data.next_1B(); |
|
4893
|
0 |
0 |
if (tag_length < unknown_tag.size()) unknown_tag.erase(tag_length); |
|
|
0 |
0 |
if (tag_length < unknown_tag.size()) unknown_tag.erase(tag_length); |
|
4894
|
0 |
0 |
if (tag_length < number_tag.size()) number_tag.erase(tag_length); |
|
|
0 |
0 |
if (tag_length < number_tag.size()) number_tag.erase(tag_length); |
|
4895
|
0 |
0 |
if (tag_length < punctuation_tag.size()) punctuation_tag.erase(tag_length); |
|
|
0 |
0 |
if (tag_length < punctuation_tag.size()) punctuation_tag.erase(tag_length); |
|
4898
|
0 |
0 |
dictionary.load(data); |
|
4902
|
0 |
0 |
if (data.next_1B()) { |
|
|
0 |
0 |
if (data.next_1B()) { |
|
4903
|
0 |
0 |
prefix_guesser.reset(new morpho_prefix_guesser(dictionary)); |
|
4904
|
0 |
0 |
prefix_guesser->load(data); |
|
4909
|
0 |
0 |
if (data.next_1B()) { |
|
|
0 |
0 |
if (data.next_1B()) { |
|
4910
|
0 |
0 |
statistical_guesser.reset(new morpho_statistical_guesser()); |
|
4911
|
0 |
0 |
statistical_guesser->load(data); |
|
4912
|
0 |
0 |
} |
|
4923
|
0 |
0 |
if (form.len) { |
|
4927
|
0 |
0 |
generate_casing_variants(form, form_uclc, form_lc); |
|
4930
|
0 |
0 |
dictionary.analyze(form, lemmas); |
|
4931
|
0 |
0 |
if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas); |
|
|
0 |
0 |
if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas); |
|
4932
|
0 |
0 |
if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas); |
|
|
0 |
0 |
if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas); |
|
4933
|
0 |
0 |
if (!lemmas.empty()) return NO_GUESSER; |
|
4936
|
0 |
0 |
analyze_special(form, lemmas); |
|
4937
|
0 |
0 |
if (!lemmas.empty()) return NO_GUESSER; |
|
4940
|
0 |
0 |
if (guesser == GUESSER && prefix_guesser) |
|
|
0 |
0 |
if (guesser == GUESSER && prefix_guesser) |
|
|
0 |
0 |
if (guesser == GUESSER && prefix_guesser) |
|
4941
|
0 |
0 |
prefix_guesser->analyze(form_lc.empty() ? form : form_lc, lemmas); |
|
|
0 |
0 |
prefix_guesser->analyze(form_lc.empty() ? form : form_lc, lemmas); |
|
4945
|
0 |
0 |
if (guesser == GUESSER && statistical_guesser) { |
|
|
0 |
0 |
if (guesser == GUESSER && statistical_guesser) { |
|
|
0 |
0 |
if (guesser == GUESSER && statistical_guesser) { |
|
4946
|
0 |
0 |
if (form_uclc.empty() && form_lc.empty()) |
|
|
0 |
0 |
if (form_uclc.empty() && form_lc.empty()) |
|
|
0 |
0 |
if (form_uclc.empty() && form_lc.empty()) |
|
4947
|
0 |
0 |
statistical_guesser->analyze(form, lemmas, nullptr); |
|
4949
|
0 |
0 |
morpho_statistical_guesser::used_rules used_rules; used_rules.reserve(3); |
|
4950
|
0 |
0 |
statistical_guesser->analyze(form, lemmas, &used_rules); |
|
4951
|
0 |
0 |
if (!form_uclc.empty()) statistical_guesser->analyze(form_uclc, lemmas, &used_rules); |
|
|
0 |
0 |
if (!form_uclc.empty()) statistical_guesser->analyze(form_uclc, lemmas, &used_rules); |
|
4952
|
0 |
0 |
if (!form_lc.empty()) statistical_guesser->analyze(form_lc, lemmas, &used_rules); |
|
|
0 |
0 |
if (!form_lc.empty()) statistical_guesser->analyze(form_lc, lemmas, &used_rules); |
|
4958
|
0 |
0 |
if (prefix_guesser_guesses) { |
|
4961
|
0 |
0 |
return lemma_compare < 0 || (lemma_compare == 0 && a.tag < b.tag); |
|
4964
|
0 |
0 |
return a.lemma == b.lemma && a.tag == b.tag; |
|
|
0 |
0 |
return a.lemma == b.lemma && a.tag == b.tag; |
|
4966
|
0 |
0 |
if (lemmas_end != lemmas.end()) lemmas.erase(lemmas_end, lemmas.end()); |
|
4969
|
0 |
0 |
if (!lemmas.empty()) return GUESSER; |
|
4972
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), unknown_tag); |
|
4981
|
0 |
0 |
if (lemma.len) { |
|
4982
|
0 |
0 |
if (dictionary.generate(lemma, filter, forms)) |
|
|
0 |
0 |
if (dictionary.generate(lemma, filter, forms)) |
|
4985
|
0 |
0 |
if (guesser == GUESSER && prefix_guesser) |
|
|
0 |
0 |
if (guesser == GUESSER && prefix_guesser) |
|
5006
|
0 |
0 |
return new czech_tokenizer(language, version, this); |
|
5037
|
0 |
0 |
if (!form.len) return; |
|
5045
|
0 |
0 |
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(form.str, form.len); |
|
5046
|
0 |
0 |
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(form.str, form.len); |
|
5047
|
0 |
0 |
if ((codepoint == '.' && form.len) || codepoint == ',') codepoint = utf8::decode(form.str, form.len); |
|
|
0 |
0 |
if ((codepoint == '.' && form.len) || codepoint == ',') codepoint = utf8::decode(form.str, form.len); |
|
|
0 |
0 |
if ((codepoint == '.' && form.len) || codepoint == ',') codepoint = utf8::decode(form.str, form.len); |
|
5048
|
0 |
0 |
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(form.str, form.len); |
|
5049
|
0 |
0 |
if (any_digit && (codepoint == 'e' || codepoint == 'E')) { |
|
|
0 |
0 |
if (any_digit && (codepoint == 'e' || codepoint == 'E')) { |
|
5051
|
0 |
0 |
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(form.str, form.len); |
|
5053
|
0 |
0 |
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(form.str, form.len); |
|
5056
|
0 |
0 |
if (any_digit && !form.len && (!codepoint || codepoint == '.')) { |
|
|
0 |
0 |
if (any_digit && !form.len && (!codepoint || codepoint == '.')) { |
|
|
0 |
0 |
if (any_digit && !form.len && (!codepoint || codepoint == '.')) { |
|
5057
|
0 |
0 |
lemmas.emplace_back(string(form_ori.str, form_ori.len), number_tag); |
|
5058
|
0 |
0 |
} else if ((first < sizeof(punctuation_additional) && punctuation_additional[first]) || |
|
|
0 |
0 |
} else if ((first < sizeof(punctuation_additional) && punctuation_additional[first]) || |
|
|
0 |
0 |
} else if ((first < sizeof(punctuation_additional) && punctuation_additional[first]) || |
|
|
0 |
0 |
} else if ((first < sizeof(punctuation_additional) && punctuation_additional[first]) || |
|
5059
|
0 |
0 |
((unicode::category(first) & unicode::P) && (first >= sizeof(punctuation_exceptions) || !punctuation_exceptions[first]))) |
|
|
0 |
0 |
((unicode::category(first) & unicode::P) && (first >= sizeof(punctuation_exceptions) || !punctuation_exceptions[first]))) |
|
5060
|
0 |
0 |
lemmas.emplace_back(string(form_ori.str, form_ori.len), punctuation_tag); |
|
5098
|
0 |
0 |
for (unsigned len = 1; len < lemma.len; len++) { |
|
5099
|
0 |
0 |
if (len + 1 == lemma.len && (lemma.str[len] == '^' || lemma.str[len] == '+')) |
|
|
0 |
0 |
if (len + 1 == lemma.len && (lemma.str[len] == '^' || lemma.str[len] == '+')) |
|
5101
|
0 |
0 |
if (len + 1 < lemma.len && lemma.str[len] == '^') { |
|
|
0 |
0 |
if (len + 1 < lemma.len && lemma.str[len] == '^') { |
|
5103
|
0 |
0 |
for (unsigned i = len + 1; ok && i < lemma.len; i++) |
|
|
0 |
0 |
for (unsigned i = len + 1; ok && i < lemma.len; i++) |
|
5104
|
0 |
0 |
ok &= (lemma.str[i] >= 'A' && lemma.str[i] <= 'Z') || |
|
5105
|
0 |
0 |
(lemma.str[i] >= 'a' && lemma.str[i] <= 'z') || |
|
|
0 |
0 |
(lemma.str[i] >= 'a' && lemma.str[i] <= 'z') || |
|
5106
|
0 |
0 |
(i > len + 1 && lemma.str[i] == '-'); |
|
5107
|
0 |
0 |
if (ok) return len; |
|
5130
|
0 |
0 |
for (size_t i = len; i < lemma.len; i++) |
|
5137
|
0 |
0 |
if (data.empty()) return true; |
|
5138
|
0 |
0 |
if (data.size() == 1 && data[0] == '^') return other_addinfo_len > 0 && other_addinfo[0] == '^'; |
|
|
0 |
0 |
if (data.size() == 1 && data[0] == '^') return other_addinfo_len > 0 && other_addinfo[0] == '^'; |
|
|
0 |
0 |
if (data.size() == 1 && data[0] == '^') return other_addinfo_len > 0 && other_addinfo[0] == '^'; |
|
|
0 |
0 |
if (data.size() == 1 && data[0] == '^') return other_addinfo_len > 0 && other_addinfo[0] == '^'; |
|
|
0 |
0 |
if (data.size() == 1 && data[0] == '^') return other_addinfo_len > 0 && other_addinfo[0] == '^'; |
|
5139
|
0 |
0 |
if (data.size() == 1 && data[0] == '+') return other_addinfo_len == 0; |
|
|
0 |
0 |
if (data.size() == 1 && data[0] == '+') return other_addinfo_len == 0; |
|
|
0 |
0 |
if (data.size() == 1 && data[0] == '+') return other_addinfo_len == 0; |
|
5140
|
0 |
0 |
return data.size() == size_t(other_addinfo_len) && small_memeq(data.data(), other_addinfo, other_addinfo_len); |
|
|
0 |
0 |
return data.size() == size_t(other_addinfo_len) && small_memeq(data.data(), other_addinfo, other_addinfo_len); |
|
5160
|
0 |
0 |
class english_morpho_guesser { |
|
|
0 |
0 |
class english_morpho_guesser { |
|
|
0 |
0 |
class english_morpho_guesser { |
|
|
0 |
0 |
class english_morpho_guesser { |
|
|
0 |
0 |
class english_morpho_guesser { |
|
|
0 |
0 |
class english_morpho_guesser { |
|
|
0 |
0 |
class english_morpho_guesser { |
|
|
0 |
0 |
class english_morpho_guesser { |
|
|
0 |
0 |
class english_morpho_guesser { |
|
|
0 |
0 |
class english_morpho_guesser { |
|
|
0 |
0 |
class english_morpho_guesser { |
|
|
0 |
0 |
class english_morpho_guesser { |
|
|
0 |
0 |
class english_morpho_guesser { |
|
|
0 |
0 |
class english_morpho_guesser { |
|
|
0 |
0 |
class english_morpho_guesser { |
|
|
0 |
0 |
class english_morpho_guesser { |
|
|
0 |
0 |
class english_morpho_guesser { |
|
|
0 |
0 |
class english_morpho_guesser { |
|
|
0 |
0 |
class english_morpho_guesser { |
|
5208
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
5282
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
|
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
|
5285
|
0 |
0 |
dictionary.load(data); |
|
5286
|
0 |
0 |
morpho_guesser.load(data); |
|
|
0 |
0 |
morpho_guesser.load(data); |
|
5297
|
0 |
0 |
if (form.len) { |
|
5301
|
0 |
0 |
generate_casing_variants(form, form_uclc, form_lc); |
|
5304
|
0 |
0 |
dictionary.analyze(form, lemmas); |
|
5305
|
0 |
0 |
if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas); |
|
|
0 |
0 |
if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas); |
|
5306
|
0 |
0 |
if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas); |
|
|
0 |
0 |
if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas); |
|
5307
|
0 |
0 |
if (!lemmas.empty()) |
|
5308
|
0 |
0 |
return guesser == NO_GUESSER || !morpho_guesser.analyze_proper_names(form, form_lc.empty() ? form : form_lc, lemmas) ? NO_GUESSER : GUESSER; |
|
|
0 |
0 |
return guesser == NO_GUESSER || !morpho_guesser.analyze_proper_names(form, form_lc.empty() ? form : form_lc, lemmas) ? NO_GUESSER : GUESSER; |
|
|
0 |
0 |
return guesser == NO_GUESSER || !morpho_guesser.analyze_proper_names(form, form_lc.empty() ? form : form_lc, lemmas) ? NO_GUESSER : GUESSER; |
|
|
0 |
0 |
return guesser == NO_GUESSER || !morpho_guesser.analyze_proper_names(form, form_lc.empty() ? form : form_lc, lemmas) ? NO_GUESSER : GUESSER; |
|
5311
|
0 |
0 |
analyze_special(form, lemmas); |
|
5312
|
0 |
0 |
if (!lemmas.empty()) return NO_GUESSER; |
|
5315
|
0 |
0 |
if (guesser == GUESSER) |
|
5316
|
0 |
0 |
morpho_guesser.analyze(form, form_lc.empty() ? form : form_lc, lemmas); |
|
|
0 |
0 |
morpho_guesser.analyze(form, form_lc.empty() ? form : form_lc, lemmas); |
|
5317
|
0 |
0 |
if (!lemmas.empty()) return GUESSER; |
|
5320
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), unknown_tag); |
|
5329
|
0 |
0 |
if (lemma.len) { |
|
5330
|
0 |
0 |
if (dictionary.generate(lemma, filter, forms)) |
|
|
0 |
0 |
if (dictionary.generate(lemma, filter, forms)) |
|
5350
|
0 |
0 |
return new english_tokenizer(version <= 2 ? 1 : 2); |
|
5357
|
0 |
0 |
if (!form.len) return; |
|
5360
|
0 |
0 |
if (form.len == 1) |
|
5364
|
0 |
0 |
case '?': lemmas.emplace_back(string(form.str, form.len), dot_tag); return; |
|
5365
|
0 |
0 |
case ',': lemmas.emplace_back(string(form.str, form.len), comma_tag); return; |
|
5366
|
0 |
0 |
case '#': lemmas.emplace_back(string(form.str, form.len), hash_tag); return; |
|
5367
|
0 |
0 |
case '$': lemmas.emplace_back(string(form.str, form.len), dollar_tag); return; |
|
5368
|
0 |
0 |
case '[': lemmas.emplace_back(string(form.str, form.len), sym_tag); return; |
|
5369
|
0 |
0 |
case ']': lemmas.emplace_back(string(form.str, form.len), sym_tag); return; |
|
5370
|
0 |
0 |
case '%': lemmas.emplace_back(string(form.str, form.len), jj_tag); |
|
5371
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), nn_tag); return; |
|
5372
|
0 |
0 |
case '&': lemmas.emplace_back(string(form.str, form.len), cc_tag); |
|
5373
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), sym_tag); return; |
|
5374
|
0 |
0 |
case '*': lemmas.emplace_back(string(form.str, form.len), sym_tag); |
|
5375
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), nn_tag); return; |
|
5376
|
0 |
0 |
case '@': lemmas.emplace_back(string(form.str, form.len), sym_tag); |
|
5377
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), in_tag); return; |
|
5378
|
0 |
0 |
case '\'': lemmas.emplace_back(string(form.str, form.len), close_quotation_tag); |
|
5379
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), pos_tag); return; |
|
5386
|
0 |
0 |
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(number.str, number.len); |
|
5387
|
0 |
0 |
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
|
5388
|
0 |
0 |
while (codepoint == ',') { |
|
5390
|
0 |
0 |
if (unicode::category(utf8::decode(group.str, group.len) & ~unicode::N)) break; |
|
5391
|
0 |
0 |
if (unicode::category(utf8::decode(group.str, group.len) & ~unicode::N)) break; |
|
5392
|
0 |
0 |
if (unicode::category(utf8::decode(group.str, group.len) & ~unicode::N)) break; |
|
5397
|
0 |
0 |
if (codepoint == '.' && number.len) { |
|
|
0 |
0 |
if (codepoint == '.' && number.len) { |
|
5399
|
0 |
0 |
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
|
5401
|
0 |
0 |
if (version >= 2 && any_digit && codepoint == 's' && !number.len) { |
|
|
0 |
0 |
if (version >= 2 && any_digit && codepoint == 's' && !number.len) { |
|
|
0 |
0 |
if (version >= 2 && any_digit && codepoint == 's' && !number.len) { |
|
5402
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), number_tag); |
|
5403
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len - 1), nns_tag); |
|
5406
|
0 |
0 |
if (any_digit && (codepoint == 'e' || codepoint == 'E')) { |
|
|
0 |
0 |
if (any_digit && (codepoint == 'e' || codepoint == 'E')) { |
|
5408
|
0 |
0 |
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(number.str, number.len); |
|
5410
|
0 |
0 |
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
|
5412
|
0 |
0 |
if (any_digit && !number.len && (!codepoint || codepoint == '.')) { |
|
|
0 |
0 |
if (any_digit && !number.len && (!codepoint || codepoint == '.')) { |
|
|
0 |
0 |
if (any_digit && !number.len && (!codepoint || codepoint == '.')) { |
|
5413
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), number_tag); |
|
5414
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), nnp_tag); |
|
5415
|
0 |
0 |
if (form.len == 1 + (codepoint == '.') && *form.str >= '1' && *form.str <= '9') |
|
|
0 |
0 |
if (form.len == 1 + (codepoint == '.') && *form.str >= '1' && *form.str <= '9') |
|
|
0 |
0 |
if (form.len == 1 + (codepoint == '.') && *form.str >= '1' && *form.str <= '9') |
|
|
0 |
0 |
if (form.len == 1 + (codepoint == '.') && *form.str >= '1' && *form.str <= '9') |
|
5416
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), ls_tag); |
|
5423
|
0 |
0 |
while ((symbol || any_punctuation) && punctuation.len) { |
|
|
0 |
0 |
while ((symbol || any_punctuation) && punctuation.len) { |
|
5425
|
0 |
0 |
if (open_quotation) open_quotation = codepoint == '`' || unicode::category(codepoint) & unicode::Pi; |
|
|
0 |
0 |
if (open_quotation) open_quotation = codepoint == '`' || unicode::category(codepoint) & unicode::Pi; |
|
|
0 |
0 |
if (open_quotation) open_quotation = codepoint == '`' || unicode::category(codepoint) & unicode::Pi; |
|
5426
|
0 |
0 |
if (close_quotation) close_quotation = codepoint == '\'' || codepoint == '"' || unicode::category(codepoint) & unicode::Pf; |
|
|
0 |
0 |
if (close_quotation) close_quotation = codepoint == '\'' || codepoint == '"' || unicode::category(codepoint) & unicode::Pf; |
|
|
0 |
0 |
if (close_quotation) close_quotation = codepoint == '\'' || codepoint == '"' || unicode::category(codepoint) & unicode::Pf; |
|
5427
|
0 |
0 |
if (open_parenthesis) open_parenthesis = unicode::category(codepoint) & unicode::Ps; |
|
5428
|
0 |
0 |
if (close_parenthesis) close_parenthesis = unicode::category(codepoint) & unicode::Pe; |
|
5429
|
0 |
0 |
if (any_punctuation) any_punctuation = unicode::category(codepoint) & unicode::P; |
|
5430
|
0 |
0 |
if (symbol) symbol = codepoint == '*' || unicode::category(codepoint) & unicode::S; |
|
|
0 |
0 |
if (symbol) symbol = codepoint == '*' || unicode::category(codepoint) & unicode::S; |
|
|
0 |
0 |
if (symbol) symbol = codepoint == '*' || unicode::category(codepoint) & unicode::S; |
|
5432
|
0 |
0 |
if (!punctuation.len && open_quotation) { lemmas.emplace_back(string(form.str, form.len), open_quotation_tag); return; } |
|
|
0 |
0 |
if (!punctuation.len && open_quotation) { lemmas.emplace_back(string(form.str, form.len), open_quotation_tag); return; } |
|
|
0 |
0 |
if (!punctuation.len && open_quotation) { lemmas.emplace_back(string(form.str, form.len), open_quotation_tag); return; } |
|
5433
|
0 |
0 |
if (!punctuation.len && close_quotation) { lemmas.emplace_back(string(form.str, form.len), close_quotation_tag); return; } |
|
|
0 |
0 |
if (!punctuation.len && close_quotation) { lemmas.emplace_back(string(form.str, form.len), close_quotation_tag); return; } |
|
|
0 |
0 |
if (!punctuation.len && close_quotation) { lemmas.emplace_back(string(form.str, form.len), close_quotation_tag); return; } |
|
5434
|
0 |
0 |
if (!punctuation.len && open_parenthesis) { lemmas.emplace_back(string(form.str, form.len), open_parenthesis_tag); return; } |
|
|
0 |
0 |
if (!punctuation.len && open_parenthesis) { lemmas.emplace_back(string(form.str, form.len), open_parenthesis_tag); return; } |
|
|
0 |
0 |
if (!punctuation.len && open_parenthesis) { lemmas.emplace_back(string(form.str, form.len), open_parenthesis_tag); return; } |
|
5435
|
0 |
0 |
if (!punctuation.len && close_parenthesis) { lemmas.emplace_back(string(form.str, form.len), close_parenthesis_tag); return; } |
|
|
0 |
0 |
if (!punctuation.len && close_parenthesis) { lemmas.emplace_back(string(form.str, form.len), close_parenthesis_tag); return; } |
|
|
0 |
0 |
if (!punctuation.len && close_parenthesis) { lemmas.emplace_back(string(form.str, form.len), close_parenthesis_tag); return; } |
|
5436
|
0 |
0 |
if (!punctuation.len && symbol) { lemmas.emplace_back(string(form.str, form.len), sym_tag); return; } |
|
|
0 |
0 |
if (!punctuation.len && symbol) { lemmas.emplace_back(string(form.str, form.len), sym_tag); return; } |
|
|
0 |
0 |
if (!punctuation.len && symbol) { lemmas.emplace_back(string(form.str, form.len), sym_tag); return; } |
|
5437
|
0 |
0 |
if (!punctuation.len && any_punctuation) { lemmas.emplace_back(string(form.str, form.len), punctuation_tag); return; } |
|
|
0 |
0 |
if (!punctuation.len && any_punctuation) { lemmas.emplace_back(string(form.str, form.len), punctuation_tag); return; } |
|
|
0 |
0 |
if (!punctuation.len && any_punctuation) { lemmas.emplace_back(string(form.str, form.len), punctuation_tag); return; } |
|
5471
|
0 |
0 |
while (tags--) { |
|
5473
|
0 |
0 |
exceptions_tags.emplace_back(string(data.next(len), len)); |
|
5609
|
0 |
0 |
for (unsigned len = data.next_1B(); len; len--) { |
|
5615
|
0 |
0 |
if (exception) { |
|
5618
|
0 |
0 |
for (unsigned len = data.next_1B(); len; len--) { |
|
5621
|
0 |
0 |
for (unsigned tags = data.next_1B(); tags; tags--) |
|
5622
|
0 |
0 |
lemmas.emplace_back(lemma, exceptions_tags[data.next_2B()]); |
|
5629
|
0 |
0 |
for (unsigned prefix = 1; prefix <= form_lc.len; prefix++) { |
|
5631
|
0 |
0 |
if (!found) break; |
|
5632
|
0 |
0 |
if (found[NEGATION_LEN]) { |
|
5633
|
0 |
0 |
if (form_lc.len - prefix >= found[TO_FOLLOW]) negation_len = found[NEGATION_LEN]; |
|
5639
|
0 |
0 |
add(JJ, lemma_lc, negation_len, lemmas); |
|
5640
|
0 |
0 |
add(RB, lemma_lc, negation_len, lemmas); |
|
5641
|
0 |
0 |
add(NN, lemma_lc, negation_len, lemmas); |
|
5642
|
0 |
0 |
add_NNS(lemma_lc, negation_len, lemmas); |
|
5659
|
0 |
0 |
if ( p == ( (form_lc.str + form_lc.len)) ) |
|
5666
|
0 |
0 |
if ( _klen > 0 ) { |
|
5671
|
0 |
0 |
if ( _upper < _lower ) |
|
5675
|
0 |
0 |
if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) < *_mid ) |
|
5677
|
0 |
0 |
else if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) > *_mid ) |
|
5689
|
0 |
0 |
if ( _klen > 0 ) { |
|
5694
|
0 |
0 |
if ( _upper < _lower ) |
|
5698
|
0 |
0 |
if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) < _mid[0] ) |
|
5700
|
0 |
0 |
else if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) > _mid[1] ) |
|
5714
|
0 |
0 |
if ( _tag_guesser_trans_actions[_trans] == 0 ) |
|
5719
|
0 |
0 |
while ( _nacts-- > 0 ) |
|
5724
|
0 |
0 |
{ if (!added_JJR_RBR) added_JJR_RBR = true, add_JJR_RBR(lemma_lc, negation_len, lemmas); } |
|
|
0 |
0 |
{ if (!added_JJR_RBR) added_JJR_RBR = true, add_JJR_RBR(lemma_lc, negation_len, lemmas); } |
|
5727
|
0 |
0 |
{ if (!added_JJS_RBS) added_JJS_RBS = true, add_JJS_RBS(lemma_lc, negation_len, lemmas); } |
|
|
0 |
0 |
{ if (!added_JJS_RBS) added_JJS_RBS = true, add_JJS_RBS(lemma_lc, negation_len, lemmas); } |
|
5730
|
0 |
0 |
{ add_VBG(lemma_lc, lemmas); } |
|
5733
|
0 |
0 |
{ add_VBD_VBN(lemma_lc, lemmas); } |
|
5736
|
0 |
0 |
{ add_VBZ(lemma_lc, lemmas); } |
|
5742
|
0 |
0 |
{ if (!added_SYM) added_SYM = true, add(SYM, lemma_lc, lemmas); } |
|
5745
|
0 |
0 |
{ if (!added_CD) added_CD = true, add(CD, lemma_lc, lemmas); } |
|
5751
|
0 |
0 |
if ( ++p != ( (form_lc.str + form_lc.len)) ) |
|
5754
|
0 |
0 |
if ( p == ( (form_lc.str + form_lc.len)) ) |
|
5758
|
0 |
0 |
while ( __nacts-- > 0 ) { |
|
5759
|
0 |
0 |
switch ( *__acts++ ) { |
|
5761
|
0 |
0 |
{ if (!added_CD) added_CD = true, add(CD, lemma_lc, lemmas); } |
|
5777
|
0 |
0 |
bool is_NNP = form.str != form_lc.str || (form.len && (*form.str == '\'' || (*form.str >= '0' && *form.str <= '9'))); |
|
|
0 |
0 |
bool is_NNP = form.str != form_lc.str || (form.len && (*form.str == '\'' || (*form.str >= '0' && *form.str <= '9'))); |
|
|
0 |
0 |
bool is_NNP = form.str != form_lc.str || (form.len && (*form.str == '\'' || (*form.str >= '0' && *form.str <= '9'))); |
|
|
0 |
0 |
bool is_NNP = form.str != form_lc.str || (form.len && (*form.str == '\'' || (*form.str >= '0' && *form.str <= '9'))); |
|
5779
|
0 |
0 |
if (!is_NNP && !is_NNPS) return false; |
|
5782
|
0 |
0 |
for (auto&& lemma : lemmas) { |
|
5786
|
0 |
0 |
if (!((is_NNP && !was_NNP) || (is_NNPS && !was_NNPS))) return false; |
|
|
0 |
0 |
if (!((is_NNP && !was_NNP) || (is_NNPS && !was_NNPS))) return false; |
|
5789
|
0 |
0 |
if (is_NNP && !was_NNP) add(NNP, lemma, lemmas); |
|
5790
|
0 |
0 |
if (is_NNPS && !was_NNPS) add_NNPS(lemma, lemmas); |
|
|
0 |
0 |
if (is_NNPS && !was_NNPS) add_NNPS(lemma, lemmas); |
|
5795
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
5804
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
5906
|
0 |
0 |
if ( p == ( (form.c_str() + form.size())) ) |
|
5915
|
0 |
0 |
if ( _klen > 0 ) { |
|
5920
|
0 |
0 |
if ( _upper < _lower ) |
|
5924
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < *_mid ) |
|
5926
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > *_mid ) |
|
5938
|
0 |
0 |
if ( _klen > 0 ) { |
|
5943
|
0 |
0 |
if ( _upper < _lower ) |
|
5947
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < _mid[0] ) |
|
5949
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > _mid[1] ) |
|
5963
|
0 |
0 |
if ( _NNS_trans_actions[_trans] == 0 ) |
|
5968
|
0 |
0 |
while ( _nacts-- > 0 ) |
|
5973
|
0 |
0 |
{ if (best > 'a') best = 'a', remove = 2, append = "an"; } |
|
5976
|
0 |
0 |
{ if (best > 'b') best = 'b', remove = 1, append = nullptr; } |
|
5979
|
0 |
0 |
{ if (best > 'c') best = 'c', remove = 3, append = "fe"; } |
|
5982
|
0 |
0 |
{ if (best > 'd') best = 'd', remove = 2, append = nullptr; } |
|
5985
|
0 |
0 |
{ if (best > 'e') best = 'e', remove = 1, append = nullptr; } |
|
5988
|
0 |
0 |
{ if (best > 'f') best = 'f', remove = 2, append = nullptr; } |
|
5991
|
0 |
0 |
{ if (best > 'g') best = 'g', remove = 1, append = nullptr; } |
|
5994
|
0 |
0 |
{ if (best > 'h') best = 'h', remove = 2, append = nullptr; } |
|
5997
|
0 |
0 |
{ if (best > 'i') best = 'i', remove = 1, append = nullptr; } |
|
6000
|
0 |
0 |
{ if (best > 'j') best = 'j', remove = 1, append = nullptr; } |
|
6003
|
0 |
0 |
{ if (best > 'k') best = 'k', remove = 2, append = nullptr; } |
|
6006
|
0 |
0 |
{ if (best > 'l') best = 'l', remove = 3, append = "y"; } |
|
6009
|
0 |
0 |
{ if (best > 'm') best = 'm', remove = 2, append = nullptr; } |
|
6012
|
0 |
0 |
{ if (best > 'n') best = 'n', remove = 1, append = nullptr; } |
|
6018
|
0 |
0 |
if ( cs == 0 ) |
|
6020
|
0 |
0 |
if ( ++p != ( (form.c_str() + form.size())) ) |
|
6026
|
0 |
0 |
add(NNS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
|
|
0 |
0 |
add(NNS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
|
|
0 |
0 |
add(NNS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
|
6152
|
0 |
0 |
if ( p == ( (form.c_str() + form.size())) ) |
|
6161
|
0 |
0 |
if ( _klen > 0 ) { |
|
6166
|
0 |
0 |
if ( _upper < _lower ) |
|
6170
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid ) |
|
6172
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid ) |
|
6184
|
0 |
0 |
if ( _klen > 0 ) { |
|
6189
|
0 |
0 |
if ( _upper < _lower ) |
|
6193
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] ) |
|
6195
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] ) |
|
6209
|
0 |
0 |
if ( _NNPS_trans_actions[_trans] == 0 ) |
|
6214
|
0 |
0 |
while ( _nacts-- > 0 ) |
|
6219
|
0 |
0 |
{ if (best > 'a') best = 'a', remove = 2, append = "AN"; } |
|
6222
|
0 |
0 |
{ if (best > 'b') best = 'b', remove = 2, append = "an"; } |
|
6225
|
0 |
0 |
{ if (best > 'c') best = 'c', remove = 1, append = nullptr; } |
|
6228
|
0 |
0 |
{ if (best > 'd') best = 'd', remove = 3, append = "FE"; } |
|
6231
|
0 |
0 |
{ if (best > 'e') best = 'e', remove = 3, append = "fe"; } |
|
6234
|
0 |
0 |
{ if (best > 'f') best = 'f', remove = 2, append = nullptr; } |
|
6237
|
0 |
0 |
{ if (best > 'g') best = 'g', remove = 1, append = nullptr; } |
|
6240
|
0 |
0 |
{ if (best > 'h') best = 'h', remove = 2, append = nullptr; } |
|
6243
|
0 |
0 |
{ if (best > 'i') best = 'i', remove = 1, append = nullptr; } |
|
6246
|
0 |
0 |
{ if (best > 'j') best = 'j', remove = 2, append = nullptr; } |
|
6249
|
0 |
0 |
{ if (best > 'k') best = 'k', remove = 1, append = nullptr; } |
|
6252
|
0 |
0 |
{ if (best > 'l') best = 'l', remove = 1, append = nullptr; } |
|
6255
|
0 |
0 |
{ if (best > 'm') best = 'm', remove = 2, append = nullptr; } |
|
6258
|
0 |
0 |
{ if (best > 'n') best = 'n', remove = 3, append = "Y"; } |
|
6261
|
0 |
0 |
{ if (best > 'o') best = 'o', remove = 3, append = "y"; } |
|
6264
|
0 |
0 |
{ if (best > 'p') best = 'p', remove = 2, append = nullptr; } |
|
6267
|
0 |
0 |
{ if (best > 'q') best = 'q', remove = 1, append = nullptr; } |
|
6273
|
0 |
0 |
if ( cs == 0 ) |
|
6275
|
0 |
0 |
if ( ++p != ( (form.c_str() + form.size())) ) |
|
6281
|
0 |
0 |
add(NNPS, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
|
|
0 |
0 |
add(NNPS, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
|
6581
|
0 |
0 |
if ( p == ( (form.c_str() + form.size())) ) |
|
6590
|
0 |
0 |
if ( _klen > 0 ) { |
|
6595
|
0 |
0 |
if ( _upper < _lower ) |
|
6599
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid ) |
|
6601
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid ) |
|
6613
|
0 |
0 |
if ( _klen > 0 ) { |
|
6618
|
0 |
0 |
if ( _upper < _lower ) |
|
6622
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] ) |
|
6624
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] ) |
|
6638
|
0 |
0 |
if ( _VBG_trans_actions[_trans] == 0 ) |
|
6643
|
0 |
0 |
while ( _nacts-- > 0 ) |
|
6648
|
0 |
0 |
{ if (best > 'a') best = 'a', remove = 3, append = nullptr; } |
|
6651
|
0 |
0 |
{ if (best > 'b') best = 'b', remove = 3, append = "e"; } |
|
6654
|
0 |
0 |
{ if (best > 'c') best = 'c', remove = 3, append = nullptr; } |
|
6657
|
0 |
0 |
{ if (best > 'd') best = 'd', remove = 3, append = "e"; } |
|
6660
|
0 |
0 |
{ if (best > 'e') best = 'e', remove = 3, append = nullptr; } |
|
6663
|
0 |
0 |
{ if (best > 'f') best = 'f', remove = 3, append = "e"; } |
|
6666
|
0 |
0 |
{ if (best > 'g') best = 'g', remove = 3, append = nullptr; } |
|
6669
|
0 |
0 |
{ if (best > 'h') best = 'h', remove = 3, append = "e"; } |
|
6672
|
0 |
0 |
{ if (best > 'i') best = 'i', remove = 3, append = nullptr; } |
|
6675
|
0 |
0 |
{ if (best > 'j') best = 'j', remove = 3, append = "e"; } |
|
6678
|
0 |
0 |
{ if (best > 'k') best = 'k', remove = 3, append = nullptr; } |
|
6681
|
0 |
0 |
{ if (best > 'l') best = 'l', remove = 3, append = "e"; } |
|
6684
|
0 |
0 |
{ if (best > 'm') best = 'm', remove = 3, append = nullptr; } |
|
6687
|
0 |
0 |
{ if (best > 'n') best = 'n', remove = 3, append = "e"; } |
|
6690
|
0 |
0 |
{ if (best > 'o') best = 'o', remove = 3, append = nullptr; } |
|
6693
|
0 |
0 |
{ if (best > 'p') best = 'p', remove = 3, append = "e"; } |
|
6696
|
0 |
0 |
{ if (best > 'q') best = 'q', remove = 3, append = nullptr; } |
|
6699
|
0 |
0 |
{ if (best > 'r') best = 'r', remove = 3, append = "e"; } |
|
6705
|
0 |
0 |
if ( cs == 0 ) |
|
6707
|
0 |
0 |
if ( ++p != ( (form.c_str() + form.size())) ) |
|
6710
|
0 |
0 |
if ( p == ( (form.c_str() + form.size())) ) |
|
6714
|
0 |
0 |
while ( __nacts-- > 0 ) { |
|
6717
|
0 |
0 |
{ if (best > 'c') best = 'c', remove = 3, append = nullptr; } |
|
6720
|
0 |
0 |
{ if (best > 'f') best = 'f', remove = 3, append = "e"; } |
|
6723
|
0 |
0 |
{ if (best > 'p') best = 'p', remove = 3, append = "e"; } |
|
6732
|
0 |
0 |
add(VBG, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
|
|
0 |
0 |
add(VBG, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
|
7035
|
0 |
0 |
if ( p == ( (form.c_str() + form.size())) ) |
|
7044
|
0 |
0 |
if ( _klen > 0 ) { |
|
7049
|
0 |
0 |
if ( _upper < _lower ) |
|
7053
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid ) |
|
7055
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid ) |
|
7067
|
0 |
0 |
if ( _klen > 0 ) { |
|
7072
|
0 |
0 |
if ( _upper < _lower ) |
|
7076
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] ) |
|
7078
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] ) |
|
7092
|
0 |
0 |
if ( _VBD_VBN_trans_actions[_trans] == 0 ) |
|
7097
|
0 |
0 |
while ( _nacts-- > 0 ) |
|
7102
|
0 |
0 |
{ if (best > 'a') best = 'a', remove = 1, append = nullptr; } |
|
7105
|
0 |
0 |
{ if (best > 'b') best = 'b', remove = 2, append = nullptr; } |
|
7108
|
0 |
0 |
{ if (best > 'c') best = 'c', remove = 1, append = nullptr; } |
|
7111
|
0 |
0 |
{ if (best > 'd') best = 'd', remove = 2, append = nullptr; } |
|
7114
|
0 |
0 |
{ if (best > 'e') best = 'e', remove = 1, append = nullptr; } |
|
7117
|
0 |
0 |
{ if (best > 'f') best = 'f', remove = 2, append = nullptr; } |
|
7120
|
0 |
0 |
{ if (best > 'h') best = 'h', remove = 2, append = nullptr; } |
|
7123
|
0 |
0 |
{ if (best > 'i') best = 'i', remove = 3, append = "y"; } |
|
7126
|
0 |
0 |
{ if (best > 'j') best = 'j', remove = 1, append = nullptr; } |
|
7129
|
0 |
0 |
{ if (best > 'k') best = 'k', remove = 2, append = nullptr; } |
|
7132
|
0 |
0 |
{ if (best > 'l') best = 'l', remove = 1, append = nullptr; } |
|
7135
|
0 |
0 |
{ if (best > 'm') best = 'm', remove = 2, append = nullptr; } |
|
7138
|
0 |
0 |
{ if (best > 'n') best = 'n', remove = 1, append = nullptr; } |
|
7141
|
0 |
0 |
{ if (best > 'o') best = 'o', remove = 2, append = nullptr; } |
|
7144
|
0 |
0 |
{ if (best > 'p') best = 'p', remove = 1, append = nullptr; } |
|
7147
|
0 |
0 |
{ if (best > 'q') best = 'q', remove = 2, append = nullptr; } |
|
7150
|
0 |
0 |
{ if (best > 'r') best = 'r', remove = 1, append = nullptr; } |
|
7156
|
0 |
0 |
if ( cs == 0 ) |
|
7158
|
0 |
0 |
if ( ++p != ( (form.c_str() + form.size())) ) |
|
7161
|
0 |
0 |
if ( p == ( (form.c_str() + form.size())) ) |
|
7165
|
0 |
0 |
while ( __nacts-- > 0 ) { |
|
7168
|
0 |
0 |
{ if (best > 'd') best = 'd', remove = 2, append = nullptr; } |
|
7171
|
0 |
0 |
{ if (best > 'g') best = 'g', remove = 1, append = nullptr; } |
|
7174
|
0 |
0 |
{ if (best > 'j') best = 'j', remove = 1, append = nullptr; } |
|
7183
|
0 |
0 |
add(VBD, VBN, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
|
|
0 |
0 |
add(VBD, VBN, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
|
7262
|
0 |
0 |
if ( p == ( (form.c_str() + form.size())) ) |
|
7271
|
0 |
0 |
if ( _klen > 0 ) { |
|
7276
|
0 |
0 |
if ( _upper < _lower ) |
|
7280
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid ) |
|
7282
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid ) |
|
7294
|
0 |
0 |
if ( _klen > 0 ) { |
|
7299
|
0 |
0 |
if ( _upper < _lower ) |
|
7303
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] ) |
|
7305
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] ) |
|
7319
|
0 |
0 |
if ( _VBZ_trans_actions[_trans] == 0 ) |
|
7324
|
0 |
0 |
while ( _nacts-- > 0 ) |
|
7329
|
0 |
0 |
{ if (best > 'a') best = 'a', remove = 1, append = nullptr; } |
|
7332
|
0 |
0 |
{ if (best > 'b') best = 'b', remove = 2, append = nullptr; } |
|
7335
|
0 |
0 |
{ if (best > 'c') best = 'c', remove = 1, append = nullptr; } |
|
7338
|
0 |
0 |
{ if (best > 'd') best = 'd', remove = 2, append = nullptr; } |
|
7341
|
0 |
0 |
{ if (best > 'e') best = 'e', remove = 1, append = nullptr; } |
|
7344
|
0 |
0 |
{ if (best > 'f') best = 'f', remove = 2, append = nullptr; } |
|
7347
|
0 |
0 |
{ if (best > 'g') best = 'g', remove = 3, append = "y"; } |
|
7350
|
0 |
0 |
{ if (best > 'h') best = 'h', remove = 2, append = nullptr; } |
|
7353
|
0 |
0 |
{ if (best > 'i') best = 'i', remove = 1, append = nullptr; } |
|
7359
|
0 |
0 |
if ( cs == 0 ) |
|
7361
|
0 |
0 |
if ( ++p != ( (form.c_str() + form.size())) ) |
|
7367
|
0 |
0 |
add(VBZ, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
|
|
0 |
0 |
add(VBZ, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
|
7493
|
0 |
0 |
if ( p == ( (form.c_str() + form.size())) ) |
|
7502
|
0 |
0 |
if ( _klen > 0 ) { |
|
7507
|
0 |
0 |
if ( _upper < _lower ) |
|
7511
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < *_mid ) |
|
7513
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > *_mid ) |
|
7525
|
0 |
0 |
if ( _klen > 0 ) { |
|
7530
|
0 |
0 |
if ( _upper < _lower ) |
|
7534
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < _mid[0] ) |
|
7536
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > _mid[1] ) |
|
7550
|
0 |
0 |
if ( _JJR_RBR_trans_actions[_trans] == 0 ) |
|
7555
|
0 |
0 |
while ( _nacts-- > 0 ) |
|
7560
|
0 |
0 |
{ if (best > 'a') best = 'a', remove = 2, append = nullptr; } |
|
7563
|
0 |
0 |
{ if (best > 'b') best = 'b', remove = 3, append = nullptr; } |
|
7566
|
0 |
0 |
{ if (best > 'c') best = 'c', remove = 3, append = "y"; } |
|
7569
|
0 |
0 |
{ if (best > 'd') best = 'd', remove = 2, append = nullptr; } |
|
7572
|
0 |
0 |
{ if (best > 'e') best = 'e', remove = 1, append = nullptr; } |
|
7575
|
0 |
0 |
{ if (best > 'f') best = 'f', remove = 2, append = nullptr; } |
|
7581
|
0 |
0 |
if ( cs == 0 ) |
|
7583
|
0 |
0 |
if ( ++p != ( (form.c_str() + form.size())) ) |
|
7589
|
0 |
0 |
add(JJR, RBR, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
|
|
0 |
0 |
add(JJR, RBR, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
|
|
0 |
0 |
add(JJR, RBR, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
|
7719
|
0 |
0 |
if ( p == ( (form.c_str() + form.size())) ) |
|
7728
|
0 |
0 |
if ( _klen > 0 ) { |
|
7733
|
0 |
0 |
if ( _upper < _lower ) |
|
7737
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < *_mid ) |
|
7739
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > *_mid ) |
|
7751
|
0 |
0 |
if ( _klen > 0 ) { |
|
7756
|
0 |
0 |
if ( _upper < _lower ) |
|
7760
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < _mid[0] ) |
|
7762
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > _mid[1] ) |
|
7776
|
0 |
0 |
if ( _JJS_RBS_trans_actions[_trans] == 0 ) |
|
7781
|
0 |
0 |
while ( _nacts-- > 0 ) |
|
7786
|
0 |
0 |
{ if (best > 'a') best = 'a', remove = 3, append = nullptr; } |
|
7789
|
0 |
0 |
{ if (best > 'b') best = 'b', remove = 4, append = nullptr; } |
|
7792
|
0 |
0 |
{ if (best > 'c') best = 'c', remove = 4, append = "y"; } |
|
7795
|
0 |
0 |
{ if (best > 'd') best = 'd', remove = 3, append = nullptr; } |
|
7798
|
0 |
0 |
{ if (best > 'e') best = 'e', remove = 2, append = nullptr; } |
|
7801
|
0 |
0 |
{ if (best > 'f') best = 'f', remove = 3, append = nullptr; } |
|
7807
|
0 |
0 |
if ( cs == 0 ) |
|
7809
|
0 |
0 |
if ( ++p != ( (form.c_str() + form.size())) ) |
|
7815
|
0 |
0 |
add(JJS, RBS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
|
|
0 |
0 |
add(JJS, RBS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
|
|
0 |
0 |
add(JJS, RBS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
|
7898
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
|
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
|
7902
|
0 |
0 |
unsigned length = data.next_1B(); |
|
7903
|
0 |
0 |
unknown_tag.assign(data.next(length), length); |
|
|
0 |
0 |
unknown_tag.assign(data.next(length), length); |
|
7914
|
0 |
0 |
if (form.len) { |
|
7917
|
0 |
0 |
while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++; |
|
|
0 |
0 |
while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++; |
|
7918
|
0 |
0 |
if (lemmatags.len) lemmatags.len--, lemmatags.str++; |
|
7921
|
0 |
0 |
while (lemmatags.len) { |
|
7923
|
0 |
0 |
while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++; |
|
|
0 |
0 |
while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++; |
|
7924
|
0 |
0 |
if (!lemmatags.len) break; |
|
7929
|
0 |
0 |
while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++; |
|
|
0 |
0 |
while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++; |
|
7931
|
0 |
0 |
if (lemmatags.len) lemmatags.len--, lemmatags.str++; |
|
7933
|
0 |
0 |
lemmas.emplace_back(string(lemma_start, lemma_len), string(tag_start, tag_len)); |
|
7936
|
0 |
0 |
if (!lemmas.empty()) return NO_GUESSER; |
|
7939
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), unknown_tag); |
|
7948
|
0 |
0 |
if (lemma.len) { |
|
7951
|
0 |
0 |
while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++; |
|
|
0 |
0 |
while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++; |
|
7953
|
0 |
0 |
if (formtags.len) formtags.len--, formtags.str++; |
|
7957
|
0 |
0 |
while (formtags.len) { |
|
7959
|
0 |
0 |
while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++; |
|
|
0 |
0 |
while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++; |
|
7960
|
0 |
0 |
if (!formtags.len) break; |
|
7965
|
0 |
0 |
while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++; |
|
|
0 |
0 |
while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++; |
|
7967
|
0 |
0 |
if (formtags.len) formtags.len--, formtags.str++; |
|
7971
|
0 |
0 |
if (filter.matches(tag.c_str())) { |
|
7972
|
0 |
0 |
if (forms.empty()) forms.emplace_back(string(real_lemma.str, real_lemma.len)); |
|
|
0 |
0 |
if (forms.empty()) forms.emplace_back(string(real_lemma.str, real_lemma.len)); |
|
7973
|
0 |
0 |
forms.back().forms.emplace_back(string(form_start, form_len), tag); |
|
7977
|
0 |
0 |
if (any_result) return NO_GUESSER; |
|
7985
|
0 |
0 |
while (lemma_len < lemma.len && lemma.str[lemma_len] != ' ') lemma_len++; |
|
|
0 |
0 |
while (lemma_len < lemma.len && lemma.str[lemma_len] != ' ') lemma_len++; |
|
7991
|
0 |
0 |
while (lemma_len < lemma.len && lemma.str[lemma_len] != ' ') lemma_len++; |
|
|
0 |
0 |
while (lemma_len < lemma.len && lemma.str[lemma_len] != ' ') lemma_len++; |
|
7997
|
0 |
0 |
while (form_len < form.len && form.str[form_len] != ' ') form_len++; |
|
|
0 |
0 |
while (form_len < form.len && form.str[form_len] != ' ') form_len++; |
|
8118
|
1 |
0 |
if (!compressor::load(is, data)) return false; |
|
|
1 |
0 |
if (!compressor::load(is, data)) return false; |
|
8122
|
1 |
0 |
unsigned length = data.next_1B(); |
|
8123
|
1 |
0 |
unknown_tag.assign(data.next(length), length); |
|
8124
|
1 |
0 |
length = data.next_1B(); |
|
8125
|
1 |
0 |
number_tag.assign(data.next(length), length); |
|
8126
|
1 |
0 |
length = data.next_1B(); |
|
8127
|
1 |
0 |
punctuation_tag.assign(data.next(length), length); |
|
8128
|
1 |
0 |
length = data.next_1B(); |
|
8129
|
1 |
0 |
symbol_tag.assign(data.next(length), length); |
|
8132
|
1 |
0 |
dictionary.load(data); |
|
8136
|
1 |
0 |
if (data.next_1B()) { |
|
|
1 |
0 |
if (data.next_1B()) { |
|
8137
|
1 |
0 |
statistical_guesser.reset(new morpho_statistical_guesser()); |
|
8138
|
1 |
0 |
statistical_guesser->load(data); |
|
8139
|
0 |
0 |
} |
|
8150
|
7 |
0 |
if (form.len) { |
|
8154
|
7 |
0 |
generate_casing_variants(form, form_uclc, form_lc); |
|
8157
|
7 |
0 |
dictionary.analyze(form, lemmas); |
|
8158
|
0 |
7 |
if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas); |
|
|
0 |
0 |
if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas); |
|
8159
|
1 |
6 |
if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas); |
|
|
1 |
0 |
if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas); |
|
8160
|
0 |
7 |
if (!lemmas.empty()) return NO_GUESSER; |
|
8163
|
0 |
0 |
analyze_special(form, lemmas); |
|
8164
|
0 |
0 |
if (!lemmas.empty()) return NO_GUESSER; |
|
8167
|
0 |
0 |
if (guesser == GUESSER && statistical_guesser) { |
|
|
0 |
0 |
if (guesser == GUESSER && statistical_guesser) { |
|
|
0 |
0 |
if (guesser == GUESSER && statistical_guesser) { |
|
8168
|
0 |
0 |
if (form_uclc.empty() && form_lc.empty()) |
|
|
0 |
0 |
if (form_uclc.empty() && form_lc.empty()) |
|
|
0 |
0 |
if (form_uclc.empty() && form_lc.empty()) |
|
8169
|
0 |
0 |
statistical_guesser->analyze(form, lemmas, nullptr); |
|
8171
|
0 |
0 |
morpho_statistical_guesser::used_rules used_rules; used_rules.reserve(3); |
|
8172
|
0 |
0 |
statistical_guesser->analyze(form, lemmas, &used_rules); |
|
8173
|
0 |
0 |
if (!form_uclc.empty()) statistical_guesser->analyze(form_uclc, lemmas, &used_rules); |
|
|
0 |
0 |
if (!form_uclc.empty()) statistical_guesser->analyze(form_uclc, lemmas, &used_rules); |
|
8174
|
0 |
0 |
if (!form_lc.empty()) statistical_guesser->analyze(form_lc, lemmas, &used_rules); |
|
|
0 |
0 |
if (!form_lc.empty()) statistical_guesser->analyze(form_lc, lemmas, &used_rules); |
|
8177
|
0 |
0 |
if (!lemmas.empty()) return GUESSER; |
|
8180
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), unknown_tag); |
|
8189
|
0 |
0 |
if (lemma.len) { |
|
8190
|
0 |
0 |
if (dictionary.generate(lemma, filter, forms)) |
|
|
0 |
0 |
if (dictionary.generate(lemma, filter, forms)) |
|
8220
|
0 |
0 |
if (!form.len) return; |
|
8228
|
0 |
0 |
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(number.str, number.len); |
|
8229
|
0 |
0 |
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
|
8230
|
0 |
0 |
if ((codepoint == '.' && number.len) || codepoint == ',') codepoint = utf8::decode(number.str, number.len); |
|
|
0 |
0 |
if ((codepoint == '.' && number.len) || codepoint == ',') codepoint = utf8::decode(number.str, number.len); |
|
|
0 |
0 |
if ((codepoint == '.' && number.len) || codepoint == ',') codepoint = utf8::decode(number.str, number.len); |
|
8231
|
0 |
0 |
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
|
8232
|
0 |
0 |
if (any_digit && (codepoint == 'e' || codepoint == 'E')) { |
|
|
0 |
0 |
if (any_digit && (codepoint == 'e' || codepoint == 'E')) { |
|
8234
|
0 |
0 |
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(number.str, number.len); |
|
8236
|
0 |
0 |
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
|
8239
|
0 |
0 |
if (any_digit && !number.len && (!codepoint || codepoint == '.')) { |
|
|
0 |
0 |
if (any_digit && !number.len && (!codepoint || codepoint == '.')) { |
|
|
0 |
0 |
if (any_digit && !number.len && (!codepoint || codepoint == '.')) { |
|
8240
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), number_tag); |
|
8247
|
0 |
0 |
while (form.len) { |
|
8249
|
0 |
0 |
punctuation = punctuation && unicode::category(codepoint) & unicode::P; |
|
|
0 |
0 |
punctuation = punctuation && unicode::category(codepoint) & unicode::P; |
|
8250
|
0 |
0 |
symbol = symbol && unicode::category(codepoint) & unicode::S; |
|
|
0 |
0 |
symbol = symbol && unicode::category(codepoint) & unicode::S; |
|
8252
|
0 |
0 |
if (punctuation) |
|
8253
|
0 |
0 |
lemmas.emplace_back(string(form_ori.str, form_ori.len), punctuation_tag); |
|
8254
|
0 |
0 |
else if (symbol) |
|
8255
|
0 |
0 |
lemmas.emplace_back(string(form_ori.str, form_ori.len), symbol_tag); |
|
8302
|
0 |
0 |
construct(std::map(map.begin(), map.end()), load_factor, entry_encode); |
|
|
0 |
0 |
construct(std::map(map.begin(), map.end()), load_factor, entry_encode); |
|
|
0 |
0 |
construct(std::map(map.begin(), map.end()), load_factor, entry_encode); |
|
|
0 |
0 |
construct(std::map(map.begin(), map.end()), load_factor, entry_encode); |
|
8308
|
0 |
0 |
std::map enlarged_map(map.begin(), map.end()); |
|
|
0 |
0 |
std::map enlarged_map(map.begin(), map.end()); |
|
8310
|
0 |
0 |
for (auto&& entry : map) { |
|
|
0 |
0 |
for (auto&& entry : map) { |
|
8313
|
0 |
0 |
if (!key.empty() && add_prefixes) |
|
|
0 |
0 |
if (!key.empty() && add_prefixes) |
|
|
0 |
0 |
if (!key.empty() && add_prefixes) |
|
|
0 |
0 |
if (!key.empty() && add_prefixes) |
|
|
0 |
0 |
if (!key.empty() && add_prefixes) |
|
|
0 |
0 |
if (!key.empty() && add_prefixes) |
|
8314
|
0 |
0 |
for (unsigned i = key.size() - 1; i; i--) |
|
|
0 |
0 |
for (unsigned i = key.size() - 1; i; i--) |
|
8315
|
0 |
0 |
enlarged_map[key.substr(0, i)]; |
|
|
0 |
0 |
enlarged_map[key.substr(0, i)]; |
|
|
0 |
0 |
enlarged_map[key.substr(0, i)]; |
|
|
0 |
0 |
enlarged_map[key.substr(0, i)]; |
|
8317
|
0 |
0 |
if (!key.empty() && add_suffixes) |
|
|
0 |
0 |
if (!key.empty() && add_suffixes) |
|
|
0 |
0 |
if (!key.empty() && add_suffixes) |
|
|
0 |
0 |
if (!key.empty() && add_suffixes) |
|
|
0 |
0 |
if (!key.empty() && add_suffixes) |
|
|
0 |
0 |
if (!key.empty() && add_suffixes) |
|
8318
|
0 |
0 |
for (unsigned i = 1; i < key.size(); i++) |
|
|
0 |
0 |
for (unsigned i = 1; i < key.size(); i++) |
|
8319
|
0 |
0 |
enlarged_map[key.substr(i)]; |
|
|
0 |
0 |
enlarged_map[key.substr(i)]; |
|
|
0 |
0 |
enlarged_map[key.substr(i)]; |
|
|
0 |
0 |
enlarged_map[key.substr(i)]; |
|
8322
|
0 |
0 |
construct(enlarged_map, load_factor, entry_encode); |
|
|
0 |
0 |
construct(enlarged_map, load_factor, entry_encode); |
|
8333
|
0 |
0 |
for (auto&& elem : map) { |
|
|
0 |
0 |
for (auto&& elem : map) { |
|
|
0 |
0 |
for (auto&& elem : map) { |
|
|
0 |
0 |
for (auto&& elem : map) { |
|
8335
|
0 |
0 |
if (len >= sizes.size()) sizes.resize(len + 1); |
|
|
0 |
0 |
if (len >= sizes.size()) sizes.resize(len + 1); |
|
|
0 |
0 |
if (len >= sizes.size()) sizes.resize(len + 1); |
|
|
0 |
0 |
if (len >= sizes.size()) sizes.resize(len + 1); |
|
|
0 |
0 |
if (len >= sizes.size()) sizes.resize(len + 1); |
|
|
0 |
0 |
if (len >= sizes.size()) sizes.resize(len + 1); |
|
|
0 |
0 |
if (len >= sizes.size()) sizes.resize(len + 1); |
|
|
0 |
0 |
if (len >= sizes.size()) sizes.resize(len + 1); |
|
8338
|
0 |
0 |
for (auto&& size : sizes) |
|
|
0 |
0 |
for (auto&& size : sizes) |
|
|
0 |
0 |
for (auto&& size : sizes) |
|
|
0 |
0 |
for (auto&& size : sizes) |
|
8339
|
0 |
0 |
resize(unsigned(load_factor * size)); |
|
|
0 |
0 |
resize(unsigned(load_factor * size)); |
|
|
0 |
0 |
resize(unsigned(load_factor * size)); |
|
|
0 |
0 |
resize(unsigned(load_factor * size)); |
|
8342
|
0 |
0 |
for (auto&& elem : map) { |
|
|
0 |
0 |
for (auto&& elem : map) { |
|
|
0 |
0 |
for (auto&& elem : map) { |
|
|
0 |
0 |
for (auto&& elem : map) { |
|
8343
|
0 |
0 |
binary_encoder enc; |
|
|
0 |
0 |
binary_encoder enc; |
|
|
0 |
0 |
binary_encoder enc; |
|
|
0 |
0 |
binary_encoder enc; |
|
8344
|
0 |
0 |
entry_encode(enc, elem.second); |
|
|
0 |
0 |
entry_encode(enc, elem.second); |
|
|
0 |
0 |
entry_encode(enc, elem.second); |
|
|
0 |
0 |
entry_encode(enc, elem.second); |
|
8347
|
0 |
0 |
done_adding(); |
|
|
0 |
0 |
done_adding(); |
|
|
0 |
0 |
done_adding(); |
|
|
0 |
0 |
done_adding(); |
|
8350
|
0 |
0 |
for (auto&& elem : map) { |
|
|
0 |
0 |
for (auto&& elem : map) { |
|
|
0 |
0 |
for (auto&& elem : map) { |
|
|
0 |
0 |
for (auto&& elem : map) { |
|
8351
|
0 |
0 |
binary_encoder enc; |
|
|
0 |
0 |
binary_encoder enc; |
|
|
0 |
0 |
binary_encoder enc; |
|
|
0 |
0 |
binary_encoder enc; |
|
8352
|
0 |
0 |
entry_encode(enc, elem.second); |
|
|
0 |
0 |
entry_encode(enc, elem.second); |
|
|
0 |
0 |
entry_encode(enc, elem.second); |
|
|
0 |
0 |
entry_encode(enc, elem.second); |
|
8361
|
0 |
0 |
for (auto&& hash : hashes) |
|
8420
|
1 |
0 |
return unique_ptr(new T(std::forward(args)...)); |
|
|
0 |
0 |
return unique_ptr(new T(std::forward(args)...)); |
|
|
0 |
0 |
return unique_ptr(new T(std::forward(args)...)); |
|
8460
|
0 |
0 |
if (!*str) return; |
|
8462
|
0 |
0 |
for (auto&& child : children) |
|
8463
|
0 |
0 |
if (child.first == *str) { |
|
8468
|
0 |
0 |
children.emplace_back(*str, new_unique_ptr()); |
|
8476
|
0 |
0 |
find_candidate_prefix(max_suffix_len, current, best, best_length, 0); |
|
8480
|
0 |
0 |
if (depth < max_suffix_len && length > best_length) { |
|
|
0 |
0 |
if (depth < max_suffix_len && length > best_length) { |
|
8484
|
0 |
0 |
for (auto&& child : children) { |
|
8486
|
0 |
0 |
child.second->find_candidate_prefix(max_suffix_len, current, best, best_length, children.size() == 1 ? length + 1 : 1); |
|
8498
|
0 |
0 |
if (str.size() >= lengths.size()) lengths.resize(str.size() + 1); |
|
8504
|
0 |
0 |
for (auto&& set : lengths) |
|
8513
|
0 |
0 |
this->lemma = lemma.substr(0, addinfo.parse(lemma, true)); |
|
8524
|
0 |
0 |
bool operator<(const lemma_form_info& other) const { return form < other.form || (form == other.form && clas < other.clas); } |
|
|
0 |
0 |
bool operator<(const lemma_form_info& other) const { return form < other.form || (form == other.form && clas < other.clas); } |
|
|
0 |
0 |
bool operator<(const lemma_form_info& other) const { return form < other.form || (form == other.form && clas < other.clas); } |
|
8528
|
0 |
0 |
bool operator<(const lemma_info& other) const { return lemma < other.lemma || (lemma == other.lemma && addinfo.data < other.addinfo.data); } |
|
8547
|
0 |
0 |
dict.load(is, max_suffix_len); |
|
8550
|
0 |
0 |
dict.encode(enc); |
|
8559
|
0 |
0 |
while(raw.next_lemma(lemma, forms)) { |
|
|
0 |
0 |
while(raw.next_lemma(lemma, forms)) { |
|
8563
|
0 |
0 |
if (forms_end != forms.end()) { |
|
8569
|
0 |
0 |
lemmas.emplace_back(lemma); |
|
8571
|
0 |
0 |
lemmas_hist.add(lemma_info.lemma); |
|
8574
|
0 |
0 |
while (!forms.empty()) { |
|
8576
|
0 |
0 |
for (auto&& form : forms) |
|
8577
|
0 |
0 |
t.add(form.first.c_str()); |
|
8580
|
0 |
0 |
string prefix = t.find_candidate_prefix(max_suffix_len); |
|
8584
|
0 |
0 |
while (start != forms.end() && start->first.compare(0, prefix.size(), prefix) != 0) start++; |
|
|
0 |
0 |
while (start != forms.end() && start->first.compare(0, prefix.size(), prefix) != 0) start++; |
|
|
0 |
0 |
while (start != forms.end() && start->first.compare(0, prefix.size(), prefix) != 0) start++; |
|
|
0 |
0 |
while (start != forms.end() && start->first.compare(0, prefix.size(), prefix) != 0) start++; |
|
8585
|
0 |
0 |
if (start == forms.end()) training_failure("Internal error when generating classes, cannot find prefix '" << prefix << "'!"); |
|
|
0 |
0 |
if (start == forms.end()) training_failure("Internal error when generating classes, cannot find prefix '" << prefix << "'!"); |
|
|
0 |
0 |
if (start == forms.end()) training_failure("Internal error when generating classes, cannot find prefix '" << prefix << "'!"); |
|
8587
|
0 |
0 |
while (end != forms.end() && end->first.compare(0, prefix.size(), prefix) == 0) end++; |
|
|
0 |
0 |
while (end != forms.end() && end->first.compare(0, prefix.size(), prefix) == 0) end++; |
|
|
0 |
0 |
while (end != forms.end() && end->first.compare(0, prefix.size(), prefix) == 0) end++; |
|
|
0 |
0 |
while (end != forms.end() && end->first.compare(0, prefix.size(), prefix) == 0) end++; |
|
8591
|
0 |
0 |
while (common_prefix < int(start->first.size()) && start->first[common_prefix] == (end-1)->first[common_prefix]) common_prefix++; |
|
|
0 |
0 |
while (common_prefix < int(start->first.size()) && start->first[common_prefix] == (end-1)->first[common_prefix]) common_prefix++; |
|
|
0 |
0 |
while (common_prefix < int(start->first.size()) && start->first[common_prefix] == (end-1)->first[common_prefix]) common_prefix++; |
|
8594
|
0 |
0 |
for (auto form = start; form != end; form++) { |
|
8595
|
0 |
0 |
if (!clas.empty()) clas.push_back('\t'); |
|
|
0 |
0 |
if (!clas.empty()) clas.push_back('\t'); |
|
8596
|
0 |
0 |
clas.append(form->first, common_prefix, string::npos); |
|
8597
|
0 |
0 |
clas.push_back('\t'); |
|
8603
|
0 |
0 |
if (class_it.second) { |
|
8605
|
0 |
0 |
for (auto form = start; form != end; form++) { |
|
8607
|
0 |
0 |
if (tag >= int(tags.size())) tags.emplace_back(form->second); |
|
|
0 |
0 |
if (tag >= int(tags.size())) tags.emplace_back(form->second); |
|
8608
|
0 |
0 |
suffixes[form->first.substr(common_prefix)][class_id].emplace_back(tag); |
|
|
0 |
0 |
suffixes[form->first.substr(common_prefix)][class_id].emplace_back(tag); |
|
|
0 |
0 |
suffixes[form->first.substr(common_prefix)][class_id].emplace_back(tag); |
|
8613
|
0 |
0 |
lemma_info.forms.emplace_back(start->first.substr(0, common_prefix), class_id); |
|
|
0 |
0 |
lemma_info.forms.emplace_back(start->first.substr(0, common_prefix), class_id); |
|
8614
|
0 |
0 |
forms_hist.add(lemma_info.forms.back().form); |
|
8630
|
0 |
0 |
for (auto&& lemma : lemmas) { |
|
8632
|
0 |
0 |
while (prev[cpl] && prev[cpl] == lemma.lemma[cpl]) cpl++; |
|
|
0 |
0 |
while (prev[cpl] && prev[cpl] == lemma.lemma[cpl]) cpl++; |
|
|
0 |
0 |
while (prev[cpl] && prev[cpl] == lemma.lemma[cpl]) cpl++; |
|
8634
|
0 |
0 |
enc.add_1B(prev.length() - cpl); |
|
8635
|
0 |
0 |
enc.add_1B(lemma.lemma.size() - cpl); |
|
8636
|
0 |
0 |
enc.add_data(lemma.lemma.substr(cpl)); |
|
8637
|
0 |
0 |
enc.add_1B(lemma.addinfo.data.size()); |
|
8639
|
0 |
0 |
enc.add_1B(lemma.forms.size()); |
|
8642
|
0 |
0 |
for (auto&& lemma_form : lemma.forms) { |
|
8644
|
0 |
0 |
for (unsigned prev_from = 0; prev_from < prev_form.size(); prev_from++) |
|
8645
|
0 |
0 |
for (unsigned form_from = 0; form_from < lemma_form.form.size(); form_from++) { |
|
8647
|
0 |
0 |
while (prev_from + len < prev_form.size() && form_from + len < lemma_form.form.size() && prev_form[prev_from+len] == lemma_form.form[form_from+len]) len++; |
|
|
0 |
0 |
while (prev_from + len < prev_form.size() && form_from + len < lemma_form.form.size() && prev_form[prev_from+len] == lemma_form.form[form_from+len]) len++; |
|
|
0 |
0 |
while (prev_from + len < prev_form.size() && form_from + len < lemma_form.form.size() && prev_form[prev_from+len] == lemma_form.form[form_from+len]) len++; |
|
|
0 |
0 |
while (prev_from + len < prev_form.size() && form_from + len < lemma_form.form.size() && prev_form[prev_from+len] == lemma_form.form[form_from+len]) len++; |
|
8648
|
0 |
0 |
if (len > best_len) best_prev_from = prev_from, best_form_from = form_from, best_len = len; |
|
8652
|
0 |
0 |
enc.add_1B(REMOVE_START * (best_prev_from>0) + REMOVE_END * (best_prev_from+best_len
|
|
|
0 |
0 |
enc.add_1B(REMOVE_START * (best_prev_from>0) + REMOVE_END * (best_prev_from+best_len
|
|
|
0 |
0 |
enc.add_1B(REMOVE_START * (best_prev_from>0) + REMOVE_END * (best_prev_from+best_len
|
|
|
0 |
0 |
enc.add_1B(REMOVE_START * (best_prev_from>0) + REMOVE_END * (best_prev_from+best_len
|
|
8654
|
0 |
0 |
if (best_prev_from > 0) enc.add_1B(best_prev_from); |
|
|
0 |
0 |
if (best_prev_from > 0) enc.add_1B(best_prev_from); |
|
8655
|
0 |
0 |
if (best_prev_from + best_len < prev_form.size()) enc.add_1B(prev_form.size() - best_prev_from - best_len); |
|
|
0 |
0 |
if (best_prev_from + best_len < prev_form.size()) enc.add_1B(prev_form.size() - best_prev_from - best_len); |
|
8656
|
0 |
0 |
if (best_form_from > 0) { |
|
8657
|
0 |
0 |
enc.add_1B(best_form_from); |
|
8658
|
0 |
0 |
enc.add_data(lemma_form.form.substr(0, best_form_from)); |
|
8660
|
0 |
0 |
if (best_form_from + best_len < lemma_form.form.size()) { |
|
8661
|
0 |
0 |
enc.add_1B(lemma_form.form.size() - best_form_from - best_len); |
|
8662
|
0 |
0 |
enc.add_data(lemma_form.form.substr(best_form_from + best_len)); |
|
8664
|
0 |
0 |
enc.add_2B(lemma_form.clas); |
|
8673
|
0 |
0 |
enc.add_2B(tags.size()); |
|
8674
|
0 |
0 |
for (auto&& tag : tags) { |
|
8675
|
0 |
0 |
enc.add_1B(tag.size()); |
|
8680
|
0 |
0 |
persistent_unordered_map(suffixes, 5, false, true, [](binary_encoder& enc, const map>& suffix) { |
|
8682
|
0 |
0 |
for (auto&& clas : suffix) |
|
8685
|
0 |
0 |
for (auto&& clas : suffix) { |
|
8686
|
0 |
0 |
enc.add_2B(tags - prev_tags < (1<<16) ? uint16_t(tags) : tags); |
|
8690
|
0 |
0 |
enc.add_2B(tags - prev_tags < (1<<16) ? uint16_t(tags) : tags); |
|
8691
|
0 |
0 |
for (auto&& clas : suffix) |
|
8692
|
0 |
0 |
for (auto&& tag : clas.second) |
|
8694
|
0 |
0 |
}).save(enc); |
|
8761
|
0 |
0 |
enc.add_1B(tags.unknown_tag.size()); |
|
8763
|
0 |
0 |
enc.add_1B(tags.number_tag.size()); |
|
8765
|
0 |
0 |
enc.add_1B(tags.punctuation_tag.size()); |
|
8767
|
0 |
0 |
enc.add_1B(tags.symbol_tag.size()); |
|
8771
|
0 |
0 |
morpho_dictionary_encoder::encode(in_dictionary, max_suffix_len, enc); |
|
8774
|
0 |
0 |
enc.add_1B(bool(in_statistical_guesser)); |
|
8775
|
0 |
0 |
if (in_statistical_guesser) { |
|
8777
|
0 |
0 |
morpho_statistical_guesser_encoder::encode(in_statistical_guesser, enc); |
|
8782
|
0 |
0 |
if (!compressor::save(out_morpho, enc)) training_failure("Cannot compress and write dictionary to file!"); |
|
|
0 |
0 |
if (!compressor::save(out_morpho, enc)) training_failure("Cannot compress and write dictionary to file!"); |
|
|
0 |
0 |
if (!compressor::save(out_morpho, enc)) training_failure("Cannot compress and write dictionary to file!"); |
|
|
0 |
0 |
if (!compressor::save(out_morpho, enc)) training_failure("Cannot compress and write dictionary to file!"); |
|
8851
|
0 |
0 |
if (res->load(is)) return res.release(); |
|
|
0 |
0 |
if (res->load(is)) return res.release(); |
|
8860
|
0 |
0 |
3); |
|
|
0 |
0 |
3); |
|
8861
|
0 |
0 |
if (res->load(is)) return res.release(); |
|
|
0 |
0 |
if (res->load(is)) return res.release(); |
|
8867
|
0 |
0 |
if (res->load(is)) return res.release(); |
|
|
0 |
0 |
if (res->load(is)) return res.release(); |
|
8873
|
1 |
0 |
if (res->load(is)) return res.release(); |
|
|
1 |
0 |
if (res->load(is)) return res.release(); |
|
8879
|
0 |
0 |
if (res->load(is)) return res.release(); |
|
|
0 |
0 |
if (res->load(is)) return res.release(); |
|
8885
|
0 |
0 |
if (!derinet->load(is)) return nullptr; |
|
|
0 |
0 |
if (!derinet->load(is)) return nullptr; |
|
8887
|
0 |
0 |
unique_ptr dictionary(load(is)); |
|
8888
|
0 |
0 |
if (!dictionary) return nullptr; |
|
8899
|
0 |
0 |
ifstream f(path_from_utf8(fname).c_str(), ifstream::binary); |
|
8900
|
0 |
0 |
if (!f) return nullptr; |
|
8902
|
0 |
0 |
return load(f); |
|
8929
|
6 |
1 |
for (auto&& tag : tags) { |
|
8931
|
397 |
6 |
for (unsigned i = 0; i < tag.size(); i++) |
|
8942
|
0 |
0 |
if (!used) return false; |
|
8944
|
0 |
0 |
for (auto&& used_rule : *used) |
|
8945
|
0 |
0 |
if (used_rule == rule) |
|
8957
|
0 |
0 |
string rule_label; rule_label.reserve(12); |
|
8959
|
0 |
0 |
for (; suffix_len < form.len; suffix_len++) { |
|
8960
|
0 |
0 |
rule_label.push_back(form.str[form.len - (suffix_len + 1)]); |
|
8961
|
0 |
0 |
if (!rules.at(rule_label.c_str(), rule_label.size(), [](pointer_decoder& data){ data.next(data.next_2B()); })) |
|
8965
|
0 |
0 |
for (suffix_len++; suffix_len--; ) { |
|
8967
|
0 |
0 |
rule_label.push_back(' '); |
|
8971
|
0 |
0 |
for (unsigned prefix_len = 0; prefix_len + suffix_len <= form.len; prefix_len++) { |
|
8972
|
0 |
0 |
if (prefix_len) rule_label.push_back(form.str[prefix_len - 1]); |
|
|
0 |
0 |
if (prefix_len) rule_label.push_back(form.str[prefix_len - 1]); |
|
8974
|
0 |
0 |
if (!found) break; |
|
8975
|
0 |
0 |
if (*(found += sizeof(uint16_t))) { |
|
8981
|
0 |
0 |
if (rule) { |
|
8983
|
0 |
0 |
if (rule_label.size() > 1 && !contains(used, rule_label)) { // ignore rule ' ' |
|
|
0 |
0 |
if (rule_label.size() > 1 && !contains(used, rule_label)) { // ignore rule ' ' |
|
|
0 |
0 |
if (rule_label.size() > 1 && !contains(used, rule_label)) { // ignore rule ' ' |
|
8984
|
0 |
0 |
if (used) used->push_back(rule_label); |
|
|
0 |
0 |
if (used) used->push_back(rule_label); |
|
8985
|
0 |
0 |
for (int rules_len = *rule++; rules_len; rules_len--) { |
|
8992
|
0 |
0 |
if (pref_del_len + suff_del_len > form.len || |
|
|
0 |
0 |
if (pref_del_len + suff_del_len > form.len || |
|
8993
|
0 |
0 |
(pref_del_len && !small_memeq(pref_del, form.str, pref_del_len)) || |
|
|
0 |
0 |
(pref_del_len && !small_memeq(pref_del, form.str, pref_del_len)) || |
|
8994
|
0 |
0 |
(suff_del_len && !small_memeq(suff_del, form.str + form.len - suff_del_len, suff_del_len)) || |
|
|
0 |
0 |
(suff_del_len && !small_memeq(suff_del, form.str + form.len - suff_del_len, suff_del_len)) || |
|
|
0 |
0 |
(suff_del_len && !small_memeq(suff_del, form.str + form.len - suff_del_len, suff_del_len)) || |
|
8999
|
0 |
0 |
lemma.reserve(form.len + pref_add_len - pref_del_len + suff_add_len - suff_del_len); |
|
9000
|
0 |
0 |
if (pref_add_len) lemma.append(pref_add, pref_add_len); |
|
|
0 |
0 |
if (pref_add_len) lemma.append(pref_add, pref_add_len); |
|
9001
|
0 |
0 |
if (pref_del_len + suff_del_len < form.len) lemma.append(form.str + pref_del_len, form.len - pref_del_len - suff_del_len); |
|
|
0 |
0 |
if (pref_del_len + suff_del_len < form.len) lemma.append(form.str + pref_del_len, form.len - pref_del_len - suff_del_len); |
|
9002
|
0 |
0 |
if (suff_add_len) lemma.append(suff_add, suff_add_len); |
|
|
0 |
0 |
if (suff_add_len) lemma.append(suff_add, suff_add_len); |
|
9003
|
0 |
0 |
while (tags_len--) |
|
9004
|
0 |
0 |
lemmas.emplace_back(lemma, this->tags[unaligned_load_inc(tags)]); |
|
9012
|
0 |
0 |
if (lemmas.size() == lemmas_initial_size) |
|
9013
|
0 |
0 |
if (!contains(used, string())) { |
|
9014
|
0 |
0 |
if (used) used->push_back(string()); |
|
9015
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), tags[default_tag]); |
|
9050
|
0 |
0 |
if (text.empty()) return; |
|
9053
|
0 |
0 |
for (string::size_type next; (next = text.find(sep, index)) != string::npos; index = next + 1) |
|
9061
|
53 |
0 |
if (!text.len) return; |
|
9064
|
68 |
53 |
for (const char* next; (next = (const char*) memchr(str, sep, text.str + text.len - str)); str = next + 1) |
|
9095
|
0 |
0 |
if (!getline(is, line)) training_failure("Missing first line with default tag in statistical guesser file"); |
|
|
0 |
0 |
if (!getline(is, line)) training_failure("Missing first line with default tag in statistical guesser file"); |
|
|
0 |
0 |
if (!getline(is, line)) training_failure("Missing first line with default tag in statistical guesser file"); |
|
|
0 |
0 |
if (!getline(is, line)) training_failure("Missing first line with default tag in statistical guesser file"); |
|
9097
|
0 |
0 |
if (unsigned(statistical_guesser_default) >= tags.size()) tags.emplace_back(line.data()); |
|
|
0 |
0 |
if (unsigned(statistical_guesser_default) >= tags.size()) tags.emplace_back(line.data()); |
|
9099
|
0 |
0 |
while (getline(is, line)) { |
|
|
0 |
0 |
while (getline(is, line)) { |
|
9100
|
0 |
0 |
split(line, '\t', tokens); |
|
9101
|
0 |
0 |
if (tokens.size() < 3 || (tokens.size() % 2) != 1) training_failure("Cannot parse line " << line << " in statistical guesser file!"); |
|
|
0 |
0 |
if (tokens.size() < 3 || (tokens.size() % 2) != 1) training_failure("Cannot parse line " << line << " in statistical guesser file!"); |
|
|
0 |
0 |
if (tokens.size() < 3 || (tokens.size() % 2) != 1) training_failure("Cannot parse line " << line << " in statistical guesser file!"); |
|
|
0 |
0 |
if (tokens.size() < 3 || (tokens.size() % 2) != 1) training_failure("Cannot parse line " << line << " in statistical guesser file!"); |
|
|
0 |
0 |
if (tokens.size() < 3 || (tokens.size() % 2) != 1) training_failure("Cannot parse line " << line << " in statistical guesser file!"); |
|
9104
|
0 |
0 |
split(tokens[0], ' ', affixes); |
|
9105
|
0 |
0 |
if (affixes.size() != 2) training_failure("Cannot parse prefix_suffix '" << tokens[0] << "' in statistical guesser file!"); |
|
|
0 |
0 |
if (affixes.size() != 2) training_failure("Cannot parse prefix_suffix '" << tokens[0] << "' in statistical guesser file!"); |
|
|
0 |
0 |
if (affixes.size() != 2) training_failure("Cannot parse prefix_suffix '" << tokens[0] << "' in statistical guesser file!"); |
|
9108
|
0 |
0 |
auto& rules = statistical_guesser[affixes[1] + ' ' + affixes[0]]; |
|
|
0 |
0 |
auto& rules = statistical_guesser[affixes[1] + ' ' + affixes[0]]; |
|
9109
|
0 |
0 |
for (unsigned i = 1; i < tokens.size(); i+= 2) { |
|
9111
|
0 |
0 |
split(tokens[i], ' ', replacements); |
|
9112
|
0 |
0 |
if (replacements.size() != 4) training_failure("Cannot parse replacement rule '" << tokens[i] << "' in statistical guesser file!"); |
|
|
0 |
0 |
if (replacements.size() != 4) training_failure("Cannot parse replacement rule '" << tokens[i] << "' in statistical guesser file!"); |
|
|
0 |
0 |
if (replacements.size() != 4) training_failure("Cannot parse replacement rule '" << tokens[i] << "' in statistical guesser file!"); |
|
9115
|
0 |
0 |
split(tokens[i+1], ' ', rule_tags); |
|
9117
|
0 |
0 |
for (auto&& rule_tag : rule_tags) { |
|
9119
|
0 |
0 |
if (unsigned(tag) >= tags.size()) tags.emplace_back(rule_tag); |
|
|
0 |
0 |
if (unsigned(tag) >= tags.size()) tags.emplace_back(rule_tag); |
|
9120
|
0 |
0 |
decoded_tags.emplace_back(tag); |
|
9123
|
0 |
0 |
rules.emplace_back(replacements, decoded_tags); |
|
9128
|
0 |
0 |
enc.add_2B(tags.size()); |
|
9129
|
0 |
0 |
for (auto&& tag : tags) { |
|
9130
|
0 |
0 |
enc.add_1B(tag.size()); |
|
9133
|
0 |
0 |
enc.add_2B(statistical_guesser_default); |
|
9137
|
0 |
0 |
e.add_1B(rules.size()); |
|
9138
|
0 |
0 |
for (auto&& rule : rules) { |
|
9139
|
0 |
0 |
if (rule.first.size() != 4) training_failure("Replacement rule not of size 4 in statistical guesser!"); |
|
|
0 |
0 |
if (rule.first.size() != 4) training_failure("Replacement rule not of size 4 in statistical guesser!"); |
|
|
0 |
0 |
if (rule.first.size() != 4) training_failure("Replacement rule not of size 4 in statistical guesser!"); |
|
9140
|
0 |
0 |
for (auto&& affix : rule.first) { |
|
9141
|
0 |
0 |
e.add_1B(affix.size()); |
|
9144
|
0 |
0 |
e.add_1B(rule.second.size()); |
|
9145
|
0 |
0 |
for (auto&& tag : rule.second) |
|
9146
|
0 |
0 |
e.add_2B(tag); |
|
9148
|
0 |
0 |
enc.add_2B(e.data.size()); |
|
9150
|
0 |
0 |
}).save(enc); |
|
|
0 |
0 |
}).save(enc); |
|
9211
|
0 |
0 |
for (string line; getline(is, line);) { |
|
|
0 |
0 |
for (string line; getline(is, line);) { |
|
9212
|
0 |
0 |
if (line.empty()) continue; |
|
9214
|
0 |
0 |
split(line, '\t', tokens); |
|
9215
|
0 |
0 |
if (tokens.size() != 3) training_failure("The guesser training line '" << line << "' does not contain three columns!"); |
|
|
0 |
0 |
if (tokens.size() != 3) training_failure("The guesser training line '" << line << "' does not contain three columns!"); |
|
|
0 |
0 |
if (tokens.size() != 3) training_failure("The guesser training line '" << line << "' does not contain three columns!"); |
|
9216
|
0 |
0 |
if (tokens[0].empty() || tokens[1].empty() || tokens[2].empty()) training_failure("The guesser training line '" << line << "' contains an empty column!"); |
|
|
0 |
0 |
if (tokens[0].empty() || tokens[1].empty() || tokens[2].empty()) training_failure("The guesser training line '" << line << "' contains an empty column!"); |
|
|
0 |
0 |
if (tokens[0].empty() || tokens[1].empty() || tokens[2].empty()) training_failure("The guesser training line '" << line << "' contains an empty column!"); |
|
|
0 |
0 |
if (tokens[0].empty() || tokens[1].empty() || tokens[2].empty()) training_failure("The guesser training line '" << line << "' contains an empty column!"); |
|
|
0 |
0 |
if (tokens[0].empty() || tokens[1].empty() || tokens[2].empty()) training_failure("The guesser training line '" << line << "' contains an empty column!"); |
|
|
0 |
0 |
if (tokens[0].empty() || tokens[1].empty() || tokens[2].empty()) training_failure("The guesser training line '" << line << "' contains an empty column!"); |
|
9221
|
0 |
0 |
if ((lemma_case == CASE_LC && (form_case == CASE_UCLC || form_case == CASE_UC)) || |
|
|
0 |
0 |
if ((lemma_case == CASE_LC && (form_case == CASE_UCLC || form_case == CASE_UC)) || |
|
|
0 |
0 |
if ((lemma_case == CASE_LC && (form_case == CASE_UCLC || form_case == CASE_UC)) || |
|
9223
|
0 |
0 |
set_casing(tokens[0], lemma_case, form); |
|
9228
|
0 |
0 |
data.emplace_back(form, tokens[1], tokens[2]); |
|
9233
|
0 |
0 |
for (auto&& instance : data) |
|
9234
|
0 |
0 |
if (!instance.form_prefix.empty()) |
|
9238
|
0 |
0 |
for (auto&& prefix : prefixes_with_forms) |
|
9239
|
0 |
0 |
if (prefix.second.size() >= min_prefix_count) |
|
9240
|
0 |
0 |
prefixes_with_counts.emplace_back(unsigned(prefix.second.size()), prefix.first); |
|
9242
|
0 |
0 |
if (prefixes_with_counts.size() > max_prefixes) { |
|
9244
|
0 |
0 |
prefixes_with_counts.resize(max_prefixes); |
|
9249
|
0 |
0 |
for (auto&& prefix : prefixes_with_counts) |
|
9257
|
0 |
0 |
for (auto&& instance : data) { |
|
9263
|
0 |
0 |
for (auto&& prefix : prefixes) |
|
9264
|
0 |
0 |
if (prefix.size() > prefix_length && instance.form.compare(0, prefix.size(), prefix) == 0) |
|
|
0 |
0 |
if (prefix.size() > prefix_length && instance.form.compare(0, prefix.size(), prefix) == 0) |
|
|
0 |
0 |
if (prefix.size() > prefix_length && instance.form.compare(0, prefix.size(), prefix) == 0) |
|
|
0 |
0 |
if (prefix.size() > prefix_length && instance.form.compare(0, prefix.size(), prefix) == 0) |
|
9267
|
0 |
0 |
tag_lemma_rule.assign(instance.lemma_rule).append("\t").append(instance.tag); |
|
9270
|
0 |
0 |
for (unsigned length = 0, utf8_length = 0; length < suffix_len && suffix(instance.form, utf8_length); length++) { |
|
|
0 |
0 |
for (unsigned length = 0, utf8_length = 0; length < suffix_len && suffix(instance.form, utf8_length); length++) { |
|
|
0 |
0 |
for (unsigned length = 0, utf8_length = 0; length < suffix_len && suffix(instance.form, utf8_length); length++) { |
|
9271
|
0 |
0 |
prefix_suffix.assign(instance.form, 0, prefix_length).append(" ").append(instance.form, instance.form.size() - utf8_length, utf8_length); |
|
|
0 |
0 |
prefix_suffix.assign(instance.form, 0, prefix_length).append(" ").append(instance.form, instance.form.size() - utf8_length, utf8_length); |
|
|
0 |
0 |
prefix_suffix.assign(instance.form, 0, prefix_length).append(" ").append(instance.form, instance.form.size() - utf8_length, utf8_length); |
|
9279
|
0 |
0 |
for (auto&& tag : tags) |
|
9280
|
0 |
0 |
if (tag.second.size() > most_frequent_tag_count) |
|
9289
|
0 |
0 |
for (auto&& suffix : suffixes) { |
|
9290
|
0 |
0 |
for (auto&& prefix : prefixes) { |
|
9295
|
0 |
0 |
for (int prefix_len = int(prefix.size()); prefix_len >= 0; prefix_len -= prefix.empty() ? 1 : prefix.size()) { |
|
|
0 |
0 |
for (int prefix_len = int(prefix.size()); prefix_len >= 0; prefix_len -= prefix.empty() ? 1 : prefix.size()) { |
|
9296
|
0 |
0 |
for (int suffix_len = int(suffix.size()); rules_counts.size() < rules_per_suffix && suffix_len > 0; suffix_len--) { |
|
|
0 |
0 |
for (int suffix_len = int(suffix.size()); rules_counts.size() < rules_per_suffix && suffix_len > 0; suffix_len--) { |
|
|
0 |
0 |
for (int suffix_len = int(suffix.size()); rules_counts.size() < rules_per_suffix && suffix_len > 0; suffix_len--) { |
|
9297
|
0 |
0 |
rule_key.assign(prefix, 0, prefix_len).append(" ").append(suffix, suffix.size() - suffix_len, suffix_len); |
|
|
0 |
0 |
rule_key.assign(prefix, 0, prefix_len).append(" ").append(suffix, suffix.size() - suffix_len, suffix_len); |
|
|
0 |
0 |
rule_key.assign(prefix, 0, prefix_len).append(" ").append(suffix, suffix.size() - suffix_len, suffix_len); |
|
9298
|
0 |
0 |
if (!rules.count(rule_key)) continue; |
|
9301
|
0 |
0 |
for (auto&& entry : rules[rule_key]) |
|
9302
|
0 |
0 |
if (!rules_set.count(entry.first)) { |
|
9303
|
0 |
0 |
rules_counts.emplace_back(unsigned(entry.second.size()), entry.first); |
|
9309
|
0 |
0 |
if (rules_counts.size() >= rules_per_suffix) { |
|
9310
|
0 |
0 |
rules_counts.resize(rules_per_suffix); |
|
9315
|
0 |
0 |
if (rules_set.empty()) break; |
|
9317
|
0 |
0 |
if (!rules_set.empty()) { |
|
9319
|
0 |
0 |
output.assign(prefix).append(" ").append(suffix); |
|
9320
|
0 |
0 |
for (unsigned i = 0; i < rules_counts.size(); i++) { |
|
9323
|
0 |
0 |
output.append("\t").append(rules_counts[i].second, 0, tab).append("\t").append(rules_counts[i].second, tab + 1, string::npos); |
|
|
0 |
0 |
output.append("\t").append(rules_counts[i].second, 0, tab).append("\t").append(rules_counts[i].second, tab + 1, string::npos); |
|
|
0 |
0 |
output.append("\t").append(rules_counts[i].second, 0, tab).append("\t").append(rules_counts[i].second, tab + 1, string::npos); |
|
|
0 |
0 |
output.append("\t").append(rules_counts[i].second, 0, tab).append("\t").append(rules_counts[i].second, tab + 1, string::npos); |
|
9326
|
0 |
0 |
for (unsigned start = i; i+1 < rules_counts.size() && rules_counts[i+1].second.compare(0, tab + 1, rules_counts[start].second, 0, tab + 1) == 0; i++) |
|
|
0 |
0 |
for (unsigned start = i; i+1 < rules_counts.size() && rules_counts[i+1].second.compare(0, tab + 1, rules_counts[start].second, 0, tab + 1) == 0; i++) |
|
|
0 |
0 |
for (unsigned start = i; i+1 < rules_counts.size() && rules_counts[i+1].second.compare(0, tab + 1, rules_counts[start].second, 0, tab + 1) == 0; i++) |
|
|
0 |
0 |
for (unsigned start = i; i+1 < rules_counts.size() && rules_counts[i+1].second.compare(0, tab + 1, rules_counts[start].second, 0, tab + 1) == 0; i++) |
|
9327
|
0 |
0 |
output.append(" ").append(rules_counts[i+1].second, tab + 1, string::npos); |
|
|
0 |
0 |
output.append(" ").append(rules_counts[i+1].second, tab + 1, string::npos); |
|
9342
|
0 |
0 |
for (int offset = -int(lemma.size() - 1); offset < int(form.size()) - 1; offset++) { |
|
9345
|
0 |
0 |
for (unsigned length = 0; form_offset < form.size() && lemma_offset < lemma.size(); form_offset++, lemma_offset++) |
|
|
0 |
0 |
for (unsigned length = 0; form_offset < form.size() && lemma_offset < lemma.size(); form_offset++, lemma_offset++) |
|
|
0 |
0 |
for (unsigned length = 0; form_offset < form.size() && lemma_offset < lemma.size(); form_offset++, lemma_offset++) |
|
9346
|
0 |
0 |
if (form[form_offset] == lemma[lemma_offset]) { |
|
9347
|
0 |
0 |
if (++length > length_best && utf8::valid(form.c_str() + form_offset + 1 - length, length)) |
|
|
0 |
0 |
if (++length > length_best && utf8::valid(form.c_str() + form_offset + 1 - length, length)) |
|
|
0 |
0 |
if (++length > length_best && utf8::valid(form.c_str() + form_offset + 1 - length, length)) |
|
9354
|
0 |
0 |
form_prefix.assign(form, 0, lemma_best == 0 ? form_best : 0); |
|
|
0 |
0 |
form_prefix.assign(form, 0, lemma_best == 0 ? form_best : 0); |
|
9355
|
0 |
0 |
lemma_rule.assign(form, 0, form_best).append(" ").append(lemma, 0, lemma_best).append(" ") |
|
|
0 |
0 |
lemma_rule.assign(form, 0, form_best).append(" ").append(lemma, 0, lemma_best).append(" ") |
|
|
0 |
0 |
lemma_rule.assign(form, 0, form_best).append(" ").append(lemma, 0, lemma_best).append(" ") |
|
|
0 |
0 |
lemma_rule.assign(form, 0, form_best).append(" ").append(lemma, 0, lemma_best).append(" ") |
|
9356
|
0 |
0 |
.append(form, form_best + length_best, string::npos).append(" ").append(lemma, lemma_best + length_best, string::npos); |
|
|
0 |
0 |
.append(form, form_best + length_best, string::npos).append(" ").append(lemma, lemma_best + length_best, string::npos); |
|
|
0 |
0 |
.append(form, form_best + length_best, string::npos).append(" ").append(lemma, lemma_best + length_best, string::npos); |
|
9364
|
0 |
0 |
for (auto&& chr : utf8::decoder(word)) { |
|
9368
|
0 |
0 |
if (allow_nonletters && index >= 2 && cat & ~unicode::L) continue; |
|
|
0 |
0 |
if (allow_nonletters && index >= 2 && cat & ~unicode::L) continue; |
|
9369
|
0 |
0 |
if (cat & ~unicode::L) return CASE_OTHER; |
|
9371
|
0 |
0 |
if (index == 0) { |
|
9372
|
0 |
0 |
c = cat & unicode::Ll ? CASE_LC : CASE_UC; |
|
9373
|
0 |
0 |
} else if (c == CASE_UC && index == 1) { |
|
9374
|
0 |
0 |
c = cat & unicode::Ll ? CASE_UCLC : CASE_UC; |
|
9375
|
0 |
0 |
} else if (c == CASE_UC) { |
|
9376
|
0 |
0 |
if (cat & ~unicode::Lut) return CASE_OTHER; |
|
9378
|
0 |
0 |
if (cat & ~unicode::Ll) return CASE_OTHER; |
|
9390
|
0 |
0 |
for (auto&& chr : utf8::decoder(original)) { |
|
9391
|
0 |
0 |
utf8::append(word, (c == CASE_UC || (c == CASE_UCLC && first)) ? unicode::uppercase(chr) : unicode::lowercase(chr)); |
|
|
0 |
0 |
utf8::append(word, (c == CASE_UC || (c == CASE_UCLC && first)) ? unicode::uppercase(chr) : unicode::lowercase(chr)); |
|
9400
|
0 |
0 |
while (additional + length <= word.size() && !utf8::valid(word.c_str() + word.size() - length - additional, additional)) |
|
|
0 |
0 |
while (additional + length <= word.size() && !utf8::valid(word.c_str() + word.size() - length - additional, additional)) |
|
|
0 |
0 |
while (additional + length <= word.size() && !utf8::valid(word.c_str() + word.size() - length - additional, additional)) |
|
9403
|
0 |
0 |
if (additional + length > word.size()) return false; |
|
9427
|
0 |
0 |
if (line.empty()) { |
|
9428
|
0 |
0 |
if (!getline(in, line)) |
|
9431
|
0 |
0 |
if (tokens.size() != 3) training_failure("Line " << line << " does not have three columns!"); |
|
|
0 |
0 |
if (tokens.size() != 3) training_failure("Line " << line << " does not have three columns!"); |
|
|
0 |
0 |
if (tokens.size() != 3) training_failure("Line " << line << " does not have three columns!"); |
|
9435
|
0 |
0 |
if (seen_lemmas.count(lemma)) |
|
9436
|
0 |
0 |
training_failure("Raw morphological dictionary contains lemma '" << lemma << "' multiple times - all forms of one lemma must be in continuous region!"); |
|
|
0 |
0 |
training_failure("Raw morphological dictionary contains lemma '" << lemma << "' multiple times - all forms of one lemma must be in continuous region!"); |
|
9441
|
0 |
0 |
while (getline(in, line)) { |
|
9443
|
0 |
0 |
if (tokens.size() != 3) training_failure("Line " << line << " does not have three columns!"); |
|
|
0 |
0 |
if (tokens.size() != 3) training_failure("Line " << line << " does not have three columns!"); |
|
|
0 |
0 |
if (tokens.size() != 3) training_failure("Line " << line << " does not have three columns!"); |
|
9445
|
0 |
0 |
if (lemma != tokens[0]) break; |
|
9470
|
0 |
0 |
if (!filter) return; |
|
9472
|
0 |
0 |
wildcard.assign(filter); |
|
9475
|
0 |
0 |
for (int tag_pos = 0, filter_pos = 0; filter[filter_pos]; tag_pos++, filter_pos++) { |
|
9476
|
0 |
0 |
if (filter[filter_pos] == '?') continue; |
|
9477
|
0 |
0 |
if (filter[filter_pos] == '[') { |
|
9481
|
0 |
0 |
if (filter[filter_pos] == '^') negate = true, filter_pos++; |
|
9484
|
0 |
0 |
for (bool first = true; filter[filter_pos] && (first || filter[filter_pos] != ']'); first = false) |
|
|
0 |
0 |
for (bool first = true; filter[filter_pos] && (first || filter[filter_pos] != ']'); first = false) |
|
|
0 |
0 |
for (bool first = true; filter[filter_pos] && (first || filter[filter_pos] != ']'); first = false) |
|
9487
|
0 |
0 |
filters.emplace_back(tag_pos, negate, chars_start, filter_pos - chars_start); |
|
9488
|
0 |
0 |
if (!filter[filter_pos]) break; |
|
9490
|
0 |
0 |
filters.emplace_back(tag_pos, false, filter_pos, 1); |
|
9543
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
10 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
8 |
2 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
10 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
5 |
14 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
7 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
9551
|
1 |
0 |
if (!compressor::load(is, data)) return false; |
|
|
1 |
0 |
if (!compressor::load(is, data)) return false; |
|
9554
|
1 |
0 |
maps.resize(data.next_1B()); |
|
|
1 |
0 |
maps.resize(data.next_1B()); |
|
9555
|
27 |
1 |
for (auto&& map : maps) |
|
9556
|
27 |
0 |
map.load(data); |
|
|
0 |
0 |
map.load(data); |
|
9598
|
1171 |
0 |
if (value < 0x80) *where++ = value; |
|
9599
|
0 |
0 |
else if (value < 0x4000) *where++ = (value >> 7) | 0x80u, *where++ = value & 0x7Fu; |
|
9600
|
0 |
0 |
else if (value < 0x200000) *where++ = (value >> 14) | 0x80u, *where++ = ((value >> 7) & 0x7Fu) | 0x80u, *where++ = value & 0x7Fu; |
|
9601
|
0 |
0 |
else if (value < 0x10000000) *where++ = (value >> 21) | 0x80u, *where++ = ((value >> 14) & 0x7Fu) | 0x80u, *where++ = ((value >> 7) & 0x7Fu) | 0x80u, *where++ = value & 0x7Fu; |
|
9608
|
0 |
0 |
while (((unsigned char)(*from)) & 0x80u) value = (value << 7) | (((unsigned char)(*from++)) ^ 0x80u); |
|
|
0 |
0 |
while (((unsigned char)(*from)) & 0x80u) value = (value << 7) | (((unsigned char)(*from++)) ^ 0x80u); |
|
9643
|
0 |
0 |
struct feature_sequence { |
|
|
0 |
0 |
struct feature_sequence { |
|
9649
|
0 |
0 |
class feature_sequences { |
|
|
0 |
0 |
class feature_sequences { |
|
|
1 |
0 |
class feature_sequences { |
|
|
0 |
0 |
class feature_sequences { |
|
|
0 |
0 |
class feature_sequences { |
|
9678
|
0 |
0 |
return it ? unaligned_load(it) : 0; |
|
|
0 |
0 |
return it ? unaligned_load(it) : 0; |
|
|
270 |
76 |
return it ? unaligned_load(it) : 0; |
|
9687
|
1 |
0 |
if (!elementary.load(is)) return false; |
|
|
0 |
0 |
if (!elementary.load(is)) return false; |
|
|
0 |
0 |
if (!elementary.load(is)) return false; |
|
9690
|
1 |
0 |
if (!compressor::load(is, data)) return false; |
|
|
1 |
0 |
if (!compressor::load(is, data)) return false; |
|
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
|
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
|
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
|
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
|
9693
|
1 |
0 |
sequences.resize(data.next_1B()); |
|
|
1 |
0 |
sequences.resize(data.next_1B()); |
|
|
0 |
0 |
sequences.resize(data.next_1B()); |
|
|
0 |
0 |
sequences.resize(data.next_1B()); |
|
|
0 |
0 |
sequences.resize(data.next_1B()); |
|
|
0 |
0 |
sequences.resize(data.next_1B()); |
|
9694
|
74 |
1 |
for (auto&& sequence : sequences) { |
|
|
0 |
0 |
for (auto&& sequence : sequences) { |
|
|
0 |
0 |
for (auto&& sequence : sequences) { |
|
9695
|
74 |
0 |
sequence.dependant_range = data.next_4B(); |
|
|
0 |
0 |
sequence.dependant_range = data.next_4B(); |
|
|
0 |
0 |
sequence.dependant_range = data.next_4B(); |
|
9696
|
74 |
0 |
sequence.elements.resize(data.next_1B()); |
|
|
74 |
0 |
sequence.elements.resize(data.next_1B()); |
|
|
0 |
0 |
sequence.elements.resize(data.next_1B()); |
|
|
0 |
0 |
sequence.elements.resize(data.next_1B()); |
|
|
0 |
0 |
sequence.elements.resize(data.next_1B()); |
|
|
0 |
0 |
sequence.elements.resize(data.next_1B()); |
|
9697
|
154 |
74 |
for (auto&& element : sequence.elements) { |
|
|
0 |
0 |
for (auto&& element : sequence.elements) { |
|
|
0 |
0 |
for (auto&& element : sequence.elements) { |
|
9698
|
154 |
0 |
element.type = elementary_feature_type(data.next_4B()); |
|
|
0 |
0 |
element.type = elementary_feature_type(data.next_4B()); |
|
|
0 |
0 |
element.type = elementary_feature_type(data.next_4B()); |
|
9699
|
154 |
0 |
element.elementary_index = data.next_4B(); |
|
|
0 |
0 |
element.elementary_index = data.next_4B(); |
|
|
0 |
0 |
element.elementary_index = data.next_4B(); |
|
9700
|
154 |
0 |
element.sequence_index = data.next_4B(); |
|
|
0 |
0 |
element.sequence_index = data.next_4B(); |
|
|
0 |
0 |
element.sequence_index = data.next_4B(); |
|
9704
|
1 |
0 |
scores.resize(data.next_1B()); |
|
|
1 |
0 |
scores.resize(data.next_1B()); |
|
|
0 |
0 |
scores.resize(data.next_1B()); |
|
|
0 |
0 |
scores.resize(data.next_1B()); |
|
|
0 |
0 |
scores.resize(data.next_1B()); |
|
|
0 |
0 |
scores.resize(data.next_1B()); |
|
9705
|
74 |
1 |
for (auto&& score : scores) |
|
|
0 |
0 |
for (auto&& score : scores) |
|
|
0 |
0 |
for (auto&& score : scores) |
|
9706
|
74 |
0 |
score.load(data); |
|
|
0 |
0 |
score.load(data); |
|
|
0 |
0 |
score.load(data); |
|
|
0 |
0 |
score.load(data); |
|
|
0 |
0 |
score.load(data); |
|
|
0 |
0 |
score.load(data); |
|
9726
|
0 |
0 |
cache_element(int elements) : key(vli::max_length() * elements), key_size(0), score(0) {} |
|
|
0 |
0 |
cache_element(int elements) : key(vli::max_length() * elements), key_size(0), score(0) {} |
|
|
0 |
0 |
cache_element(int elements) : key(vli::max_length() * elements), key_size(0), score(0) {} |
|
|
0 |
0 |
cache_element(int elements) : key(vli::max_length() * elements), key_size(0), score(0) {} |
|
9734
|
0 |
0 |
caches.reserve(self.sequences.size()); |
|
|
0 |
0 |
caches.reserve(self.sequences.size()); |
|
|
1 |
0 |
caches.reserve(self.sequences.size()); |
|
|
0 |
0 |
caches.reserve(self.sequences.size()); |
|
9736
|
0 |
0 |
for (auto&& sequence : self.sequences) { |
|
|
0 |
0 |
for (auto&& sequence : self.sequences) { |
|
|
74 |
1 |
for (auto&& sequence : self.sequences) { |
|
|
0 |
0 |
for (auto&& sequence : self.sequences) { |
|
9737
|
0 |
0 |
caches.emplace_back(int(sequence.elements.size())); |
|
|
0 |
0 |
caches.emplace_back(int(sequence.elements.size())); |
|
|
74 |
0 |
caches.emplace_back(int(sequence.elements.size())); |
|
|
0 |
0 |
caches.emplace_back(int(sequence.elements.size())); |
|
9738
|
0 |
0 |
if (int(sequence.elements.size()) > max_sequence_elements) max_sequence_elements = sequence.elements.size(); |
|
|
0 |
0 |
if (int(sequence.elements.size()) > max_sequence_elements) max_sequence_elements = sequence.elements.size(); |
|
|
2 |
72 |
if (int(sequence.elements.size()) > max_sequence_elements) max_sequence_elements = sequence.elements.size(); |
|
|
0 |
0 |
if (int(sequence.elements.size()) > max_sequence_elements) max_sequence_elements = sequence.elements.size(); |
|
9739
|
0 |
0 |
for (auto&& element : sequence.elements) |
|
|
0 |
0 |
for (auto&& element : sequence.elements) |
|
|
154 |
74 |
for (auto&& element : sequence.elements) |
|
|
0 |
0 |
for (auto&& element : sequence.elements) |
|
9740
|
0 |
0 |
if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size) |
|
|
0 |
0 |
if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size) |
|
|
0 |
0 |
if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size) |
|
|
0 |
0 |
if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size) |
|
|
92 |
62 |
if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size) |
|
|
2 |
90 |
if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size) |
|
|
0 |
0 |
if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size) |
|
|
0 |
0 |
if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size) |
|
9743
|
0 |
0 |
key.resize(max_sequence_elements * vli::max_length()); |
|
|
0 |
0 |
key.resize(max_sequence_elements * vli::max_length()); |
|
|
1 |
0 |
key.resize(max_sequence_elements * vli::max_length()); |
|
|
0 |
0 |
key.resize(max_sequence_elements * vli::max_length()); |
|
9744
|
0 |
0 |
window.resize(max_window_size); |
|
|
0 |
0 |
window.resize(max_window_size); |
|
|
1 |
0 |
window.resize(max_window_size); |
|
|
0 |
0 |
window.resize(max_window_size); |
|
9755
|
0 |
0 |
if (forms.size() > c.elementary_per_form.size()) c.elementary_per_form.resize(forms.size() * 2); |
|
|
0 |
0 |
if (forms.size() > c.elementary_per_form.size()) c.elementary_per_form.resize(forms.size() * 2); |
|
|
1 |
0 |
if (forms.size() > c.elementary_per_form.size()) c.elementary_per_form.resize(forms.size() * 2); |
|
|
0 |
0 |
if (forms.size() > c.elementary_per_form.size()) c.elementary_per_form.resize(forms.size() * 2); |
|
9756
|
0 |
0 |
if (forms.size() > c.elementary_per_tag.size()) c.elementary_per_tag.resize(forms.size() * 2); |
|
|
0 |
0 |
if (forms.size() > c.elementary_per_tag.size()) c.elementary_per_tag.resize(forms.size() * 2); |
|
|
1 |
0 |
if (forms.size() > c.elementary_per_tag.size()) c.elementary_per_tag.resize(forms.size() * 2); |
|
|
0 |
0 |
if (forms.size() > c.elementary_per_tag.size()) c.elementary_per_tag.resize(forms.size() * 2); |
|
9757
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) |
|
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) |
|
|
7 |
1 |
for (unsigned i = 0; i < forms.size(); i++) |
|
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) |
|
9758
|
0 |
0 |
if (analyses[i].size() > c.elementary_per_tag[i].size()) |
|
|
0 |
0 |
if (analyses[i].size() > c.elementary_per_tag[i].size()) |
|
|
7 |
0 |
if (analyses[i].size() > c.elementary_per_tag[i].size()) |
|
|
0 |
0 |
if (analyses[i].size() > c.elementary_per_tag[i].size()) |
|
9766
|
0 |
0 |
for (auto&& cache : c.caches) |
|
|
0 |
0 |
for (auto&& cache : c.caches) |
|
|
74 |
1 |
for (auto&& cache : c.caches) |
|
|
0 |
0 |
for (auto&& cache : c.caches) |
|
9772
|
0 |
0 |
elementary.compute_dynamic_features((*c.analyses)[form_index][tag_index], c.elementary_per_form[form_index], c.elementary_per_tag[form_index][tag_index], form_index > 0 ? prev_dynamic : nullptr, dynamic); |
|
|
0 |
0 |
elementary.compute_dynamic_features((*c.analyses)[form_index][tag_index], c.elementary_per_form[form_index], c.elementary_per_tag[form_index][tag_index], form_index > 0 ? prev_dynamic : nullptr, dynamic); |
|
|
12 |
3 |
elementary.compute_dynamic_features((*c.analyses)[form_index][tag_index], c.elementary_per_form[form_index], c.elementary_per_tag[form_index][tag_index], form_index > 0 ? prev_dynamic : nullptr, dynamic); |
|
|
0 |
0 |
elementary.compute_dynamic_features((*c.analyses)[form_index][tag_index], c.elementary_per_form[form_index], c.elementary_per_tag[form_index][tag_index], form_index > 0 ? prev_dynamic : nullptr, dynamic); |
|
9778
|
0 |
0 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
|
0 |
0 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
|
0 |
0 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
|
0 |
0 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
|
0 |
0 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
|
0 |
0 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
|
36 |
7 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
|
6 |
30 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
|
30 |
13 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
|
0 |
0 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
|
0 |
0 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
|
0 |
0 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
9783
|
0 |
0 |
for (unsigned i = 0; i < sequences.size(); i++) { |
|
|
0 |
0 |
for (unsigned i = 0; i < sequences.size(); i++) { |
|
|
658 |
8 |
for (unsigned i = 0; i < sequences.size(); i++) { |
|
|
0 |
0 |
for (unsigned i = 0; i < sequences.size(); i++) { |
|
9784
|
0 |
0 |
if (tags_unchanged >= sequences[i].dependant_range) |
|
|
0 |
0 |
if (tags_unchanged >= sequences[i].dependant_range) |
|
|
653 |
5 |
if (tags_unchanged >= sequences[i].dependant_range) |
|
|
0 |
0 |
if (tags_unchanged >= sequences[i].dependant_range) |
|
9788
|
0 |
0 |
for (unsigned j = 0; j < sequences[i].elements.size(); j++) { |
|
|
0 |
0 |
for (unsigned j = 0; j < sequences[i].elements.size(); j++) { |
|
|
1345 |
479 |
for (unsigned j = 0; j < sequences[i].elements.size(); j++) { |
|
|
0 |
0 |
for (unsigned j = 0; j < sequences[i].elements.size(); j++) { |
|
9794
|
0 |
0 |
value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index]; |
|
|
0 |
0 |
value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index]; |
|
|
0 |
0 |
value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index]; |
|
|
0 |
0 |
value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index]; |
|
|
458 |
17 |
value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index]; |
|
|
452 |
6 |
value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index]; |
|
|
0 |
0 |
value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index]; |
|
|
0 |
0 |
value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index]; |
|
9797
|
0 |
0 |
value = form_index + element.sequence_index < 0 ? elementary_feature_empty : c.window[-element.sequence_index]->values[element.elementary_index]; |
|
|
0 |
0 |
value = form_index + element.sequence_index < 0 ? elementary_feature_empty : c.window[-element.sequence_index]->values[element.elementary_index]; |
|
|
778 |
66 |
value = form_index + element.sequence_index < 0 ? elementary_feature_empty : c.window[-element.sequence_index]->values[element.elementary_index]; |
|
|
0 |
0 |
value = form_index + element.sequence_index < 0 ? elementary_feature_empty : c.window[-element.sequence_index]->values[element.elementary_index]; |
|
9804
|
0 |
0 |
if (value == elementary_feature_unknown) { |
|
|
0 |
0 |
if (value == elementary_feature_unknown) { |
|
|
174 |
1171 |
if (value == elementary_feature_unknown) { |
|
|
0 |
0 |
if (value == elementary_feature_unknown) { |
|
9813
|
0 |
0 |
if (!key_size) { |
|
|
0 |
0 |
if (!key_size) { |
|
|
174 |
479 |
if (!key_size) { |
|
|
0 |
0 |
if (!key_size) { |
|
9816
|
0 |
0 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
|
0 |
0 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
|
0 |
0 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
|
0 |
0 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
|
0 |
0 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
|
0 |
0 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
|
355 |
124 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
|
222 |
133 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
|
346 |
133 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
|
0 |
0 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
|
0 |
0 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
|
0 |
0 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
9833
|
0 |
0 |
for (unsigned i = 0; i < c.caches.size(); i++) |
|
9877
|
0 |
0 |
cache(const viterbi& self) : features_cache(self.features) {} |
|
|
0 |
0 |
cache(const viterbi& self) : features_cache(self.features) {} |
|
|
1 |
0 |
cache(const viterbi& self) : features_cache(self.features) {} |
|
|
0 |
0 |
cache(const viterbi& self) : features_cache(self.features) {} |
|
9890
|
0 |
0 |
if (!forms.size()) return; |
|
|
0 |
0 |
if (!forms.size()) return; |
|
|
1 |
0 |
if (!forms.size()) return; |
|
|
0 |
0 |
if (!forms.size()) return; |
|
9894
|
0 |
0 |
for (unsigned i = 0, states = 1; i < forms.size(); i++) { |
|
|
0 |
0 |
for (unsigned i = 0, states = 1; i < forms.size(); i++) { |
|
|
7 |
1 |
for (unsigned i = 0, states = 1; i < forms.size(); i++) { |
|
|
0 |
0 |
for (unsigned i = 0, states = 1; i < forms.size(); i++) { |
|
9895
|
0 |
0 |
if (analyses[i].empty()) return; |
|
|
0 |
0 |
if (analyses[i].empty()) return; |
|
|
7 |
0 |
if (analyses[i].empty()) return; |
|
|
0 |
0 |
if (analyses[i].empty()) return; |
|
9896
|
0 |
0 |
states = (i+1 >= unsigned(decoding_order) ? states / analyses[i-decoding_order+1].size() : states) * analyses[i].size(); |
|
|
0 |
0 |
states = (i+1 >= unsigned(decoding_order) ? states / analyses[i-decoding_order+1].size() : states) * analyses[i].size(); |
|
|
5 |
2 |
states = (i+1 >= unsigned(decoding_order) ? states / analyses[i-decoding_order+1].size() : states) * analyses[i].size(); |
|
|
0 |
0 |
states = (i+1 >= unsigned(decoding_order) ? states / analyses[i-decoding_order+1].size() : states) * analyses[i].size(); |
|
9899
|
0 |
0 |
if (nodes > c.nodes.size()) c.nodes.resize(nodes); |
|
|
0 |
0 |
if (nodes > c.nodes.size()) c.nodes.resize(nodes); |
|
|
1 |
0 |
if (nodes > c.nodes.size()) c.nodes.resize(nodes); |
|
|
0 |
0 |
if (nodes > c.nodes.size()) c.nodes.resize(nodes); |
|
9905
|
0 |
0 |
int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data()); |
|
|
0 |
0 |
int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data()); |
|
|
0 |
0 |
int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data()); |
|
|
0 |
0 |
int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data()); |
|
|
0 |
1 |
int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data()); |
|
|
0 |
0 |
int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data()); |
|
|
0 |
0 |
int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data()); |
|
|
0 |
0 |
int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data()); |
|
9911
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) { |
|
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) { |
|
|
7 |
1 |
for (unsigned i = 0; i < forms.size(); i++) { |
|
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) { |
|
9914
|
0 |
0 |
for (int j = 0; j < window_size; j++) window[j] = -1; |
|
|
0 |
0 |
for (int j = 0; j < window_size; j++) window[j] = -1; |
|
|
7 |
21 |
for (int j = 0; j < window_size; j++) window[j] = -1; |
|
|
0 |
0 |
for (int j = 0; j < window_size; j++) window[j] = -1; |
|
9915
|
0 |
0 |
for (int tag = 0; tag < int(analyses[i].size()); tag++) |
|
|
0 |
0 |
for (int tag = 0; tag < int(analyses[i].size()); tag++) |
|
|
10 |
7 |
for (int tag = 0; tag < int(analyses[i].size()); tag++) |
|
|
0 |
0 |
for (int tag = 0; tag < int(analyses[i].size()); tag++) |
|
9916
|
0 |
0 |
for (int prev = nodes_prev; prev < nodes_now; prev++) { |
|
|
0 |
0 |
for (int prev = nodes_prev; prev < nodes_now; prev++) { |
|
|
15 |
10 |
for (int prev = nodes_prev; prev < nodes_now; prev++) { |
|
|
0 |
0 |
for (int prev = nodes_prev; prev < nodes_now; prev++) { |
|
9920
|
0 |
0 |
for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) { |
|
|
0 |
0 |
for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) { |
|
|
0 |
0 |
for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) { |
|
|
0 |
0 |
for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) { |
|
|
27 |
9 |
for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) { |
|
|
21 |
6 |
for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) { |
|
|
0 |
0 |
for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) { |
|
|
0 |
0 |
for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) { |
|
9921
|
0 |
0 |
same_tags += same_tags == n && window[n] == c.nodes[p].tag; |
|
|
0 |
0 |
same_tags += same_tags == n && window[n] == c.nodes[p].tag; |
|
|
0 |
0 |
same_tags += same_tags == n && window[n] == c.nodes[p].tag; |
|
|
0 |
0 |
same_tags += same_tags == n && window[n] == c.nodes[p].tag; |
|
|
7 |
14 |
same_tags += same_tags == n && window[n] == c.nodes[p].tag; |
|
|
5 |
2 |
same_tags += same_tags == n && window[n] == c.nodes[p].tag; |
|
|
0 |
0 |
same_tags += same_tags == n && window[n] == c.nodes[p].tag; |
|
|
0 |
0 |
same_tags += same_tags == n && window[n] == c.nodes[p].tag; |
|
9926
|
0 |
0 |
features.compute_dynamic_features(i, tag, prev >= 0 ? &c.nodes[prev].dynamic : nullptr, dynamic, c.features_cache); |
|
|
0 |
0 |
features.compute_dynamic_features(i, tag, prev >= 0 ? &c.nodes[prev].dynamic : nullptr, dynamic, c.features_cache); |
|
|
12 |
3 |
features.compute_dynamic_features(i, tag, prev >= 0 ? &c.nodes[prev].dynamic : nullptr, dynamic, c.features_cache); |
|
|
0 |
0 |
features.compute_dynamic_features(i, tag, prev >= 0 ? &c.nodes[prev].dynamic : nullptr, dynamic, c.features_cache); |
|
9927
|
0 |
0 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
|
0 |
0 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
|
0 |
0 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
|
0 |
0 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
|
0 |
0 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
|
0 |
0 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
|
7 |
8 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
|
5 |
2 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
|
12 |
3 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
|
0 |
0 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
|
0 |
0 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
|
0 |
0 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
|
0 |
0 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
9931
|
0 |
0 |
if (same_tags >= decoding_order-1) { |
|
|
0 |
0 |
if (same_tags >= decoding_order-1) { |
|
|
2 |
13 |
if (same_tags >= decoding_order-1) { |
|
|
0 |
0 |
if (same_tags >= decoding_order-1) { |
|
9932
|
0 |
0 |
if (score <= c.nodes[nodes_next-1].score) continue; |
|
|
0 |
0 |
if (score <= c.nodes[nodes_next-1].score) continue; |
|
|
1 |
1 |
if (score <= c.nodes[nodes_next-1].score) continue; |
|
|
0 |
0 |
if (score <= c.nodes[nodes_next-1].score) continue; |
|
9947
|
0 |
0 |
for (int node = nodes_prev + 1; node < nodes_now; node++) |
|
|
0 |
0 |
for (int node = nodes_prev + 1; node < nodes_now; node++) |
|
|
1 |
1 |
for (int node = nodes_prev + 1; node < nodes_now; node++) |
|
|
0 |
0 |
for (int node = nodes_prev + 1; node < nodes_now; node++) |
|
9948
|
0 |
0 |
if (c.nodes[node].score > c.nodes[best].score) |
|
|
0 |
0 |
if (c.nodes[node].score > c.nodes[best].score) |
|
|
1 |
0 |
if (c.nodes[node].score > c.nodes[best].score) |
|
|
0 |
0 |
if (c.nodes[node].score > c.nodes[best].score) |
|
9951
|
0 |
0 |
for (int i = forms.size() - 1; i >= 0; i--, best = c.nodes[best].prev) |
|
|
0 |
0 |
for (int i = forms.size() - 1; i >= 0; i--, best = c.nodes[best].prev) |
|
|
7 |
1 |
for (int i = forms.size() - 1; i >= 0; i--, best = c.nodes[best].prev) |
|
|
0 |
0 |
for (int i = forms.size() - 1; i >= 0; i--, best = c.nodes[best].prev) |
|
10000
|
0 |
0 |
maps.resize(MAP_TOTAL); |
|
|
1 |
0 |
maps.resize(MAP_TOTAL); |
|
10004
|
2 |
0 |
vector conllu_elementary_features |
|
|
0 |
2 |
vector conllu_elementary_features |
|
|
68 |
2 |
vector conllu_elementary_features |
|
|
0 |
0 |
vector conllu_elementary_features |
|
10049
|
7 |
1 |
for (unsigned i = forms.size(); i--;) { |
|
|
0 |
0 |
for (unsigned i = forms.size(); i--;) { |
|
10053
|
10 |
7 |
for (unsigned j = 0; j < analyses[i].size(); j++) { |
|
|
0 |
0 |
for (unsigned j = 0; j < analyses[i].size(); j++) { |
|
10064
|
3 |
7 |
per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == lemma ? per_tag[i][j-1].values[LEMMA] : |
|
|
0 |
3 |
per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == lemma ? per_tag[i][j-1].values[LEMMA] : |
|
|
0 |
0 |
per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == lemma ? per_tag[i][j-1].values[LEMMA] : |
|
|
0 |
0 |
per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == lemma ? per_tag[i][j-1].values[LEMMA] : |
|
10069
|
0 |
10 |
if (index == string::npos) index = tag.size(); |
|
|
0 |
0 |
if (index == string::npos) index = tag.size(); |
|
10070
|
0 |
10 |
per_tag[i][j].values[TAG_UPOS] = maps[MAP_TAG_UPOS].value(tag.c_str() + (index ? 1 : 0), index - (index ? 1 : 0)); |
|
|
0 |
10 |
per_tag[i][j].values[TAG_UPOS] = maps[MAP_TAG_UPOS].value(tag.c_str() + (index ? 1 : 0), index - (index ? 1 : 0)); |
|
|
0 |
0 |
per_tag[i][j].values[TAG_UPOS] = maps[MAP_TAG_UPOS].value(tag.c_str() + (index ? 1 : 0), index - (index ? 1 : 0)); |
|
|
0 |
0 |
per_tag[i][j].values[TAG_UPOS] = maps[MAP_TAG_UPOS].value(tag.c_str() + (index ? 1 : 0), index - (index ? 1 : 0)); |
|
10072
|
10 |
0 |
if (index < tag.size()) index++; |
|
|
0 |
0 |
if (index < tag.size()) index++; |
|
10073
|
10 |
0 |
if (index < tag.size()) index = tag.find(separator, index); |
|
|
0 |
0 |
if (index < tag.size()) index = tag.find(separator, index); |
|
10074
|
10 |
0 |
if (index < tag.size()) index++; |
|
|
0 |
0 |
if (index < tag.size()) index++; |
|
10075
|
40 |
10 |
for (size_t length; index < tag.size(); index += length + 1) { |
|
|
0 |
0 |
for (size_t length; index < tag.size(); index += length + 1) { |
|
10077
|
6 |
34 |
length = (length == string::npos ? tag.size() : length) - index; |
|
|
0 |
0 |
length = (length == string::npos ? tag.size() : length) - index; |
|
10079
|
280 |
0 |
for (size_t equal_sign = 0; equal_sign + 1 < length; equal_sign++) |
|
|
0 |
0 |
for (size_t equal_sign = 0; equal_sign + 1 < length; equal_sign++) |
|
10080
|
240 |
40 |
if (tag[index + equal_sign] == '=') { |
|
|
0 |
0 |
if (tag[index + equal_sign] == '=') { |
|
10084
|
2 |
4 |
if (tag.compare(index, equal_sign, "Case") == 0) value = TAG_CASE, map = MAP_TAG_CASE; |
|
|
0 |
0 |
if (tag.compare(index, equal_sign, "Case") == 0) value = TAG_CASE, map = MAP_TAG_CASE; |
|
10087
|
2 |
14 |
if (tag.compare(index, equal_sign, "Gender") == 0) value = TAG_GENDER, map = MAP_TAG_GENDER; |
|
|
0 |
0 |
if (tag.compare(index, equal_sign, "Gender") == 0) value = TAG_GENDER, map = MAP_TAG_GENDER; |
|
10088
|
6 |
10 |
if (tag.compare(index, equal_sign, "Number") == 0) value = TAG_NUMBER, map = MAP_TAG_NUMBER; |
|
|
0 |
0 |
if (tag.compare(index, equal_sign, "Number") == 0) value = TAG_NUMBER, map = MAP_TAG_NUMBER; |
|
10089
|
4 |
12 |
if (tag.compare(index, equal_sign, "Person") == 0) value = TAG_PERSON, map = MAP_TAG_PERSON; |
|
|
0 |
0 |
if (tag.compare(index, equal_sign, "Person") == 0) value = TAG_PERSON, map = MAP_TAG_PERSON; |
|
10092
|
5 |
5 |
if (tag.compare(index, equal_sign, "Negative") == 0) value = TAG_NEGATIVE, map = MAP_TAG_NEGATIVE; |
|
|
0 |
0 |
if (tag.compare(index, equal_sign, "Negative") == 0) value = TAG_NEGATIVE, map = MAP_TAG_NEGATIVE; |
|
10096
|
19 |
21 |
if (value >= 0) |
|
|
0 |
0 |
if (value >= 0) |
|
10102
|
10 |
0 |
if (tag.size() >= 2 && tag[1] == 'V') { |
|
|
6 |
4 |
if (tag.size() >= 2 && tag[1] == 'V') { |
|
|
4 |
6 |
if (tag.size() >= 2 && tag[1] == 'V') { |
|
|
0 |
0 |
if (tag.size() >= 2 && tag[1] == 'V') { |
|
|
0 |
0 |
if (tag.size() >= 2 && tag[1] == 'V') { |
|
|
0 |
0 |
if (tag.size() >= 2 && tag[1] == 'V') { |
|
10104
|
2 |
2 |
verb_candidate = verb_candidate < 0 || (tag_compare = tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate; |
|
|
2 |
0 |
verb_candidate = verb_candidate < 0 || (tag_compare = tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate; |
|
|
0 |
0 |
verb_candidate = verb_candidate < 0 || (tag_compare = tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate; |
|
|
0 |
0 |
verb_candidate = verb_candidate < 0 || (tag_compare = tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate; |
|
10120
|
5 |
2 |
if (analyses[i].size() == 1) { |
|
|
0 |
0 |
if (analyses[i].size() == 1) { |
|
10128
|
0 |
2 |
} else if (forms[i].len <= 0) { |
|
|
0 |
0 |
} else if (forms[i].len <= 0) { |
|
10143
|
16 |
2 |
while (form.len) { |
|
|
0 |
0 |
while (form.len) { |
|
10147
|
16 |
0 |
num = num || cat & unicode::N; |
|
|
16 |
0 |
num = num || cat & unicode::N; |
|
|
0 |
0 |
num = num || cat & unicode::N; |
|
|
0 |
0 |
num = num || cat & unicode::N; |
|
10148
|
10 |
6 |
cap = cap || cat & unicode::Lut; |
|
|
9 |
1 |
cap = cap || cat & unicode::Lut; |
|
|
0 |
0 |
cap = cap || cat & unicode::Lut; |
|
|
0 |
0 |
cap = cap || cat & unicode::Lut; |
|
10149
|
16 |
0 |
dash = dash || cat & unicode::Pd; |
|
|
16 |
0 |
dash = dash || cat & unicode::Pd; |
|
|
0 |
0 |
dash = dash || cat & unicode::Pd; |
|
|
0 |
0 |
dash = dash || cat & unicode::Pd; |
|
10151
|
16 |
0 |
if (index == 10 || (!form.len && index < 10)) { |
|
|
14 |
2 |
if (index == 10 || (!form.len && index < 10)) { |
|
|
0 |
2 |
if (index == 10 || (!form.len && index < 10)) { |
|
|
0 |
0 |
if (index == 10 || (!form.len && index < 10)) { |
|
|
0 |
0 |
if (index == 10 || (!form.len && index < 10)) { |
|
|
0 |
0 |
if (index == 10 || (!form.len && index < 10)) { |
|
10181
|
12 |
3 |
if (prev_dynamic) { |
|
|
0 |
0 |
if (prev_dynamic) { |
|
10189
|
15 |
0 |
if (tag.tag.size() >= 2 && tag.tag[1] == 'V') { |
|
|
11 |
4 |
if (tag.tag.size() >= 2 && tag.tag[1] == 'V') { |
|
|
4 |
11 |
if (tag.tag.size() >= 2 && tag.tag[1] == 'V') { |
|
|
0 |
0 |
if (tag.tag.size() >= 2 && tag.tag[1] == 'V') { |
|
|
0 |
0 |
if (tag.tag.size() >= 2 && tag.tag[1] == 'V') { |
|
|
0 |
0 |
if (tag.tag.size() >= 2 && tag.tag[1] == 'V') { |
|
10243
|
0 |
0 |
maps.resize(MAP_TOTAL); |
|
10279
|
0 |
0 |
for (unsigned i = forms.size(); i--;) { |
|
10283
|
0 |
0 |
for (unsigned j = 0; j < analyses[i].size(); j++) { |
|
10286
|
0 |
0 |
per_tag[i][j].values[TAG3] = analyses[i][j].tag.size() >= 3 ? maps[MAP_TAG3].value(analyses[i][j].tag.c_str() + 2, 1) : elementary_feature_empty; |
|
10287
|
0 |
0 |
per_tag[i][j].values[TAG5] = analyses[i][j].tag.size() >= 5 ? maps[MAP_TAG5].value(analyses[i][j].tag.c_str() + 4, 1) : elementary_feature_empty; |
|
10288
|
0 |
0 |
per_tag[i][j].values[TAG25] = analyses[i][j].tag.size() >= 5 ? maps[MAP_TAG25].value((tag25[0] = analyses[i][j].tag[1], tag25[1] = analyses[i][j].tag[4], tag25), 2) : elementary_feature_empty; |
|
10289
|
0 |
0 |
per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == analyses[i][j].lemma ? per_tag[i][j-1].values[LEMMA] : |
|
|
0 |
0 |
per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == analyses[i][j].lemma ? per_tag[i][j-1].values[LEMMA] : |
|
10292
|
0 |
0 |
if (analyses[i][j].tag[0] == 'V') { |
|
10294
|
0 |
0 |
verb_candidate = verb_candidate < 0 || (tag_compare = analyses[i][j].tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && analyses[i][j].lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate; |
|
|
0 |
0 |
verb_candidate = verb_candidate < 0 || (tag_compare = analyses[i][j].tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && analyses[i][j].lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate; |
|
10304
|
0 |
0 |
if (verb_candidate >= 0) { |
|
10310
|
0 |
0 |
if (analyses[i].size() == 1) { |
|
10314
|
0 |
0 |
} else if (forms[i].len <= 0) { |
|
10325
|
0 |
0 |
while (form.len) { |
|
10329
|
0 |
0 |
num = num || cat & unicode::N; |
|
|
0 |
0 |
num = num || cat & unicode::N; |
|
10330
|
0 |
0 |
cap = cap || cat & unicode::Lut; |
|
|
0 |
0 |
cap = cap || cat & unicode::Lut; |
|
10331
|
0 |
0 |
dash = dash || cat & unicode::Pd; |
|
|
0 |
0 |
dash = dash || cat & unicode::Pd; |
|
10333
|
0 |
0 |
if (index == 5 || (!form.len && index < 5)) { |
|
|
0 |
0 |
if (index == 5 || (!form.len && index < 5)) { |
|
|
0 |
0 |
if (index == 5 || (!form.len && index < 5)) { |
|
10353
|
0 |
0 |
if (prev_dynamic) { |
|
10361
|
0 |
0 |
if (tag.tag[0] == 'V') { |
|
10415
|
0 |
0 |
maps.resize(MAP_TOTAL); |
|
10463
|
0 |
0 |
for (unsigned i = forms.size(); i--;) { |
|
10467
|
0 |
0 |
for (unsigned j = 0; j < analyses[i].size(); j++) { |
|
10469
|
0 |
0 |
per_tag[i][j].values[TAG1] = analyses[i][j].tag.size() >= 1 ? maps[MAP_TAG1].value(analyses[i][j].tag.c_str() + 0, 1) : elementary_feature_empty; |
|
10470
|
0 |
0 |
per_tag[i][j].values[TAG2] = analyses[i][j].tag.size() >= 2 ? maps[MAP_TAG2].value(analyses[i][j].tag.c_str() + 1, 1) : elementary_feature_empty; |
|
10471
|
0 |
0 |
per_tag[i][j].values[TAG3] = analyses[i][j].tag.size() >= 3 ? maps[MAP_TAG3].value(analyses[i][j].tag.c_str() + 2, 1) : elementary_feature_empty; |
|
10472
|
0 |
0 |
per_tag[i][j].values[TAG4] = analyses[i][j].tag.size() >= 4 ? maps[MAP_TAG4].value(analyses[i][j].tag.c_str() + 3, 1) : elementary_feature_empty; |
|
10473
|
0 |
0 |
per_tag[i][j].values[TAG5] = analyses[i][j].tag.size() >= 5 ? maps[MAP_TAG5].value(analyses[i][j].tag.c_str() + 4, 1) : elementary_feature_empty; |
|
10474
|
0 |
0 |
per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == analyses[i][j].lemma ? per_tag[i][j-1].values[LEMMA] : |
|
|
0 |
0 |
per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == analyses[i][j].lemma ? per_tag[i][j-1].values[LEMMA] : |
|
10477
|
0 |
0 |
if (analyses[i][j].tag[0] == 'V') { |
|
10479
|
0 |
0 |
verb_candidate = verb_candidate < 0 || (tag_compare = analyses[i][j].tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && analyses[i][j].lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate; |
|
|
0 |
0 |
verb_candidate = verb_candidate < 0 || (tag_compare = analyses[i][j].tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && analyses[i][j].lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate; |
|
10489
|
0 |
0 |
if (verb_candidate >= 0) { |
|
10495
|
0 |
0 |
if (analyses[i].size() == 1) { |
|
10503
|
0 |
0 |
} else if (forms[i].len <= 0) { |
|
10518
|
0 |
0 |
while (form.len) { |
|
10522
|
0 |
0 |
num = num || cat & unicode::N; |
|
|
0 |
0 |
num = num || cat & unicode::N; |
|
10523
|
0 |
0 |
cap = cap || cat & unicode::Lut; |
|
|
0 |
0 |
cap = cap || cat & unicode::Lut; |
|
10524
|
0 |
0 |
dash = dash || cat & unicode::Pd; |
|
|
0 |
0 |
dash = dash || cat & unicode::Pd; |
|
10526
|
0 |
0 |
if (index == 10 || (!form.len && index < 10)) { |
|
|
0 |
0 |
if (index == 10 || (!form.len && index < 10)) { |
|
|
0 |
0 |
if (index == 10 || (!form.len && index < 10)) { |
|
10556
|
0 |
0 |
if (prev_dynamic) { |
|
10564
|
0 |
0 |
if (tag.tag[0] == 'V') { |
|
10615
|
0 |
0 |
cache(const perceptron_tagger& self) : decoder_cache(self.decoder) {} |
|
|
0 |
0 |
cache(const perceptron_tagger& self) : decoder_cache(self.decoder) {} |
|
|
1 |
0 |
cache(const perceptron_tagger& self) : decoder_cache(self.decoder) {} |
|
10629
|
1 |
0 |
if (dict.reset(morpho::load(is)), !dict) return false; |
|
|
0 |
0 |
if (dict.reset(morpho::load(is)), !dict) return false; |
|
|
0 |
0 |
if (dict.reset(morpho::load(is)), !dict) return false; |
|
10631
|
1 |
0 |
if (!features.load(is)) return false; |
|
|
0 |
0 |
if (!features.load(is)) return false; |
|
|
0 |
0 |
if (!features.load(is)) return false; |
|
10643
|
0 |
0 |
if (!dict) return; |
|
|
0 |
0 |
if (!dict) return; |
|
|
1 |
0 |
if (!dict) return; |
|
10646
|
0 |
0 |
if (!c) c = new cache(*this); |
|
|
0 |
0 |
if (!c) c = new cache(*this); |
|
|
0 |
0 |
if (!c) c = new cache(*this); |
|
|
0 |
0 |
if (!c) c = new cache(*this); |
|
|
1 |
0 |
if (!c) c = new cache(*this); |
|
|
1 |
0 |
if (!c) c = new cache(*this); |
|
10649
|
0 |
0 |
if (c->analyses.size() < forms.size()) c->analyses.resize(forms.size()); |
|
|
0 |
0 |
if (c->analyses.size() < forms.size()) c->analyses.resize(forms.size()); |
|
|
1 |
0 |
if (c->analyses.size() < forms.size()) c->analyses.resize(forms.size()); |
|
10650
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) { |
|
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) { |
|
|
7 |
1 |
for (unsigned i = 0; i < forms.size(); i++) { |
|
10653
|
0 |
0 |
dict->analyze(forms[i], guesser >= 0 ? guesser : use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, c->analyses[i]); |
|
|
0 |
0 |
dict->analyze(forms[i], guesser >= 0 ? guesser : use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, c->analyses[i]); |
|
|
0 |
0 |
dict->analyze(forms[i], guesser >= 0 ? guesser : use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, c->analyses[i]); |
|
|
0 |
0 |
dict->analyze(forms[i], guesser >= 0 ? guesser : use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, c->analyses[i]); |
|
|
7 |
0 |
dict->analyze(forms[i], guesser >= 0 ? guesser : use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, c->analyses[i]); |
|
|
0 |
7 |
dict->analyze(forms[i], guesser >= 0 ? guesser : use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, c->analyses[i]); |
|
10656
|
0 |
0 |
if (c->tags.size() < forms.size()) c->tags.resize(forms.size() * 2); |
|
|
0 |
0 |
if (c->tags.size() < forms.size()) c->tags.resize(forms.size() * 2); |
|
|
1 |
0 |
if (c->tags.size() < forms.size()) c->tags.resize(forms.size() * 2); |
|
10659
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) |
|
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) |
|
|
7 |
1 |
for (unsigned i = 0; i < forms.size(); i++) |
|
10670
|
0 |
0 |
if (!c) c = new cache(*this); |
|
|
0 |
0 |
if (!c) c = new cache(*this); |
|
|
0 |
0 |
if (!c) c = new cache(*this); |
|
|
0 |
0 |
if (!c) c = new cache(*this); |
|
|
0 |
0 |
if (!c) c = new cache(*this); |
|
|
0 |
0 |
if (!c) c = new cache(*this); |
|
10696
|
1 |
0 |
tagger_id id = tagger_id(is.get()); |
|
|
1 |
0 |
tagger_id id = tagger_id(is.get()); |
|
|
1 |
0 |
tagger_id id = tagger_id(is.get()); |
|
|
1 |
0 |
tagger_id id = tagger_id(is.get()); |
|
|
0 |
0 |
tagger_id id = tagger_id(is.get()); |
|
|
0 |
0 |
tagger_id id = tagger_id(is.get()); |
|
|
0 |
0 |
tagger_id id = tagger_id(is.get()); |
|
10702
|
0 |
0 |
auto res = new_unique_ptr>>(tagger_ids::decoding_order(id), tagger_ids::window_size(id)); |
|
10703
|
0 |
0 |
if (res->load(is)) return res.release(); |
|
|
0 |
0 |
if (res->load(is)) return res.release(); |
|
10711
|
0 |
0 |
auto res = new_unique_ptr>>(tagger_ids::decoding_order(id), tagger_ids::window_size(id)); |
|
10712
|
0 |
0 |
if (res->load(is)) return res.release(); |
|
|
0 |
0 |
if (res->load(is)) return res.release(); |
|
10719
|
1 |
0 |
auto res = new_unique_ptr>>(tagger_ids::decoding_order(id), tagger_ids::window_size(id)); |
|
10720
|
1 |
0 |
if (res->load(is)) return res.release(); |
|
|
1 |
0 |
if (res->load(is)) return res.release(); |
|
10729
|
0 |
0 |
ifstream f(path_from_utf8(fname).c_str(), ifstream::binary); |
|
10730
|
0 |
0 |
if (!f) return nullptr; |
|
10732
|
0 |
0 |
return load(f); |
|
10737
|
0 |
0 |
return morpho ? morpho->new_tokenizer() : nullptr; |
|
10842
|
0 |
0 |
for (int i = 0; i < 15 && pdt_tag[i]; i++) |
|
|
0 |
0 |
for (int i = 0; i < 15 && pdt_tag[i]; i++) |
|
10843
|
0 |
0 |
if (pdt_tag[i] != '-') { |
|
10844
|
0 |
0 |
if (!tag.empty()) tag.push_back('|'); |
|
10851
|
0 |
0 |
for (unsigned i = 0; i + 2 < lemma.size(); i++) |
|
10852
|
0 |
0 |
if (lemma[i] == '_' && lemma[i + 1] == ';') { |
|
|
0 |
0 |
if (lemma[i] == '_' && lemma[i + 1] == ';') { |
|
|
0 |
0 |
if (lemma[i] == '_' && lemma[i + 1] == ';') { |
|
10853
|
0 |
0 |
if (!tag.empty()) tag.push_back('|'); |
|
10862
|
0 |
0 |
return raw_lemma < lemma.size() ? (lemma.resize(raw_lemma), true) : false; |
|
10873
|
0 |
0 |
for (auto&& tagged_lemma : tagged_lemmas) { |
|
10879
|
0 |
0 |
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
|
|
0 |
0 |
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
|
|
0 |
0 |
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
|
10887
|
0 |
0 |
for (auto&& tagged_lemma_forms : forms) { |
|
10888
|
0 |
0 |
for (auto&& tagged_form : tagged_lemma_forms.forms) |
|
10894
|
0 |
0 |
if (!lemma_changed || forms.size() < 2) return; |
|
|
0 |
0 |
if (!lemma_changed || forms.size() < 2) return; |
|
|
0 |
0 |
if (!lemma_changed || forms.size() < 2) return; |
|
10948
|
0 |
0 |
return lemma_id_len < lemma.size() ? (lemma.resize(lemma_id_len), true) : false; |
|
10958
|
0 |
0 |
for (auto&& tagged_lemma : tagged_lemmas) |
|
10962
|
0 |
0 |
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
|
|
0 |
0 |
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
|
|
0 |
0 |
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
|
10970
|
0 |
0 |
for (auto&& tagged_lemma_forms : forms) |
|
10974
|
0 |
0 |
if (!lemma_changed || forms.size() < 2) return; |
|
|
0 |
0 |
if (!lemma_changed || forms.size() < 2) return; |
|
|
0 |
0 |
if (!lemma_changed || forms.size() < 2) return; |
|
11028
|
0 |
0 |
return raw_lemma_len < lemma.size() ? (lemma.resize(raw_lemma_len), true) : false; |
|
11038
|
0 |
0 |
for (auto&& tagged_lemma : tagged_lemmas) |
|
11042
|
0 |
0 |
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
|
|
0 |
0 |
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
|
|
0 |
0 |
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
|
11050
|
0 |
0 |
for (auto&& tagged_lemma_forms : forms) |
|
11054
|
0 |
0 |
if (!lemma_changed || forms.size() < 2) return; |
|
|
0 |
0 |
if (!lemma_changed || forms.size() < 2) return; |
|
|
0 |
0 |
if (!lemma_changed || forms.size() < 2) return; |
|
11093
|
0 |
0 |
if (name == "pdt_to_conll2009") return tagset_converter::new_pdt_to_conll2009_converter(); |
|
11094
|
0 |
0 |
if (name == "strip_lemma_comment") return tagset_converter::new_strip_lemma_comment_converter(dictionary); |
|
11095
|
0 |
0 |
if (name == "strip_lemma_id") return tagset_converter::new_strip_lemma_id_converter(dictionary); |
|
11102
|
0 |
0 |
inline static bool eq(const tagged_lemma& a, const tagged_lemma& b) { return a.lemma == b.lemma && a.tag == b.tag; } |
|
|
0 |
0 |
inline static bool eq(const tagged_lemma& a, const tagged_lemma& b) { return a.lemma == b.lemma && a.tag == b.tag; } |
|
11103
|
0 |
0 |
inline static bool lt(const tagged_lemma& a, const tagged_lemma& b) { int lemma_compare = a.lemma.compare(b.lemma); return lemma_compare < 0 || (lemma_compare == 0 && a.tag < b.tag); } |
|
11112
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) { |
|
11114
|
0 |
0 |
for (unsigned j = forms.size() - 1; j > i; j--) |
|
11115
|
0 |
0 |
if (forms[j].lemma == forms[i].lemma) { |
|
11117
|
0 |
0 |
for (auto&& tagged_form : forms[j].forms) |
|
11121
|
0 |
0 |
if (j < forms.size() - 1) { |
|
11129
|
0 |
0 |
if (any_merged && forms[i].forms.size() > 1) { |
|
|
0 |
0 |
if (any_merged && forms[i].forms.size() > 1) { |
|
|
0 |
0 |
if (any_merged && forms[i].forms.size() > 1) { |
|
11132
|
0 |
0 |
inline static bool eq(const tagged_form& a, const tagged_form& b) { return a.tag == b.tag && a.form == b.form; } |
|
|
0 |
0 |
inline static bool eq(const tagged_form& a, const tagged_form& b) { return a.tag == b.tag && a.form == b.form; } |
|
11133
|
0 |
0 |
inline static bool lt(const tagged_form& a, const tagged_form& b) { int tag_compare = a.tag.compare(b.tag); return tag_compare < 0 || (tag_compare == 0 && a.form < b.form); } |
|
11291
|
214 |
2 |
const unordered_set czech_tokenizer::abbreviations_czech = { |
|
|
0 |
0 |
const unordered_set czech_tokenizer::abbreviations_czech = { |
|
11307
|
206 |
2 |
const unordered_set czech_tokenizer::abbreviations_slovak = { |
|
|
0 |
0 |
const unordered_set czech_tokenizer::abbreviations_slovak = { |
|
11324
|
0 |
0 |
: ragel_tokenizer(version <= 1 ? 1 : 2), m(m) { |
|
|
0 |
0 |
: ragel_tokenizer(version <= 1 ? 1 : 2), m(m) { |
|
11338
|
0 |
0 |
if (!m) return; |
|
11339
|
0 |
0 |
if (tokens.empty() || chars[tokens.back().start].cat & ~unicode::L) return; |
|
|
0 |
0 |
if (tokens.empty() || chars[tokens.back().start].cat & ~unicode::L) return; |
|
|
0 |
0 |
if (tokens.empty() || chars[tokens.back().start].cat & ~unicode::L) return; |
|
11342
|
0 |
0 |
for (unsigned hyphens = 1; hyphens <= 2; hyphens++) { |
|
11344
|
0 |
0 |
if (tokens.size() < 2*hyphens + 1) break; |
|
11346
|
0 |
0 |
if (tokens[first_hyphen].length != 1 || chars[tokens[first_hyphen].start].cat & ~unicode::P || |
|
|
0 |
0 |
if (tokens[first_hyphen].length != 1 || chars[tokens[first_hyphen].start].cat & ~unicode::P || |
|
|
0 |
0 |
if (tokens[first_hyphen].length != 1 || chars[tokens[first_hyphen].start].cat & ~unicode::P || |
|
11347
|
0 |
0 |
tokens[first_hyphen].start + tokens[first_hyphen].length != tokens[first_hyphen + 1].start || |
|
11348
|
0 |
0 |
tokens[first_hyphen-1].start + tokens[first_hyphen-1].length != tokens[first_hyphen].start || |
|
|
0 |
0 |
tokens[first_hyphen-1].start + tokens[first_hyphen-1].length != tokens[first_hyphen].start || |
|
11352
|
0 |
0 |
if (m->analyze(string_piece(chars[tokens[first_hyphen-1].start].str, chars[tokens.back().start + tokens.back().length].str - chars[tokens[first_hyphen-1].start].str), morpho::NO_GUESSER, lemmas) >= 0) |
|
11356
|
0 |
0 |
if (matched_hyphens) { |
|
11370
|
0 |
0 |
while (tokenize_url_email(tokens)) |
|
11371
|
0 |
0 |
if (emergency_sentence_split(tokens)) |
|
11387
|
0 |
0 |
if ( ( current) == ( (chars.size() - 1)) ) |
|
11392
|
0 |
0 |
switch ( _czech_tokenizer_from_state_actions[cs] ) { |
|
11401
|
0 |
0 |
if ( _klen > 0 ) { |
|
11406
|
0 |
0 |
if ( _upper < _lower ) |
|
11410
|
0 |
0 |
if ( _widec < _mid[0] ) |
|
11412
|
0 |
0 |
else if ( _widec > _mid[1] ) |
|
11418
|
0 |
0 |
if ( |
|
11419
|
0 |
0 |
!current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256; |
|
|
0 |
0 |
!current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256; |
|
11424
|
0 |
0 |
if ( |
|
11425
|
0 |
0 |
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
|
|
0 |
0 |
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
|
|
0 |
0 |
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
|
11438
|
0 |
0 |
if ( _klen > 0 ) { |
|
11443
|
0 |
0 |
if ( _upper < _lower ) |
|
11447
|
0 |
0 |
if ( _widec < *_mid ) |
|
11449
|
0 |
0 |
else if ( _widec > *_mid ) |
|
11461
|
0 |
0 |
if ( _klen > 0 ) { |
|
11466
|
0 |
0 |
if ( _upper < _lower ) |
|
11470
|
0 |
0 |
if ( _widec < _mid[0] ) |
|
11472
|
0 |
0 |
else if ( _widec > _mid[1] ) |
|
11487
|
0 |
0 |
if ( _czech_tokenizer_trans_actions[_trans] == 0 ) |
|
11501
|
0 |
0 |
do |
|
11502
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
11510
|
0 |
0 |
for (current = ts; current < whitespace; current++) |
|
11513
|
0 |
0 |
if (eos) {( current)++; goto _out; } |
|
11518
|
0 |
0 |
if (!tokens.empty()) {( current)++; goto _out; } |
|
11520
|
0 |
0 |
do |
|
11521
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
11530
|
0 |
0 |
do |
|
11531
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
11539
|
0 |
0 |
do |
|
11540
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
11547
|
0 |
0 |
if (!tokens.empty()) {( current)++; goto _out; } |
|
11549
|
0 |
0 |
do |
|
11550
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
11559
|
0 |
0 |
do |
|
11560
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
11568
|
0 |
0 |
switch ( _czech_tokenizer_to_state_actions[cs] ) { |
|
11574
|
0 |
0 |
if ( cs == 0 ) |
|
11576
|
0 |
0 |
if ( ++( current) != ( (chars.size() - 1)) ) |
|
11579
|
0 |
0 |
if ( ( current) == ( (chars.size() - 1)) ) |
|
11581
|
0 |
0 |
if ( _czech_tokenizer_eof_trans[cs] > 0 ) { |
|
11641
|
0 |
0 |
return new czech_tokenizer(language, version, m); |
|
11648
|
0 |
0 |
return bool(is) && (language == czech_tokenizer::CZECH || language == czech_tokenizer::SLOVAK); |
|
|
0 |
0 |
return bool(is) && (language == czech_tokenizer::CZECH || language == czech_tokenizer::SLOVAK); |
|
11713
|
228 |
2 |
const unordered_set english_tokenizer::abbreviations = { |
|
|
0 |
0 |
const unordered_set english_tokenizer::abbreviations = { |
|
11812
|
0 |
0 |
if (tokens.empty() || chars[tokens.back().start].cat & ~unilib::unicode::L) return; |
|
|
0 |
0 |
if (tokens.empty() || chars[tokens.back().start].cat & ~unilib::unicode::L) return; |
|
|
0 |
0 |
if (tokens.empty() || chars[tokens.back().start].cat & ~unilib::unicode::L) return; |
|
11827
|
0 |
0 |
if ( ( index) == ( end) ) |
|
11836
|
0 |
0 |
if ( _klen > 0 ) { |
|
11841
|
0 |
0 |
if ( _upper < _lower ) |
|
11845
|
0 |
0 |
if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) < *_mid ) |
|
11847
|
0 |
0 |
else if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) > *_mid ) |
|
11859
|
0 |
0 |
if ( _klen > 0 ) { |
|
11864
|
0 |
0 |
if ( _upper < _lower ) |
|
11868
|
0 |
0 |
if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) < _mid[0] ) |
|
11870
|
0 |
0 |
else if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) > _mid[1] ) |
|
11884
|
0 |
0 |
if ( _english_tokenizer_split_token_trans_actions[_trans] == 0 ) |
|
11898
|
0 |
0 |
if ( cs == 0 ) |
|
11900
|
0 |
0 |
if ( ++( index) != ( end) ) |
|
11903
|
0 |
0 |
if ( ( index) == ( end) ) |
|
11905
|
0 |
0 |
switch ( _english_tokenizer_split_token_eof_actions[cs] ) { |
|
11915
|
0 |
0 |
if (split_len && split_len < end) { |
|
12069
|
0 |
0 |
english_tokenizer::english_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {} |
|
|
0 |
0 |
english_tokenizer::english_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {} |
|
|
0 |
0 |
english_tokenizer::english_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {} |
|
|
0 |
0 |
english_tokenizer::english_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {} |
|
12078
|
0 |
0 |
while (tokenize_url_email(tokens)) |
|
12079
|
0 |
0 |
if (emergency_sentence_split(tokens)) |
|
12095
|
0 |
0 |
if ( ( current) == ( (chars.size() - 1)) ) |
|
12100
|
0 |
0 |
switch ( _english_tokenizer_from_state_actions[cs] ) { |
|
12109
|
0 |
0 |
if ( _klen > 0 ) { |
|
12114
|
0 |
0 |
if ( _upper < _lower ) |
|
12118
|
0 |
0 |
if ( _widec < _mid[0] ) |
|
12120
|
0 |
0 |
else if ( _widec > _mid[1] ) |
|
12126
|
0 |
0 |
if ( |
|
12127
|
0 |
0 |
!current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256; |
|
|
0 |
0 |
!current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256; |
|
12132
|
0 |
0 |
if ( |
|
12133
|
0 |
0 |
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
|
|
0 |
0 |
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
|
|
0 |
0 |
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
|
12146
|
0 |
0 |
if ( _klen > 0 ) { |
|
12151
|
0 |
0 |
if ( _upper < _lower ) |
|
12155
|
0 |
0 |
if ( _widec < *_mid ) |
|
12157
|
0 |
0 |
else if ( _widec > *_mid ) |
|
12169
|
0 |
0 |
if ( _klen > 0 ) { |
|
12174
|
0 |
0 |
if ( _upper < _lower ) |
|
12178
|
0 |
0 |
if ( _widec < _mid[0] ) |
|
12180
|
0 |
0 |
else if ( _widec > _mid[1] ) |
|
12195
|
0 |
0 |
if ( _english_tokenizer_trans_actions[_trans] == 0 ) |
|
12209
|
0 |
0 |
do |
|
12210
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
12218
|
0 |
0 |
for (current = ts; current < whitespace; current++) |
|
12221
|
0 |
0 |
if (eos) {( current)++; goto _out; } |
|
12226
|
0 |
0 |
if (!tokens.empty()) {( current)++; goto _out; } |
|
12228
|
0 |
0 |
do |
|
12229
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
12238
|
0 |
0 |
do |
|
12239
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
12247
|
0 |
0 |
do |
|
12248
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
12255
|
0 |
0 |
if (!tokens.empty()) {( current)++; goto _out; } |
|
12257
|
0 |
0 |
do |
|
12258
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
12267
|
0 |
0 |
do |
|
12268
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
12276
|
0 |
0 |
switch ( _english_tokenizer_to_state_actions[cs] ) { |
|
12282
|
0 |
0 |
if ( cs == 0 ) |
|
12284
|
0 |
0 |
if ( ++( current) != ( (chars.size() - 1)) ) |
|
12287
|
0 |
0 |
if ( ( current) == ( (chars.size() - 1)) ) |
|
12289
|
0 |
0 |
if ( _english_tokenizer_eof_trans[cs] > 0 ) { |
|
12446
|
0 |
0 |
generic_tokenizer::generic_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {} |
|
|
0 |
0 |
generic_tokenizer::generic_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {} |
|
|
0 |
0 |
generic_tokenizer::generic_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {} |
|
|
0 |
0 |
generic_tokenizer::generic_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {} |
|
12455
|
0 |
0 |
while (tokenize_url_email(tokens)) |
|
12456
|
0 |
0 |
if (emergency_sentence_split(tokens)) |
|
12472
|
0 |
0 |
if ( ( current) == ( (chars.size() - 1)) ) |
|
12477
|
0 |
0 |
switch ( _generic_tokenizer_from_state_actions[cs] ) { |
|
12486
|
0 |
0 |
if ( _klen > 0 ) { |
|
12491
|
0 |
0 |
if ( _upper < _lower ) |
|
12495
|
0 |
0 |
if ( _widec < _mid[0] ) |
|
12497
|
0 |
0 |
else if ( _widec > _mid[1] ) |
|
12503
|
0 |
0 |
if ( |
|
12504
|
0 |
0 |
!current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256; |
|
|
0 |
0 |
!current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256; |
|
12509
|
0 |
0 |
if ( |
|
12510
|
0 |
0 |
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
|
|
0 |
0 |
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
|
|
0 |
0 |
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
|
12523
|
0 |
0 |
if ( _klen > 0 ) { |
|
12528
|
0 |
0 |
if ( _upper < _lower ) |
|
12532
|
0 |
0 |
if ( _widec < *_mid ) |
|
12534
|
0 |
0 |
else if ( _widec > *_mid ) |
|
12546
|
0 |
0 |
if ( _klen > 0 ) { |
|
12551
|
0 |
0 |
if ( _upper < _lower ) |
|
12555
|
0 |
0 |
if ( _widec < _mid[0] ) |
|
12557
|
0 |
0 |
else if ( _widec > _mid[1] ) |
|
12572
|
0 |
0 |
if ( _generic_tokenizer_trans_actions[_trans] == 0 ) |
|
12585
|
0 |
0 |
do |
|
12586
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
12594
|
0 |
0 |
for (current = ts; current < whitespace; current++) |
|
12597
|
0 |
0 |
if (eos) {( current)++; goto _out; } |
|
12602
|
0 |
0 |
if (!tokens.empty()) {( current)++; goto _out; } |
|
12604
|
0 |
0 |
do |
|
12605
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
12613
|
0 |
0 |
do |
|
12614
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
12622
|
0 |
0 |
do |
|
12623
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
12630
|
0 |
0 |
if (!tokens.empty()) {( current)++; goto _out; } |
|
12632
|
0 |
0 |
do |
|
12633
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
12641
|
0 |
0 |
do |
|
12642
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
|
12650
|
0 |
0 |
switch ( _generic_tokenizer_to_state_actions[cs] ) { |
|
12656
|
0 |
0 |
if ( cs == 0 ) |
|
12658
|
0 |
0 |
if ( ++( current) != ( (chars.size() - 1)) ) |
|
12661
|
0 |
0 |
if ( ( current) == ( (chars.size() - 1)) ) |
|
12663
|
0 |
0 |
if ( _generic_tokenizer_eof_trans[cs] > 0 ) { |
|
12726
|
0 |
0 |
version = is.get(); |
|
12771
|
0 |
0 |
os.put(version); |
|
12908
|
2 |
2 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
|
0 |
0 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
|
0 |
0 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
|
0 |
0 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
|
0 |
0 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
|
0 |
0 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
|
0 |
0 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
|
0 |
0 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
|
0 |
0 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
|
0 |
0 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
|
0 |
0 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
|
0 |
0 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
|
0 |
0 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
|
0 |
0 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
|
0 |
0 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
12914
|
0 |
0 |
for (int i = 0; i < R; i++) memcpy(w[i], data.next(C), sizeof(float) * C); |
|
|
0 |
0 |
for (int i = 0; i < R; i++) memcpy(w[i], data.next(C), sizeof(float) * C); |
|
|
192 |
12 |
for (int i = 0; i < R; i++) memcpy(w[i], data.next(C), sizeof(float) * C); |
|
|
0 |
0 |
for (int i = 0; i < R; i++) memcpy(w[i], data.next(C), sizeof(float) * C); |
|
|
0 |
0 |
for (int i = 0; i < R; i++) memcpy(w[i], data.next(C), sizeof(float) * C); |
|
|
6 |
2 |
for (int i = 0; i < R; i++) memcpy(w[i], data.next(C), sizeof(float) * C); |
|
12930
|
1 |
0 |
if (chars.empty()) return; |
|
|
0 |
0 |
if (chars.empty()) return; |
|
|
0 |
0 |
if (chars.empty()) return; |
|
12934
|
34 |
1 |
for (size_t i = 0; i < chars.size(); i++) { |
|
|
0 |
0 |
for (size_t i = 0; i < chars.size(); i++) { |
|
|
0 |
0 |
for (size_t i = 0; i < chars.size(); i++) { |
|
12942
|
0 |
0 |
if (decomposition[0] == 0x3001) decomposition[0] = char32_t(','); |
|
|
0 |
0 |
if (decomposition[0] == 0x3001) decomposition[0] = char32_t(','); |
|
|
0 |
0 |
if (decomposition[0] == 0x3001) decomposition[0] = char32_t(','); |
|
12943
|
0 |
0 |
if (decomposition[0] == 0x3002) decomposition[0] = char32_t('.'); |
|
|
0 |
0 |
if (decomposition[0] == 0x3002) decomposition[0] = char32_t('.'); |
|
|
0 |
0 |
if (decomposition[0] == 0x3002) decomposition[0] = char32_t('.'); |
|
12944
|
0 |
0 |
if (decomposition[0] != chars[i].chr) embedding = embeddings.find(decomposition[0]); |
|
|
0 |
0 |
if (decomposition[0] != chars[i].chr) embedding = embeddings.find(decomposition[0]); |
|
|
0 |
0 |
if (decomposition[0] != chars[i].chr) embedding = embeddings.find(decomposition[0]); |
|
12947
|
34 |
0 |
if (embedding != embeddings.end()) { |
|
|
0 |
0 |
if (embedding != embeddings.end()) { |
|
|
0 |
0 |
if (embedding != embeddings.end()) { |
|
12952
|
0 |
0 |
outcomes[i].embedding = embedding != embeddings.end() ? embedding->second.cache.w[0] : empty_embedding.cache.w[0]; |
|
|
0 |
0 |
outcomes[i].embedding = embedding != embeddings.end() ? embedding->second.cache.w[0] : empty_embedding.cache.w[0]; |
|
|
0 |
0 |
outcomes[i].embedding = embedding != embeddings.end() ? embedding->second.cache.w[0] : empty_embedding.cache.w[0]; |
|
12957
|
34 |
1 |
for (auto&& outcome : outcomes) |
|
|
0 |
0 |
for (auto&& outcome : outcomes) |
|
|
0 |
0 |
for (auto&& outcome : outcomes) |
|
12958
|
102 |
34 |
for (int i = 0; i < 3; i++) |
|
|
0 |
0 |
for (int i = 0; i < 3; i++) |
|
|
0 |
0 |
for (int i = 0; i < 3; i++) |
|
12963
|
2 |
1 |
for (int dir = 0; dir < 2; dir++) { |
|
|
0 |
0 |
for (int dir = 0; dir < 2; dir++) { |
|
|
0 |
0 |
for (int dir = 0; dir < 2; dir++) { |
|
12964
|
1 |
1 |
auto& gru = dir == 0 ? gru_fwd : gru_bwd; |
|
|
0 |
0 |
auto& gru = dir == 0 ? gru_fwd : gru_bwd; |
|
|
0 |
0 |
auto& gru = dir == 0 ? gru_fwd : gru_bwd; |
|
12965
|
1 |
1 |
auto& projection = dir == 0 ? projection_fwd : projection_bwd; |
|
|
0 |
0 |
auto& projection = dir == 0 ? projection_fwd : projection_bwd; |
|
|
0 |
0 |
auto& projection = dir == 0 ? projection_fwd : projection_bwd; |
|
12968
|
68 |
2 |
for (size_t i = 0; i < outcomes.size(); i++) { |
|
|
0 |
0 |
for (size_t i = 0; i < outcomes.size(); i++) { |
|
|
0 |
0 |
for (size_t i = 0; i < outcomes.size(); i++) { |
|
12969
|
34 |
34 |
auto& outcome = outcomes[dir == 0 ? i : outcomes.size() - 1 - i]; |
|
|
0 |
0 |
auto& outcome = outcomes[dir == 0 ? i : outcomes.size() - 1 - i]; |
|
|
0 |
0 |
auto& outcome = outcomes[dir == 0 ? i : outcomes.size() - 1 - i]; |
|
12970
|
34 |
34 |
auto* embedding_cache = outcome.embedding + (dir == 1) * 3 * D; |
|
|
0 |
0 |
auto* embedding_cache = outcome.embedding + (dir == 1) * 3 * D; |
|
|
0 |
0 |
auto* embedding_cache = outcome.embedding + (dir == 1) * 3 * D; |
|
12972
|
68 |
1088 |
for (int j = 0; j < D; j++) { |
|
|
0 |
0 |
for (int j = 0; j < D; j++) { |
|
|
0 |
0 |
for (int j = 0; j < D; j++) { |
|
12975
|
17408 |
1088 |
for (int k = 0; k < D; k++) { |
|
|
0 |
0 |
for (int k = 0; k < D; k++) { |
|
|
0 |
0 |
for (int k = 0; k < D; k++) { |
|
12983
|
68 |
1088 |
for (int j = 0; j < D; j++) { |
|
|
0 |
0 |
for (int j = 0; j < D; j++) { |
|
|
0 |
0 |
for (int j = 0; j < D; j++) { |
|
12985
|
17408 |
1088 |
for (int k = 0; k < D; k++) |
|
|
0 |
0 |
for (int k = 0; k < D; k++) |
|
|
0 |
0 |
for (int k = 0; k < D; k++) |
|
12991
|
204 |
68 |
for (int j = 0; j < 3; j++) |
|
|
0 |
0 |
for (int j = 0; j < 3; j++) |
|
|
0 |
0 |
for (int j = 0; j < 3; j++) |
|
12992
|
3264 |
204 |
for (int k = 0; k < D; k++) |
|
|
0 |
0 |
for (int k = 0; k < D; k++) |
|
|
0 |
0 |
for (int k = 0; k < D; k++) |
|
12998
|
34 |
1 |
for (auto&& outcome : outcomes) { |
|
|
0 |
0 |
for (auto&& outcome : outcomes) { |
|
|
0 |
0 |
for (auto&& outcome : outcomes) { |
|
13000
|
1 |
33 |
if (outcome.w[2] > outcome.w[outcome.outcome]) outcome.outcome = 2; |
|
|
0 |
0 |
if (outcome.w[2] > outcome.w[outcome.outcome]) outcome.outcome = 2; |
|
|
0 |
0 |
if (outcome.w[2] > outcome.w[outcome.outcome]) outcome.outcome = 2; |
|
13008
|
0 |
0 |
for (unsigned chars = data.next_4B(); chars; chars--) { |
|
|
0 |
0 |
for (unsigned chars = data.next_4B(); chars; chars--) { |
|
|
0 |
0 |
for (unsigned chars = data.next_4B(); chars; chars--) { |
|
|
0 |
0 |
for (unsigned chars = data.next_4B(); chars; chars--) { |
|
|
1 |
0 |
for (unsigned chars = data.next_4B(); chars; chars--) { |
|
|
20 |
1 |
for (unsigned chars = data.next_4B(); chars; chars--) { |
|
13009
|
0 |
0 |
auto& embedding = network->embeddings[data.next_4B()]; |
|
|
0 |
0 |
auto& embedding = network->embeddings[data.next_4B()]; |
|
|
20 |
0 |
auto& embedding = network->embeddings[data.next_4B()]; |
|
13010
|
0 |
0 |
copy_n(data.next(D), D, embedding.e.w[0]); |
|
|
0 |
0 |
copy_n(data.next(D), D, embedding.e.w[0]); |
|
|
20 |
0 |
copy_n(data.next(D), D, embedding.e.w[0]); |
|
13014
|
0 |
0 |
network->gru_fwd.load(data); |
|
|
0 |
0 |
network->gru_fwd.load(data); |
|
|
1 |
0 |
network->gru_fwd.load(data); |
|
13015
|
0 |
0 |
network->gru_bwd.load(data); |
|
|
0 |
0 |
network->gru_bwd.load(data); |
|
|
1 |
0 |
network->gru_bwd.load(data); |
|
13016
|
0 |
0 |
network->projection_fwd.load(data); |
|
|
0 |
0 |
network->projection_fwd.load(data); |
|
|
1 |
0 |
network->projection_fwd.load(data); |
|
13017
|
0 |
0 |
network->projection_bwd.load(data); |
|
|
0 |
0 |
network->projection_bwd.load(data); |
|
|
1 |
0 |
network->projection_bwd.load(data); |
|
13020
|
0 |
0 |
for (unsigned unknown_chars_len = data.next_1B(); unknown_chars_len; unknown_chars_len--) { |
|
|
0 |
0 |
for (unsigned unknown_chars_len = data.next_1B(); unknown_chars_len; unknown_chars_len--) { |
|
|
0 |
0 |
for (unsigned unknown_chars_len = data.next_1B(); unknown_chars_len; unknown_chars_len--) { |
|
|
0 |
0 |
for (unsigned unknown_chars_len = data.next_1B(); unknown_chars_len; unknown_chars_len--) { |
|
|
1 |
0 |
for (unsigned unknown_chars_len = data.next_1B(); unknown_chars_len; unknown_chars_len--) { |
|
|
4 |
1 |
for (unsigned unknown_chars_len = data.next_1B(); unknown_chars_len; unknown_chars_len--) { |
|
13021
|
0 |
0 |
unilib::unicode::category_t cat = data.next_4B(); |
|
|
0 |
0 |
unilib::unicode::category_t cat = data.next_4B(); |
|
|
4 |
0 |
unilib::unicode::category_t cat = data.next_4B(); |
|
13022
|
0 |
0 |
network->unknown_chars[cat] = data.next_4B(); |
|
|
0 |
0 |
network->unknown_chars[cat] = data.next_4B(); |
|
|
4 |
0 |
network->unknown_chars[cat] = data.next_4B(); |
|
13032
|
0 |
0 |
for (auto&& embedding : embeddings) { |
|
|
0 |
0 |
for (auto&& embedding : embeddings) { |
|
|
20 |
1 |
for (auto&& embedding : embeddings) { |
|
13036
|
0 |
0 |
for (int i = 0; i < 6; i++) fill_n(cache.w[i], D, 0.f); |
|
|
0 |
0 |
for (int i = 0; i < 6; i++) fill_n(cache.w[i], D, 0.f); |
|
|
20 |
120 |
for (int i = 0; i < 6; i++) fill_n(cache.w[i], D, 0.f); |
|
13037
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[0][i] += e.w[0][j] * gru_fwd.X.w[i][j]; |
|
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[0][i] += e.w[0][j] * gru_fwd.X.w[i][j]; |
|
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[0][i] += e.w[0][j] * gru_fwd.X.w[i][j]; |
|
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[0][i] += e.w[0][j] * gru_fwd.X.w[i][j]; |
|
|
320 |
20 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[0][i] += e.w[0][j] * gru_fwd.X.w[i][j]; |
|
|
5120 |
320 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[0][i] += e.w[0][j] * gru_fwd.X.w[i][j]; |
|
13038
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[1][i] += e.w[0][j] * gru_fwd.X_r.w[i][j]; |
|
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[1][i] += e.w[0][j] * gru_fwd.X_r.w[i][j]; |
|
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[1][i] += e.w[0][j] * gru_fwd.X_r.w[i][j]; |
|
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[1][i] += e.w[0][j] * gru_fwd.X_r.w[i][j]; |
|
|
320 |
20 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[1][i] += e.w[0][j] * gru_fwd.X_r.w[i][j]; |
|
|
5120 |
320 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[1][i] += e.w[0][j] * gru_fwd.X_r.w[i][j]; |
|
13039
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[2][i] += e.w[0][j] * gru_fwd.X_z.w[i][j]; |
|
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[2][i] += e.w[0][j] * gru_fwd.X_z.w[i][j]; |
|
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[2][i] += e.w[0][j] * gru_fwd.X_z.w[i][j]; |
|
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[2][i] += e.w[0][j] * gru_fwd.X_z.w[i][j]; |
|
|
320 |
20 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[2][i] += e.w[0][j] * gru_fwd.X_z.w[i][j]; |
|
|
5120 |
320 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[2][i] += e.w[0][j] * gru_fwd.X_z.w[i][j]; |
|
13040
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[3][i] += e.w[0][j] * gru_bwd.X.w[i][j]; |
|
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[3][i] += e.w[0][j] * gru_bwd.X.w[i][j]; |
|
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[3][i] += e.w[0][j] * gru_bwd.X.w[i][j]; |
|
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[3][i] += e.w[0][j] * gru_bwd.X.w[i][j]; |
|
|
320 |
20 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[3][i] += e.w[0][j] * gru_bwd.X.w[i][j]; |
|
|
5120 |
320 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[3][i] += e.w[0][j] * gru_bwd.X.w[i][j]; |
|
13041
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[4][i] += e.w[0][j] * gru_bwd.X_r.w[i][j]; |
|
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[4][i] += e.w[0][j] * gru_bwd.X_r.w[i][j]; |
|
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[4][i] += e.w[0][j] * gru_bwd.X_r.w[i][j]; |
|
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[4][i] += e.w[0][j] * gru_bwd.X_r.w[i][j]; |
|
|
320 |
20 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[4][i] += e.w[0][j] * gru_bwd.X_r.w[i][j]; |
|
|
5120 |
320 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[4][i] += e.w[0][j] * gru_bwd.X_r.w[i][j]; |
|
13042
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[5][i] += e.w[0][j] * gru_bwd.X_z.w[i][j]; |
|
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[5][i] += e.w[0][j] * gru_bwd.X_z.w[i][j]; |
|
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[5][i] += e.w[0][j] * gru_bwd.X_z.w[i][j]; |
|
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[5][i] += e.w[0][j] * gru_bwd.X_z.w[i][j]; |
|
|
320 |
20 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[5][i] += e.w[0][j] * gru_bwd.X_z.w[i][j]; |
|
|
5120 |
320 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[5][i] += e.w[0][j] * gru_bwd.X_z.w[i][j]; |
|
13044
|
0 |
0 |
for (int i = 0; i < 6; i++) fill_n(empty_embedding.cache.w[i], D, 0.f); |
|
|
0 |
0 |
for (int i = 0; i < 6; i++) fill_n(empty_embedding.cache.w[i], D, 0.f); |
|
|
6 |
1 |
for (int i = 0; i < 6; i++) fill_n(empty_embedding.cache.w[i], D, 0.f); |
|
13067
|
0 |
0 |
: unicode_tokenizer(url_email_tokenizer), segment(segment), allow_spaces(allow_spaces), network_index(0), network_length(0), network(network) {} |
|
|
0 |
0 |
: unicode_tokenizer(url_email_tokenizer), segment(segment), allow_spaces(allow_spaces), network_index(0), network_length(0), network(network) {} |
|
|
0 |
0 |
: unicode_tokenizer(url_email_tokenizer), segment(segment), allow_spaces(allow_spaces), network_index(0), network_length(0), network(network) {} |
|
|
1 |
0 |
: unicode_tokenizer(url_email_tokenizer), segment(segment), allow_spaces(allow_spaces), network_index(0), network_length(0), network(network) {} |
|
13102
|
29 |
5 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
|
29 |
0 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
|
29 |
0 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
|
0 |
29 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
|
4 |
0 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
|
4 |
0 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
|
4 |
0 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
|
0 |
4 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
|
28 |
5 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
|
28 |
0 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
|
28 |
0 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
|
0 |
28 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
|
7 |
5 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
|
7 |
0 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
|
7 |
0 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
|
0 |
7 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
13109
|
1 |
1 |
if (current == 0) network_index = network_length = 0; |
|
13112
|
8 |
1 |
for (bool eos = false; !eos && !emergency_sentence_split(tokens); ) { |
|
|
0 |
8 |
for (bool eos = false; !eos && !emergency_sentence_split(tokens); ) { |
|
|
8 |
1 |
for (bool eos = false; !eos && !emergency_sentence_split(tokens); ) { |
|
13113
|
12 |
1 |
while (current < chars.size() - 1 && is_space(current)) |
|
|
7 |
5 |
while (current < chars.size() - 1 && is_space(current)) |
|
|
5 |
8 |
while (current < chars.size() - 1 && is_space(current)) |
|
13114
|
0 |
5 |
if (next_outcome() == gru_tokenizer_network::END_OF_SENTENCE && !tokens.empty()) |
|
|
0 |
0 |
if (next_outcome() == gru_tokenizer_network::END_OF_SENTENCE && !tokens.empty()) |
|
|
5 |
0 |
if (next_outcome() == gru_tokenizer_network::END_OF_SENTENCE && !tokens.empty()) |
|
13117
|
7 |
1 |
if (current >= chars.size() - 1) break; |
|
13120
|
0 |
7 |
if (tokenize_url_email(tokens)) { |
|
13121
|
0 |
0 |
while (network_index < network_length && network_offsets[network_index] < current) |
|
|
0 |
0 |
while (network_index < network_length && network_offsets[network_index] < current) |
|
|
0 |
0 |
while (network_index < network_length && network_offsets[network_index] < current) |
|
13122
|
0 |
0 |
if (network_outcomes[network_index++].outcome == gru_tokenizer_network::END_OF_SENTENCE && !tokens.empty()) |
|
|
0 |
0 |
if (network_outcomes[network_index++].outcome == gru_tokenizer_network::END_OF_SENTENCE && !tokens.empty()) |
|
|
0 |
0 |
if (network_outcomes[network_index++].outcome == gru_tokenizer_network::END_OF_SENTENCE && !tokens.empty()) |
|
13129
|
22 |
0 |
do { |
|
13132
|
22 |
7 |
if (outcome != gru_tokenizer_network::NO_SPLIT) break; |
|
13141
|
1 |
33 |
if (network_index >= network_length) { |
|
13150
|
34 |
1 |
for (size_t offset = current; |
|
13151
|
34 |
1 |
network_offsets.push_back(offset), offset < chars.size() - 1 && network_length < segment; |
|
|
0 |
34 |
network_offsets.push_back(offset), offset < chars.size() - 1 && network_length < segment; |
|
13153
|
5 |
29 |
if (is_space(offset)) { |
|
13155
|
4 |
1 |
while (offset + 1 < chars.size() - 1 && is_space(offset + 1)) offset++; |
|
|
4 |
0 |
while (offset + 1 < chars.size() - 1 && is_space(offset + 1)) offset++; |
|
|
0 |
5 |
while (offset + 1 < chars.size() - 1 && is_space(offset + 1)) offset++; |
|
13161
|
1 |
0 |
if (network_length < segment && network_chars.back().chr != ' ') |
|
|
1 |
0 |
if (network_length < segment && network_chars.back().chr != ' ') |
|
|
0 |
1 |
if (network_length < segment && network_chars.back().chr != ' ') |
|
13169
|
33 |
1 |
for (size_t i = 0; i < network_length - 1; i++) |
|
13170
|
28 |
5 |
if (is_space(network_offsets[i+1])) { |
|
13173
|
1 |
4 |
if (i + 2 == network_length) eos = true; |
|
13174
|
0 |
5 |
for (size_t j = network_offsets[i+1]; j + 1 < network_offsets[i+2] && !eos; j++) |
|
|
0 |
0 |
for (size_t j = network_offsets[i+1]; j + 1 < network_offsets[i+2] && !eos; j++) |
|
|
0 |
5 |
for (size_t j = network_offsets[i+1]; j + 1 < network_offsets[i+2] && !eos; j++) |
|
13175
|
0 |
0 |
eos = (chars[j].chr == '\n' && chars[j+1].chr == '\n') || |
|
|
0 |
0 |
eos = (chars[j].chr == '\n' && chars[j+1].chr == '\n') || |
|
|
0 |
0 |
eos = (chars[j].chr == '\n' && chars[j+1].chr == '\n') || |
|
13176
|
0 |
0 |
(j + 3 < network_offsets[i+2] && chars[j].chr == '\r' && chars[j+1].chr == '\n' && chars[j+2].chr == '\r' && chars[j+3].chr == '\n'); |
|
|
0 |
0 |
(j + 3 < network_offsets[i+2] && chars[j].chr == '\r' && chars[j+1].chr == '\n' && chars[j+2].chr == '\r' && chars[j+3].chr == '\n'); |
|
|
0 |
0 |
(j + 3 < network_offsets[i+2] && chars[j].chr == '\r' && chars[j+1].chr == '\n' && chars[j+2].chr == '\r' && chars[j+3].chr == '\n'); |
|
|
0 |
0 |
(j + 3 < network_offsets[i+2] && chars[j].chr == '\r' && chars[j+1].chr == '\n' && chars[j+2].chr == '\r' && chars[j+3].chr == '\n'); |
|
13177
|
1 |
4 |
if (eos) network_outcomes[i].outcome = gru_tokenizer_network::END_OF_SENTENCE; |
|
13179
|
1 |
4 |
if (network_outcomes[i].outcome == gru_tokenizer_network::NO_SPLIT) |
|
13181
|
0 |
4 |
if (!allow_spaces || network_outcomes[i+1].outcome == gru_tokenizer_network::END_OF_TOKEN) |
|
|
0 |
0 |
if (!allow_spaces || network_outcomes[i+1].outcome == gru_tokenizer_network::END_OF_TOKEN) |
|
|
0 |
4 |
if (!allow_spaces || network_outcomes[i+1].outcome == gru_tokenizer_network::END_OF_TOKEN) |
|
13186
|
0 |
1 |
if (network_length == segment && network_length >= 10) { |
|
|
0 |
0 |
if (network_length == segment && network_length >= 10) { |
|
13188
|
0 |
0 |
while (network_length > segment / 2) |
|
13189
|
0 |
0 |
if (network_outcomes[--network_length].outcome != gru_tokenizer_network::NO_SPLIT) |
|
13251
|
1 |
0 |
if (!is.get(version)) return false; |
|
13252
|
1 |
0 |
if (!(version >= 1 && version <= 2)) return false; |
|
13255
|
1 |
0 |
if (!compressor::load(is, data)) return false; |
|
|
1 |
0 |
if (!compressor::load(is, data)) return false; |
|
13258
|
1 |
0 |
url_email_tokenizer = data.next_1B(); |
|
13259
|
1 |
0 |
segment = data.next_2B(); |
|
13260
|
0 |
1 |
allow_spaces = version >= 2 ? data.next_1B() : false /*false was default for version 1*/; |
|
|
0 |
0 |
allow_spaces = version >= 2 ? data.next_1B() : false /*false was default for version 1*/; |
|
|
0 |
0 |
allow_spaces = version >= 2 ? data.next_1B() : false /*false was default for version 1*/; |
|
13262
|
1 |
0 |
network.reset(gru_tokenizer_network::load(data)); |
|
13263
|
1 |
0 |
if (!network) return false; |
|
|
0 |
0 |
if (!network) return false; |
|
13289
|
1 |
0 |
if (data.next_1B() != 1) return nullptr; |
|
13352
|
0 |
0 |
class gru_tokenizer_network_trainer : public gru_tokenizer_network_implementation { |
|
|
0 |
0 |
class gru_tokenizer_network_trainer : public gru_tokenizer_network_implementation { |
|
|
0 |
0 |
class gru_tokenizer_network_trainer : public gru_tokenizer_network_implementation { |
|
13370
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
13380
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
13405
|
0 |
0 |
if (segment < 10) return error.assign("Segment size must be at least 10!"), false; |
|
|
0 |
0 |
if (segment < 10) return error.assign("Segment size must be at least 10!"), false; |
|
|
0 |
0 |
if (segment < 10) return error.assign("Segment size must be at least 10!"), false; |
|
13408
|
0 |
0 |
for (auto&& sentence : data) |
|
|
0 |
0 |
for (auto&& sentence : data) |
|
|
0 |
0 |
for (auto&& sentence : data) |
|
13410
|
0 |
0 |
if (characters < segment) return error.assign("Not enought training data for the gru_tokenizer!"), false; |
|
|
0 |
0 |
if (characters < segment) return error.assign("Not enought training data for the gru_tokenizer!"), false; |
|
|
0 |
0 |
if (characters < segment) return error.assign("Not enought training data for the gru_tokenizer!"), false; |
|
13418
|
0 |
0 |
for (auto&& sentence : data) |
|
|
0 |
0 |
for (auto&& sentence : data) |
|
|
0 |
0 |
for (auto&& sentence : data) |
|
13419
|
0 |
0 |
for (auto&& chr : sentence.sentence) |
|
|
0 |
0 |
for (auto&& chr : sentence.sentence) |
|
|
0 |
0 |
for (auto&& chr : sentence.sentence) |
|
13435
|
0 |
0 |
for (auto&& embedding : this->embeddings) |
|
|
0 |
0 |
for (auto&& embedding : this->embeddings) |
|
|
0 |
0 |
for (auto&& embedding : this->embeddings) |
|
13437
|
0 |
0 |
vector*> chosen_embeddings(segment); |
|
|
0 |
0 |
vector*> chosen_embeddings(segment); |
|
|
0 |
0 |
vector*> chosen_embeddings(segment); |
|
13438
|
0 |
0 |
vector> embedding_dropouts(segment); |
|
|
0 |
0 |
vector> embedding_dropouts(segment); |
|
|
0 |
0 |
vector> embedding_dropouts(segment); |
|
13439
|
0 |
0 |
gru_trainer gru_fwd(this->gru_fwd, segment), gru_bwd(this->gru_bwd, segment); |
|
|
0 |
0 |
gru_trainer gru_fwd(this->gru_fwd, segment), gru_bwd(this->gru_bwd, segment); |
|
|
0 |
0 |
gru_trainer gru_fwd(this->gru_fwd, segment), gru_bwd(this->gru_bwd, segment); |
|
|
0 |
0 |
gru_trainer gru_fwd(this->gru_fwd, segment), gru_bwd(this->gru_bwd, segment); |
|
|
0 |
0 |
gru_trainer gru_fwd(this->gru_fwd, segment), gru_bwd(this->gru_bwd, segment); |
|
|
0 |
0 |
gru_trainer gru_fwd(this->gru_fwd, segment), gru_bwd(this->gru_bwd, segment); |
|
13447
|
0 |
0 |
vector training_input, instance_input(segment); |
|
|
0 |
0 |
vector training_input, instance_input(segment); |
|
|
0 |
0 |
vector training_input, instance_input(segment); |
|
13448
|
0 |
0 |
vector training_output, instance_output(segment); |
|
|
0 |
0 |
vector training_output, instance_output(segment); |
|
|
0 |
0 |
vector training_output, instance_output(segment); |
|
13449
|
0 |
0 |
vector permutation; for (size_t i = 0; i < data.size(); i++) permutation.push_back(permutation.size()); |
|
|
0 |
0 |
vector permutation; for (size_t i = 0; i < data.size(); i++) permutation.push_back(permutation.size()); |
|
|
0 |
0 |
vector permutation; for (size_t i = 0; i < data.size(); i++) permutation.push_back(permutation.size()); |
|
13450
|
0 |
0 |
for (unsigned epoch = 0; epoch < epochs; epoch++) { |
|
|
0 |
0 |
for (unsigned epoch = 0; epoch < epochs; epoch++) { |
|
|
0 |
0 |
for (unsigned epoch = 0; epoch < epochs; epoch++) { |
|
13454
|
0 |
0 |
for (int instance = 0, instances = 10000; instance < instances; instance++) { |
|
|
0 |
0 |
for (int instance = 0, instances = 10000; instance < instances; instance++) { |
|
|
0 |
0 |
for (int instance = 0, instances = 10000; instance < instances; instance++) { |
|
13456
|
0 |
0 |
if (training_offset + segment >= training_input.size()) { |
|
|
0 |
0 |
if (training_offset + segment >= training_input.size()) { |
|
|
0 |
0 |
if (training_offset + segment >= training_input.size()) { |
|
13459
|
0 |
0 |
for (auto&& index : permutation) { |
|
|
0 |
0 |
for (auto&& index : permutation) { |
|
|
0 |
0 |
for (auto&& index : permutation) { |
|
13461
|
0 |
0 |
if (sentence.tokens.empty()) continue; |
|
|
0 |
0 |
if (sentence.tokens.empty()) continue; |
|
|
0 |
0 |
if (sentence.tokens.empty()) continue; |
|
13464
|
0 |
0 |
training_input.resize(training_offset + sentence.sentence.size()); |
|
|
0 |
0 |
training_input.resize(training_offset + sentence.sentence.size()); |
|
|
0 |
0 |
training_input.resize(training_offset + sentence.sentence.size()); |
|
13465
|
0 |
0 |
training_output.resize(training_offset + sentence.sentence.size()); |
|
|
0 |
0 |
training_output.resize(training_offset + sentence.sentence.size()); |
|
|
0 |
0 |
training_output.resize(training_offset + sentence.sentence.size()); |
|
13466
|
0 |
0 |
for (size_t i = 0; i < sentence.sentence.size(); i++) { |
|
|
0 |
0 |
for (size_t i = 0; i < sentence.sentence.size(); i++) { |
|
|
0 |
0 |
for (size_t i = 0; i < sentence.sentence.size(); i++) { |
|
13470
|
0 |
0 |
for (size_t i = 0; i < sentence.tokens.size(); i++) |
|
|
0 |
0 |
for (size_t i = 0; i < sentence.tokens.size(); i++) |
|
|
0 |
0 |
for (size_t i = 0; i < sentence.tokens.size(); i++) |
|
13471
|
0 |
0 |
training_output[training_offset + sentence.tokens[i].start + sentence.tokens[i].length - 1].outcome = |
|
|
0 |
0 |
training_output[training_offset + sentence.tokens[i].start + sentence.tokens[i].length - 1].outcome = |
|
|
0 |
0 |
training_output[training_offset + sentence.tokens[i].start + sentence.tokens[i].length - 1].outcome = |
|
13480
|
0 |
0 |
for (training_shift = segment - 5; training_shift > segment / 2; training_shift--) |
|
|
0 |
0 |
for (training_shift = segment - 5; training_shift > segment / 2; training_shift--) |
|
|
0 |
0 |
for (training_shift = segment - 5; training_shift > segment / 2; training_shift--) |
|
13481
|
0 |
0 |
if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ') |
|
|
0 |
0 |
if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ') |
|
|
0 |
0 |
if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ') |
|
|
0 |
0 |
if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ') |
|
|
0 |
0 |
if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ') |
|
|
0 |
0 |
if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ') |
|
|
0 |
0 |
if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ') |
|
|
0 |
0 |
if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ') |
|
|
0 |
0 |
if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ') |
|
13486
|
0 |
0 |
for (unsigned i = 0; i < segment; i++) { |
|
|
0 |
0 |
for (unsigned i = 0; i < segment; i++) { |
|
|
0 |
0 |
for (unsigned i = 0; i < segment; i++) { |
|
13488
|
0 |
0 |
for (unsigned k = 0; k < D; k++) |
|
|
0 |
0 |
for (unsigned k = 0; k < D; k++) |
|
|
0 |
0 |
for (unsigned k = 0; k < D; k++) |
|
13489
|
0 |
0 |
embedding_dropouts[i].w[0][k] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier; |
|
|
0 |
0 |
embedding_dropouts[i].w[0][k] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier; |
|
|
0 |
0 |
embedding_dropouts[i].w[0][k] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier; |
|
|
0 |
0 |
embedding_dropouts[i].w[0][k] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier; |
|
|
0 |
0 |
embedding_dropouts[i].w[0][k] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier; |
|
|
0 |
0 |
embedding_dropouts[i].w[0][k] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier; |
|
13490
|
0 |
0 |
for (int j = 0; j < 3; j++) |
|
|
0 |
0 |
for (int j = 0; j < 3; j++) |
|
|
0 |
0 |
for (int j = 0; j < 3; j++) |
|
13494
|
0 |
0 |
for (int dir = 0; dir < 2; dir++) { |
|
|
0 |
0 |
for (int dir = 0; dir < 2; dir++) { |
|
|
0 |
0 |
for (int dir = 0; dir < 2; dir++) { |
|
13495
|
0 |
0 |
auto& gru = dir == 0 ? gru_fwd : gru_bwd; |
|
|
0 |
0 |
auto& gru = dir == 0 ? gru_fwd : gru_bwd; |
|
|
0 |
0 |
auto& gru = dir == 0 ? gru_fwd : gru_bwd; |
|
13496
|
0 |
0 |
auto& projection = dir == 0 ? projection_fwd : projection_bwd; |
|
|
0 |
0 |
auto& projection = dir == 0 ? projection_fwd : projection_bwd; |
|
|
0 |
0 |
auto& projection = dir == 0 ? projection_fwd : projection_bwd; |
|
13499
|
0 |
0 |
for (size_t i = 0; i < segment; i++) { |
|
|
0 |
0 |
for (size_t i = 0; i < segment; i++) { |
|
|
0 |
0 |
for (size_t i = 0; i < segment; i++) { |
|
13500
|
0 |
0 |
auto& embedding = chosen_embeddings[dir == 0 ? i : segment - 1 - i]; |
|
|
0 |
0 |
auto& embedding = chosen_embeddings[dir == 0 ? i : segment - 1 - i]; |
|
|
0 |
0 |
auto& embedding = chosen_embeddings[dir == 0 ? i : segment - 1 - i]; |
|
13501
|
0 |
0 |
auto& embedding_dropout = embedding_dropouts[dir == 0 ? i : segment - 1 - i]; |
|
|
0 |
0 |
auto& embedding_dropout = embedding_dropouts[dir == 0 ? i : segment - 1 - i]; |
|
|
0 |
0 |
auto& embedding_dropout = embedding_dropouts[dir == 0 ? i : segment - 1 - i]; |
|
13502
|
0 |
0 |
auto& output = instance_output[dir == 0 ? i : segment - 1 - i]; |
|
|
0 |
0 |
auto& output = instance_output[dir == 0 ? i : segment - 1 - i]; |
|
|
0 |
0 |
auto& output = instance_output[dir == 0 ? i : segment - 1 - i]; |
|
13504
|
0 |
0 |
for (int j = 0; j < D; j++) { |
|
|
0 |
0 |
for (int j = 0; j < D; j++) { |
|
|
0 |
0 |
for (int j = 0; j < D; j++) { |
|
13507
|
0 |
0 |
for (int k = 0; k < D; k++) { |
|
|
0 |
0 |
for (int k = 0; k < D; k++) { |
|
|
0 |
0 |
for (int k = 0; k < D; k++) { |
|
13515
|
0 |
0 |
for (int j = 0; j < D; j++) { |
|
|
0 |
0 |
for (int j = 0; j < D; j++) { |
|
|
0 |
0 |
for (int j = 0; j < D; j++) { |
|
13517
|
0 |
0 |
for (int k = 0; k < D; k++) |
|
|
0 |
0 |
for (int k = 0; k < D; k++) |
|
|
0 |
0 |
for (int k = 0; k < D; k++) |
|
13523
|
0 |
0 |
for (int j = 0; j < D; j++) |
|
|
0 |
0 |
for (int j = 0; j < D; j++) |
|
|
0 |
0 |
for (int j = 0; j < D; j++) |
|
13524
|
0 |
0 |
gru.dropouts[i].w[0][j] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier * gru.states[i+1].w[0][j]; |
|
|
0 |
0 |
gru.dropouts[i].w[0][j] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier * gru.states[i+1].w[0][j]; |
|
|
0 |
0 |
gru.dropouts[i].w[0][j] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier * gru.states[i+1].w[0][j]; |
|
|
0 |
0 |
gru.dropouts[i].w[0][j] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier * gru.states[i+1].w[0][j]; |
|
|
0 |
0 |
gru.dropouts[i].w[0][j] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier * gru.states[i+1].w[0][j]; |
|
|
0 |
0 |
gru.dropouts[i].w[0][j] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier * gru.states[i+1].w[0][j]; |
|
13526
|
0 |
0 |
for (int j = 0; j < 3; j++) |
|
|
0 |
0 |
for (int j = 0; j < 3; j++) |
|
|
0 |
0 |
for (int j = 0; j < 3; j++) |
|
13527
|
0 |
0 |
for (int k = 0; k < D; k++) |
|
|
0 |
0 |
for (int k = 0; k < D; k++) |
|
|
0 |
0 |
for (int k = 0; k < D; k++) |
|
13532
|
0 |
0 |
for (auto&& output : instance_output) { |
|
|
0 |
0 |
for (auto&& output : instance_output) { |
|
|
0 |
0 |
for (auto&& output : instance_output) { |
|
13534
|
0 |
0 |
if (output.w[2] > output.w[best]) best = 2; |
|
|
0 |
0 |
if (output.w[2] > output.w[best]) best = 2; |
|
|
0 |
0 |
if (output.w[2] > output.w[best]) best = 2; |
|
13536
|
0 |
0 |
for (int j = 0; j < 3; j++) sum += (output.w[j] = exp(output.w[j] - maximum)); |
|
|
0 |
0 |
for (int j = 0; j < 3; j++) sum += (output.w[j] = exp(output.w[j] - maximum)); |
|
|
0 |
0 |
for (int j = 0; j < 3; j++) sum += (output.w[j] = exp(output.w[j] - maximum)); |
|
13538
|
0 |
0 |
for (int j = 0; j < 3; j++) output.w[j] *= sum; |
|
|
0 |
0 |
for (int j = 0; j < 3; j++) output.w[j] *= sum; |
|
|
0 |
0 |
for (int j = 0; j < 3; j++) output.w[j] *= sum; |
|
13546
|
0 |
0 |
for (auto&& output : instance_output) |
|
|
0 |
0 |
for (auto&& output : instance_output) |
|
|
0 |
0 |
for (auto&& output : instance_output) |
|
13547
|
0 |
0 |
for (int j = 0; j < 3; j++) |
|
|
0 |
0 |
for (int j = 0; j < 3; j++) |
|
|
0 |
0 |
for (int j = 0; j < 3; j++) |
|
13548
|
0 |
0 |
output.w[j] = (output.outcome == j) - output.w[j]; |
|
|
0 |
0 |
output.w[j] = (output.outcome == j) - output.w[j]; |
|
|
0 |
0 |
output.w[j] = (output.outcome == j) - output.w[j]; |
|
13550
|
0 |
0 |
for (int dir = 0; dir < 2; dir++) { |
|
|
0 |
0 |
for (int dir = 0; dir < 2; dir++) { |
|
|
0 |
0 |
for (int dir = 0; dir < 2; dir++) { |
|
13551
|
0 |
0 |
auto& gru = dir == 0 ? gru_fwd : gru_bwd; |
|
|
0 |
0 |
auto& gru = dir == 0 ? gru_fwd : gru_bwd; |
|
|
0 |
0 |
auto& gru = dir == 0 ? gru_fwd : gru_bwd; |
|
13552
|
0 |
0 |
auto& projection = dir == 0 ? projection_fwd : projection_bwd; |
|
|
0 |
0 |
auto& projection = dir == 0 ? projection_fwd : projection_bwd; |
|
|
0 |
0 |
auto& projection = dir == 0 ? projection_fwd : projection_bwd; |
|
13556
|
0 |
0 |
for (size_t i = segment; i--; ) { |
|
|
0 |
0 |
for (size_t i = segment; i--; ) { |
|
|
0 |
0 |
for (size_t i = segment; i--; ) { |
|
13557
|
0 |
0 |
auto& embedding = chosen_embeddings[dir == 0 ? i : segment - 1 - i]; |
|
|
0 |
0 |
auto& embedding = chosen_embeddings[dir == 0 ? i : segment - 1 - i]; |
|
|
0 |
0 |
auto& embedding = chosen_embeddings[dir == 0 ? i : segment - 1 - i]; |
|
13558
|
0 |
0 |
auto& embedding_dropout = embedding_dropouts[dir == 0 ? i : segment - 1 - i]; |
|
|
0 |
0 |
auto& embedding_dropout = embedding_dropouts[dir == 0 ? i : segment - 1 - i]; |
|
|
0 |
0 |
auto& embedding_dropout = embedding_dropouts[dir == 0 ? i : segment - 1 - i]; |
|
13559
|
0 |
0 |
auto& output = instance_output[dir == 0 ? i : segment - 1 - i]; |
|
|
0 |
0 |
auto& output = instance_output[dir == 0 ? i : segment - 1 - i]; |
|
|
0 |
0 |
auto& output = instance_output[dir == 0 ? i : segment - 1 - i]; |
|
13561
|
0 |
0 |
for (int j = 0; j < D; j++) // These for cycles are swapped because |
|
|
0 |
0 |
for (int j = 0; j < D; j++) // These for cycles are swapped because |
|
|
0 |
0 |
for (int j = 0; j < D; j++) // These for cycles are swapped because |
|
13562
|
0 |
0 |
for (int k = 0; k < 3; k++) // g++-4.8 generates wrong code otherwise. |
|
|
0 |
0 |
for (int k = 0; k < 3; k++) // g++-4.8 generates wrong code otherwise. |
|
|
0 |
0 |
for (int k = 0; k < 3; k++) // g++-4.8 generates wrong code otherwise. |
|
13565
|
0 |
0 |
for (int j = 0; j < D; j++) |
|
|
0 |
0 |
for (int j = 0; j < D; j++) |
|
|
0 |
0 |
for (int j = 0; j < D; j++) |
|
13566
|
0 |
0 |
if (gru.dropouts[i].w[0][j]) |
|
|
0 |
0 |
if (gru.dropouts[i].w[0][j]) |
|
|
0 |
0 |
if (gru.dropouts[i].w[0][j]) |
|
13567
|
0 |
0 |
for (int k = 0; k < 3; k++) |
|
|
0 |
0 |
for (int k = 0; k < 3; k++) |
|
|
0 |
0 |
for (int k = 0; k < 3; k++) |
|
13571
|
0 |
0 |
for (int j = 0; j < D; j++) { |
|
|
0 |
0 |
for (int j = 0; j < D; j++) { |
|
|
0 |
0 |
for (int j = 0; j < D; j++) { |
|
13578
|
0 |
0 |
for (int k = 0; k < D; k++) { |
|
|
0 |
0 |
for (int k = 0; k < D; k++) { |
|
|
0 |
0 |
for (int k = 0; k < D; k++) { |
|
13585
|
0 |
0 |
for (int j = 0; j < D; j++) { |
|
|
0 |
0 |
for (int j = 0; j < D; j++) { |
|
|
0 |
0 |
for (int j = 0; j < D; j++) { |
|
13594
|
0 |
0 |
for (int k = 0; k < D; k++) { |
|
|
0 |
0 |
for (int k = 0; k < D; k++) { |
|
|
0 |
0 |
for (int k = 0; k < D; k++) { |
|
13608
|
0 |
0 |
if (batch_size == 1 || |
|
|
0 |
0 |
if (batch_size == 1 || |
|
|
0 |
0 |
if (batch_size == 1 || |
|
|
0 |
0 |
if (batch_size == 1 || |
|
|
0 |
0 |
if (batch_size == 1 || |
|
|
0 |
0 |
if (batch_size == 1 || |
|
|
0 |
0 |
if (batch_size == 1 || |
|
|
0 |
0 |
if (batch_size == 1 || |
|
|
0 |
0 |
if (batch_size == 1 || |
|
13615
|
0 |
0 |
if (batch_size == 1) |
|
|
0 |
0 |
if (batch_size == 1) |
|
|
0 |
0 |
if (batch_size == 1) |
|
13616
|
0 |
0 |
for (auto&& chosen_embedding : chosen_embeddings) |
|
|
0 |
0 |
for (auto&& chosen_embedding : chosen_embeddings) |
|
|
0 |
0 |
for (auto&& chosen_embedding : chosen_embeddings) |
|
13619
|
0 |
0 |
for (auto&& embedding : embeddings) |
|
|
0 |
0 |
for (auto&& embedding : embeddings) |
|
|
0 |
0 |
for (auto&& embedding : embeddings) |
|
13627
|
0 |
0 |
if (learning_rate_final && learning_rate_final != learning_rate_initial) |
|
|
0 |
0 |
if (learning_rate_final && learning_rate_final != learning_rate_initial) |
|
|
0 |
0 |
if (learning_rate_final && learning_rate_final != learning_rate_initial) |
|
13631
|
0 |
0 |
cerr << "Epoch " << epoch+1 << ", logprob: " << scientific << setprecision(4) << logprob |
|
|
0 |
0 |
cerr << "Epoch " << epoch+1 << ", logprob: " << scientific << setprecision(4) << logprob |
|
|
0 |
0 |
cerr << "Epoch " << epoch+1 << ", logprob: " << scientific << setprecision(4) << logprob |
|
13633
|
0 |
0 |
if (!heldout.empty()) { |
|
|
0 |
0 |
if (!heldout.empty()) { |
|
|
0 |
0 |
if (!heldout.empty()) { |
|
13635
|
0 |
0 |
evaluate(url_email_tokenizer, segment, allow_spaces, heldout, tokens, sentences); |
|
|
0 |
0 |
evaluate(url_email_tokenizer, segment, allow_spaces, heldout, tokens, sentences); |
|
|
0 |
0 |
evaluate(url_email_tokenizer, segment, allow_spaces, heldout, tokens, sentences); |
|
13636
|
0 |
0 |
cerr << ", heldout tokens: " << 100. * tokens.precision << "%P/" << 100. * tokens.recall << "%R/" |
|
|
0 |
0 |
cerr << ", heldout tokens: " << 100. * tokens.precision << "%P/" << 100. * tokens.recall << "%R/" |
|
|
0 |
0 |
cerr << ", heldout tokens: " << 100. * tokens.precision << "%P/" << 100. * tokens.recall << "%R/" |
|
13640
|
0 |
0 |
if (early_stopping && sentences.f1 + tokens.f1 > best_combined_f1) { |
|
|
0 |
0 |
if (early_stopping && sentences.f1 + tokens.f1 > best_combined_f1) { |
|
|
0 |
0 |
if (early_stopping && sentences.f1 + tokens.f1 > best_combined_f1) { |
|
|
0 |
0 |
if (early_stopping && sentences.f1 + tokens.f1 > best_combined_f1) { |
|
|
0 |
0 |
if (early_stopping && sentences.f1 + tokens.f1 > best_combined_f1) { |
|
|
0 |
0 |
if (early_stopping && sentences.f1 + tokens.f1 > best_combined_f1) { |
|
13645
|
0 |
0 |
if (early_stopping && best_combined_f1 && epoch - best_combined_f1_epoch > 30) { |
|
|
0 |
0 |
if (early_stopping && best_combined_f1 && epoch - best_combined_f1_epoch > 30) { |
|
|
0 |
0 |
if (early_stopping && best_combined_f1 && epoch - best_combined_f1_epoch > 30) { |
|
|
0 |
0 |
if (early_stopping && best_combined_f1 && epoch - best_combined_f1_epoch > 30) { |
|
|
0 |
0 |
if (early_stopping && best_combined_f1 && epoch - best_combined_f1_epoch > 30) { |
|
|
0 |
0 |
if (early_stopping && best_combined_f1 && epoch - best_combined_f1_epoch > 30) { |
|
13654
|
0 |
0 |
if (early_stopping && best_combined_f1) { |
|
|
0 |
0 |
if (early_stopping && best_combined_f1) { |
|
|
0 |
0 |
if (early_stopping && best_combined_f1) { |
|
13664
|
0 |
0 |
enc.add_1B(1); |
|
|
0 |
0 |
enc.add_1B(1); |
|
|
0 |
0 |
enc.add_1B(1); |
|
13665
|
0 |
0 |
enc.add_1B(D); |
|
|
0 |
0 |
enc.add_1B(D); |
|
|
0 |
0 |
enc.add_1B(D); |
|
13668
|
0 |
0 |
for (auto&& embedding : this->embeddings) { |
|
|
0 |
0 |
for (auto&& embedding : this->embeddings) { |
|
|
0 |
0 |
for (auto&& embedding : this->embeddings) { |
|
13672
|
0 |
0 |
save_gru(this->gru_fwd, enc); |
|
|
0 |
0 |
save_gru(this->gru_fwd, enc); |
|
|
0 |
0 |
save_gru(this->gru_fwd, enc); |
|
13673
|
0 |
0 |
save_gru(this->gru_bwd, enc); |
|
|
0 |
0 |
save_gru(this->gru_bwd, enc); |
|
|
0 |
0 |
save_gru(this->gru_bwd, enc); |
|
13674
|
0 |
0 |
save_matrix(this->projection_fwd, enc); |
|
|
0 |
0 |
save_matrix(this->projection_fwd, enc); |
|
|
0 |
0 |
save_matrix(this->projection_fwd, enc); |
|
13675
|
0 |
0 |
save_matrix(this->projection_bwd, enc); |
|
|
0 |
0 |
save_matrix(this->projection_bwd, enc); |
|
|
0 |
0 |
save_matrix(this->projection_bwd, enc); |
|
13682
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
13683
|
0 |
0 |
for (int j = 0; j < C; j++) { |
|
|
0 |
0 |
for (int j = 0; j < C; j++) { |
|
|
0 |
0 |
for (int j = 0; j < C; j++) { |
|
|
0 |
0 |
for (int j = 0; j < C; j++) { |
|
|
0 |
0 |
for (int j = 0; j < C; j++) { |
|
|
0 |
0 |
for (int j = 0; j < C; j++) { |
|
|
0 |
0 |
for (int j = 0; j < C; j++) { |
|
|
0 |
0 |
for (int j = 0; j < C; j++) { |
|
|
0 |
0 |
for (int j = 0; j < C; j++) { |
|
13693
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
13694
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
13716
|
0 |
0 |
for (auto&& sentence : heldout) { |
|
|
0 |
0 |
for (auto&& sentence : heldout) { |
|
|
0 |
0 |
for (auto&& sentence : heldout) { |
|
13717
|
0 |
0 |
if (sentence.tokens.empty()) continue; |
|
|
0 |
0 |
if (sentence.tokens.empty()) continue; |
|
|
0 |
0 |
if (sentence.tokens.empty()) continue; |
|
13719
|
0 |
0 |
gold_sentences.emplace_back(text.size() + sentence.tokens.front().start, sentence.tokens.back().start + sentence.tokens.back().length - sentence.tokens.front().start); |
|
|
0 |
0 |
gold_sentences.emplace_back(text.size() + sentence.tokens.front().start, sentence.tokens.back().start + sentence.tokens.back().length - sentence.tokens.front().start); |
|
|
0 |
0 |
gold_sentences.emplace_back(text.size() + sentence.tokens.front().start, sentence.tokens.back().start + sentence.tokens.back().length - sentence.tokens.front().start); |
|
13720
|
0 |
0 |
for (auto&& token : sentence.tokens) |
|
|
0 |
0 |
for (auto&& token : sentence.tokens) |
|
|
0 |
0 |
for (auto&& token : sentence.tokens) |
|
13721
|
0 |
0 |
gold_tokens.emplace_back(text.size() + token.start, token.length); |
|
|
0 |
0 |
gold_tokens.emplace_back(text.size() + token.start, token.length); |
|
|
0 |
0 |
gold_tokens.emplace_back(text.size() + token.start, token.length); |
|
13731
|
0 |
0 |
unilib::utf8::encode(text, text_utf8); |
|
|
0 |
0 |
unilib::utf8::encode(text, text_utf8); |
|
|
0 |
0 |
unilib::utf8::encode(text, text_utf8); |
|
13732
|
0 |
0 |
tokenizer.set_text(text_utf8); |
|
|
0 |
0 |
tokenizer.set_text(text_utf8); |
|
|
0 |
0 |
tokenizer.set_text(text_utf8); |
|
13734
|
0 |
0 |
while (tokenizer.next_sentence(tokens)) |
|
|
0 |
0 |
while (tokenizer.next_sentence(tokens)) |
|
|
0 |
0 |
while (tokenizer.next_sentence(tokens)) |
|
|
0 |
0 |
while (tokenizer.next_sentence(tokens)) |
|
|
0 |
0 |
while (tokenizer.next_sentence(tokens)) |
|
|
0 |
0 |
while (tokenizer.next_sentence(tokens)) |
|
13735
|
0 |
0 |
if (!tokens.empty()) { |
|
|
0 |
0 |
if (!tokens.empty()) { |
|
|
0 |
0 |
if (!tokens.empty()) { |
|
13736
|
0 |
0 |
system_sentences.emplace_back(tokens.front().start, tokens.back().start + tokens.back().length - tokens.front().start); |
|
|
0 |
0 |
system_sentences.emplace_back(tokens.front().start, tokens.back().start + tokens.back().length - tokens.front().start); |
|
|
0 |
0 |
system_sentences.emplace_back(tokens.front().start, tokens.back().start + tokens.back().length - tokens.front().start); |
|
13737
|
0 |
0 |
system_tokens.insert(system_tokens.end(), tokens.begin(), tokens.end()); |
|
|
0 |
0 |
system_tokens.insert(system_tokens.end(), tokens.begin(), tokens.end()); |
|
|
0 |
0 |
system_tokens.insert(system_tokens.end(), tokens.begin(), tokens.end()); |
|
13747
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
13748
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start)) |
|
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start)) |
|
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start)) |
|
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start)) |
|
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start)) |
|
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start)) |
|
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start)) |
|
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start)) |
|
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start)) |
|
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start)) |
|
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start)) |
|
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start)) |
|
13750
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start)) |
|
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start)) |
|
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start)) |
|
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start)) |
|
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start)) |
|
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start)) |
|
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start)) |
|
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start)) |
|
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start)) |
|
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start)) |
|
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start)) |
|
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start)) |
|
13755
|
0 |
0 |
f1.precision = system.size() ? both / double(system.size()) : 0.; |
|
|
0 |
0 |
f1.precision = system.size() ? both / double(system.size()) : 0.; |
|
|
0 |
0 |
f1.precision = system.size() ? both / double(system.size()) : 0.; |
|
13756
|
0 |
0 |
f1.recall = gold.size() ? both / double(gold.size()) : 0.; |
|
|
0 |
0 |
f1.recall = gold.size() ? both / double(gold.size()) : 0.; |
|
|
0 |
0 |
f1.recall = gold.size() ? both / double(gold.size()) : 0.; |
|
13757
|
0 |
0 |
f1.f1 = system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0.; |
|
|
0 |
0 |
f1.f1 = system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0.; |
|
|
0 |
0 |
f1.f1 = system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0.; |
|
13763
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
13765
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
13782
|
0 |
0 |
for (int i = 0; i < R; i++) |
|
|
0 |
0 |
for (int i = 0; i < R; i++) |
|
|
0 |
0 |
for (int i = 0; i < R; i++) |
|
|
0 |
0 |
for (int i = 0; i < R; i++) |
|
|
0 |
0 |
for (int i = 0; i < R; i++) |
|
|
0 |
0 |
for (int i = 0; i < R; i++) |
|
13826
|
0 |
0 |
enc.add_1B(url_email_tokenizer); |
|
13827
|
0 |
0 |
enc.add_2B(segment); |
|
13828
|
0 |
0 |
enc.add_1B(allow_spaces); |
|
13831
|
0 |
0 |
if (dimension == 16) { |
|
13833
|
0 |
0 |
if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final, |
|
|
0 |
0 |
if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final, |
|
13835
|
0 |
0 |
} else if (dimension == 24) { |
|
13837
|
0 |
0 |
if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final, |
|
|
0 |
0 |
if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final, |
|
13839
|
0 |
0 |
} else if (dimension == 64) { |
|
13841
|
0 |
0 |
if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final, |
|
|
0 |
0 |
if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final, |
|
13844
|
0 |
0 |
return error.assign("Gru tokenizer dimension '").append(to_string(dimension)).append("' is not supported!"), false; |
|
|
0 |
0 |
return error.assign("Gru tokenizer dimension '").append(to_string(dimension)).append("' is not supported!"), false; |
|
13849
|
0 |
0 |
for (auto&& sentence : data) |
|
13850
|
0 |
0 |
for (auto&& chr : sentence.sentence) |
|
13854
|
0 |
0 |
for (auto&& count : counts) { |
|
13857
|
0 |
0 |
for (auto&& chr : count.second) |
|
13858
|
0 |
0 |
if (chr.second > best) |
|
13860
|
0 |
0 |
if (best_chr) |
|
13863
|
0 |
0 |
enc.add_1B(unknown_chars.size()); |
|
13864
|
0 |
0 |
for (auto&& unknown_char : unknown_chars) { |
|
13869
|
0 |
0 |
if (!compressor::save(os, enc)) return error.assign("Cannot save gru_tokenizer_factory!"), false; |
|
|
0 |
0 |
if (!compressor::save(os, enc)) return error.assign("Cannot save gru_tokenizer_factory!"), false; |
|
|
0 |
0 |
if (!compressor::save(os, enc)) return error.assign("Cannot save gru_tokenizer_factory!"), false; |
|
14241
|
0 |
0 |
initialize_ragel_map(); |
|
14245
|
0 |
1 |
while (ragel_map_flag.test_and_set()) {} |
|
14246
|
1 |
0 |
if (ragel_map.empty()) { |
|
14247
|
128 |
1 |
for (uint8_t ascii = 0; ascii < 128; ascii++) |
|
14259
|
1 |
3 |
if (chr >= ragel_map.size()) |
|
14279
|
7 |
0 |
if ( ( current) == ( (chars.size() - 1)) ) |
|
14287
|
0 |
30 |
if ( _klen > 0 ) { |
|
14292
|
0 |
0 |
if ( _upper < _lower ) |
|
14296
|
0 |
0 |
if ( _widec < _mid[0] ) |
|
14298
|
0 |
0 |
else if ( _widec > _mid[1] ) |
|
14304
|
0 |
0 |
if ( |
|
14310
|
0 |
0 |
if ( |
|
14324
|
30 |
0 |
if ( _klen > 0 ) { |
|
14329
|
87 |
30 |
if ( _upper < _lower ) |
|
14333
|
13 |
74 |
if ( _widec < *_mid ) |
|
14335
|
74 |
0 |
else if ( _widec > *_mid ) |
|
14347
|
30 |
0 |
if ( _klen > 0 ) { |
|
14352
|
86 |
7 |
if ( _upper < _lower ) |
|
14356
|
9 |
77 |
if ( _widec < _mid[0] ) |
|
14358
|
54 |
23 |
else if ( _widec > _mid[1] ) |
|
14372
|
0 |
30 |
if ( _ragel_url_email_trans_actions[_trans] == 0 ) |
|
14393
|
23 |
7 |
if ( cs == 0 ) |
|
14395
|
23 |
0 |
if ( ++( current) != ( (chars.size() - 1)) ) |
|
14401
|
0 |
7 |
if (end > start) { |
|
14430
|
0 |
0 |
vertical_tokenizer() : unicode_tokenizer(0) {} |
|
14528
|
0 |
0 |
if (res->load(is)) return res.release(); |
|
14534
|
1 |
0 |
if (res->load(is)) return res.release(); |
|
|
1 |
0 |
if (res->load(is)) return res.release(); |
|
14540
|
0 |
0 |
if (res->load(is)) return res.release(); |
|
|
0 |
0 |
if (res->load(is)) return res.release(); |
|
14551
|
0 |
0 |
ifstream f(path_from_utf8(fname).c_str(), ifstream::binary); |
|
14552
|
0 |
0 |
if (!f) return nullptr; |
|
14554
|
0 |
0 |
return load(f); |
|
14575
|
1 |
0 |
ragel_tokenizer::initialize_ragel_map(); |
|
14577
|
1 |
0 |
set_text(string_piece(nullptr, 0)); |
|
14583
|
0 |
2 |
if (make_copy && text.str) { |
|
|
0 |
0 |
if (make_copy && text.str) { |
|
14590
|
34 |
2 |
for (const char* curr_str = text.str; text.len; curr_str = text.str) |
|
14596
|
2 |
0 |
vector& tokens = tokens_ptr ? *tokens_ptr : tokens_buffer; |
|
14598
|
2 |
0 |
if (forms) forms->clear(); |
|
14599
|
2 |
0 |
if (current >= chars.size() - 1) return false; |
|
14602
|
2 |
0 |
if (forms) |
|
14603
|
7 |
2 |
for (auto&& token : tokens) |
|
14610
|
7 |
0 |
if (current >= chars.size() - 1) return false; |
|
14612
|
7 |
0 |
return url_email_tokenizer ? ragel_tokenizer::ragel_url_email(url_email_tokenizer, chars, current, tokens) : false; |
|
14619
|
0 |
8 |
return tokens.size() >= 500 || |
|
14620
|
8 |
0 |
(tokens.size() >= 450 && chars[tokens.back().start].cat & unicode::P) || |
|
|
0 |
0 |
(tokens.size() >= 450 && chars[tokens.back().start].cat & unicode::P) || |
|
|
0 |
8 |
(tokens.size() >= 450 && chars[tokens.back().start].cat & unicode::P) || |
|
14621
|
0 |
0 |
(tokens.size() >= 400 && chars[tokens.back().start].cat & unicode::Po); |
|
14627
|
0 |
0 |
if (eos_chr == '.' && !tokens.empty()) { |
|
|
0 |
0 |
if (eos_chr == '.' && !tokens.empty()) { |
|
|
0 |
0 |
if (eos_chr == '.' && !tokens.empty()) { |
|
14629
|
0 |
0 |
if (tokens.back().length == 1 && chars[tokens.back().start].cat & unicode::Lut) |
|
|
0 |
0 |
if (tokens.back().length == 1 && chars[tokens.back().start].cat & unicode::Lut) |
|
|
0 |
0 |
if (tokens.back().length == 1 && chars[tokens.back().start].cat & unicode::Lut) |
|
14633
|
0 |
0 |
if (abbreviations) { |
|
14635
|
0 |
0 |
for (size_t i = 0; i < tokens.back().length; i++) |
|
14637
|
0 |
0 |
if (abbreviations->count(eos_buffer)) |
|
14662
|
0 |
0 |
if (current >= chars.size() - 1) return false; |
|
14666
|
0 |
0 |
while (current < chars.size() - 1 && chars[current].chr != '\r' && chars[current].chr != '\n') current++; |
|
|
0 |
0 |
while (current < chars.size() - 1 && chars[current].chr != '\r' && chars[current].chr != '\n') current++; |
|
|
0 |
0 |
while (current < chars.size() - 1 && chars[current].chr != '\r' && chars[current].chr != '\n') current++; |
|
|
0 |
0 |
while (current < chars.size() - 1 && chars[current].chr != '\r' && chars[current].chr != '\n') current++; |
|
14669
|
0 |
0 |
if (current < chars.size() - 1) { |
|
14671
|
0 |
0 |
if (current < chars.size() - 1 && |
|
|
0 |
0 |
if (current < chars.size() - 1 && |
|
|
0 |
0 |
if (current < chars.size() - 1 && |
|
14672
|
0 |
0 |
((chars[current-1].chr == '\r' && chars[current].chr == '\n') || |
|
|
0 |
0 |
((chars[current-1].chr == '\r' && chars[current].chr == '\n') || |
|
14673
|
0 |
0 |
(chars[current-1].chr == '\n' && chars[current].chr == '\r'))) |
|
14677
|
0 |
0 |
if (line_start < line_end) |
|
14765
|
0 |
0 |
return {1, 11, 1, "dev"}; |
|
|
0 |
0 |
return {1, 11, 1, "dev"}; |
|
14776
|
0 |
0 |
<< (morphodita.prerelease.empty() ? "" : "-") << morphodita.prerelease |
|
|
0 |
0 |
<< (morphodita.prerelease.empty() ? "" : "-") << morphodita.prerelease |
|
14778
|
0 |
0 |
<< (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n" |
|
|
0 |
0 |
<< (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n" |
|
14780
|
0 |
0 |
"Mathematics and Physics, Charles University in Prague, Czech Republic."; |
|
14803
|
0 |
1 |
assert(t); |
|
14809
|
1 |
0 |
if (!t->nodes.empty()) stack.push_back(0); |
|
14813
|
7 |
1 |
for (size_t i = t->nodes.size(); i > 1; i--) |
|
14818
|
0 |
0 |
return buffer.empty() && stack.size() <= 1; |
|
|
0 |
0 |
return buffer.empty() && stack.size() <= 1; |
|
|
0 |
0 |
return buffer.empty() && stack.size() <= 1; |
|
|
0 |
0 |
return buffer.empty() && stack.size() <= 1; |
|
|
0 |
0 |
return buffer.empty() && stack.size() <= 1; |
|
|
0 |
0 |
return buffer.empty() && stack.size() <= 1; |
|
|
0 |
0 |
return buffer.empty() && stack.size() <= 1; |
|
|
0 |
0 |
return buffer.empty() && stack.size() <= 1; |
|
|
16 |
51 |
return buffer.empty() && stack.size() <= 1; |
|
|
11 |
5 |
return buffer.empty() && stack.size() <= 1; |
|
|
0 |
0 |
return buffer.empty() && stack.size() <= 1; |
|
|
0 |
0 |
return buffer.empty() && stack.size() <= 1; |
|
|
0 |
0 |
return buffer.empty() && stack.size() <= 1; |
|
|
0 |
0 |
return buffer.empty() && stack.size() <= 1; |
|
14881
|
1116 |
62 |
for (auto&& selector : selectors) { |
|
14886
|
867 |
63 |
if (selector.start.second < int(conf.stack.size())) |
|
14890
|
98 |
88 |
if (selector.start.second < int(conf.buffer.size())) |
|
14896
|
965 |
151 |
if (current >= 0) |
|
14897
|
802 |
410 |
for (auto&& direction : selector.directions) { |
|
14901
|
0 |
0 |
current = node.head ? node.head : -1; |
|
14904
|
120 |
281 |
current = direction.second >= 0 && direction.second < int(node.children.size()) ? |
|
14906
|
127 |
274 |
direction.second < 0 && -direction.second <= int(node.children.size()) ? |
|
14908
|
401 |
401 |
-1; |
|
|
401 |
281 |
-1; |
|
14911
|
247 |
555 |
if (current <= 0) break; |
|
14924
|
1 |
0 |
split(description, '\n', lines); |
|
14925
|
19 |
1 |
for (auto&& line : lines) { |
|
14926
|
18 |
1 |
if (!line.len || line.str[0] == '#') continue; |
|
|
18 |
0 |
if (!line.len || line.str[0] == '#') continue; |
|
14929
|
18 |
0 |
split(line, ',', parts); |
|
14932
|
18 |
0 |
split(parts[0], ' ', words); |
|
14933
|
0 |
18 |
if (words.size() != 2) |
|
14934
|
0 |
0 |
return error.assign("The node selector '").append(parts[0].str, parts[0].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false; |
|
|
0 |
0 |
return error.assign("The node selector '").append(parts[0].str, parts[0].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false; |
|
|
0 |
0 |
return error.assign("The node selector '").append(parts[0].str, parts[0].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false; |
|
|
0 |
0 |
return error.assign("The node selector '").append(parts[0].str, parts[0].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false; |
|
|
0 |
0 |
return error.assign("The node selector '").append(parts[0].str, parts[0].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false; |
|
14937
|
15 |
3 |
if (words[0] == "stack") |
|
14939
|
3 |
0 |
else if (words[0] == "buffer") |
|
14942
|
0 |
0 |
return error.assign("Cannot parse starting location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false; |
|
|
0 |
0 |
return error.assign("Cannot parse starting location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false; |
|
|
0 |
0 |
return error.assign("Cannot parse starting location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false; |
|
|
0 |
0 |
return error.assign("Cannot parse starting location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false; |
|
|
0 |
0 |
return error.assign("Cannot parse starting location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false; |
|
14945
|
18 |
0 |
if (!parse_int(words[1], "starting index", start_index, error)) return false; |
|
|
18 |
0 |
if (!parse_int(words[1], "starting index", start_index, error)) return false; |
|
14947
|
18 |
0 |
selectors.emplace_back(start, start_index); |
|
14950
|
16 |
18 |
for (size_t i = 1; i < parts.size(); i++) { |
|
14951
|
16 |
0 |
split(parts[i], ' ', words); |
|
14952
|
0 |
16 |
if (words.empty()) |
|
14953
|
0 |
0 |
return error.assign("Empty node selector on line '").append(line.str, line.len).append(".!"), false; |
|
|
0 |
0 |
return error.assign("Empty node selector on line '").append(line.str, line.len).append(".!"), false; |
|
|
0 |
0 |
return error.assign("Empty node selector on line '").append(line.str, line.len).append(".!"), false; |
|
14955
|
0 |
16 |
if (words[0] == "parent") { |
|
14956
|
0 |
0 |
if (words.size() != 1) |
|
14957
|
0 |
0 |
return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain one space separated value!"), false; |
|
|
0 |
0 |
return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain one space separated value!"), false; |
|
|
0 |
0 |
return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain one space separated value!"), false; |
|
|
0 |
0 |
return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain one space separated value!"), false; |
|
|
0 |
0 |
return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain one space separated value!"), false; |
|
14958
|
0 |
0 |
selectors.back().directions.emplace_back(PARENT, 0); |
|
14959
|
16 |
0 |
} else if (words[0] == "child") { |
|
14960
|
0 |
16 |
if (words.size() != 2) |
|
14961
|
0 |
0 |
return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false; |
|
|
0 |
0 |
return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false; |
|
|
0 |
0 |
return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false; |
|
|
0 |
0 |
return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false; |
|
|
0 |
0 |
return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false; |
|
14963
|
16 |
0 |
if (!parse_int(words[1], "child index", child_index, error)) return false; |
|
|
16 |
0 |
if (!parse_int(words[1], "child index", child_index, error)) return false; |
|
14964
|
16 |
0 |
selectors.back().directions.emplace_back(CHILD, child_index); |
|
14966
|
0 |
0 |
return error.assign("Cannot parse direction location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false; |
|
|
0 |
0 |
return error.assign("Cannot parse direction location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false; |
|
|
0 |
0 |
return error.assign("Cannot parse direction location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false; |
|
|
0 |
0 |
return error.assign("Cannot parse direction location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false; |
|
|
0 |
0 |
return error.assign("Cannot parse direction location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false; |
|
15029
|
0 |
0 |
if (!n.misc.empty()) { |
|
15032
|
0 |
0 |
if (lid != string::npos) { |
|
15037
|
0 |
0 |
if (lid_end == string::npos) lid_end = n.misc.size(); |
|
15067
|
1 |
3 |
if (description == "form") |
|
15069
|
0 |
3 |
else if (description == "lemma") |
|
15071
|
0 |
3 |
else if (description == "lemma_id") |
|
15073
|
0 |
3 |
else if (description == "tag") |
|
15075
|
1 |
2 |
else if (description == "universal_tag") |
|
15077
|
1 |
1 |
else if (description == "feats") |
|
15079
|
0 |
1 |
else if (description == "universal_tag_feats") |
|
15081
|
1 |
0 |
else if (description == "deprel") |
|
15150
|
92 |
36 |
if (it != dictionary.end()) return it->second; |
|
15157
|
18 |
36 |
for (auto&& chr : utf8::decoder(word)) { |
|
15158
|
3 |
15 |
(first ? first_category : other_categories) |= unicode::category(chr); |
|
15162
|
0 |
36 |
if ((first_category & unicode::Lut) && (other_categories & unicode::Lut)) { |
|
|
0 |
0 |
if ((first_category & unicode::Lut) && (other_categories & unicode::Lut)) { |
|
15166
|
0 |
0 |
for (auto&& chr : utf8::decoder(word)) { |
|
15167
|
0 |
0 |
utf8::append(buffer, first ? chr : unicode::lowercase(chr)); |
|
15172
|
0 |
0 |
if (it != dictionary.end()) return it->second; |
|
15175
|
36 |
0 |
if ((first_category & unicode::Lut) || (other_categories & unicode::Lut)) { |
|
|
0 |
36 |
if ((first_category & unicode::Lut) || (other_categories & unicode::Lut)) { |
|
15179
|
0 |
0 |
if (it != dictionary.end()) return it->second; |
|
15184
|
0 |
36 |
if ((first_category & unicode::N) && !(other_categories & unicode::L)) { |
|
|
0 |
0 |
if ((first_category & unicode::N) && !(other_categories & unicode::L)) { |
|
15189
|
0 |
0 |
if (it != dictionary.end()) return it->second; |
|
15200
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
15205
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
31 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
4 |
27 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
27 |
4 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
27 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
27 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
27 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
15218
|
4 |
0 |
for (unsigned size = data.next_4B(); size; size--) { |
|
|
23 |
4 |
for (unsigned size = data.next_4B(); size; size--) { |
|
15219
|
23 |
0 |
data.next_str(word); |
|
15223
|
4 |
0 |
unknown_index = data.next_1B() ? dictionary.size() : -1; |
|
|
4 |
0 |
unknown_index = data.next_1B() ? dictionary.size() : -1; |
|
15226
|
4 |
0 |
weights.resize(dimension * (dictionary.size() + (unknown_index >= 0))); |
|
15227
|
4 |
0 |
memcpy(weights.data(), data.next(weights.size()), sizeof(float) * weights.size()); |
|
15253
|
0 |
0 |
for (auto&& entry : dictionary) { |
|
15254
|
0 |
0 |
assert(entry.second >= 0 && entry.second < int(dictionary.size())); |
|
|
0 |
0 |
assert(entry.second >= 0 && entry.second < int(dictionary.size())); |
|
15258
|
0 |
0 |
for (auto&& word : words) |
|
15259
|
0 |
0 |
enc.add_str(word); |
|
15261
|
0 |
0 |
enc.add_1B(unknown_index >= 0); |
|
15277
|
0 |
0 |
for (auto&& word : words) { |
|
15278
|
0 |
0 |
assert(word.second.size() == dimension); |
|
15283
|
0 |
0 |
if (unknown_weights.empty()) { |
|
15295
|
0 |
0 |
if (dictionary.empty()) return; |
|
15297
|
0 |
0 |
assert(unknown_index < 0 || unknown_index == int(dictionary.size())); |
|
|
0 |
0 |
assert(unknown_index < 0 || unknown_index == int(dictionary.size())); |
|
15300
|
0 |
0 |
for (auto&& entry : dictionary) { |
|
15304
|
0 |
0 |
if (unknown_index >= 0) |
|
15353
|
0 |
0 |
class neural_network { |
|
|
0 |
0 |
class neural_network { |
|
|
2 |
1 |
class neural_network { |
|
|
1 |
0 |
class neural_network { |
|
|
2 |
1 |
class neural_network { |
|
15397
|
367 |
2 |
for (auto&& row : m) { |
|
15411
|
0 |
62 |
assert(!weights[0].empty()); |
|
15412
|
0 |
62 |
assert(!weights[1].empty()); |
|
15413
|
1116 |
62 |
for (auto&& embedding_ids : embedding_ids_sequences) if (embedding_ids) assert(embeddings.size() == embedding_ids->size()); |
|
|
410 |
706 |
for (auto&& embedding_ids : embedding_ids_sequences) if (embedding_ids) assert(embeddings.size() == embedding_ids->size()); |
|
|
0 |
410 |
for (auto&& embedding_ids : embedding_ids_sequences) if (embedding_ids) assert(embeddings.size() == embedding_ids->size()); |
|
15424
|
1116 |
62 |
for (unsigned sequence = 0; sequence < embedding_ids_sequences.size(); sequence++) |
|
15425
|
4464 |
1116 |
for (unsigned i = 0; i < embeddings.size(); index += embeddings[i].dimension, i++) |
|
15426
|
1640 |
2824 |
if (embedding_ids_sequences[sequence] && embedding_ids_sequences[sequence]->at(i) >= 0) { |
|
|
0 |
1640 |
if (embedding_ids_sequences[sequence] && embedding_ids_sequences[sequence]->at(i) >= 0) { |
|
|
1640 |
2824 |
if (embedding_ids_sequences[sequence] && embedding_ids_sequences[sequence]->at(i) >= 0) { |
|
15428
|
1640 |
0 |
if (cache && i < cache->size() && word < cache->at(i).size()) { |
|
|
1640 |
0 |
if (cache && i < cache->size() && word < cache->at(i).size()) { |
|
|
0 |
1640 |
if (cache && i < cache->size() && word < cache->at(i).size()) { |
|
|
1640 |
0 |
if (cache && i < cache->size() && word < cache->at(i).size()) { |
|
15431
|
8200 |
1640 |
for (unsigned j = 0; j < hidden_layer_size; j++) |
|
15436
|
0 |
0 |
for (unsigned j = 0; j < embeddings[i].dimension; j++) |
|
15437
|
0 |
0 |
for (unsigned k = 0; k < hidden_layer_size; k++) |
|
15441
|
310 |
62 |
for (unsigned i = 0; i < hidden_layer_size; i++) // Bias |
|
15447
|
62 |
0 |
if (!tanh_cache.empty()) |
|
15448
|
310 |
62 |
for (auto&& weight : hidden_layer) |
|
15449
|
310 |
0 |
weight = weight <= -10 ? -1 : weight >= 10 ? 1 : tanh_cache[int(weight * 32768 + 10 * 32768)]; |
|
|
310 |
0 |
weight = weight <= -10 ? -1 : weight >= 10 ? 1 : tanh_cache[int(weight * 32768 + 10 * 32768)]; |
|
15451
|
0 |
0 |
for (auto&& weight : hidden_layer) |
|
15455
|
0 |
0 |
for (auto&& weight : hidden_layer) |
|
15459
|
0 |
0 |
for (auto&& weight : hidden_layer) |
|
15460
|
0 |
0 |
if (weight < 0) weight = 0; |
|
15464
|
310 |
62 |
for (unsigned i = 0; i < hidden_layer_size; i++) |
|
15465
|
4030 |
310 |
for (unsigned j = 0; j < outcomes_size; j++) |
|
15467
|
806 |
62 |
for (unsigned i = 0; i < outcomes_size; i++) // Bias |
|
15471
|
62 |
0 |
if (softmax) { |
|
15473
|
62 |
744 |
for (unsigned i = 1; i < outcomes_size; i++) if (outcomes[i] > max) max = outcomes[i]; |
|
|
68 |
676 |
for (unsigned i = 1; i < outcomes_size; i++) if (outcomes[i] > max) max = outcomes[i]; |
|
15476
|
806 |
62 |
for (unsigned i = 0; i < outcomes_size; i++) sum += (outcomes[i] = exp(outcomes[i] - max)); |
|
15479
|
806 |
62 |
for (unsigned i = 0; i < outcomes_size; i++) outcomes[i] *= sum; |
|
15485
|
655360 |
1 |
for (unsigned i = 0; i < tanh_cache.size(); i++) |
|
15491
|
4 |
1 |
for (auto&& embedding : embeddings) embeddings_dim += embedding.dimension; |
|
15494
|
0 |
1 |
assert(sequences * embeddings_dim + 1 == weights[0].size()); |
|
15499
|
4 |
1 |
for (unsigned i = 0, weight_index = 0; i < embeddings.size(); weight_index += embeddings[i].dimension, i++) { |
|
15501
|
31 |
0 |
while (words < max_words && embeddings[i].weight(words)) words++; |
|
|
4 |
27 |
while (words < max_words && embeddings[i].weight(words)) words++; |
|
|
27 |
4 |
while (words < max_words && embeddings[i].weight(words)) words++; |
|
15504
|
27 |
4 |
for (unsigned word = 0; word < words; word++) { |
|
15508
|
486 |
27 |
for (unsigned sequence = 0, index = weight_index; sequence < sequences; index += embeddings_dim, sequence++) |
|
15509
|
2430 |
486 |
for (unsigned j = 0; j < embeddings[i].dimension; j++) |
|
15510
|
12150 |
2430 |
for (unsigned k = 0; k < hidden_layer_size; k++) |
|
15587
|
0 |
0 |
struct workspace { |
|
|
0 |
0 |
struct workspace { |
|
|
0 |
0 |
struct workspace { |
|
|
0 |
0 |
struct workspace { |
|
|
0 |
0 |
struct workspace { |
|
|
0 |
0 |
struct workspace { |
|
15677
|
0 |
0 |
if (parameters.hidden_layer) { |
|
15679
|
0 |
0 |
-parameters.initialization_range * sqrt(6.0 / float(input_size + parameters.hidden_layer)); |
|
15683
|
0 |
0 |
for (auto&& row : network.weights[0]) { |
|
15685
|
0 |
0 |
for (auto&& weight : row) |
|
15690
|
0 |
0 |
-parameters.initialization_range * sqrt(6.0 / float(output_size + parameters.hidden_layer)); |
|
15694
|
0 |
0 |
for (auto&& row : network.weights[1]) { |
|
15696
|
0 |
0 |
for (auto&& weight : row) |
|
15713
|
0 |
0 |
if (maxnorm_regularization) maxnorm_regularize(); |
|
15717
|
0 |
0 |
if (iteration++ >= iterations) return false; |
|
15719
|
0 |
0 |
if (trainer.algorithm != network_trainer::ADADELTA) |
|
15720
|
0 |
0 |
if (trainer.learning_rate != trainer.learning_rate_final && iteration > 1) |
|
|
0 |
0 |
if (trainer.learning_rate != trainer.learning_rate_final && iteration > 1) |
|
15729
|
0 |
0 |
if (dropout_input) { |
|
15732
|
0 |
0 |
for (auto&& flag : w.input_dropout) |
|
15736
|
0 |
0 |
if (dropout_hidden) { |
|
15739
|
0 |
0 |
for (auto&& flag : w.hidden_dropout) |
|
15743
|
0 |
0 |
for (unsigned i = 0; i < network.weights[0].front().size(); i++) |
|
15744
|
0 |
0 |
if (w.hidden_dropout.empty() || !w.hidden_dropout[i]) |
|
|
0 |
0 |
if (w.hidden_dropout.empty() || !w.hidden_dropout[i]) |
|
|
0 |
0 |
if (w.hidden_dropout.empty() || !w.hidden_dropout[i]) |
|
15757
|
0 |
0 |
for (auto&& embedding_ids : embedding_ids_sequences) |
|
15760
|
0 |
0 |
for (unsigned i = 0; i < embeddings.size(); i++) { |
|
15761
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
15763
|
0 |
0 |
for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, embedding++, index++) |
|
15764
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) |
|
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) |
|
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) |
|
15765
|
0 |
0 |
for (auto&& j : w.hidden_kept) |
|
15771
|
0 |
0 |
if (dropout_input) { // Dropout normalization |
|
15773
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
15776
|
0 |
0 |
for (auto&& i : w.hidden_kept) // Bias |
|
15782
|
0 |
0 |
for (auto&& weight : w.hidden_layer) |
|
15786
|
0 |
0 |
for (auto&& weight : w.hidden_layer) |
|
15790
|
0 |
0 |
for (auto&& weight : w.hidden_layer) |
|
15791
|
0 |
0 |
if (weight < 0) weight = 0; |
|
15794
|
0 |
0 |
if (dropout_hidden) { // Dropout normalization |
|
15796
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
15800
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
15801
|
0 |
0 |
for (unsigned j = 0; j < outcomes_size; j++) |
|
15803
|
0 |
0 |
for (unsigned i = 0; i < outcomes_size; i++) // Bias |
|
15808
|
0 |
0 |
for (unsigned i = 1; i < outcomes_size; i++) if (w.outcomes[i] > max) max = w.outcomes[i]; |
|
|
0 |
0 |
for (unsigned i = 1; i < outcomes_size; i++) if (w.outcomes[i] > max) max = w.outcomes[i]; |
|
15811
|
0 |
0 |
for (unsigned i = 0; i < outcomes_size; i++) sum += (w.outcomes[i] = exp(w.outcomes[i] - max)); |
|
15814
|
0 |
0 |
for (unsigned i = 0; i < outcomes_size; i++) w.outcomes[i] *= sum; |
|
15861
|
0 |
0 |
if (network.weights[0].size() > w.weights_batch[0].size()) w.weights_batch[0].resize(network.weights[0].size()); |
|
|
0 |
0 |
if (network.weights[0].size() > w.weights_batch[0].size()) w.weights_batch[0].resize(network.weights[0].size()); |
|
|
0 |
0 |
if (network.weights[0].size() > w.weights_batch[0].size()) w.weights_batch[0].resize(network.weights[0].size()); |
|
|
0 |
0 |
if (network.weights[0].size() > w.weights_batch[0].size()) w.weights_batch[0].resize(network.weights[0].size()); |
|
|
0 |
0 |
if (network.weights[0].size() > w.weights_batch[0].size()) w.weights_batch[0].resize(network.weights[0].size()); |
|
15862
|
0 |
0 |
if (network.weights[1].size() > w.weights_batch[1].size()) w.weights_batch[1].resize(network.weights[1].size()); |
|
|
0 |
0 |
if (network.weights[1].size() > w.weights_batch[1].size()) w.weights_batch[1].resize(network.weights[1].size()); |
|
|
0 |
0 |
if (network.weights[1].size() > w.weights_batch[1].size()) w.weights_batch[1].resize(network.weights[1].size()); |
|
|
0 |
0 |
if (network.weights[1].size() > w.weights_batch[1].size()) w.weights_batch[1].resize(network.weights[1].size()); |
|
|
0 |
0 |
if (network.weights[1].size() > w.weights_batch[1].size()) w.weights_batch[1].resize(network.weights[1].size()); |
|
15863
|
0 |
0 |
if (embeddings.size() > w.error_embedding.size()) w.error_embedding.resize(embeddings.size()); |
|
|
0 |
0 |
if (embeddings.size() > w.error_embedding.size()) w.error_embedding.resize(embeddings.size()); |
|
|
0 |
0 |
if (embeddings.size() > w.error_embedding.size()) w.error_embedding.resize(embeddings.size()); |
|
|
0 |
0 |
if (embeddings.size() > w.error_embedding.size()) w.error_embedding.resize(embeddings.size()); |
|
|
0 |
0 |
if (embeddings.size() > w.error_embedding.size()) w.error_embedding.resize(embeddings.size()); |
|
15864
|
0 |
0 |
if (embeddings.size() > w.error_embedding_nonempty.size()) w.error_embedding_nonempty.resize(embeddings.size()); |
|
|
0 |
0 |
if (embeddings.size() > w.error_embedding_nonempty.size()) w.error_embedding_nonempty.resize(embeddings.size()); |
|
|
0 |
0 |
if (embeddings.size() > w.error_embedding_nonempty.size()) w.error_embedding_nonempty.resize(embeddings.size()); |
|
|
0 |
0 |
if (embeddings.size() > w.error_embedding_nonempty.size()) w.error_embedding_nonempty.resize(embeddings.size()); |
|
|
0 |
0 |
if (embeddings.size() > w.error_embedding_nonempty.size()) w.error_embedding_nonempty.resize(embeddings.size()); |
|
15868
|
0 |
0 |
if (TRAINER::need_trainer_data) { |
|
|
0 |
0 |
if (TRAINER::need_trainer_data) { |
|
|
0 |
0 |
if (TRAINER::need_trainer_data) { |
|
|
0 |
0 |
if (TRAINER::need_trainer_data) { |
|
|
0 |
0 |
if (TRAINER::need_trainer_data) { |
|
15869
|
0 |
0 |
while (network.weights[0].size() > w.weights_trainer[0].size()) w.weights_trainer[0].emplace_back(network.weights[0].front().size()); |
|
|
0 |
0 |
while (network.weights[0].size() > w.weights_trainer[0].size()) w.weights_trainer[0].emplace_back(network.weights[0].front().size()); |
|
|
0 |
0 |
while (network.weights[0].size() > w.weights_trainer[0].size()) w.weights_trainer[0].emplace_back(network.weights[0].front().size()); |
|
|
0 |
0 |
while (network.weights[0].size() > w.weights_trainer[0].size()) w.weights_trainer[0].emplace_back(network.weights[0].front().size()); |
|
|
0 |
0 |
while (network.weights[0].size() > w.weights_trainer[0].size()) w.weights_trainer[0].emplace_back(network.weights[0].front().size()); |
|
15870
|
0 |
0 |
while (network.weights[1].size() > w.weights_trainer[1].size()) w.weights_trainer[1].emplace_back(outcomes_size); |
|
|
0 |
0 |
while (network.weights[1].size() > w.weights_trainer[1].size()) w.weights_trainer[1].emplace_back(outcomes_size); |
|
|
0 |
0 |
while (network.weights[1].size() > w.weights_trainer[1].size()) w.weights_trainer[1].emplace_back(outcomes_size); |
|
|
0 |
0 |
while (network.weights[1].size() > w.weights_trainer[1].size()) w.weights_trainer[1].emplace_back(outcomes_size); |
|
|
0 |
0 |
while (network.weights[1].size() > w.weights_trainer[1].size()) w.weights_trainer[1].emplace_back(outcomes_size); |
|
15875
|
0 |
0 |
for (unsigned i = 0; i < outcomes_size; i++) |
|
|
0 |
0 |
for (unsigned i = 0; i < outcomes_size; i++) |
|
|
0 |
0 |
for (unsigned i = 0; i < outcomes_size; i++) |
|
|
0 |
0 |
for (unsigned i = 0; i < outcomes_size; i++) |
|
|
0 |
0 |
for (unsigned i = 0; i < outcomes_size; i++) |
|
15876
|
0 |
0 |
w.error_outcomes[i] = (i == required_outcome) - w.outcomes[i]; |
|
|
0 |
0 |
w.error_outcomes[i] = (i == required_outcome) - w.outcomes[i]; |
|
|
0 |
0 |
w.error_outcomes[i] = (i == required_outcome) - w.outcomes[i]; |
|
|
0 |
0 |
w.error_outcomes[i] = (i == required_outcome) - w.outcomes[i]; |
|
|
0 |
0 |
w.error_outcomes[i] = (i == required_outcome) - w.outcomes[i]; |
|
15880
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
15881
|
0 |
0 |
for (unsigned j = 0; j < outcomes_size; j++) |
|
|
0 |
0 |
for (unsigned j = 0; j < outcomes_size; j++) |
|
|
0 |
0 |
for (unsigned j = 0; j < outcomes_size; j++) |
|
|
0 |
0 |
for (unsigned j = 0; j < outcomes_size; j++) |
|
|
0 |
0 |
for (unsigned j = 0; j < outcomes_size; j++) |
|
15884
|
0 |
0 |
if (dropout_hidden) { |
|
|
0 |
0 |
if (dropout_hidden) { |
|
|
0 |
0 |
if (dropout_hidden) { |
|
|
0 |
0 |
if (dropout_hidden) { |
|
|
0 |
0 |
if (dropout_hidden) { |
|
15886
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
15893
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
15897
|
0 |
0 |
for (auto&& i : w.hidden_kept) { |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) { |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) { |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) { |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) { |
|
15903
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
15904
|
0 |
0 |
if (w.hidden_layer[i] <= 0) |
|
|
0 |
0 |
if (w.hidden_layer[i] <= 0) |
|
|
0 |
0 |
if (w.hidden_layer[i] <= 0) |
|
|
0 |
0 |
if (w.hidden_layer[i] <= 0) |
|
|
0 |
0 |
if (w.hidden_layer[i] <= 0) |
|
15910
|
0 |
0 |
for (auto&& i : w.hidden_kept) { |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) { |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) { |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) { |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) { |
|
15911
|
0 |
0 |
if (w.weights_batch[1][i].empty()) w.weights_batch[1][i].resize(outcomes_size); |
|
|
0 |
0 |
if (w.weights_batch[1][i].empty()) w.weights_batch[1][i].resize(outcomes_size); |
|
|
0 |
0 |
if (w.weights_batch[1][i].empty()) w.weights_batch[1][i].resize(outcomes_size); |
|
|
0 |
0 |
if (w.weights_batch[1][i].empty()) w.weights_batch[1][i].resize(outcomes_size); |
|
|
0 |
0 |
if (w.weights_batch[1][i].empty()) w.weights_batch[1][i].resize(outcomes_size); |
|
15912
|
0 |
0 |
for (unsigned j = 0; j < outcomes_size; j++) |
|
|
0 |
0 |
for (unsigned j = 0; j < outcomes_size; j++) |
|
|
0 |
0 |
for (unsigned j = 0; j < outcomes_size; j++) |
|
|
0 |
0 |
for (unsigned j = 0; j < outcomes_size; j++) |
|
|
0 |
0 |
for (unsigned j = 0; j < outcomes_size; j++) |
|
15916
|
0 |
0 |
if (w.weights_batch[1][hidden_layer_size].empty()) w.weights_batch[1][hidden_layer_size].resize(outcomes_size); |
|
|
0 |
0 |
if (w.weights_batch[1][hidden_layer_size].empty()) w.weights_batch[1][hidden_layer_size].resize(outcomes_size); |
|
|
0 |
0 |
if (w.weights_batch[1][hidden_layer_size].empty()) w.weights_batch[1][hidden_layer_size].resize(outcomes_size); |
|
|
0 |
0 |
if (w.weights_batch[1][hidden_layer_size].empty()) w.weights_batch[1][hidden_layer_size].resize(outcomes_size); |
|
|
0 |
0 |
if (w.weights_batch[1][hidden_layer_size].empty()) w.weights_batch[1][hidden_layer_size].resize(outcomes_size); |
|
15917
|
0 |
0 |
for (unsigned i = 0; i < outcomes_size; i++) |
|
|
0 |
0 |
for (unsigned i = 0; i < outcomes_size; i++) |
|
|
0 |
0 |
for (unsigned i = 0; i < outcomes_size; i++) |
|
|
0 |
0 |
for (unsigned i = 0; i < outcomes_size; i++) |
|
|
0 |
0 |
for (unsigned i = 0; i < outcomes_size; i++) |
|
15921
|
0 |
0 |
if (dropout_input) { |
|
|
0 |
0 |
if (dropout_input) { |
|
|
0 |
0 |
if (dropout_input) { |
|
|
0 |
0 |
if (dropout_input) { |
|
|
0 |
0 |
if (dropout_input) { |
|
15923
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
15928
|
0 |
0 |
for (auto&& embedding_ids : embedding_ids_sequences) |
|
|
0 |
0 |
for (auto&& embedding_ids : embedding_ids_sequences) |
|
|
0 |
0 |
for (auto&& embedding_ids : embedding_ids_sequences) |
|
|
0 |
0 |
for (auto&& embedding_ids : embedding_ids_sequences) |
|
|
0 |
0 |
for (auto&& embedding_ids : embedding_ids_sequences) |
|
15931
|
0 |
0 |
for (unsigned i = 0; i < embeddings.size(); i++) { |
|
|
0 |
0 |
for (unsigned i = 0; i < embeddings.size(); i++) { |
|
|
0 |
0 |
for (unsigned i = 0; i < embeddings.size(); i++) { |
|
|
0 |
0 |
for (unsigned i = 0; i < embeddings.size(); i++) { |
|
|
0 |
0 |
for (unsigned i = 0; i < embeddings.size(); i++) { |
|
15932
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
15936
|
0 |
0 |
if (embeddings[i].can_update_weights(embedding_id)) { |
|
|
0 |
0 |
if (embeddings[i].can_update_weights(embedding_id)) { |
|
|
0 |
0 |
if (embeddings[i].can_update_weights(embedding_id)) { |
|
|
0 |
0 |
if (embeddings[i].can_update_weights(embedding_id)) { |
|
|
0 |
0 |
if (embeddings[i].can_update_weights(embedding_id)) { |
|
15937
|
0 |
0 |
if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1); |
|
|
0 |
0 |
if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1); |
|
|
0 |
0 |
if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1); |
|
|
0 |
0 |
if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1); |
|
|
0 |
0 |
if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1); |
|
|
0 |
0 |
if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1); |
|
|
0 |
0 |
if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1); |
|
|
0 |
0 |
if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1); |
|
|
0 |
0 |
if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1); |
|
|
0 |
0 |
if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1); |
|
15938
|
0 |
0 |
if (w.error_embedding[i][embedding_id].empty()) { |
|
|
0 |
0 |
if (w.error_embedding[i][embedding_id].empty()) { |
|
|
0 |
0 |
if (w.error_embedding[i][embedding_id].empty()) { |
|
|
0 |
0 |
if (w.error_embedding[i][embedding_id].empty()) { |
|
|
0 |
0 |
if (w.error_embedding[i][embedding_id].empty()) { |
|
15940
|
0 |
0 |
w.error_embedding_nonempty[i].emplace_back(embedding_id); |
|
|
0 |
0 |
w.error_embedding_nonempty[i].emplace_back(embedding_id); |
|
|
0 |
0 |
w.error_embedding_nonempty[i].emplace_back(embedding_id); |
|
|
0 |
0 |
w.error_embedding_nonempty[i].emplace_back(embedding_id); |
|
|
0 |
0 |
w.error_embedding_nonempty[i].emplace_back(embedding_id); |
|
15946
|
0 |
0 |
for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding) |
|
|
0 |
0 |
for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding) |
|
|
0 |
0 |
for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding) |
|
|
0 |
0 |
for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding) |
|
|
0 |
0 |
for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding) |
|
|
0 |
0 |
for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding) |
|
|
0 |
0 |
for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding) |
|
|
0 |
0 |
for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding) |
|
|
0 |
0 |
for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding) |
|
|
0 |
0 |
for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding) |
|
15947
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
15948
|
0 |
0 |
if (error_embedding) |
|
|
0 |
0 |
if (error_embedding) |
|
|
0 |
0 |
if (error_embedding) |
|
|
0 |
0 |
if (error_embedding) |
|
|
0 |
0 |
if (error_embedding) |
|
15949
|
0 |
0 |
for (auto&& j : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& j : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& j : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& j : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& j : w.hidden_kept) |
|
15951
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
15952
|
0 |
0 |
for (auto&& j : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& j : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& j : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& j : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& j : w.hidden_kept) |
|
15962
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
15963
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
15968
|
0 |
0 |
if (++w.batch < batch_size) return; |
|
|
0 |
0 |
if (++w.batch < batch_size) return; |
|
|
0 |
0 |
if (++w.batch < batch_size) return; |
|
|
0 |
0 |
if (++w.batch < batch_size) return; |
|
|
0 |
0 |
if (++w.batch < batch_size) return; |
|
15972
|
0 |
0 |
if (!network.weights[0].empty()) |
|
|
0 |
0 |
if (!network.weights[0].empty()) |
|
|
0 |
0 |
if (!network.weights[0].empty()) |
|
|
0 |
0 |
if (!network.weights[0].empty()) |
|
|
0 |
0 |
if (!network.weights[0].empty()) |
|
15973
|
0 |
0 |
for (int i = 0; i < 2; i++) { |
|
|
0 |
0 |
for (int i = 0; i < 2; i++) { |
|
|
0 |
0 |
for (int i = 0; i < 2; i++) { |
|
|
0 |
0 |
for (int i = 0; i < 2; i++) { |
|
|
0 |
0 |
for (int i = 0; i < 2; i++) { |
|
15974
|
0 |
0 |
for (unsigned j = 0; j < w.weights_batch[i].size(); j++) |
|
|
0 |
0 |
for (unsigned j = 0; j < w.weights_batch[i].size(); j++) |
|
|
0 |
0 |
for (unsigned j = 0; j < w.weights_batch[i].size(); j++) |
|
|
0 |
0 |
for (unsigned j = 0; j < w.weights_batch[i].size(); j++) |
|
|
0 |
0 |
for (unsigned j = 0; j < w.weights_batch[i].size(); j++) |
|
15975
|
0 |
0 |
if (!w.weights_batch[i][j].empty()) { |
|
|
0 |
0 |
if (!w.weights_batch[i][j].empty()) { |
|
|
0 |
0 |
if (!w.weights_batch[i][j].empty()) { |
|
|
0 |
0 |
if (!w.weights_batch[i][j].empty()) { |
|
|
0 |
0 |
if (!w.weights_batch[i][j].empty()) { |
|
15976
|
0 |
0 |
for (unsigned k = 0; k < w.weights_batch[i][j].size(); k++) |
|
|
0 |
0 |
for (unsigned k = 0; k < w.weights_batch[i][j].size(); k++) |
|
|
0 |
0 |
for (unsigned k = 0; k < w.weights_batch[i][j].size(); k++) |
|
|
0 |
0 |
for (unsigned k = 0; k < w.weights_batch[i][j].size(); k++) |
|
|
0 |
0 |
for (unsigned k = 0; k < w.weights_batch[i][j].size(); k++) |
|
15977
|
0 |
0 |
network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k]; |
|
|
0 |
0 |
network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k]; |
|
|
0 |
0 |
network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k]; |
|
|
0 |
0 |
network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k]; |
|
|
0 |
0 |
network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k]; |
|
|
0 |
0 |
network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k]; |
|
|
0 |
0 |
network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k]; |
|
|
0 |
0 |
network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k]; |
|
|
0 |
0 |
network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k]; |
|
15983
|
0 |
0 |
for (unsigned i = 0; i < embeddings.size(); i++) { |
|
|
0 |
0 |
for (unsigned i = 0; i < embeddings.size(); i++) { |
|
|
0 |
0 |
for (unsigned i = 0; i < embeddings.size(); i++) { |
|
|
0 |
0 |
for (unsigned i = 0; i < embeddings.size(); i++) { |
|
|
0 |
0 |
for (unsigned i = 0; i < embeddings.size(); i++) { |
|
15984
|
0 |
0 |
for (auto&& id : w.error_embedding_nonempty[i]) { |
|
|
0 |
0 |
for (auto&& id : w.error_embedding_nonempty[i]) { |
|
|
0 |
0 |
for (auto&& id : w.error_embedding_nonempty[i]) { |
|
|
0 |
0 |
for (auto&& id : w.error_embedding_nonempty[i]) { |
|
|
0 |
0 |
for (auto&& id : w.error_embedding_nonempty[i]) { |
|
15985
|
0 |
0 |
if (TRAINER::need_trainer_data) { |
|
|
0 |
0 |
if (TRAINER::need_trainer_data) { |
|
|
0 |
0 |
if (TRAINER::need_trainer_data) { |
|
|
0 |
0 |
if (TRAINER::need_trainer_data) { |
|
|
0 |
0 |
if (TRAINER::need_trainer_data) { |
|
15986
|
0 |
0 |
if (w.embedding_trainer.size() <= i) w.embedding_trainer.resize(i + 1); |
|
|
0 |
0 |
if (w.embedding_trainer.size() <= i) w.embedding_trainer.resize(i + 1); |
|
|
0 |
0 |
if (w.embedding_trainer.size() <= i) w.embedding_trainer.resize(i + 1); |
|
|
0 |
0 |
if (w.embedding_trainer.size() <= i) w.embedding_trainer.resize(i + 1); |
|
|
0 |
0 |
if (w.embedding_trainer.size() <= i) w.embedding_trainer.resize(i + 1); |
|
15987
|
0 |
0 |
if (w.embedding_trainer[i].size() <= id) w.embedding_trainer[i].resize(id + 1); |
|
|
0 |
0 |
if (w.embedding_trainer[i].size() <= id) w.embedding_trainer[i].resize(id + 1); |
|
|
0 |
0 |
if (w.embedding_trainer[i].size() <= id) w.embedding_trainer[i].resize(id + 1); |
|
|
0 |
0 |
if (w.embedding_trainer[i].size() <= id) w.embedding_trainer[i].resize(id + 1); |
|
|
0 |
0 |
if (w.embedding_trainer[i].size() <= id) w.embedding_trainer[i].resize(id + 1); |
|
15988
|
0 |
0 |
if (w.embedding_trainer[i][id].size() < embeddings[i].dimension) w.embedding_trainer[i][id].resize(embeddings[i].dimension); |
|
|
0 |
0 |
if (w.embedding_trainer[i][id].size() < embeddings[i].dimension) w.embedding_trainer[i][id].resize(embeddings[i].dimension); |
|
|
0 |
0 |
if (w.embedding_trainer[i][id].size() < embeddings[i].dimension) w.embedding_trainer[i][id].resize(embeddings[i].dimension); |
|
|
0 |
0 |
if (w.embedding_trainer[i][id].size() < embeddings[i].dimension) w.embedding_trainer[i][id].resize(embeddings[i].dimension); |
|
|
0 |
0 |
if (w.embedding_trainer[i][id].size() < embeddings[i].dimension) w.embedding_trainer[i][id].resize(embeddings[i].dimension); |
|
15991
|
0 |
0 |
for (unsigned j = 0; j < embeddings[i].dimension; j++) |
|
|
0 |
0 |
for (unsigned j = 0; j < embeddings[i].dimension; j++) |
|
|
0 |
0 |
for (unsigned j = 0; j < embeddings[i].dimension; j++) |
|
|
0 |
0 |
for (unsigned j = 0; j < embeddings[i].dimension; j++) |
|
|
0 |
0 |
for (unsigned j = 0; j < embeddings[i].dimension; j++) |
|
15992
|
0 |
0 |
embedding[j] += TRAINER::delta(w.error_embedding[i][id][j], trainer, TRAINER::need_trainer_data ? w.embedding_trainer[i][id][j] : none_trainer_data) - l2_regularization * embedding[j]; |
|
|
0 |
0 |
embedding[j] += TRAINER::delta(w.error_embedding[i][id][j], trainer, TRAINER::need_trainer_data ? w.embedding_trainer[i][id][j] : none_trainer_data) - l2_regularization * embedding[j]; |
|
|
0 |
0 |
embedding[j] += TRAINER::delta(w.error_embedding[i][id][j], trainer, TRAINER::need_trainer_data ? w.embedding_trainer[i][id][j] : none_trainer_data) - l2_regularization * embedding[j]; |
|
|
0 |
0 |
embedding[j] += TRAINER::delta(w.error_embedding[i][id][j], trainer, TRAINER::need_trainer_data ? w.embedding_trainer[i][id][j] : none_trainer_data) - l2_regularization * embedding[j]; |
|
15999
|
0 |
0 |
if (maxnorm_regularization) maxnorm_regularize(); |
|
|
0 |
0 |
if (maxnorm_regularization) maxnorm_regularize(); |
|
|
0 |
0 |
if (maxnorm_regularization) maxnorm_regularize(); |
|
|
0 |
0 |
if (maxnorm_regularization) maxnorm_regularize(); |
|
|
0 |
0 |
if (maxnorm_regularization) maxnorm_regularize(); |
|
16026
|
0 |
0 |
training_failure("Internal error, unsupported trainer!"); |
|
|
0 |
0 |
training_failure("Internal error, unsupported trainer!"); |
|
16030
|
0 |
0 |
if (!l1_regularization) return; |
|
16032
|
0 |
0 |
for (auto&& weights : network.weights) |
|
16033
|
0 |
0 |
for (unsigned i = 0; i + 1 /*ignore biases*/ < weights.size(); i++) { |
|
16035
|
0 |
0 |
for (auto&& weight : row) |
|
16036
|
0 |
0 |
if (weight < l1_regularization) weight += l1_regularization; |
|
16037
|
0 |
0 |
else if (weight > l1_regularization) weight -= l1_regularization; |
|
16043
|
0 |
0 |
if (!maxnorm_regularization) return; |
|
16045
|
0 |
0 |
for (unsigned i = 0; i < 2; i++) |
|
16046
|
0 |
0 |
for (unsigned j = 0; j < network.weights[i].front().size(); j++) { |
|
16048
|
0 |
0 |
for (auto&& row : network.weights[i]) |
|
16051
|
0 |
0 |
if (length > 0 && length > maxnorm_regularization * maxnorm_regularization) { |
|
|
0 |
0 |
if (length > 0 && length > maxnorm_regularization * maxnorm_regularization) { |
|
16053
|
0 |
0 |
for (auto&& row : network.weights[i]) |
|
16060
|
0 |
0 |
if (l1_regularization) l1_regularize(); |
|
|
0 |
0 |
if (l1_regularization) l1_regularize(); |
|
|
0 |
0 |
if (l1_regularization) l1_regularize(); |
|
16065
|
0 |
0 |
enc.add_4B(m.empty() ? 0 : m.front().size()); |
|
16067
|
0 |
0 |
for (auto&& row : m) { |
|
16068
|
0 |
0 |
assert(row.size() == m.front().size()); |
|
16283
|
1 |
0 |
struct workspace { |
|
|
2 |
1 |
struct workspace { |
|
16284
|
2 |
1 |
workspace(bool single_root) : conf(single_root) {} |
|
16342
|
0 |
0 |
ifstream in(path_from_utf8(file).c_str(), ifstream::in | ifstream::binary); |
|
16343
|
0 |
0 |
if (!in.is_open()) return nullptr; |
|
16344
|
0 |
0 |
return load(in, cache); |
|
16351
|
1 |
0 |
if (!compressor::load(in, data)) return nullptr; |
|
|
1 |
0 |
if (!compressor::load(in, data)) return nullptr; |
|
16355
|
1 |
0 |
data.next_str(name); |
|
16357
|
1 |
0 |
result.reset(create(name)); |
|
16358
|
0 |
1 |
if (!result) return nullptr; |
|
16360
|
1 |
0 |
result->load(data, cache); |
|
|
0 |
0 |
result->load(data, cache); |
|
16365
|
1 |
0 |
return result && data.is_end() ? result.release() : nullptr; |
|
|
1 |
0 |
return result && data.is_end() ? result.release() : nullptr; |
|
16369
|
1 |
0 |
if (name == "nn") return new parser_nn(false); |
|
16370
|
0 |
0 |
if (name == "nn_versioned") return new parser_nn(true); |
|
16398
|
1 |
0 |
if (beam_size > 1) |
|
16401
|
0 |
0 |
parse_greedy(t, cost); |
|
16405
|
0 |
0 |
assert(system); |
|
16406
|
0 |
0 |
if (cost) *cost = 0.; |
|
16410
|
0 |
0 |
if (!w) w = new workspace(single_root); |
|
16416
|
0 |
0 |
if (w->embeddings.size() < t.nodes.size()) w->embeddings.resize(t.nodes.size()); |
|
16417
|
0 |
0 |
for (size_t i = 0; i < t.nodes.size(); i++) { |
|
16418
|
0 |
0 |
if (w->embeddings[i].size() < embeddings.size()) w->embeddings[i].resize(embeddings.size()); |
|
16419
|
0 |
0 |
for (size_t j = 0; j < embeddings.size(); j++) { |
|
16427
|
0 |
0 |
for (; !w->conf.final(); transitions++) { |
|
16431
|
0 |
0 |
for (size_t i = 0; i < w->extracted_nodes.size(); i++) |
|
16432
|
0 |
0 |
w->extracted_embeddings[i] = w->extracted_nodes[i] >= 0 ? &w->embeddings[w->extracted_nodes[i]] : nullptr; |
|
16439
|
0 |
0 |
for (unsigned i = 0; i < w->outcomes.size(); i++) |
|
16440
|
0 |
0 |
if (system->applicable(w->conf, i) && (best < 0 || w->outcomes[i] > w->outcomes[best])) |
|
|
0 |
0 |
if (system->applicable(w->conf, i) && (best < 0 || w->outcomes[i] > w->outcomes[best])) |
|
|
0 |
0 |
if (system->applicable(w->conf, i) && (best < 0 || w->outcomes[i] > w->outcomes[best])) |
|
|
0 |
0 |
if (system->applicable(w->conf, i) && (best < 0 || w->outcomes[i] > w->outcomes[best])) |
|
16445
|
0 |
0 |
if (cost) *cost += log(w->outcomes[best]); |
|
16448
|
0 |
0 |
if (child >= 0) |
|
16449
|
0 |
0 |
for (size_t i = 0; i < embeddings.size(); i++) { |
|
16455
|
0 |
0 |
if (cost && transitions) |
|
16463
|
0 |
1 |
assert(system); |
|
16467
|
1 |
0 |
if (!w) w = new workspace(single_root); |
|
16470
|
2 |
1 |
for (int i = 0; i < 2; i++) { |
|
16471
|
2 |
10 |
while (w->bs_confs[i].size() < beam_size) w->bs_confs[i].emplace_back(single_root); |
|
16472
|
0 |
2 |
while (w->bs_confs[i].size() > beam_size) w->bs_confs[i].pop_back(); |
|
16481
|
1 |
0 |
if (w->embeddings.size() < t.nodes.size()) w->embeddings.resize(t.nodes.size()); |
|
16482
|
1 |
0 |
if (w->embeddings_values.size() < t.nodes.size()) w->embeddings_values.resize(t.nodes.size()); |
|
16483
|
8 |
1 |
for (size_t i = 0; i < t.nodes.size(); i++) { |
|
16484
|
8 |
0 |
if (w->embeddings[i].size() < embeddings.size()) w->embeddings[i].resize(embeddings.size()); |
|
16485
|
8 |
0 |
if (w->embeddings_values[i].size() < embeddings.size()) w->embeddings_values[i].resize(embeddings.size()); |
|
16486
|
32 |
8 |
for (size_t j = 0; j < embeddings.size(); j++) { |
|
16494
|
1 |
15 |
for (bool all_final = false; !all_final; iteration++) { |
|
16498
|
67 |
15 |
for (size_t c = 0; c < w->bs_confs_size[iteration & 1]; c++) { |
|
16501
|
5 |
62 |
if (bs_conf.conf.final()) { |
|
16502
|
0 |
5 |
if (w->bs_alternatives.size() == beam_size) { |
|
16503
|
0 |
0 |
if (bs_conf.cost <= w->bs_alternatives[0].cost) continue; |
|
16515
|
496 |
62 |
for (size_t i = 0; i < t.nodes.size(); i++) |
|
16516
|
1984 |
496 |
for (size_t j = 0; j < embeddings.size(); j++) { |
|
16518
|
96 |
1888 |
if (w->word != w->embeddings_values[i][j]) { |
|
16527
|
1116 |
62 |
for (size_t i = 0; i < w->extracted_nodes.size(); i++) |
|
16528
|
410 |
706 |
w->extracted_embeddings[i] = w->extracted_nodes[i] >= 0 ? &w->embeddings[w->extracted_nodes[i]] : nullptr; |
|
16534
|
806 |
62 |
for (unsigned i = 0; i < w->outcomes.size(); i++) |
|
16535
|
633 |
173 |
if (system->applicable(bs_conf.conf, i)) { |
|
16537
|
567 |
66 |
if (w->bs_alternatives.size() == beam_size) { |
|
16538
|
170 |
397 |
if (cost <= w->bs_alternatives[0].cost) continue; |
|
16548
|
15 |
71 |
for (auto&& alternative : w->bs_alternatives) { |
|
16552
|
66 |
5 |
if (alternative.transition >= 0) { |
|
16562
|
4 |
1 |
for (size_t i = 1; i < w->bs_confs_size[iteration & 1]; i++) |
|
16563
|
2 |
2 |
if (w->bs_confs[iteration & 1][i].cost > w->bs_confs[iteration & 1][best].cost) |
|
16567
|
0 |
1 |
if (cost) *cost = w->bs_confs[iteration & 1][best].cost * (t.nodes.size() - 1); |
|
16574
|
129 |
1032 |
for (auto&& node : conf.t->nodes) node.children.clear(); |
|
16575
|
1032 |
129 |
for (size_t i = 0; i < conf.t->nodes.size(); i++) { |
|
16578
|
302 |
730 |
if (heads[i] >= 0) conf.t->nodes[heads[i]].children.push_back(i); |
|
16583
|
1 |
66 |
if (conf.t->nodes.size() > heads.size()) heads.resize(conf.t->nodes.size()); |
|
16584
|
1 |
66 |
if (conf.t->nodes.size() > deprels.size()) deprels.resize(conf.t->nodes.size()); |
|
16585
|
536 |
67 |
for (size_t i = 0; i < conf.t->nodes.size(); i++) { |
|
16594
|
0 |
1 |
version = versioned ? data.next_1B() : 1; |
|
|
0 |
0 |
version = versioned ? data.next_1B() : 1; |
|
16595
|
0 |
1 |
if (!(version >= 1 && version <= VERSION_LATEST)) |
|
16598
|
0 |
1 |
single_root = version >= 2 ? data.next_1B() : false; |
|
|
0 |
0 |
single_root = version >= 2 ? data.next_1B() : false; |
|
|
0 |
0 |
single_root = version >= 2 ? data.next_1B() : false; |
|
16601
|
1 |
0 |
labels.resize(data.next_2B()); |
|
|
1 |
0 |
labels.resize(data.next_2B()); |
|
16602
|
6 |
1 |
for (auto&& label : labels) |
|
16603
|
6 |
0 |
data.next_str(label); |
|
16607
|
1 |
0 |
data.next_str(system_name); |
|
16608
|
1 |
0 |
system.reset(transition_system::create(system_name, labels)); |
|
16609
|
0 |
1 |
if (!system) throw binary_decoder_error("Cannot load transition system"); |
|
16612
|
1 |
0 |
data.next_str(description); |
|
16613
|
1 |
0 |
if (!nodes.create(description, error)) |
|
|
0 |
1 |
if (!nodes.create(description, error)) |
|
16617
|
1 |
0 |
values.resize(data.next_2B()); |
|
|
1 |
0 |
values.resize(data.next_2B()); |
|
16618
|
4 |
1 |
for (auto&& value : values) { |
|
16619
|
4 |
0 |
data.next_str(description); |
|
16620
|
4 |
0 |
if (!value.create(description, error)) |
|
|
0 |
4 |
if (!value.create(description, error)) |
|
16624
|
1 |
0 |
embeddings.resize(values.size()); |
|
16625
|
4 |
1 |
for (auto&& embedding : embeddings) |
|
16626
|
4 |
0 |
embedding.load(data); |
|
16629
|
1 |
0 |
network.load(data); |
|
16630
|
1 |
0 |
network.generate_tanh_cache(); |
|
16631
|
1 |
0 |
network.generate_embeddings_cache(embeddings, embeddings_cache, cache); |
|
16678
|
0 |
0 |
if (train.empty()) training_failure("No training data was given!"); |
|
|
0 |
0 |
if (train.empty()) training_failure("No training data was given!"); |
|
|
0 |
0 |
if (train.empty()) training_failure("No training data was given!"); |
|
16684
|
0 |
0 |
for (auto&& tree : train) |
|
16685
|
0 |
0 |
for (auto&& node : tree.nodes) |
|
16686
|
0 |
0 |
if (node.id) { |
|
16687
|
0 |
0 |
if (node.head < 0) training_failure("The node '" << node.form << "' with id " << node.id << " has no head set!"); |
|
|
0 |
0 |
if (node.head < 0) training_failure("The node '" << node.form << "' with id " << node.id << " has no head set!"); |
|
|
0 |
0 |
if (node.head < 0) training_failure("The node '" << node.form << "' with id " << node.id << " has no head set!"); |
|
|
0 |
0 |
if (node.head < 0) training_failure("The node '" << node.form << "' with id " << node.id << " has no head set!"); |
|
16688
|
0 |
0 |
if (node.deprel.empty()) training_failure("The node '" << node.form << "' with id " << node.id << " has no deprel set!"); |
|
|
0 |
0 |
if (node.deprel.empty()) training_failure("The node '" << node.form << "' with id " << node.id << " has no deprel set!"); |
|
|
0 |
0 |
if (node.deprel.empty()) training_failure("The node '" << node.form << "' with id " << node.id << " has no deprel set!"); |
|
|
0 |
0 |
if (node.deprel.empty()) training_failure("The node '" << node.form << "' with id " << node.id << " has no deprel set!"); |
|
16696
|
0 |
0 |
for (auto&& tree : train) |
|
16697
|
0 |
0 |
for (auto&& node : tree.nodes) |
|
16698
|
0 |
0 |
if (node.id && !labels_set.count(node.deprel)) { |
|
16700
|
0 |
0 |
parser.labels.push_back(node.deprel); |
|
16704
|
0 |
0 |
if (single_root) { |
|
16705
|
0 |
0 |
for (auto&& tree : train) { |
|
16707
|
0 |
0 |
for (auto&& node : tree.nodes) |
|
16708
|
0 |
0 |
if (node.id) { |
|
16709
|
0 |
0 |
if (node.head == 0 && node.deprel != "root") |
|
|
0 |
0 |
if (node.head == 0 && node.deprel != "root") |
|
|
0 |
0 |
if (node.head == 0 && node.deprel != "root") |
|
16710
|
0 |
0 |
training_failure("When single root is required, every root node must have 'root' deprel!"); |
|
|
0 |
0 |
training_failure("When single root is required, every root node must have 'root' deprel!"); |
|
16711
|
0 |
0 |
if (node.head != 0 && node.deprel == "root") |
|
|
0 |
0 |
if (node.head != 0 && node.deprel == "root") |
|
|
0 |
0 |
if (node.head != 0 && node.deprel == "root") |
|
16712
|
0 |
0 |
training_failure("When single root is required, any non-root cannot have 'root' deprel!"); |
|
|
0 |
0 |
training_failure("When single root is required, any non-root cannot have 'root' deprel!"); |
|
16715
|
0 |
0 |
if (roots != 1) |
|
16716
|
0 |
0 |
training_failure("When single root is required, every training tree must have single root!"); |
|
|
0 |
0 |
training_failure("When single root is required, every training tree must have single root!"); |
|
16720
|
0 |
0 |
if (!labels_set.count("root")) |
|
|
0 |
0 |
if (!labels_set.count("root")) |
|
16721
|
0 |
0 |
training_failure("When single root is required, the deprel 'root' must be present!"); |
|
|
0 |
0 |
training_failure("When single root is required, the deprel 'root' must be present!"); |
|
16722
|
0 |
0 |
if (labels_set.size() <= 1) |
|
16723
|
0 |
0 |
training_failure("When single root is required, deprel different from 'root' must exist!"); |
|
|
0 |
0 |
training_failure("When single root is required, deprel different from 'root' must exist!"); |
|
16727
|
0 |
0 |
parser.system.reset(transition_system::create(transition_system_name, parser.labels)); |
|
16728
|
0 |
0 |
if (!parser.system) training_failure("Cannot create transition system '" << transition_system_name << "'!"); |
|
|
0 |
0 |
if (!parser.system) training_failure("Cannot create transition system '" << transition_system_name << "'!"); |
|
|
0 |
0 |
if (!parser.system) training_failure("Cannot create transition system '" << transition_system_name << "'!"); |
|
16730
|
0 |
0 |
unique_ptr oracle(parser.system->oracle(transition_oracle_name)); |
|
16731
|
0 |
0 |
if (!oracle) training_failure("Cannot create transition oracle '" << transition_oracle_name << "' for transition system '" << transition_system_name << "'!"); |
|
|
0 |
0 |
if (!oracle) training_failure("Cannot create transition oracle '" << transition_oracle_name << "' for transition system '" << transition_system_name << "'!"); |
|
|
0 |
0 |
if (!oracle) training_failure("Cannot create transition oracle '" << transition_oracle_name << "' for transition system '" << transition_system_name << "'!"); |
|
16735
|
0 |
0 |
if (!parser.nodes.create(nodes_description, error)) training_failure(error); |
|
|
0 |
0 |
if (!parser.nodes.create(nodes_description, error)) training_failure(error); |
|
|
0 |
0 |
if (!parser.nodes.create(nodes_description, error)) training_failure(error); |
|
16740
|
0 |
0 |
split(embeddings_description, '\n', lines); |
|
16741
|
0 |
0 |
for (auto&& line : lines) { |
|
16743
|
0 |
0 |
if (!line.len || line.str[0] == '#') continue; |
|
|
0 |
0 |
if (!line.len || line.str[0] == '#') continue; |
|
16745
|
0 |
0 |
split(line, ' ', tokens); |
|
16746
|
0 |
0 |
if (!(tokens.size() >= 3 && tokens.size() <= 6)) |
|
|
0 |
0 |
if (!(tokens.size() >= 3 && tokens.size() <= 6)) |
|
|
0 |
0 |
if (!(tokens.size() >= 3 && tokens.size() <= 6)) |
|
16747
|
0 |
0 |
training_failure("Expected 3 to 6 columns on embedding description line '" << line << "'!"); |
|
|
0 |
0 |
training_failure("Expected 3 to 6 columns on embedding description line '" << line << "'!"); |
|
16749
|
0 |
0 |
value_names.emplace_back(string(tokens[0].str, tokens[0].len)); |
|
16750
|
0 |
0 |
parser.values.emplace_back(); |
|
16751
|
0 |
0 |
if (!parser.values.back().create(tokens[0], error)) training_failure(error); |
|
|
0 |
0 |
if (!parser.values.back().create(tokens[0], error)) training_failure(error); |
|
|
0 |
0 |
if (!parser.values.back().create(tokens[0], error)) training_failure(error); |
|
16753
|
0 |
0 |
int dimension = parse_int(tokens[1], "embedding dimension"); |
|
16754
|
0 |
0 |
int min_count = parse_int(tokens[2], "minimum frequency count"); |
|
16764
|
0 |
0 |
for (auto&& tree : train) |
|
16765
|
0 |
0 |
for (auto&& node : tree.nodes) |
|
16766
|
0 |
0 |
if (node.id) { |
|
16767
|
0 |
0 |
parser.values.back().extract(node, word); |
|
16772
|
0 |
0 |
if (tokens.size() >= 4) { |
|
16773
|
0 |
0 |
int update_weights = tokens.size() >= 5 ? parse_int(tokens[4], "update weights") : 1; |
|
|
0 |
0 |
int update_weights = tokens.size() >= 5 ? parse_int(tokens[4], "update weights") : 1; |
|
16774
|
0 |
0 |
int max_embeddings = tokens.size() >= 6 ? parse_int(tokens[5], "maximum embeddings count") : numeric_limits::max(); |
|
|
0 |
0 |
int max_embeddings = tokens.size() >= 6 ? parse_int(tokens[5], "maximum embeddings count") : numeric_limits::max(); |
|
16775
|
0 |
0 |
ifstream in(path_from_utf8(string(tokens[3].str, tokens[3].len)).c_str()); |
|
16776
|
0 |
0 |
if (!in.is_open()) training_failure("Cannot load '" << tokens[0] << "' embedding from file '" << tokens[3] << "'!"); |
|
|
0 |
0 |
if (!in.is_open()) training_failure("Cannot load '" << tokens[0] << "' embedding from file '" << tokens[3] << "'!"); |
|
|
0 |
0 |
if (!in.is_open()) training_failure("Cannot load '" << tokens[0] << "' embedding from file '" << tokens[3] << "'!"); |
|
16781
|
0 |
0 |
if (!getline(in, line)) training_failure("Cannot read first line from embedding file '" << tokens[3] << "'!"); |
|
|
0 |
0 |
if (!getline(in, line)) training_failure("Cannot read first line from embedding file '" << tokens[3] << "'!"); |
|
|
0 |
0 |
if (!getline(in, line)) training_failure("Cannot read first line from embedding file '" << tokens[3] << "'!"); |
|
|
0 |
0 |
if (!getline(in, line)) training_failure("Cannot read first line from embedding file '" << tokens[3] << "'!"); |
|
16782
|
0 |
0 |
split(line, ' ', parts); |
|
16783
|
0 |
0 |
if (parts.size() != 2) training_failure("Expected two numbers on the first line of embedding file '" << tokens[3] << "'!"); |
|
|
0 |
0 |
if (parts.size() != 2) training_failure("Expected two numbers on the first line of embedding file '" << tokens[3] << "'!"); |
|
|
0 |
0 |
if (parts.size() != 2) training_failure("Expected two numbers on the first line of embedding file '" << tokens[3] << "'!"); |
|
16784
|
0 |
0 |
int file_dimension = parse_int(parts[1], "embedding file dimension"); |
|
16786
|
0 |
0 |
if (file_dimension < dimension) training_failure("The embedding file '" << tokens[3] << "' has lower dimension than required!"); |
|
|
0 |
0 |
if (file_dimension < dimension) training_failure("The embedding file '" << tokens[3] << "' has lower dimension than required!"); |
|
|
0 |
0 |
if (file_dimension < dimension) training_failure("The embedding file '" << tokens[3] << "' has lower dimension than required!"); |
|
16790
|
0 |
0 |
if (file_dimension > dimension) { |
|
16791
|
0 |
0 |
embeddings_from_file_comment = "[dim" + to_string(file_dimension) + "->" + to_string(dimension) + "]"; |
|
|
0 |
0 |
embeddings_from_file_comment = "[dim" + to_string(file_dimension) + "->" + to_string(dimension) + "]"; |
|
|
0 |
0 |
embeddings_from_file_comment = "[dim" + to_string(file_dimension) + "->" + to_string(dimension) + "]"; |
|
|
0 |
0 |
embeddings_from_file_comment = "[dim" + to_string(file_dimension) + "->" + to_string(dimension) + "]"; |
|
16794
|
0 |
0 |
projection.resize(dimension); |
|
16795
|
0 |
0 |
for (auto&& row : projection) { |
|
16796
|
0 |
0 |
row.resize(file_dimension); |
|
16797
|
0 |
0 |
for (auto&& weight : row) weight = uniform(generator); |
|
16800
|
0 |
0 |
for (auto&& weight : row) sum += weight; |
|
16801
|
0 |
0 |
for (auto&& weight : row) weight /= sum; |
|
16806
|
0 |
0 |
vector input_weights(file_dimension); |
|
16807
|
0 |
0 |
vector projected_weights(dimension); |
|
16808
|
0 |
0 |
while (getline(in, line) && int(weights.size()) < max_embeddings) { |
|
|
0 |
0 |
while (getline(in, line) && int(weights.size()) < max_embeddings) { |
|
|
0 |
0 |
while (getline(in, line) && int(weights.size()) < max_embeddings) { |
|
|
0 |
0 |
while (getline(in, line) && int(weights.size()) < max_embeddings) { |
|
16809
|
0 |
0 |
split(line, ' ', parts); |
|
16810
|
0 |
0 |
if (!parts.empty() && !parts.back().len) parts.pop_back(); // Ignore space at the end of line |
|
|
0 |
0 |
if (!parts.empty() && !parts.back().len) parts.pop_back(); // Ignore space at the end of line |
|
|
0 |
0 |
if (!parts.empty() && !parts.back().len) parts.pop_back(); // Ignore space at the end of line |
|
16811
|
0 |
0 |
if (int(parts.size()) != file_dimension + 1) training_failure("Wrong number of values on line '" << line << "' of embedding file '" << tokens[3]); |
|
|
0 |
0 |
if (int(parts.size()) != file_dimension + 1) training_failure("Wrong number of values on line '" << line << "' of embedding file '" << tokens[3]); |
|
16812
|
0 |
0 |
for (int i = 0; i < file_dimension; i++) |
|
16813
|
0 |
0 |
input_weights[i] = parse_double(parts[1 + i], "embedding weight"); |
|
16818
|
0 |
0 |
if (update_weights == 2 && !word_counts.count(word)) |
|
16821
|
0 |
0 |
for (int i = 0; i < dimension; i++) |
|
16822
|
0 |
0 |
if (file_dimension == dimension) { |
|
16826
|
0 |
0 |
for (int j = 0; j < file_dimension; j++) |
|
16830
|
0 |
0 |
if (!weights_set.count(word)) { |
|
16831
|
0 |
0 |
weights.emplace_back(word, projected_weights); |
|
16836
|
0 |
0 |
updatable_index = update_weights ? 0 : embeddings_from_file; |
|
16842
|
0 |
0 |
for (auto&& word_count : word_counts) |
|
16843
|
0 |
0 |
if (word_count.second >= min_count && !weights_set.count(word_count.first)) |
|
16844
|
0 |
0 |
count_words.emplace_back(word_count.second, word_count.first); |
|
16848
|
0 |
0 |
vector word_weights(dimension); |
|
16850
|
0 |
0 |
for (auto&& count_word : count_words) { |
|
16851
|
0 |
0 |
for (auto&& word_weight : word_weights) |
|
16854
|
0 |
0 |
weights.emplace_back(count_word.second, word_weights); |
|
16859
|
0 |
0 |
vector unknown_weights(dimension); |
|
16860
|
0 |
0 |
if (min_count > 1) { |
|
16863
|
0 |
0 |
for (auto&& weight : unknown_weights) |
|
16868
|
0 |
0 |
parser.embeddings.emplace_back(); |
|
16869
|
0 |
0 |
parser.embeddings.back().create(dimension, updatable_index, weights, unknown_weights); |
|
16874
|
0 |
0 |
for (auto&& tree : train) |
|
16875
|
0 |
0 |
for (auto&& node : tree.nodes) |
|
16876
|
0 |
0 |
if (node.id) { |
|
16877
|
0 |
0 |
parser.values.back().extract(node, word); |
|
16879
|
0 |
0 |
int word_id = parser.embeddings.back().lookup_word(word, buffer); |
|
16881
|
0 |
0 |
words_covered_from_file += word_id != parser.embeddings.back().unknown_word() && unsigned(word_id) < embeddings_from_file; |
|
|
0 |
0 |
words_covered_from_file += word_id != parser.embeddings.back().unknown_word() && unsigned(word_id) < embeddings_from_file; |
|
16891
|
0 |
0 |
for (auto&& embedding : parser.embeddings) total_dimension += embedding.dimension; |
|
16892
|
0 |
0 |
for (auto&& tree : train) total_nodes += tree.nodes.size() - 1; |
|
16896
|
0 |
0 |
neural_network_trainer network_trainer(parser.network, total_dimension * parser.nodes.node_count(), parser.system->transition_count(), scaled_parameters, generator); |
|
|
0 |
0 |
neural_network_trainer network_trainer(parser.network, total_dimension * parser.nodes.node_count(), parser.system->transition_count(), scaled_parameters, generator); |
|
16902
|
0 |
0 |
for (size_t i = 0; i < train.size(); i++) |
|
16905
|
0 |
0 |
for (int iteration = 1; network_trainer.next_iteration(); iteration++) { |
|
16922
|
0 |
0 |
tree t_eval; |
|
16930
|
0 |
0 |
for (unsigned current_index; (current_index = atomic_index++) < permutation.size();) { |
|
16934
|
0 |
0 |
conf.init(&t); |
|
16937
|
0 |
0 |
if (t.nodes.size() > nodes_embeddings.size()) nodes_embeddings.resize(t.nodes.size()); |
|
|
0 |
0 |
if (t.nodes.size() > nodes_embeddings.size()) nodes_embeddings.resize(t.nodes.size()); |
|
16938
|
0 |
0 |
for (size_t i = 0; i < t.nodes.size(); i++) { |
|
16939
|
0 |
0 |
nodes_embeddings[i].resize(parser.embeddings.size()); |
|
16940
|
0 |
0 |
for (size_t j = 0; j < parser.embeddings.size(); j++) { |
|
16941
|
0 |
0 |
parser.values[j].extract(t.nodes[i], word); |
|
16942
|
0 |
0 |
nodes_embeddings[i][j] = parser.embeddings[j].lookup_word(word, word_buffer); |
|
16947
|
0 |
0 |
auto tree_oracle = oracle->create_tree_oracle(gold); |
|
16950
|
0 |
0 |
while (!conf.final()) { |
|
16952
|
0 |
0 |
parser.nodes.extract(conf, extracted_nodes); |
|
16953
|
0 |
0 |
extracted_embeddings.resize(extracted_nodes.size()); |
|
16954
|
0 |
0 |
for (size_t i = 0; i < extracted_nodes.size(); i++) |
|
16955
|
0 |
0 |
extracted_embeddings[i] = extracted_nodes[i] >= 0 ? &nodes_embeddings[extracted_nodes[i]] : nullptr; |
|
16958
|
0 |
0 |
network_trainer.propagate(parser.embeddings, extracted_embeddings, workspace); |
|
16962
|
0 |
0 |
for (unsigned i = 0; i < workspace.outcomes.size(); i++) |
|
16963
|
0 |
0 |
if (parser.system->applicable(conf, i) && (network_best < 0 || workspace.outcomes[i] > workspace.outcomes[network_best])) |
|
|
0 |
0 |
if (parser.system->applicable(conf, i) && (network_best < 0 || workspace.outcomes[i] > workspace.outcomes[network_best])) |
|
|
0 |
0 |
if (parser.system->applicable(conf, i) && (network_best < 0 || workspace.outcomes[i] > workspace.outcomes[network_best])) |
|
|
0 |
0 |
if (parser.system->applicable(conf, i) && (network_best < 0 || workspace.outcomes[i] > workspace.outcomes[network_best])) |
|
|
0 |
0 |
if (parser.system->applicable(conf, i) && (network_best < 0 || workspace.outcomes[i] > workspace.outcomes[network_best])) |
|
16967
|
0 |
0 |
auto prediction = tree_oracle->predict(conf, network_best, iteration); |
|
16970
|
0 |
0 |
if (parser.system->applicable(conf, prediction.best)) { |
|
|
0 |
0 |
if (parser.system->applicable(conf, prediction.best)) { |
|
16972
|
0 |
0 |
if (workspace.outcomes[prediction.best]) |
|
16976
|
0 |
0 |
network_trainer.backpropagate(parser.embeddings, extracted_embeddings, prediction.best, workspace); |
|
16980
|
0 |
0 |
if (!parser.system->applicable(conf, prediction.to_follow)) |
|
|
0 |
0 |
if (!parser.system->applicable(conf, prediction.to_follow)) |
|
16984
|
0 |
0 |
int child = parser.system->perform(conf, prediction.to_follow); |
|
16987
|
0 |
0 |
if (child >= 0) |
|
16988
|
0 |
0 |
for (size_t i = 0; i < parser.embeddings.size(); i++) { |
|
16989
|
0 |
0 |
parser.values[i].extract(t.nodes[child], word); |
|
16990
|
0 |
0 |
nodes_embeddings[child][i] = parser.embeddings[i].lookup_word(word, word_buffer); |
|
16996
|
0 |
0 |
if (parameters.structured_interval && (current_index % parameters.structured_interval) == 0) { |
|
|
0 |
0 |
if (parameters.structured_interval && (current_index % parameters.structured_interval) == 0) { |
|
17001
|
0 |
0 |
conf.init(&t); |
|
17004
|
0 |
0 |
if (t.nodes.size() > nodes_embeddings.size()) nodes_embeddings.resize(t.nodes.size()); |
|
|
0 |
0 |
if (t.nodes.size() > nodes_embeddings.size()) nodes_embeddings.resize(t.nodes.size()); |
|
17005
|
0 |
0 |
for (size_t i = 0; i < t.nodes.size(); i++) { |
|
17006
|
0 |
0 |
nodes_embeddings[i].resize(parser.embeddings.size()); |
|
17007
|
0 |
0 |
for (size_t j = 0; j < parser.embeddings.size(); j++) { |
|
17008
|
0 |
0 |
parser.values[j].extract(t.nodes[i], word); |
|
17009
|
0 |
0 |
nodes_embeddings[i][j] = parser.embeddings[j].lookup_word(word, word_buffer); |
|
17014
|
0 |
0 |
auto tree_oracle = oracle->create_tree_oracle(gold); |
|
17017
|
0 |
0 |
while (!conf.final()) { |
|
17019
|
0 |
0 |
parser.nodes.extract(conf, extracted_nodes); |
|
17020
|
0 |
0 |
extracted_embeddings.resize(extracted_nodes.size()); |
|
17021
|
0 |
0 |
for (size_t i = 0; i < extracted_nodes.size(); i++) |
|
17022
|
0 |
0 |
extracted_embeddings[i] = extracted_nodes[i] >= 0 ? &nodes_embeddings[extracted_nodes[i]] : nullptr; |
|
17027
|
0 |
0 |
tree_oracle->interesting_transitions(conf, transitions_eval); |
|
17028
|
0 |
0 |
for (auto&& transition : transitions_eval) { |
|
17030
|
0 |
0 |
conf_eval = conf; |
|
17032
|
0 |
0 |
nodes_embeddings_eval = nodes_embeddings; |
|
17035
|
0 |
0 |
int child = parser.system->perform(conf_eval, transition); |
|
17036
|
0 |
0 |
if (child >= 0) |
|
17037
|
0 |
0 |
for (size_t i = 0; i < parser.embeddings.size(); i++) { |
|
17038
|
0 |
0 |
parser.values[i].extract(t_eval.nodes[child], word); |
|
17039
|
0 |
0 |
nodes_embeddings_eval[child][i] = parser.embeddings[i].lookup_word(word, word_buffer); |
|
17043
|
0 |
0 |
while (!conf_eval.final()) { |
|
17045
|
0 |
0 |
parser.nodes.extract(conf_eval, extracted_nodes_eval); |
|
17046
|
0 |
0 |
extracted_embeddings_eval.resize(extracted_nodes_eval.size()); |
|
17047
|
0 |
0 |
for (size_t i = 0; i < extracted_nodes_eval.size(); i++) |
|
17048
|
0 |
0 |
extracted_embeddings_eval[i] = extracted_nodes_eval[i] >= 0 ? &nodes_embeddings_eval[extracted_nodes_eval[i]] : nullptr; |
|
17051
|
0 |
0 |
parser.network.propagate(parser.embeddings, extracted_embeddings_eval, hidden_layer_eval, outcomes_eval, nullptr, false); |
|
17055
|
0 |
0 |
for (unsigned i = 0; i < outcomes_eval.size(); i++) |
|
17056
|
0 |
0 |
if (parser.system->applicable(conf_eval, i) && (network_best < 0 || outcomes_eval[i] > outcomes_eval[network_best])) |
|
|
0 |
0 |
if (parser.system->applicable(conf_eval, i) && (network_best < 0 || outcomes_eval[i] > outcomes_eval[network_best])) |
|
|
0 |
0 |
if (parser.system->applicable(conf_eval, i) && (network_best < 0 || outcomes_eval[i] > outcomes_eval[network_best])) |
|
|
0 |
0 |
if (parser.system->applicable(conf_eval, i) && (network_best < 0 || outcomes_eval[i] > outcomes_eval[network_best])) |
|
|
0 |
0 |
if (parser.system->applicable(conf_eval, i) && (network_best < 0 || outcomes_eval[i] > outcomes_eval[network_best])) |
|
17060
|
0 |
0 |
int child = parser.system->perform(conf_eval, network_best); |
|
17063
|
0 |
0 |
if (child >= 0) |
|
17064
|
0 |
0 |
for (size_t i = 0; i < parser.embeddings.size(); i++) { |
|
17065
|
0 |
0 |
parser.values[i].extract(t_eval.nodes[child], word); |
|
17066
|
0 |
0 |
nodes_embeddings_eval[child][i] = parser.embeddings[i].lookup_word(word, word_buffer); |
|
17071
|
0 |
0 |
for (unsigned i = 1; i < gold.nodes.size(); i++) |
|
17074
|
0 |
0 |
if (uas > best_uas) best = transition, best_uas = uas; |
|
17078
|
0 |
0 |
network_trainer.propagate(parser.embeddings, extracted_embeddings, workspace); |
|
17081
|
0 |
0 |
if (workspace.outcomes[best]) |
|
17083
|
0 |
0 |
network_trainer.backpropagate(parser.embeddings, extracted_embeddings, best, workspace); |
|
17092
|
0 |
0 |
int child = parser.system->perform(conf, /*network_*/best); |
|
17095
|
0 |
0 |
if (child >= 0) |
|
17096
|
0 |
0 |
for (size_t i = 0; i < parser.embeddings.size(); i++) { |
|
17097
|
0 |
0 |
parser.values[i].extract(t.nodes[child], word); |
|
17098
|
0 |
0 |
nodes_embeddings[child][i] = parser.embeddings[i].lookup_word(word, word_buffer); |
|
17104
|
0 |
0 |
for (double old_atomic_logprob = atomic_logprob; atomic_logprob.compare_exchange_weak(old_atomic_logprob, old_atomic_logprob + logprob); ) {} |
|
17107
|
0 |
0 |
cerr << "Iteration " << iteration << ": "; |
|
|
0 |
0 |
cerr << "Iteration " << iteration << ": "; |
|
17108
|
0 |
0 |
training(); |
|
17112
|
0 |
0 |
if (!heldout.empty()) { |
|
17113
|
0 |
0 |
tree t; |
|
17115
|
0 |
0 |
for (auto&& gold : heldout) { |
|
17119
|
0 |
0 |
for (size_t i = 1; i < t.nodes.size(); i++) { |
|
17122
|
0 |
0 |
correct_labelled += t.nodes[i].head == gold.nodes[i].head && t.nodes[i].deprel == gold.nodes[i].deprel; |
|
|
0 |
0 |
correct_labelled += t.nodes[i].head == gold.nodes[i].head && t.nodes[i].deprel == gold.nodes[i].deprel; |
|
17126
|
0 |
0 |
cerr << ", heldout UAS " << fixed << setprecision(2) << (100. * correct_unlabelled / total) << "%, LAS " << (100. * correct_labelled / total) << "%"; |
|
17128
|
0 |
0 |
if (parameters.early_stopping && correct_labelled > heldout_best_correct_labelled) { |
|
|
0 |
0 |
if (parameters.early_stopping && correct_labelled > heldout_best_correct_labelled) { |
|
17129
|
0 |
0 |
heldout_best_network = parser.network; |
|
17138
|
0 |
0 |
if (parameters.early_stopping && heldout_best_iteration > 0) { |
|
|
0 |
0 |
if (parameters.early_stopping && heldout_best_iteration > 0) { |
|
17140
|
0 |
0 |
parser.network = heldout_best_network; |
|
17144
|
0 |
0 |
enc.add_1B(parser.version); |
|
17147
|
0 |
0 |
enc.add_1B(single_root); |
|
17150
|
0 |
0 |
enc.add_2B(parser.labels.size()); |
|
17151
|
0 |
0 |
for (auto&& label : parser.labels) |
|
17152
|
0 |
0 |
enc.add_str(label); |
|
17153
|
0 |
0 |
enc.add_str(transition_system_name); |
|
17156
|
0 |
0 |
enc.add_str(nodes_description); |
|
17159
|
0 |
0 |
enc.add_2B(value_names.size()); |
|
17160
|
0 |
0 |
for (auto&& value_name : value_names) |
|
17161
|
0 |
0 |
enc.add_str(value_name); |
|
17162
|
0 |
0 |
for (auto&& embedding : parser.embeddings) |
|
17163
|
0 |
0 |
embedding.save(enc); |
|
17166
|
0 |
0 |
network_trainer.save_network(enc); |
|
17188
|
0 |
387 |
if (conf.single_root && label_is_root) |
|
|
0 |
0 |
if (conf.single_root && label_is_root) |
|
17191
|
351 |
36 |
return conf.stack.size() >= 2 && conf.stack[conf.stack.size() - 2]; |
|
|
90 |
261 |
return conf.stack.size() >= 2 && conf.stack[conf.stack.size() - 2]; |
|
17195
|
0 |
15 |
assert(applicable(conf)); |
|
17206
|
0 |
395 |
if (conf.single_root && label_is_root) |
|
|
0 |
0 |
if (conf.single_root && label_is_root) |
|
17207
|
0 |
0 |
return conf.stack.size() == 2 && conf.buffer.empty(); |
|
|
0 |
0 |
return conf.stack.size() == 2 && conf.buffer.empty(); |
|
17208
|
0 |
395 |
else if (conf.single_root) // && !label_is_root |
|
17215
|
0 |
23 |
assert(applicable(conf)); |
|
17229
|
0 |
28 |
assert(applicable(conf)); |
|
17238
|
0 |
0 |
return conf.stack.size() >= 2 && conf.stack[conf.stack.size() - 2] && conf.stack[conf.stack.size() - 2] < conf.stack[conf.stack.size() - 1]; |
|
|
0 |
0 |
return conf.stack.size() >= 2 && conf.stack[conf.stack.size() - 2] && conf.stack[conf.stack.size() - 2] < conf.stack[conf.stack.size() - 1]; |
|
|
0 |
0 |
return conf.stack.size() >= 2 && conf.stack[conf.stack.size() - 2] && conf.stack[conf.stack.size() - 2] < conf.stack[conf.stack.size() - 1]; |
|
17242
|
0 |
0 |
assert(applicable(conf)); |
|
17253
|
0 |
0 |
if (conf.single_root && label_is_root) |
|
|
0 |
0 |
if (conf.single_root && label_is_root) |
|
17256
|
0 |
0 |
return conf.stack.size() >= 3 && conf.stack[conf.stack.size() - 3]; |
|
|
0 |
0 |
return conf.stack.size() >= 3 && conf.stack[conf.stack.size() - 3]; |
|
17260
|
0 |
0 |
assert(applicable(conf)); |
|
17273
|
0 |
0 |
if (conf.single_root && label_is_root) |
|
|
0 |
0 |
if (conf.single_root && label_is_root) |
|
17275
|
0 |
0 |
else if (conf.single_root) // && !label_is_root |
|
17282
|
0 |
0 |
assert(applicable(conf)); |
|
17386
|
0 |
806 |
assert(transition < transitions.size()); |
|
17392
|
0 |
66 |
assert(transition < transitions.size()); |
|
17398
|
1 |
0 |
if (name == "projective") return new transition_system_projective(labels); |
|
|
1 |
0 |
if (name == "projective") return new transition_system_projective(labels); |
|
17399
|
0 |
0 |
if (name == "swap") return new transition_system_swap(labels); |
|
|
0 |
0 |
if (name == "swap") return new transition_system_swap(labels); |
|
17400
|
0 |
0 |
if (name == "link2") return new transition_system_link2(labels); |
|
|
0 |
0 |
if (name == "link2") return new transition_system_link2(labels); |
|
17422
|
0 |
0 |
transitions.emplace_back(new transition_shift()); |
|
|
0 |
0 |
transitions.emplace_back(new transition_shift()); |
|
17423
|
0 |
0 |
for (auto&& label : labels) { |
|
17424
|
0 |
0 |
transitions.emplace_back(new transition_left_arc(label)); |
|
|
0 |
0 |
transitions.emplace_back(new transition_left_arc(label)); |
|
|
0 |
0 |
transitions.emplace_back(new transition_left_arc(label)); |
|
17425
|
0 |
0 |
transitions.emplace_back(new transition_right_arc(label)); |
|
|
0 |
0 |
transitions.emplace_back(new transition_right_arc(label)); |
|
|
0 |
0 |
transitions.emplace_back(new transition_right_arc(label)); |
|
17426
|
0 |
0 |
transitions.emplace_back(new transition_left_arc_2(label)); |
|
|
0 |
0 |
transitions.emplace_back(new transition_left_arc_2(label)); |
|
|
0 |
0 |
transitions.emplace_back(new transition_left_arc_2(label)); |
|
17427
|
0 |
0 |
transitions.emplace_back(new transition_right_arc_2(label)); |
|
|
0 |
0 |
transitions.emplace_back(new transition_right_arc_2(label)); |
|
|
0 |
0 |
transitions.emplace_back(new transition_right_arc_2(label)); |
|
17435
|
0 |
0 |
for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break; |
|
|
0 |
0 |
for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break; |
|
17463
|
0 |
0 |
if (!conf.buffer.empty()) transitions.push_back(0); |
|
17468
|
0 |
0 |
for (int direction = 0; direction < 4; direction++) |
|
17469
|
0 |
0 |
if (conf.stack.size() >= parents[direction] && conf.stack.size() >= children[direction]) { |
|
|
0 |
0 |
if (conf.stack.size() >= parents[direction] && conf.stack.size() >= children[direction]) { |
|
|
0 |
0 |
if (conf.stack.size() >= parents[direction] && conf.stack.size() >= children[direction]) { |
|
17474
|
0 |
0 |
if (direction >= 2 && gold.nodes[child].head != parent) continue; |
|
|
0 |
0 |
if (direction >= 2 && gold.nodes[child].head != parent) continue; |
|
|
0 |
0 |
if (direction >= 2 && gold.nodes[child].head != parent) continue; |
|
17476
|
0 |
0 |
for (size_t i = 0; i < labels.size(); i++) |
|
17477
|
0 |
0 |
if (gold.nodes[child].deprel == labels[i]) |
|
17478
|
0 |
0 |
if (!conf.single_root || |
|
|
0 |
0 |
if (!conf.single_root || |
|
17479
|
0 |
0 |
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
|
0 |
0 |
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
|
0 |
0 |
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
|
0 |
0 |
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
17480
|
0 |
0 |
(i != root_label && conf.stack.size() > 2 && direction < 2) || |
|
|
0 |
0 |
(i != root_label && conf.stack.size() > 2 && direction < 2) || |
|
|
0 |
0 |
(i != root_label && conf.stack.size() > 2 && direction < 2) || |
|
|
0 |
0 |
(i != root_label && conf.stack.size() > 2 && direction < 2) || |
|
17481
|
0 |
0 |
(i != root_label && conf.stack.size() > 3 && direction >= 2)) |
|
|
0 |
0 |
(i != root_label && conf.stack.size() > 3 && direction >= 2)) |
|
17490
|
0 |
0 |
for (int direction = 0; direction < 4; direction++) |
|
17491
|
0 |
0 |
if (conf.stack.size() >= parents[direction] && conf.stack.size() >= children[direction]) { |
|
|
0 |
0 |
if (conf.stack.size() >= parents[direction] && conf.stack.size() >= children[direction]) { |
|
|
0 |
0 |
if (conf.stack.size() >= parents[direction] && conf.stack.size() >= children[direction]) { |
|
17495
|
0 |
0 |
if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) { |
|
|
0 |
0 |
if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) { |
|
|
0 |
0 |
if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) { |
|
17496
|
0 |
0 |
for (size_t i = 0; i < labels.size(); i++) |
|
17497
|
0 |
0 |
if (gold.nodes[child].deprel == labels[i]) |
|
17510
|
0 |
0 |
if (name == "static") return new transition_system_link2_oracle_static(labels); |
|
|
0 |
0 |
if (name == "static") return new transition_system_link2_oracle_static(labels); |
|
17532
|
1 |
0 |
transitions.emplace_back(new transition_shift()); |
|
|
1 |
0 |
transitions.emplace_back(new transition_shift()); |
|
17533
|
6 |
1 |
for (auto&& label : labels) { |
|
17534
|
6 |
0 |
transitions.emplace_back(new transition_left_arc(label)); |
|
|
6 |
0 |
transitions.emplace_back(new transition_left_arc(label)); |
|
|
6 |
0 |
transitions.emplace_back(new transition_left_arc(label)); |
|
17535
|
6 |
0 |
transitions.emplace_back(new transition_right_arc(label)); |
|
|
6 |
0 |
transitions.emplace_back(new transition_right_arc(label)); |
|
|
6 |
0 |
transitions.emplace_back(new transition_right_arc(label)); |
|
17543
|
0 |
0 |
for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break; |
|
|
0 |
0 |
for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break; |
|
17569
|
0 |
0 |
if (!conf.buffer.empty()) transitions.push_back(0); |
|
17570
|
0 |
0 |
if (conf.stack.size() >= 2) |
|
17571
|
0 |
0 |
for (int direction = 0; direction < 2; direction++) { |
|
17573
|
0 |
0 |
for (size_t i = 0; i < labels.size(); i++) |
|
17574
|
0 |
0 |
if (gold.nodes[child].deprel == labels[i]) |
|
17575
|
0 |
0 |
if (!conf.single_root || |
|
|
0 |
0 |
if (!conf.single_root || |
|
17576
|
0 |
0 |
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
|
0 |
0 |
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
|
0 |
0 |
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
|
0 |
0 |
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
|
0 |
0 |
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
17577
|
0 |
0 |
(i != root_label && conf.stack.size() > 2)) |
|
17584
|
0 |
0 |
if (conf.stack.size() >= 2) { |
|
17587
|
0 |
0 |
if (gold.nodes[child].head == parent) { |
|
17588
|
0 |
0 |
for (size_t i = 0; i < labels.size(); i++) |
|
17589
|
0 |
0 |
if (gold.nodes[child].deprel == labels[i]) |
|
17597
|
0 |
0 |
if (conf.stack.size() >= 2) { |
|
17600
|
0 |
0 |
if (gold.nodes[child].head == parent && |
|
|
0 |
0 |
if (gold.nodes[child].head == parent && |
|
|
0 |
0 |
if (gold.nodes[child].head == parent && |
|
17601
|
0 |
0 |
(conf.buffer.empty() || gold.nodes[child].children.empty() || gold.nodes[child].children.back() < conf.buffer.back())) { |
|
|
0 |
0 |
(conf.buffer.empty() || gold.nodes[child].children.empty() || gold.nodes[child].children.back() < conf.buffer.back())) { |
|
17602
|
0 |
0 |
for (size_t i = 0; i < labels.size(); i++) |
|
17603
|
0 |
0 |
if (gold.nodes[child].deprel == labels[i]) |
|
17618
|
0 |
0 |
for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break; |
|
|
0 |
0 |
for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break; |
|
17648
|
0 |
0 |
if (iteration <= 1) |
|
17657
|
0 |
0 |
if (!conf.buffer.empty()) { |
|
17659
|
0 |
0 |
for (size_t i = conf.buffer.size(); i--; ) { |
|
17662
|
0 |
0 |
for (auto&& child : gold.nodes[node].children) |
|
17664
|
0 |
0 |
if (to_right_stack) { |
|
17665
|
0 |
0 |
right_stack.push_back(node); |
|
17672
|
0 |
0 |
class t_representation { |
|
|
0 |
0 |
class t_representation { |
|
|
0 |
0 |
class t_representation { |
|
|
0 |
0 |
class t_representation { |
|
17675
|
0 |
0 |
: stack(stack), right_stack(right_stack), gold(gold), labels(labels) { |
|
|
0 |
0 |
: stack(stack), right_stack(right_stack), gold(gold), labels(labels) { |
|
17676
|
0 |
0 |
for (int i = 0; i < 2; i++) { |
|
17677
|
0 |
0 |
costs[i].reserve((stack.size() + right_stack.size()) * (stack.size() + right_stack.size())); |
|
17678
|
0 |
0 |
transitions[i].reserve((stack.size() + right_stack.size()) * (stack.size() + right_stack.size())); |
|
17680
|
0 |
0 |
} |
|
|
0 |
0 |
} |
|
17690
|
0 |
0 |
int node(unsigned i, unsigned /*j*/, unsigned h) const { return h <= i ? stack[stack.size() - 1 - i + h] : right_stack[h - i - 1]; } |
|
|
0 |
0 |
int node(unsigned i, unsigned /*j*/, unsigned h) const { return h <= i ? stack[stack.size() - 1 - i + h] : right_stack[h - i - 1]; } |
|
|
0 |
0 |
int node(unsigned i, unsigned /*j*/, unsigned h) const { return h <= i ? stack[stack.size() - 1 - i + h] : right_stack[h - i - 1]; } |
|
17693
|
0 |
0 |
for (size_t i = 0; i < labels.size(); i++) |
|
17694
|
0 |
0 |
if (gold.nodes[child].deprel == labels[i]) |
|
17706
|
0 |
0 |
} t(conf.stack, right_stack, gold, labels); |
|
17708
|
0 |
0 |
t.prepare(0); |
|
17710
|
0 |
0 |
for (unsigned diagonal = 0; diagonal < conf.stack.size() + right_stack.size(); diagonal++) { |
|
17711
|
0 |
0 |
t.prepare(diagonal + 1); |
|
17712
|
0 |
0 |
for (unsigned i = diagonal > right_stack.size() ? diagonal - right_stack.size() : 0; i <= diagonal && i < conf.stack.size(); i++) { |
|
|
0 |
0 |
for (unsigned i = diagonal > right_stack.size() ? diagonal - right_stack.size() : 0; i <= diagonal && i < conf.stack.size(); i++) { |
|
|
0 |
0 |
for (unsigned i = diagonal > right_stack.size() ? diagonal - right_stack.size() : 0; i <= diagonal && i < conf.stack.size(); i++) { |
|
|
0 |
0 |
for (unsigned i = diagonal > right_stack.size() ? diagonal - right_stack.size() : 0; i <= diagonal && i < conf.stack.size(); i++) { |
|
17716
|
0 |
0 |
if (i+1 < conf.stack.size()) |
|
17717
|
0 |
0 |
for (unsigned h = 0; h <= diagonal; h++) { |
|
17719
|
0 |
0 |
if (new_node && t.cost(i, j, h) + t.edge_cost(h_node, new_node) < t.cost(i+1, j, h+1) + (t.transition(i, j, h) != 0)) { |
|
|
0 |
0 |
if (new_node && t.cost(i, j, h) + t.edge_cost(h_node, new_node) < t.cost(i+1, j, h+1) + (t.transition(i, j, h) != 0)) { |
|
|
0 |
0 |
if (new_node && t.cost(i, j, h) + t.edge_cost(h_node, new_node) < t.cost(i+1, j, h+1) + (t.transition(i, j, h) != 0)) { |
|
17721
|
0 |
0 |
t.transition(i+1, j, h+1) = t.transition(i, j, h) >= 0 ? t.transition(i, j, h) : t.which_arc_transition(h_node, new_node); |
|
17723
|
0 |
0 |
if (t.cost(i, j, h) + t.edge_cost(new_node, h_node) < t.cost(i+1, j, 0) + (t.transition(i, j, h) != 0)) { |
|
17725
|
0 |
0 |
t.transition(i+1, j, 0) = t.transition(i, j, h) >= 0 ? t.transition(i, j, h) : t.which_arc_transition(new_node, h_node); |
|
17730
|
0 |
0 |
if (j+1 < right_stack.size() + 1) |
|
17731
|
0 |
0 |
for (unsigned h = 0; h <= diagonal; h++) { |
|
17733
|
0 |
0 |
if (t.cost(i, j, h) + t.edge_cost(h_node, new_node) < t.cost(i, j+1, h) + (t.transition(i, j, h) > 0)) { |
|
17735
|
0 |
0 |
t.transition(i, j+1, h) = t.transition(i, j, h) >= 0 ? t.transition(i, j, h) : 0; |
|
17737
|
0 |
0 |
if (h_node && t.cost(i, j, h) + t.edge_cost(new_node, h_node) < t.cost(i, j+1, diagonal+1) + (t.transition(i, j, h) > 0)) { |
|
|
0 |
0 |
if (h_node && t.cost(i, j, h) + t.edge_cost(new_node, h_node) < t.cost(i, j+1, diagonal+1) + (t.transition(i, j, h) > 0)) { |
|
|
0 |
0 |
if (h_node && t.cost(i, j, h) + t.edge_cost(new_node, h_node) < t.cost(i, j+1, diagonal+1) + (t.transition(i, j, h) > 0)) { |
|
17739
|
0 |
0 |
t.transition(i, j+1, diagonal+1) = t.transition(i, j, h) >= 0 ? t.transition(i, j, h) : 0; |
|
17750
|
0 |
0 |
if (name == "static") return new transition_system_projective_oracle_static(labels); |
|
|
0 |
0 |
if (name == "static") return new transition_system_projective_oracle_static(labels); |
|
17751
|
0 |
0 |
if (name == "dynamic") return new transition_system_projective_oracle_dynamic(labels); |
|
|
0 |
0 |
if (name == "dynamic") return new transition_system_projective_oracle_dynamic(labels); |
|
17773
|
0 |
0 |
transitions.emplace_back(new transition_shift()); |
|
|
0 |
0 |
transitions.emplace_back(new transition_shift()); |
|
17774
|
0 |
0 |
transitions.emplace_back(new transition_swap()); |
|
|
0 |
0 |
transitions.emplace_back(new transition_swap()); |
|
17775
|
0 |
0 |
for (auto&& label : labels) { |
|
17776
|
0 |
0 |
transitions.emplace_back(new transition_left_arc(label)); |
|
|
0 |
0 |
transitions.emplace_back(new transition_left_arc(label)); |
|
|
0 |
0 |
transitions.emplace_back(new transition_left_arc(label)); |
|
17777
|
0 |
0 |
transitions.emplace_back(new transition_right_arc(label)); |
|
|
0 |
0 |
transitions.emplace_back(new transition_right_arc(label)); |
|
|
0 |
0 |
transitions.emplace_back(new transition_right_arc(label)); |
|
17785
|
0 |
0 |
for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break; |
|
|
0 |
0 |
for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break; |
|
17791
|
0 |
0 |
: labels(labels), root_label(root_label), gold(gold), projective_order(projective_order), projective_components(projective_components) {} |
|
|
0 |
0 |
: labels(labels), root_label(root_label), gold(gold), projective_order(projective_order), projective_components(projective_components) {} |
|
17818
|
0 |
0 |
if (lazy) { |
|
17819
|
0 |
0 |
tree_oracle_static projective_oracle(labels, root_label, gold, vector(), vector()); |
|
17822
|
0 |
0 |
transition_system_swap system(labels); |
|
17824
|
0 |
0 |
conf.init(&t); |
|
17825
|
0 |
0 |
while (!conf.final()) { |
|
17827
|
0 |
0 |
if (!system.applicable(conf, prediction.to_follow)) break; |
|
|
0 |
0 |
if (!system.applicable(conf, prediction.to_follow)) break; |
|
17828
|
0 |
0 |
system.perform(conf, prediction.to_follow); |
|
17832
|
0 |
0 |
for (auto&& node : conf.stack) |
|
17833
|
0 |
0 |
if (node) |
|
17837
|
0 |
0 |
return unique_ptr(new tree_oracle_static(labels, root_label, gold, move(projective_order), move(projective_components))); |
|
|
0 |
0 |
return unique_ptr(new tree_oracle_static(labels, root_label, gold, move(projective_order), move(projective_components))); |
|
17842
|
0 |
0 |
while (child_index < gold.nodes[node].children.size() && gold.nodes[node].children[child_index] < node) |
|
|
0 |
0 |
while (child_index < gold.nodes[node].children.size() && gold.nodes[node].children[child_index] < node) |
|
|
0 |
0 |
while (child_index < gold.nodes[node].children.size() && gold.nodes[node].children[child_index] < node) |
|
17845
|
0 |
0 |
while (child_index < gold.nodes[node].children.size()) |
|
17851
|
0 |
0 |
for (auto&& child : gold.nodes[node].children) |
|
17857
|
0 |
0 |
if (!conf.buffer.empty()) transitions.push_back(0); |
|
17858
|
0 |
0 |
if (conf.stack.size() >= 2) { |
|
17860
|
0 |
0 |
if (!projective_order.empty()) { |
|
17863
|
0 |
0 |
if (projective_order[last] < projective_order[prev] && |
|
|
0 |
0 |
if (projective_order[last] < projective_order[prev] && |
|
|
0 |
0 |
if (projective_order[last] < projective_order[prev] && |
|
17864
|
0 |
0 |
(projective_components.empty() || |
|
17865
|
0 |
0 |
(conf.buffer.empty() || projective_components[last] != projective_components[conf.buffer.back()]))) |
|
17870
|
0 |
0 |
for (int direction = 0; direction < 2; direction++) { |
|
17872
|
0 |
0 |
for (size_t i = 0; i < labels.size(); i++) |
|
17873
|
0 |
0 |
if (gold.nodes[child].deprel == labels[i]) |
|
17874
|
0 |
0 |
if (!conf.single_root || |
|
|
0 |
0 |
if (!conf.single_root || |
|
17875
|
0 |
0 |
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
|
0 |
0 |
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
|
0 |
0 |
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
|
0 |
0 |
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
|
0 |
0 |
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
17876
|
0 |
0 |
(i != root_label && conf.stack.size() > 2)) |
|
17884
|
0 |
0 |
if (conf.stack.size() >= 2) { |
|
17887
|
0 |
0 |
if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) { |
|
|
0 |
0 |
if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) { |
|
|
0 |
0 |
if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) { |
|
17888
|
0 |
0 |
for (size_t i = 0; i < labels.size(); i++) |
|
17889
|
0 |
0 |
if (gold.nodes[child].deprel == labels[i]) |
|
17897
|
0 |
0 |
if (conf.stack.size() >= 2) { |
|
17900
|
0 |
0 |
if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) { |
|
|
0 |
0 |
if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) { |
|
|
0 |
0 |
if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) { |
|
17901
|
0 |
0 |
for (size_t i = 0; i < labels.size(); i++) |
|
17902
|
0 |
0 |
if (gold.nodes[child].deprel == labels[i]) |
|
17910
|
0 |
0 |
if (conf.stack.size() >= 2 && !projective_order.empty()) { |
|
|
0 |
0 |
if (conf.stack.size() >= 2 && !projective_order.empty()) { |
|
|
0 |
0 |
if (conf.stack.size() >= 2 && !projective_order.empty()) { |
|
17913
|
0 |
0 |
if (projective_order[last] < projective_order[prev] && |
|
|
0 |
0 |
if (projective_order[last] < projective_order[prev] && |
|
|
0 |
0 |
if (projective_order[last] < projective_order[prev] && |
|
17914
|
0 |
0 |
(projective_components.empty() || |
|
17915
|
0 |
0 |
(conf.buffer.empty() || projective_components[last] != projective_components[conf.buffer.back()]))) |
|
17925
|
0 |
0 |
if (name == "static_eager") return new transition_system_swap_oracle_static(labels, false); |
|
|
0 |
0 |
if (name == "static_eager") return new transition_system_swap_oracle_static(labels, false); |
|
17926
|
0 |
0 |
if (name == "static_lazy") return new transition_system_swap_oracle_static(labels, true); |
|
|
0 |
0 |
if (name == "static_lazy") return new transition_system_swap_oracle_static(labels, true); |
|
17950
|
1 |
0 |
clear(); |
|
17964
|
0 |
0 |
nodes.emplace_back((int)nodes.size(), form); |
|
|
0 |
0 |
nodes.emplace_back((int)nodes.size(), form); |
|
|
0 |
0 |
nodes.emplace_back((int)nodes.size(), form); |
|
|
7 |
0 |
nodes.emplace_back((int)nodes.size(), form); |
|
17969
|
38 |
0 |
assert(id >= 0 && id < int(nodes.size())); |
|
|
0 |
38 |
assert(id >= 0 && id < int(nodes.size())); |
|
17970
|
0 |
38 |
assert(head < int(nodes.size())); |
|
17973
|
0 |
38 |
if (nodes[id].head >= 0) { |
|
17975
|
0 |
0 |
for (size_t i = children.size(); i && children[i-1] >= id; i--) |
|
|
0 |
0 |
for (size_t i = children.size(); i && children[i-1] >= id; i--) |
|
|
0 |
0 |
for (size_t i = children.size(); i && children[i-1] >= id; i--) |
|
17976
|
0 |
0 |
if (children[i-1] == id) { |
|
17985
|
38 |
0 |
if (head >= 0) { |
|
17988
|
27 |
29 |
while (i && children[i-1] > id) i--; |
|
|
9 |
18 |
while (i && children[i-1] > id) i--; |
|
|
18 |
38 |
while (i && children[i-1] > id) i--; |
|
17989
|
9 |
29 |
if (!i || children[i-1] < id) children.insert(children.begin() + i, id); |
|
|
9 |
0 |
if (!i || children[i-1] < id) children.insert(children.begin() + i, id); |
|
|
38 |
0 |
if (!i || children[i-1] < id) children.insert(children.begin() + i, id); |
|
17994
|
0 |
0 |
for (auto&& node : nodes) { |
|
|
0 |
0 |
for (auto&& node : nodes) { |
|
|
0 |
0 |
for (auto&& node : nodes) { |
|
|
0 |
0 |
for (auto&& node : nodes) { |
|
|
8 |
1 |
for (auto&& node : nodes) { |
|
18088
|
0 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
|
0 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
|
0 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
|
0 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
|
0 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
|
0 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
|
0 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
18118
|
0 |
0 |
if (name == "conllu") return new_conllu_input_format(); |
|
18128
|
0 |
0 |
if (name == "conllu") return new_conllu_output_format(); |
|
18156
|
0 |
0 |
if (make_copy) { |
|
18171
|
0 |
0 |
while (text.len) { |
|
18174
|
0 |
0 |
while (line.len < text.len && line.str[line.len] != '\n') line.len++; |
|
|
0 |
0 |
while (line.len < text.len && line.str[line.len] != '\n') line.len++; |
|
18179
|
0 |
0 |
if (!line.len) { |
|
18180
|
0 |
0 |
if (t.empty()) continue; |
|
18184
|
0 |
0 |
if (*line.str == '#') { |
|
18186
|
0 |
0 |
if (t.empty()) comments.push_back(line); |
|
|
0 |
0 |
if (t.empty()) comments.push_back(line); |
|
18191
|
0 |
0 |
split(line, '\t', tokens); |
|
18192
|
0 |
0 |
if (tokens.size() != 10) |
|
18193
|
0 |
0 |
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' does not contain 10 columns!") , false; |
|
|
0 |
0 |
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' does not contain 10 columns!") , false; |
|
|
0 |
0 |
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' does not contain 10 columns!") , false; |
|
18196
|
0 |
0 |
if (memchr(tokens[0].str, '-', tokens[0].len)) { |
|
18197
|
0 |
0 |
split(tokens[0], '-', parts); |
|
18198
|
0 |
0 |
if (parts.size() != 2) |
|
18199
|
0 |
0 |
return error.assign("Cannot parse ID of multiword token '").append(line.str, line.len).append("'!") , false; |
|
|
0 |
0 |
return error.assign("Cannot parse ID of multiword token '").append(line.str, line.len).append("'!") , false; |
|
|
0 |
0 |
return error.assign("Cannot parse ID of multiword token '").append(line.str, line.len).append("'!") , false; |
|
18201
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error)) |
|
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error)) |
|
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error)) |
|
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error)) |
|
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error)) |
|
18203
|
0 |
0 |
if (from != int(t.nodes.size())) |
|
18204
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
18205
|
0 |
0 |
if (to < from) |
|
18206
|
0 |
0 |
return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
18207
|
0 |
0 |
if (from <= last_multiword_token) |
|
18208
|
0 |
0 |
return error.assign("Multiword token '").append(line.str, line.len).append("' overlaps with the previous one!"), false; |
|
|
0 |
0 |
return error.assign("Multiword token '").append(line.str, line.len).append("' overlaps with the previous one!"), false; |
|
|
0 |
0 |
return error.assign("Multiword token '").append(line.str, line.len).append("' overlaps with the previous one!"), false; |
|
18210
|
0 |
0 |
multiword_tokens.emplace_back(from, line); |
|
18216
|
0 |
0 |
if (!parse_int(tokens[0], "CoNLL-U id", id, error)) |
|
|
0 |
0 |
if (!parse_int(tokens[0], "CoNLL-U id", id, error)) |
|
18218
|
0 |
0 |
if (id != int(t.nodes.size())) |
|
18219
|
0 |
0 |
return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false; |
|
18222
|
0 |
0 |
if (tokens[6].len == 1 && tokens[6].str[0] == '_') { |
|
|
0 |
0 |
if (tokens[6].len == 1 && tokens[6].str[0] == '_') { |
|
|
0 |
0 |
if (tokens[6].len == 1 && tokens[6].str[0] == '_') { |
|
18225
|
0 |
0 |
if (!parse_int(tokens[6], "CoNLL-U head", head, error)) |
|
|
0 |
0 |
if (!parse_int(tokens[6], "CoNLL-U head", head, error)) |
|
18227
|
0 |
0 |
if (head < 0) |
|
18228
|
0 |
0 |
return error.assign("Numeric head value '").append(tokens[0].str, tokens[0].len).append("' cannot be negative!"), false; |
|
|
0 |
0 |
return error.assign("Numeric head value '").append(tokens[0].str, tokens[0].len).append("' cannot be negative!"), false; |
|
|
0 |
0 |
return error.assign("Numeric head value '").append(tokens[0].str, tokens[0].len).append("' cannot be negative!"), false; |
|
18233
|
0 |
0 |
if (!(tokens[2].len == 1 && tokens[2].str[0] == '_')) node.lemma.assign(tokens[2].str, tokens[2].len); |
|
|
0 |
0 |
if (!(tokens[2].len == 1 && tokens[2].str[0] == '_')) node.lemma.assign(tokens[2].str, tokens[2].len); |
|
|
0 |
0 |
if (!(tokens[2].len == 1 && tokens[2].str[0] == '_')) node.lemma.assign(tokens[2].str, tokens[2].len); |
|
18234
|
0 |
0 |
if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) node.upostag.assign(tokens[3].str, tokens[3].len); |
|
|
0 |
0 |
if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) node.upostag.assign(tokens[3].str, tokens[3].len); |
|
|
0 |
0 |
if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) node.upostag.assign(tokens[3].str, tokens[3].len); |
|
18235
|
0 |
0 |
if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) node.xpostag.assign(tokens[4].str, tokens[4].len); |
|
|
0 |
0 |
if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) node.xpostag.assign(tokens[4].str, tokens[4].len); |
|
|
0 |
0 |
if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) node.xpostag.assign(tokens[4].str, tokens[4].len); |
|
18236
|
0 |
0 |
if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) node.feats.assign(tokens[5].str, tokens[5].len); |
|
|
0 |
0 |
if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) node.feats.assign(tokens[5].str, tokens[5].len); |
|
|
0 |
0 |
if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) node.feats.assign(tokens[5].str, tokens[5].len); |
|
18238
|
0 |
0 |
if (!(tokens[7].len == 1 && tokens[7].str[0] == '_')) node.deprel.assign(tokens[7].str, tokens[7].len); |
|
|
0 |
0 |
if (!(tokens[7].len == 1 && tokens[7].str[0] == '_')) node.deprel.assign(tokens[7].str, tokens[7].len); |
|
|
0 |
0 |
if (!(tokens[7].len == 1 && tokens[7].str[0] == '_')) node.deprel.assign(tokens[7].str, tokens[7].len); |
|
18239
|
0 |
0 |
if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) node.deps.assign(tokens[8].str, tokens[8].len); |
|
|
0 |
0 |
if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) node.deps.assign(tokens[8].str, tokens[8].len); |
|
|
0 |
0 |
if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) node.deps.assign(tokens[8].str, tokens[8].len); |
|
18240
|
0 |
0 |
if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) node.misc.assign(tokens[9].str, tokens[9].len); |
|
|
0 |
0 |
if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) node.misc.assign(tokens[9].str, tokens[9].len); |
|
|
0 |
0 |
if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) node.misc.assign(tokens[9].str, tokens[9].len); |
|
18244
|
0 |
0 |
if (last_multiword_token >= int(t.nodes.size())) |
|
18245
|
0 |
0 |
return error.assign("There are words missing for multiword token '").append(multiword_tokens.back().second.str, multiword_tokens.back().second.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("There are words missing for multiword token '").append(multiword_tokens.back().second.str, multiword_tokens.back().second.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("There are words missing for multiword token '").append(multiword_tokens.back().second.str, multiword_tokens.back().second.len).append("'!"), false; |
|
18248
|
0 |
0 |
for (auto&& node : t.nodes) |
|
18249
|
0 |
0 |
if (node.id && node.head >= 0) { |
|
|
0 |
0 |
if (node.id && node.head >= 0) { |
|
18250
|
0 |
0 |
if (node.head >= int(t.nodes.size())) |
|
18251
|
0 |
0 |
return error.assign("Node ID '").append(to_string(node.id)).append("' form '").append(node.form).append("' has too large head: '").append(to_string(node.head)).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Node ID '").append(to_string(node.id)).append("' form '").append(node.form).append("' has too large head: '").append(to_string(node.head)).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Node ID '").append(to_string(node.id)).append("' form '").append(node.form).append("' has too large head: '").append(to_string(node.head)).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Node ID '").append(to_string(node.id)).append("' form '").append(node.form).append("' has too large head: '").append(to_string(node.head)).append("'!"), false; |
|
18252
|
0 |
0 |
t.set_head(node.id, node.head, node.deprel); |
|
18266
|
0 |
0 |
auto input_conllu = dynamic_cast(additional_info); |
|
18270
|
0 |
0 |
if (input_conllu) |
|
18271
|
0 |
0 |
for (auto&& comment : input_conllu->comments) |
|
18275
|
0 |
0 |
for (int i = 1 /*skip the root node*/; i < int(t.nodes.size()); i++) { |
|
18277
|
0 |
0 |
if (input_conllu && input_conllu_multiword_tokens < input_conllu->multiword_tokens.size() && |
|
|
0 |
0 |
if (input_conllu && input_conllu_multiword_tokens < input_conllu->multiword_tokens.size() && |
|
|
0 |
0 |
if (input_conllu && input_conllu_multiword_tokens < input_conllu->multiword_tokens.size() && |
|
|
0 |
0 |
if (input_conllu && input_conllu_multiword_tokens < input_conllu->multiword_tokens.size() && |
|
18285
|
0 |
0 |
output.append(to_string(i)).push_back('\t'); |
|
18291
|
0 |
0 |
output.append(t.nodes[i].head < 0 ? "_" : to_string(t.nodes[i].head)).push_back('\t'); |
|
|
0 |
0 |
output.append(t.nodes[i].head < 0 ? "_" : to_string(t.nodes[i].head)).push_back('\t'); |
|
18348
|
0 |
0 |
return {1, 1, 1, "devel"}; |
|
|
0 |
0 |
return {1, 1, 1, "devel"}; |
|
18359
|
0 |
0 |
<< (parsito.prerelease.empty() ? "" : "-") << parsito.prerelease |
|
|
0 |
0 |
<< (parsito.prerelease.empty() ? "" : "-") << parsito.prerelease |
|
18361
|
0 |
0 |
<< (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n" |
|
|
0 |
0 |
<< (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n" |
|
18363
|
0 |
0 |
"Mathematics and Physics, Charles University in Prague, Czech Republic."; |
|
18407
|
20 |
2 |
const string input_format_conllu::columns[10] = {"ID", "FORM", "LEMMA", |
|
18408
|
2 |
0 |
"UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"}; |
|
|
2 |
0 |
"UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"}; |
|
|
2 |
0 |
"UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"}; |
|
|
2 |
0 |
"UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"}; |
|
|
2 |
0 |
"UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"}; |
|
|
2 |
0 |
"UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"}; |
|
|
2 |
0 |
"UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"}; |
|
|
2 |
0 |
"UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"}; |
|
|
2 |
0 |
"UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"}; |
|
|
2 |
0 |
"UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"}; |
|
|
0 |
0 |
"UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"}; |
|
18419
|
0 |
0 |
if (make_copy) { |
|
18432
|
0 |
0 |
while (text.len) { |
|
18435
|
0 |
0 |
while (line.len < text.len && (line.str[line.len] != '\r' && line.str[line.len] != '\n')) line.len++; |
|
|
0 |
0 |
while (line.len < text.len && (line.str[line.len] != '\r' && line.str[line.len] != '\n')) line.len++; |
|
18438
|
0 |
0 |
if (text.len >= 2 && text.str[0] == '\r' && text.str[1] == '\n') |
|
|
0 |
0 |
if (text.len >= 2 && text.str[0] == '\r' && text.str[1] == '\n') |
|
|
0 |
0 |
if (text.len >= 2 && text.str[0] == '\r' && text.str[1] == '\n') |
|
18440
|
0 |
0 |
else if (text.len && *text.str == '\n') |
|
|
0 |
0 |
else if (text.len && *text.str == '\n') |
|
18444
|
0 |
0 |
if (!line.len) { |
|
18445
|
0 |
0 |
if (s.empty()) continue; |
|
18449
|
0 |
0 |
if (*line.str == '#') { |
|
18451
|
0 |
0 |
if (s.empty()) s.comments.emplace_back(line.str, line.len); |
|
|
0 |
0 |
if (s.empty()) s.comments.emplace_back(line.str, line.len); |
|
18456
|
0 |
0 |
split(line, '\t', tokens); |
|
18457
|
0 |
0 |
if (tokens.size() != 10) |
|
18458
|
0 |
0 |
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' does not contain 10 columns!") , false; |
|
|
0 |
0 |
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' does not contain 10 columns!") , false; |
|
|
0 |
0 |
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' does not contain 10 columns!") , false; |
|
18461
|
0 |
0 |
for (int i = 0; i < 10; i++) { |
|
18462
|
0 |
0 |
if (!tokens[i].len) |
|
18463
|
0 |
0 |
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains empty column ").append(columns[i]).append("!"), false; |
|
|
0 |
0 |
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains empty column ").append(columns[i]).append("!"), false; |
|
|
0 |
0 |
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains empty column ").append(columns[i]).append("!"), false; |
|
|
0 |
0 |
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains empty column ").append(columns[i]).append("!"), false; |
|
18464
|
0 |
0 |
if ((version < 2 || (i != 1 && i != 2 && i != 9)) && memchr(tokens[i].str, ' ', tokens[i].len) != NULL) |
|
|
0 |
0 |
if ((version < 2 || (i != 1 && i != 2 && i != 9)) && memchr(tokens[i].str, ' ', tokens[i].len) != NULL) |
|
|
0 |
0 |
if ((version < 2 || (i != 1 && i != 2 && i != 9)) && memchr(tokens[i].str, ' ', tokens[i].len) != NULL) |
|
|
0 |
0 |
if ((version < 2 || (i != 1 && i != 2 && i != 9)) && memchr(tokens[i].str, ' ', tokens[i].len) != NULL) |
|
|
0 |
0 |
if ((version < 2 || (i != 1 && i != 2 && i != 9)) && memchr(tokens[i].str, ' ', tokens[i].len) != NULL) |
|
18465
|
0 |
0 |
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains spaces in column ").append(columns[i]).append("!"), false; |
|
|
0 |
0 |
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains spaces in column ").append(columns[i]).append("!"), false; |
|
|
0 |
0 |
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains spaces in column ").append(columns[i]).append("!"), false; |
|
|
0 |
0 |
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains spaces in column ").append(columns[i]).append("!"), false; |
|
18469
|
0 |
0 |
if (memchr(tokens[0].str, '-', tokens[0].len)) { |
|
18470
|
0 |
0 |
split(tokens[0], '-', parts); |
|
18471
|
0 |
0 |
if (parts.size() != 2) |
|
18472
|
0 |
0 |
return error.assign("Cannot parse ID of multiword token '").append(line.str, line.len).append("'!") , false; |
|
|
0 |
0 |
return error.assign("Cannot parse ID of multiword token '").append(line.str, line.len).append("'!") , false; |
|
|
0 |
0 |
return error.assign("Cannot parse ID of multiword token '").append(line.str, line.len).append("'!") , false; |
|
18474
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error)) |
|
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error)) |
|
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error)) |
|
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error)) |
|
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error)) |
|
18476
|
0 |
0 |
if (from != int(s.words.size())) |
|
18477
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
18478
|
0 |
0 |
if (to < from) |
|
18479
|
0 |
0 |
return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
18480
|
0 |
0 |
if (from <= last_multiword_token) |
|
18481
|
0 |
0 |
return error.assign("Multiword token '").append(line.str, line.len).append("' overlaps with the previous one!"), false; |
|
|
0 |
0 |
return error.assign("Multiword token '").append(line.str, line.len).append("' overlaps with the previous one!"), false; |
|
|
0 |
0 |
return error.assign("Multiword token '").append(line.str, line.len).append("' overlaps with the previous one!"), false; |
|
18483
|
0 |
0 |
for (int i = 2; i < 9; i++) |
|
18484
|
0 |
0 |
if (tokens[i].len != 1 || tokens[i].str[0] != '_') |
|
|
0 |
0 |
if (tokens[i].len != 1 || tokens[i].str[0] != '_') |
|
|
0 |
0 |
if (tokens[i].len != 1 || tokens[i].str[0] != '_') |
|
18485
|
0 |
0 |
return error.assign("Column ").append(columns[i]).append(" of an multi-word token '").append(line.str, line.len).append("' is not an empty!"), false; |
|
|
0 |
0 |
return error.assign("Column ").append(columns[i]).append(" of an multi-word token '").append(line.str, line.len).append("' is not an empty!"), false; |
|
|
0 |
0 |
return error.assign("Column ").append(columns[i]).append(" of an multi-word token '").append(line.str, line.len).append("' is not an empty!"), false; |
|
|
0 |
0 |
return error.assign("Column ").append(columns[i]).append(" of an multi-word token '").append(line.str, line.len).append("' is not an empty!"), false; |
|
18486
|
0 |
0 |
s.multiword_tokens.emplace_back(from, to, tokens[1], tokens[9].len == 1 && tokens[9].str[0] == '_' ? string_piece() : tokens[9]); |
|
|
0 |
0 |
s.multiword_tokens.emplace_back(from, to, tokens[1], tokens[9].len == 1 && tokens[9].str[0] == '_' ? string_piece() : tokens[9]); |
|
|
0 |
0 |
s.multiword_tokens.emplace_back(from, to, tokens[1], tokens[9].len == 1 && tokens[9].str[0] == '_' ? string_piece() : tokens[9]); |
|
18491
|
0 |
0 |
if (version >= 2) |
|
18492
|
0 |
0 |
if (memchr(tokens[0].str, '.', tokens[0].len)) { |
|
18493
|
0 |
0 |
split(tokens[0], '.', parts); |
|
18494
|
0 |
0 |
if (parts.size() != 2) |
|
18495
|
0 |
0 |
return error.assign("Cannot parse ID of empty node '").append(line.str, line.len).append("'!") , false; |
|
|
0 |
0 |
return error.assign("Cannot parse ID of empty node '").append(line.str, line.len).append("'!") , false; |
|
|
0 |
0 |
return error.assign("Cannot parse ID of empty node '").append(line.str, line.len).append("'!") , false; |
|
18497
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U empty node id", id, error) || !parse_int(parts[1], "CoNLL-U empty node index", index, error)) |
|
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U empty node id", id, error) || !parse_int(parts[1], "CoNLL-U empty node index", index, error)) |
|
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U empty node id", id, error) || !parse_int(parts[1], "CoNLL-U empty node index", index, error)) |
|
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U empty node id", id, error) || !parse_int(parts[1], "CoNLL-U empty node index", index, error)) |
|
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U empty node id", id, error) || !parse_int(parts[1], "CoNLL-U empty node index", index, error)) |
|
18499
|
0 |
0 |
if (id != int(s.words.size()) - 1) |
|
18500
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false; |
|
18501
|
0 |
0 |
if (!((s.empty_nodes.empty() && index == 1) || (!s.empty_nodes.empty() && s.empty_nodes.back().id < id && index == 1) || |
|
|
0 |
0 |
if (!((s.empty_nodes.empty() && index == 1) || (!s.empty_nodes.empty() && s.empty_nodes.back().id < id && index == 1) || |
|
|
0 |
0 |
if (!((s.empty_nodes.empty() && index == 1) || (!s.empty_nodes.empty() && s.empty_nodes.back().id < id && index == 1) || |
|
|
0 |
0 |
if (!((s.empty_nodes.empty() && index == 1) || (!s.empty_nodes.empty() && s.empty_nodes.back().id < id && index == 1) || |
|
|
0 |
0 |
if (!((s.empty_nodes.empty() && index == 1) || (!s.empty_nodes.empty() && s.empty_nodes.back().id < id && index == 1) || |
|
18502
|
0 |
0 |
(!s.empty_nodes.empty() && s.empty_nodes.back().id == id && index == s.empty_nodes.back().index + 1))) |
|
|
0 |
0 |
(!s.empty_nodes.empty() && s.empty_nodes.back().id == id && index == s.empty_nodes.back().index + 1))) |
|
|
0 |
0 |
(!s.empty_nodes.empty() && s.empty_nodes.back().id == id && index == s.empty_nodes.back().index + 1))) |
|
|
0 |
0 |
(!s.empty_nodes.empty() && s.empty_nodes.back().id == id && index == s.empty_nodes.back().index + 1))) |
|
18503
|
0 |
0 |
return error.assign("Incorrect ID index '").append(parts[1].str, parts[1].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect ID index '").append(parts[1].str, parts[1].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect ID index '").append(parts[1].str, parts[1].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect ID index '").append(parts[1].str, parts[1].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect ID index '").append(parts[1].str, parts[1].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false; |
|
18504
|
0 |
0 |
for (int i = 6; i < 8; i++) |
|
18505
|
0 |
0 |
if (tokens[i].len != 1 || tokens[i].str[0] != '_') |
|
|
0 |
0 |
if (tokens[i].len != 1 || tokens[i].str[0] != '_') |
|
|
0 |
0 |
if (tokens[i].len != 1 || tokens[i].str[0] != '_') |
|
18506
|
0 |
0 |
return error.assign("Column ").append(columns[i]).append(" of an empty node token '").append(line.str, line.len).append("' is not an empty!"), false; |
|
|
0 |
0 |
return error.assign("Column ").append(columns[i]).append(" of an empty node token '").append(line.str, line.len).append("' is not an empty!"), false; |
|
|
0 |
0 |
return error.assign("Column ").append(columns[i]).append(" of an empty node token '").append(line.str, line.len).append("' is not an empty!"), false; |
|
|
0 |
0 |
return error.assign("Column ").append(columns[i]).append(" of an empty node token '").append(line.str, line.len).append("' is not an empty!"), false; |
|
18508
|
0 |
0 |
s.empty_nodes.emplace_back(id, index); |
|
18511
|
0 |
0 |
if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) s.empty_nodes.back().upostag.assign(tokens[3].str, tokens[3].len); |
|
|
0 |
0 |
if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) s.empty_nodes.back().upostag.assign(tokens[3].str, tokens[3].len); |
|
|
0 |
0 |
if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) s.empty_nodes.back().upostag.assign(tokens[3].str, tokens[3].len); |
|
18512
|
0 |
0 |
if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) s.empty_nodes.back().xpostag.assign(tokens[4].str, tokens[4].len); |
|
|
0 |
0 |
if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) s.empty_nodes.back().xpostag.assign(tokens[4].str, tokens[4].len); |
|
|
0 |
0 |
if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) s.empty_nodes.back().xpostag.assign(tokens[4].str, tokens[4].len); |
|
18513
|
0 |
0 |
if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) s.empty_nodes.back().feats.assign(tokens[5].str, tokens[5].len); |
|
|
0 |
0 |
if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) s.empty_nodes.back().feats.assign(tokens[5].str, tokens[5].len); |
|
|
0 |
0 |
if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) s.empty_nodes.back().feats.assign(tokens[5].str, tokens[5].len); |
|
18514
|
0 |
0 |
if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) s.empty_nodes.back().deps.assign(tokens[8].str, tokens[8].len); |
|
|
0 |
0 |
if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) s.empty_nodes.back().deps.assign(tokens[8].str, tokens[8].len); |
|
|
0 |
0 |
if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) s.empty_nodes.back().deps.assign(tokens[8].str, tokens[8].len); |
|
18515
|
0 |
0 |
if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) s.empty_nodes.back().misc.assign(tokens[9].str, tokens[9].len); |
|
|
0 |
0 |
if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) s.empty_nodes.back().misc.assign(tokens[9].str, tokens[9].len); |
|
|
0 |
0 |
if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) s.empty_nodes.back().misc.assign(tokens[9].str, tokens[9].len); |
|
18521
|
0 |
0 |
if (!parse_int(tokens[0], "CoNLL-U id", id, error)) |
|
|
0 |
0 |
if (!parse_int(tokens[0], "CoNLL-U id", id, error)) |
|
18523
|
0 |
0 |
if (id != int(s.words.size())) |
|
18524
|
0 |
0 |
return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false; |
|
18527
|
0 |
0 |
if (tokens[6].len == 1 && tokens[6].str[0] == '_') { |
|
|
0 |
0 |
if (tokens[6].len == 1 && tokens[6].str[0] == '_') { |
|
|
0 |
0 |
if (tokens[6].len == 1 && tokens[6].str[0] == '_') { |
|
18530
|
0 |
0 |
if (!parse_int(tokens[6], "CoNLL-U head", head, error)) |
|
|
0 |
0 |
if (!parse_int(tokens[6], "CoNLL-U head", head, error)) |
|
18532
|
0 |
0 |
if (head < 0) |
|
18533
|
0 |
0 |
return error.assign("Numeric head value '").append(tokens[0].str, tokens[0].len).append("' cannot be negative!"), false; |
|
|
0 |
0 |
return error.assign("Numeric head value '").append(tokens[0].str, tokens[0].len).append("' cannot be negative!"), false; |
|
|
0 |
0 |
return error.assign("Numeric head value '").append(tokens[0].str, tokens[0].len).append("' cannot be negative!"), false; |
|
18539
|
0 |
0 |
if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) word.upostag.assign(tokens[3].str, tokens[3].len); |
|
|
0 |
0 |
if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) word.upostag.assign(tokens[3].str, tokens[3].len); |
|
|
0 |
0 |
if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) word.upostag.assign(tokens[3].str, tokens[3].len); |
|
18540
|
0 |
0 |
if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) word.xpostag.assign(tokens[4].str, tokens[4].len); |
|
|
0 |
0 |
if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) word.xpostag.assign(tokens[4].str, tokens[4].len); |
|
|
0 |
0 |
if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) word.xpostag.assign(tokens[4].str, tokens[4].len); |
|
18541
|
0 |
0 |
if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) word.feats.assign(tokens[5].str, tokens[5].len); |
|
|
0 |
0 |
if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) word.feats.assign(tokens[5].str, tokens[5].len); |
|
|
0 |
0 |
if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) word.feats.assign(tokens[5].str, tokens[5].len); |
|
18543
|
0 |
0 |
if (!(tokens[7].len == 1 && tokens[7].str[0] == '_')) word.deprel.assign(tokens[7].str, tokens[7].len); |
|
|
0 |
0 |
if (!(tokens[7].len == 1 && tokens[7].str[0] == '_')) word.deprel.assign(tokens[7].str, tokens[7].len); |
|
|
0 |
0 |
if (!(tokens[7].len == 1 && tokens[7].str[0] == '_')) word.deprel.assign(tokens[7].str, tokens[7].len); |
|
18544
|
0 |
0 |
if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) word.deps.assign(tokens[8].str, tokens[8].len); |
|
|
0 |
0 |
if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) word.deps.assign(tokens[8].str, tokens[8].len); |
|
|
0 |
0 |
if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) word.deps.assign(tokens[8].str, tokens[8].len); |
|
18545
|
0 |
0 |
if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) word.misc.assign(tokens[9].str, tokens[9].len); |
|
|
0 |
0 |
if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) word.misc.assign(tokens[9].str, tokens[9].len); |
|
|
0 |
0 |
if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) word.misc.assign(tokens[9].str, tokens[9].len); |
|
18549
|
0 |
0 |
if (last_multiword_token >= int(s.words.size())) |
|
18550
|
0 |
0 |
return error.assign("There are words missing for multiword token '").append(s.multiword_tokens.back().form).append("'!"), false; |
|
|
0 |
0 |
return error.assign("There are words missing for multiword token '").append(s.multiword_tokens.back().form).append("'!"), false; |
|
18553
|
0 |
0 |
for (auto&& word : s.words) |
|
18554
|
0 |
0 |
if (word.id && word.head >= 0) { |
|
|
0 |
0 |
if (word.id && word.head >= 0) { |
|
18555
|
0 |
0 |
if (word.head >= int(s.words.size())) |
|
18556
|
0 |
0 |
return error.assign("Node ID '").append(to_string(word.id)).append("' form '").append(word.form).append("' has too large head: '").append(to_string(word.head)).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Node ID '").append(to_string(word.id)).append("' form '").append(word.form).append("' has too large head: '").append(to_string(word.head)).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Node ID '").append(to_string(word.id)).append("' form '").append(word.form).append("' has too large head: '").append(to_string(word.head)).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Node ID '").append(to_string(word.id)).append("' form '").append(word.form).append("' has too large head: '").append(to_string(word.head)).append("'!"), false; |
|
18557
|
0 |
0 |
s.set_head(word.id, word.head, word.deprel); |
|
18581
|
0 |
0 |
if (getline(is, block)) |
|
18595
|
0 |
0 |
if (make_copy) { |
|
18607
|
0 |
0 |
while (text.len && (*text.str == ' ' || *text.str == '\t' || *text.str == '\r' || *text.str == '\n')) { |
|
|
0 |
0 |
while (text.len && (*text.str == ' ' || *text.str == '\t' || *text.str == '\r' || *text.str == '\n')) { |
|
|
0 |
0 |
while (text.len && (*text.str == ' ' || *text.str == '\t' || *text.str == '\r' || *text.str == '\n')) { |
|
|
0 |
0 |
while (text.len && (*text.str == ' ' || *text.str == '\t' || *text.str == '\r' || *text.str == '\n')) { |
|
18613
|
0 |
0 |
while (text.len && *text.str != '\r' && *text.str != '\n') { |
|
|
0 |
0 |
while (text.len && *text.str != '\r' && *text.str != '\n') { |
|
|
0 |
0 |
while (text.len && *text.str != '\r' && *text.str != '\n') { |
|
18617
|
0 |
0 |
while (text.len && *text.str != ' ' && *text.str != '\t' && *text.str != '\r' && *text.str != '\n') |
|
|
0 |
0 |
while (text.len && *text.str != ' ' && *text.str != '\t' && *text.str != '\r' && *text.str != '\n') |
|
|
0 |
0 |
while (text.len && *text.str != ' ' && *text.str != '\t' && *text.str != '\r' && *text.str != '\n') |
|
|
0 |
0 |
while (text.len && *text.str != ' ' && *text.str != '\t' && *text.str != '\r' && *text.str != '\n') |
|
|
0 |
0 |
while (text.len && *text.str != ' ' && *text.str != '\t' && *text.str != '\r' && *text.str != '\n') |
|
18623
|
0 |
0 |
if (s.words.back().form.find("\302\240") != string::npos) { |
|
18626
|
0 |
0 |
for (size_t i = 0; i < form.size(); i++) { |
|
18627
|
0 |
0 |
if (form_len && form[form_len-1] == '\302' && form[i] == '\240') |
|
|
0 |
0 |
if (form_len && form[form_len-1] == '\302' && form[i] == '\240') |
|
|
0 |
0 |
if (form_len && form[form_len-1] == '\302' && form[i] == '\240') |
|
|
0 |
0 |
if (form_len && form[form_len-1] == '\302' && form[i] == '\240') |
|
18636
|
0 |
0 |
while (text.len && (*text.str == ' ' || *text.str == '\t')) |
|
|
0 |
0 |
while (text.len && (*text.str == ' ' || *text.str == '\t')) |
|
18640
|
0 |
0 |
if (!s.empty()) { |
|
18642
|
0 |
0 |
if (new_document) |
|
18647
|
0 |
0 |
if (preceeding_newlines >= 2) |
|
18652
|
0 |
0 |
s.set_sent_id(to_string(sentence_id++)); |
|
18688
|
0 |
0 |
if (make_copy) { |
|
18700
|
0 |
0 |
while (text.len && (*text.str == '\t' || *text.str == '\r' || *text.str == '\n')) { |
|
|
0 |
0 |
while (text.len && (*text.str == '\t' || *text.str == '\r' || *text.str == '\n')) { |
|
|
0 |
0 |
while (text.len && (*text.str == '\t' || *text.str == '\r' || *text.str == '\n')) { |
|
18706
|
0 |
0 |
while (text.len && *text.str != '\r' && *text.str != '\n') { |
|
|
0 |
0 |
while (text.len && *text.str != '\r' && *text.str != '\n') { |
|
|
0 |
0 |
while (text.len && *text.str != '\r' && *text.str != '\n') { |
|
18710
|
0 |
0 |
while (text.len && *text.str != '\t' && *text.str != '\r' && *text.str != '\n') |
|
|
0 |
0 |
while (text.len && *text.str != '\t' && *text.str != '\r' && *text.str != '\n') |
|
|
0 |
0 |
while (text.len && *text.str != '\t' && *text.str != '\r' && *text.str != '\n') |
|
|
0 |
0 |
while (text.len && *text.str != '\t' && *text.str != '\r' && *text.str != '\n') |
|
18716
|
0 |
0 |
while (text.len && *text.str != '\r' && *text.str != '\n') |
|
|
0 |
0 |
while (text.len && *text.str != '\r' && *text.str != '\n') |
|
|
0 |
0 |
while (text.len && *text.str != '\r' && *text.str != '\n') |
|
18720
|
0 |
0 |
if (text.len >= 2 && text.str[0] == '\r' && text.str[1] == '\n') |
|
|
0 |
0 |
if (text.len >= 2 && text.str[0] == '\r' && text.str[1] == '\n') |
|
|
0 |
0 |
if (text.len >= 2 && text.str[0] == '\r' && text.str[1] == '\n') |
|
18722
|
0 |
0 |
else if (text.len && *text.str == '\n') |
|
|
0 |
0 |
else if (text.len && *text.str == '\n') |
|
18726
|
0 |
0 |
while (text.len && *text.str == '\t') |
|
|
0 |
0 |
while (text.len && *text.str == '\t') |
|
18730
|
0 |
0 |
if (!s.empty()) { |
|
18732
|
0 |
0 |
if (new_document) |
|
18737
|
0 |
0 |
if (preceeding_newlines >= 2) |
|
18742
|
0 |
0 |
s.set_sent_id(to_string(sentence_id++)); |
|
18769
|
0 |
0 |
if (getline(is, block)) |
|
18784
|
0 |
0 |
if (make_copy) { |
|
18797
|
0 |
0 |
while (text.len && s.empty()) { |
|
|
0 |
0 |
while (text.len && s.empty()) { |
|
|
0 |
0 |
while (text.len && s.empty()) { |
|
18800
|
0 |
0 |
while (line.len < text.len && (line.str[line.len] == '\n' || line.str[line.len] == '\r')) { |
|
|
0 |
0 |
while (line.len < text.len && (line.str[line.len] == '\n' || line.str[line.len] == '\r')) { |
|
18804
|
0 |
0 |
while (line.len < text.len && (line.str[line.len] != '\n' && line.str[line.len] != '\r')) |
|
|
0 |
0 |
while (line.len < text.len && (line.str[line.len] != '\n' && line.str[line.len] != '\r')) |
|
18806
|
0 |
0 |
while (line.len < text.len && (line.str[line.len] == '\n' || line.str[line.len] == '\r')) { |
|
|
0 |
0 |
while (line.len < text.len && (line.str[line.len] == '\n' || line.str[line.len] == '\r')) { |
|
18813
|
0 |
0 |
tokenizer->set_text(line, false); |
|
18814
|
0 |
0 |
while (tokenizer->next_sentence(partial, error)) { |
|
|
0 |
0 |
while (tokenizer->next_sentence(partial, error)) { |
|
18817
|
0 |
0 |
for (size_t i = 1; i < partial.words.size(); i++) { |
|
18820
|
0 |
0 |
if (s.words.back().head > 0) s.words.back().head += words; |
|
18824
|
0 |
0 |
for (auto&& multiword_token : partial.multiword_tokens) { |
|
18831
|
0 |
0 |
for (auto&& empty_node : partial.empty_nodes) { |
|
18836
|
0 |
0 |
if (!error.empty()) return false; |
|
18838
|
0 |
0 |
if (s.empty()) { |
|
18844
|
0 |
0 |
if (!s.empty()) { |
|
18846
|
0 |
0 |
if (new_document) |
|
18847
|
0 |
0 |
s.set_new_doc(true, document_id); |
|
18851
|
0 |
0 |
if (preceeding_newlines >= 2) |
|
18852
|
0 |
0 |
s.set_new_par(true); |
|
18856
|
0 |
0 |
s.set_sent_id(to_string(sentence_id++)); |
|
18859
|
0 |
0 |
s.comments.emplace_back("# text = "); |
|
18860
|
0 |
0 |
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
|
18861
|
0 |
0 |
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j].form : (const token&)s.words[i].form; |
|
|
0 |
0 |
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j].form : (const token&)s.words[i].form; |
|
18862
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
18866
|
0 |
0 |
if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' '); |
|
|
0 |
0 |
if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' '); |
|
|
0 |
0 |
if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' '); |
|
|
0 |
0 |
if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' '); |
|
|
0 |
0 |
if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' '); |
|
18877
|
0 |
0 |
if (!named_values::parse(options, parsed_options, parse_error)) |
|
|
0 |
0 |
if (!named_values::parse(options, parsed_options, parse_error)) |
|
18881
|
0 |
0 |
if (parsed_options.count(CONLLU_V1)) |
|
18883
|
0 |
0 |
if (parsed_options.count(CONLLU_V2)) |
|
18886
|
0 |
0 |
return new input_format_conllu(version); |
|
18892
|
0 |
0 |
if (!named_values::parse(options, parsed_options, parse_error)) |
|
|
0 |
0 |
if (!named_values::parse(options, parsed_options, parse_error)) |
|
18898
|
0 |
0 |
input_format* result = new morphodita_tokenizer_wrapper(morphodita::tokenizer::new_generic_tokenizer(), nullptr, normalized_spaces, token_ranges); |
|
|
0 |
0 |
input_format* result = new morphodita_tokenizer_wrapper(morphodita::tokenizer::new_generic_tokenizer(), nullptr, normalized_spaces, token_ranges); |
|
|
0 |
0 |
input_format* result = new morphodita_tokenizer_wrapper(morphodita::tokenizer::new_generic_tokenizer(), nullptr, normalized_spaces, token_ranges); |
|
18899
|
0 |
0 |
return (parsed_options.count(GENERIC_TOKENIZER_PRESEGMENTED) && result) ? input_format::new_presegmented_tokenizer(result) : result; |
|
|
0 |
0 |
return (parsed_options.count(GENERIC_TOKENIZER_PRESEGMENTED) && result) ? input_format::new_presegmented_tokenizer(result) : result; |
|
|
0 |
0 |
return (parsed_options.count(GENERIC_TOKENIZER_PRESEGMENTED) && result) ? input_format::new_presegmented_tokenizer(result) : result; |
|
18912
|
0 |
0 |
size_t name_len = equal != string::npos ? equal : name.size(); |
|
18913
|
0 |
0 |
size_t option_offset = equal != string::npos ? equal + 1 : name.size(); |
|
18915
|
0 |
0 |
if (name.compare(0, name_len, "conllu") == 0) return new_conllu_input_format(name.substr(option_offset)); |
|
|
0 |
0 |
if (name.compare(0, name_len, "conllu") == 0) return new_conllu_input_format(name.substr(option_offset)); |
|
18916
|
0 |
0 |
if (name.compare(0, name_len, "generic_tokenizer") == 0) return new_generic_tokenizer_input_format(name.substr(option_offset)); |
|
|
0 |
0 |
if (name.compare(0, name_len, "generic_tokenizer") == 0) return new_generic_tokenizer_input_format(name.substr(option_offset)); |
|
18917
|
0 |
0 |
if (name.compare(0, name_len, "horizontal") == 0) return new_horizontal_input_format(name.substr(option_offset)); |
|
|
0 |
0 |
if (name.compare(0, name_len, "horizontal") == 0) return new_horizontal_input_format(name.substr(option_offset)); |
|
18918
|
0 |
0 |
if (name.compare(0, name_len, "vertical") == 0) return new_vertical_input_format(name.substr(option_offset)); |
|
|
0 |
0 |
if (name.compare(0, name_len, "vertical") == 0) return new_vertical_input_format(name.substr(option_offset)); |
|
18966
|
0 |
0 |
while (str.len) { |
|
18967
|
0 |
0 |
while (str.len && *str.str != '<' && *str.str != '>' && *str.str != '&' && (!data.encode_quot || *str.str != '"')) |
|
|
0 |
0 |
while (str.len && *str.str != '<' && *str.str != '>' && *str.str != '&' && (!data.encode_quot || *str.str != '"')) |
|
|
0 |
0 |
while (str.len && *str.str != '<' && *str.str != '>' && *str.str != '&' && (!data.encode_quot || *str.str != '"')) |
|
|
0 |
0 |
while (str.len && *str.str != '<' && *str.str != '>' && *str.str != '&' && (!data.encode_quot || *str.str != '"')) |
|
|
0 |
0 |
while (str.len && *str.str != '<' && *str.str != '>' && *str.str != '&' && (!data.encode_quot || *str.str != '"')) |
|
|
0 |
0 |
while (str.len && *str.str != '<' && *str.str != '>' && *str.str != '&' && (!data.encode_quot || *str.str != '"')) |
|
18970
|
0 |
0 |
if (str.len) { |
|
18971
|
0 |
0 |
if (to_print < str.str) os.write(to_print, str.str - to_print); |
|
18972
|
0 |
0 |
os << (*str.str == '<' ? "<" : *str.str == '>' ? ">" : *str.str == '&' ? "&" : """); |
|
|
0 |
0 |
os << (*str.str == '<' ? "<" : *str.str == '>' ? ">" : *str.str == '&' ? "&" : """); |
|
|
0 |
0 |
os << (*str.str == '<' ? "<" : *str.str == '>' ? ">" : *str.str == '&' ? "&" : """); |
|
18978
|
0 |
0 |
if (to_print < str.str) os.write(to_print, str.str - to_print); |
|
19014
|
0 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
|
4 |
3 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
|
7 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
|
7 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
|
7 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
|
2 |
5 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
|
0 |
7 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
|
7 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
|
0 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
|
0 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
|
0 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
|
0 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
|
0 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
|
0 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
19022
|
1 |
4 |
for (auto&& comment : s.comments) |
|
19027
|
8 |
1 |
for (int i = 0; i < int(s.words.size()); i++) { |
|
19029
|
7 |
1 |
if (i > 0) { |
|
19031
|
0 |
7 |
if (multiword_token < s.multiword_tokens.size() && |
|
|
0 |
0 |
if (multiword_token < s.multiword_tokens.size() && |
|
|
0 |
7 |
if (multiword_token < s.multiword_tokens.size() && |
|
19047
|
0 |
7 |
if (s.words[i].head < 0) os << '_'; else os << s.words[i].head; os << '\t' |
|
19054
|
8 |
0 |
if (version >= 2) |
|
19055
|
0 |
8 |
for (; empty_node < s.empty_nodes.size() && i == s.empty_nodes[empty_node].id; empty_node++) { |
|
|
0 |
0 |
for (; empty_node < s.empty_nodes.size() && i == s.empty_nodes[empty_node].id; empty_node++) { |
|
|
0 |
8 |
for (; empty_node < s.empty_nodes.size() && i == s.empty_nodes[empty_node].id; empty_node++) { |
|
19072
|
0 |
14 |
if (version >= 2 || str.find(' ') == string::npos) |
|
|
0 |
0 |
if (version >= 2 || str.find(' ') == string::npos) |
|
|
14 |
0 |
if (version >= 2 || str.find(' ') == string::npos) |
|
19075
|
0 |
0 |
for (auto&& chr : str) |
|
19076
|
0 |
0 |
os << (chr == ' ' ? '_' : chr); |
|
19092
|
0 |
0 |
json_builder& close() { if (!stack.empty()) { json.push_back(stack.back()); stack.pop_back(); } comma_needed = true; return *this; } |
|
|
0 |
0 |
json_builder& close() { if (!stack.empty()) { json.push_back(stack.back()); stack.pop_back(); } comma_needed = true; return *this; } |
|
19094
|
0 |
0 |
json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; } |
|
|
0 |
0 |
json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; } |
|
|
0 |
0 |
json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; } |
|
|
0 |
0 |
json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; } |
|
|
0 |
0 |
json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; } |
|
|
0 |
0 |
json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; } |
|
|
0 |
0 |
json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; } |
|
|
0 |
0 |
json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; } |
|
|
0 |
0 |
json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; } |
|
|
0 |
0 |
json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; } |
|
|
0 |
0 |
json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; } |
|
|
0 |
0 |
json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; } |
|
19095
|
0 |
0 |
json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; } |
|
|
0 |
0 |
json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; } |
|
|
0 |
0 |
json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; } |
|
|
0 |
0 |
json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; } |
|
|
0 |
0 |
json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; } |
|
|
0 |
0 |
json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; } |
|
|
0 |
0 |
json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; } |
|
|
0 |
0 |
json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; } |
|
|
0 |
0 |
json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; } |
|
|
0 |
0 |
json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; } |
|
19103
|
0 |
0 |
if (comma_needed) { |
|
19111
|
0 |
0 |
for (; str.len; str.str++, str.len--) |
|
19121
|
0 |
0 |
if (((unsigned char)*str.str) < 32) { |
|
19131
|
0 |
0 |
for (; value || start_size == json.size(); value /= 10) |
|
|
0 |
0 |
for (; value || start_size == json.size(); value /= 10) |
|
|
0 |
0 |
for (; value || start_size == json.size(); value /= 10) |
|
19146
|
0 |
0 |
json.object().key("id").value(++sentences).key("nodes").array(); |
|
|
0 |
0 |
json.object().key("id").value(++sentences).key("nodes").array(); |
|
|
0 |
0 |
json.object().key("id").value(++sentences).key("nodes").array(); |
|
|
0 |
0 |
json.object().key("id").value(++sentences).key("nodes").array(); |
|
19148
|
0 |
0 |
for (size_t i = 1; i < s.words.size(); i++) { |
|
19149
|
0 |
0 |
json.object().key("id").value(i).key("form").value(s.words[i].form); |
|
|
0 |
0 |
json.object().key("id").value(i).key("form").value(s.words[i].form); |
|
|
0 |
0 |
json.object().key("id").value(i).key("form").value(s.words[i].form); |
|
19152
|
0 |
0 |
if (s.words[i].get_token_range(start, end)) |
|
19153
|
0 |
0 |
json.key("start").value(start).key("end").value(end); |
|
|
0 |
0 |
json.key("start").value(start).key("end").value(end); |
|
19154
|
0 |
0 |
if (s.words[i].head == 0) |
|
19157
|
0 |
0 |
json.key("properties").object() |
|
|
0 |
0 |
json.key("properties").object() |
|
19158
|
0 |
0 |
.key("lemma").value(s.words[i].lemma) |
|
19159
|
0 |
0 |
.key("upos").value(s.words[i].upostag) |
|
19160
|
0 |
0 |
.key("xpos").value(s.words[i].xpostag); |
|
19162
|
0 |
0 |
for (auto&& feat : feats) { |
|
19164
|
0 |
0 |
while (key.len < feat.len && key.str[key.len] != '=') |
|
|
0 |
0 |
while (key.len < feat.len && key.str[key.len] != '=') |
|
19166
|
0 |
0 |
if (key.len + 1 < feat.len) |
|
19167
|
0 |
0 |
json.key(key).value(string_piece(key.str + key.len + 1, feat.len - key.len - 1)); |
|
19171
|
0 |
0 |
if (!s.words[i].children.empty()) { |
|
19173
|
0 |
0 |
for (auto&& child : s.words[i].children) |
|
19174
|
0 |
0 |
json.object().key("label").value(s.words[child].deprel).key("target").value(child).close(); |
|
|
0 |
0 |
json.object().key("label").value(s.words[child].deprel).key("target").value(child).close(); |
|
|
0 |
0 |
json.object().key("label").value(s.words[child].deprel).key("target").value(child).close(); |
|
|
0 |
0 |
json.object().key("label").value(s.words[child].deprel).key("target").value(child).close(); |
|
19204
|
0 |
0 |
if (!sentences) { |
|
19210
|
0 |
0 |
for (auto&& node : s.words[0].children) |
|
19211
|
0 |
0 |
write_node(s, node, pad, os); |
|
19226
|
0 |
0 |
os << pad << "
|
|
|
0 |
0 |
os << pad << "
|
|
19227
|
0 |
0 |
<< "\" form=\"" << xml_encoded(s.words[node].form, true) |
|
19228
|
0 |
0 |
<< "\" lem=\"" << xml_encoded(s.words[node].lemma, true) |
|
19229
|
0 |
0 |
<< "\" mi=\"" << xml_encoded(s.words[node].feats, true) |
|
19230
|
0 |
0 |
<< "\" si=\"" << xml_encoded(s.words[node].deprel, true) << '"'; |
|
19232
|
0 |
0 |
if (s.words[node].children.empty()) { |
|
19236
|
0 |
0 |
for (auto&& child : s.words[node].children) |
|
19258
|
0 |
0 |
if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par())) |
|
|
0 |
0 |
if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par())) |
|
|
0 |
0 |
if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par())) |
|
|
0 |
0 |
if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par())) |
|
|
0 |
0 |
if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par())) |
|
19263
|
0 |
0 |
for (size_t i = 1; i < s.words.size(); i++) { |
|
19265
|
0 |
0 |
for (auto&& chr : s.words[i].form) |
|
19266
|
0 |
0 |
if (chr == ' ') |
|
19267
|
0 |
0 |
line.append("\302\240"); |
|
19269
|
0 |
0 |
line.push_back(chr); |
|
19271
|
0 |
0 |
if (i+1 < s.words.size()) |
|
19272
|
0 |
0 |
line.push_back(' '); |
|
19290
|
0 |
0 |
if (normalized) { |
|
19291
|
0 |
0 |
if (!empty && (s.get_new_doc() || s.get_new_par())) |
|
|
0 |
0 |
if (!empty && (s.get_new_doc() || s.get_new_par())) |
|
|
0 |
0 |
if (!empty && (s.get_new_doc() || s.get_new_par())) |
|
|
0 |
0 |
if (!empty && (s.get_new_doc() || s.get_new_par())) |
|
19293
|
0 |
0 |
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
|
19294
|
0 |
0 |
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j] : (const token&)s.words[i]; |
|
|
0 |
0 |
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j] : (const token&)s.words[i]; |
|
19296
|
0 |
0 |
if (i+1 < s.words.size() && tok.get_space_after()) |
|
|
0 |
0 |
if (i+1 < s.words.size() && tok.get_space_after()) |
|
|
0 |
0 |
if (i+1 < s.words.size() && tok.get_space_after()) |
|
19298
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
19304
|
0 |
0 |
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
|
19305
|
0 |
0 |
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j] : (const token&)s.words[i]; |
|
|
0 |
0 |
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j] : (const token&)s.words[i]; |
|
19306
|
0 |
0 |
tok.get_spaces_before(spaces); os << spaces; |
|
19307
|
0 |
0 |
tok.get_spaces_in_token(spaces); os << (!spaces.empty() ? spaces : tok.form); |
|
|
0 |
0 |
tok.get_spaces_in_token(spaces); os << (!spaces.empty() ? spaces : tok.form); |
|
19308
|
0 |
0 |
tok.get_spaces_after(spaces); os << spaces; |
|
19309
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
19331
|
0 |
0 |
if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par())) |
|
|
0 |
0 |
if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par())) |
|
|
0 |
0 |
if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par())) |
|
|
0 |
0 |
if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par())) |
|
|
0 |
0 |
if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par())) |
|
19335
|
0 |
0 |
for (size_t i = 1; i < s.words.size(); i++) |
|
19344
|
1 |
0 |
if (!named_values::parse(options, parsed_options, parse_error)) |
|
|
1 |
0 |
if (!named_values::parse(options, parsed_options, parse_error)) |
|
19348
|
0 |
1 |
if (parsed_options.count(CONLLU_V1)) |
|
19350
|
0 |
1 |
if (parsed_options.count(CONLLU_V2)) |
|
19353
|
1 |
0 |
return new output_format_conllu(version); |
|
19361
|
0 |
0 |
return new output_format_matxin(); |
|
19367
|
0 |
0 |
if (!named_values::parse(options, parsed_options, parse_error)) |
|
|
0 |
0 |
if (!named_values::parse(options, parsed_options, parse_error)) |
|
19370
|
0 |
0 |
return new output_format_horizontal(parsed_options.count(HORIZONTAL_PARAGRAPHS)); |
|
19376
|
0 |
0 |
if (!named_values::parse(options, parsed_options, parse_error)) |
|
|
0 |
0 |
if (!named_values::parse(options, parsed_options, parse_error)) |
|
19379
|
0 |
0 |
return new output_format_plaintext(parsed_options.count(PLAINTEXT_NORMALIZED_SPACES)); |
|
19385
|
0 |
0 |
if (!named_values::parse(options, parsed_options, parse_error)) |
|
|
0 |
0 |
if (!named_values::parse(options, parsed_options, parse_error)) |
|
19388
|
0 |
0 |
return new output_format_vertical(parsed_options.count(VERTICAL_PARAGRAPHS)); |
|
19393
|
1 |
0 |
size_t name_len = equal != string::npos ? equal : name.size(); |
|
19394
|
0 |
1 |
size_t option_offset = equal != string::npos ? equal + 1 : name.size(); |
|
19396
|
1 |
0 |
if (name.compare(0, name_len, "conllu") == 0) return new_conllu_output_format(name.substr(option_offset)); |
|
|
1 |
0 |
if (name.compare(0, name_len, "conllu") == 0) return new_conllu_output_format(name.substr(option_offset)); |
|
19397
|
0 |
0 |
if (name.compare(0, name_len, "epe") == 0) return new_epe_output_format(name.substr(option_offset)); |
|
|
0 |
0 |
if (name.compare(0, name_len, "epe") == 0) return new_epe_output_format(name.substr(option_offset)); |
|
19398
|
0 |
0 |
if (name.compare(0, name_len, "matxin") == 0) return new_matxin_output_format(name.substr(option_offset)); |
|
19399
|
0 |
0 |
if (name.compare(0, name_len, "horizontal") == 0) return new_horizontal_output_format(name.substr(option_offset)); |
|
|
0 |
0 |
if (name.compare(0, name_len, "horizontal") == 0) return new_horizontal_output_format(name.substr(option_offset)); |
|
19400
|
0 |
0 |
if (name.compare(0, name_len, "plaintext") == 0) return new_plaintext_output_format(name.substr(option_offset)); |
|
|
0 |
0 |
if (name.compare(0, name_len, "plaintext") == 0) return new_plaintext_output_format(name.substr(option_offset)); |
|
19401
|
0 |
0 |
if (name.compare(0, name_len, "vertical") == 0) return new_vertical_output_format(name.substr(option_offset)); |
|
|
0 |
0 |
if (name.compare(0, name_len, "vertical") == 0) return new_vertical_output_format(name.substr(option_offset)); |
|
19421
|
1 |
0 |
clear(); |
|
19439
|
0 |
0 |
words.emplace_back((int)words.size(), form); |
|
|
0 |
0 |
words.emplace_back((int)words.size(), form); |
|
|
0 |
0 |
words.emplace_back((int)words.size(), form); |
|
19444
|
7 |
0 |
assert(id >= 0 && id < int(words.size())); |
|
|
0 |
7 |
assert(id >= 0 && id < int(words.size())); |
|
19445
|
0 |
7 |
assert(head < int(words.size())); |
|
19448
|
0 |
7 |
if (words[id].head >= 0) { |
|
19450
|
0 |
0 |
for (size_t i = children.size(); i && children[i-1] >= id; i--) |
|
|
0 |
0 |
for (size_t i = children.size(); i && children[i-1] >= id; i--) |
|
|
0 |
0 |
for (size_t i = children.size(); i && children[i-1] >= id; i--) |
|
19451
|
0 |
0 |
if (children[i-1] == id) { |
|
19460
|
7 |
0 |
if (head >= 0) { |
|
19463
|
4 |
3 |
while (i && children[i-1] > id) i--; |
|
|
4 |
0 |
while (i && children[i-1] > id) i--; |
|
|
0 |
7 |
while (i && children[i-1] > id) i--; |
|
19464
|
4 |
3 |
if (!i || children[i-1] < id) children.insert(children.begin() + i, id); |
|
|
4 |
0 |
if (!i || children[i-1] < id) children.insert(children.begin() + i, id); |
|
|
7 |
0 |
if (!i || children[i-1] < id) children.insert(children.begin() + i, id); |
|
19469
|
0 |
0 |
for (auto&& word : words) { |
|
19477
|
0 |
0 |
if (get_comment("newdoc id", id)) |
|
19486
|
1 |
0 |
if (new_doc && id.len) |
|
|
0 |
1 |
if (new_doc && id.len) |
|
19488
|
1 |
0 |
else if (new_doc) |
|
19493
|
0 |
0 |
if (get_comment("newpar id", id)) |
|
19502
|
1 |
0 |
if (new_par && id.len) |
|
|
0 |
1 |
if (new_par && id.len) |
|
19504
|
1 |
0 |
else if (new_par) |
|
19517
|
1 |
0 |
if (id.len) |
|
19530
|
0 |
0 |
if (text.len) |
|
19535
|
0 |
0 |
for (auto&& comment : comments) |
|
19536
|
0 |
0 |
if (comment[0] == '#') { |
|
19539
|
0 |
0 |
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
|
|
0 |
0 |
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
|
|
0 |
0 |
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
|
|
0 |
0 |
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
|
19542
|
0 |
0 |
if (j + name.len <= comment.size() && comment.compare(j, name.len, name.str, name.len) == 0) { |
|
|
0 |
0 |
if (j + name.len <= comment.size() && comment.compare(j, name.len, name.str, name.len) == 0) { |
|
|
0 |
0 |
if (j + name.len <= comment.size() && comment.compare(j, name.len, name.str, name.len) == 0) { |
|
19544
|
0 |
0 |
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
|
|
0 |
0 |
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
|
|
0 |
0 |
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
|
|
0 |
0 |
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
|
19545
|
0 |
0 |
if (j < comment.size() && comment[j] == '=') { |
|
|
0 |
0 |
if (j < comment.size() && comment[j] == '=') { |
|
|
0 |
0 |
if (j < comment.size() && comment[j] == '=') { |
|
19548
|
0 |
0 |
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
|
|
0 |
0 |
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
|
|
0 |
0 |
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
|
|
0 |
0 |
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
|
19549
|
0 |
0 |
if (value) value->assign(comment, j, comment.size() - j); |
|
19552
|
0 |
0 |
if (value) value->clear(); |
|
19563
|
7 |
8 |
for (unsigned i = comments.size(); i--; ) |
|
19564
|
0 |
7 |
if (comments[i][0] == '#') { |
|
19567
|
14 |
0 |
while (j < comments[i].size() && (comments[i][j] == ' ' || comments[i][j] == '\t')) j++; |
|
|
7 |
7 |
while (j < comments[i].size() && (comments[i][j] == ' ' || comments[i][j] == '\t')) j++; |
|
|
0 |
7 |
while (j < comments[i].size() && (comments[i][j] == ' ' || comments[i][j] == '\t')) j++; |
|
|
7 |
7 |
while (j < comments[i].size() && (comments[i][j] == ' ' || comments[i][j] == '\t')) j++; |
|
19570
|
2 |
5 |
if (j + name.len <= comments[i].size() && comments[i].compare(j, name.len, name.str, name.len) == 0) |
|
|
2 |
0 |
if (j + name.len <= comments[i].size() && comments[i].compare(j, name.len, name.str, name.len) == 0) |
|
|
7 |
0 |
if (j + name.len <= comments[i].size() && comments[i].compare(j, name.len, name.str, name.len) == 0) |
|
19579
|
3 |
0 |
comment.append("# ").append(name.str, name.len); |
|
|
3 |
0 |
comment.append("# ").append(name.str, name.len); |
|
19580
|
1 |
2 |
if (value.len) { |
|
19581
|
1 |
0 |
comment.append(" = "); |
|
19582
|
1 |
1 |
for (size_t i = 0; i < value.len; i++) |
|
19583
|
1 |
0 |
comment.push_back(value.str[i] == '\r' || value.str[i] == '\n' ? ' ' : value.str[i]); |
|
|
1 |
0 |
comment.push_back(value.str[i] == '\r' || value.str[i] == '\n' ? ' ' : value.str[i]); |
|
19602
|
3 |
8 |
if (form.len) this->form.assign(form.str, form.len); |
|
19603
|
0 |
11 |
if (misc.len) this->misc.assign(misc.str, misc.len); |
|
19610
|
2 |
4 |
return !(get_misc_field("SpaceAfter", value) && value.len == 2 && memcmp(value.str, "No", 2) == 0); |
|
|
2 |
0 |
return !(get_misc_field("SpaceAfter", value) && value.len == 2 && memcmp(value.str, "No", 2) == 0); |
|
|
2 |
0 |
return !(get_misc_field("SpaceAfter", value) && value.len == 2 && memcmp(value.str, "No", 2) == 0); |
|
19614
|
5 |
2 |
if (space_after) |
|
19624
|
0 |
0 |
if (get_misc_field("SpacesBefore", value)) |
|
19631
|
7 |
0 |
if (spaces_before.len == 0) |
|
19640
|
0 |
0 |
if (get_misc_field("SpacesAfter", value)) |
|
19643
|
0 |
0 |
spaces_after.assign(get_space_after() ? " " : ""); |
|
19647
|
2 |
5 |
if (spaces_after.len == 0) { |
|
19650
|
5 |
0 |
} else if (spaces_after.len == 1 && spaces_after.str[0] == ' ') { |
|
|
5 |
0 |
} else if (spaces_after.len == 1 && spaces_after.str[0] == ' ') { |
|
19662
|
0 |
0 |
if (get_misc_field("SpacesInToken", value)) |
|
19669
|
7 |
0 |
if (spaces_in_token.len == 0) |
|
19679
|
0 |
0 |
if (!get_misc_field("TokenRange", value)) return false; |
|
19682
|
0 |
0 |
while (value.len && value.str[0] >= '0' && value.str[0] <= '9') { |
|
|
0 |
0 |
while (value.len && value.str[0] >= '0' && value.str[0] <= '9') { |
|
|
0 |
0 |
while (value.len && value.str[0] >= '0' && value.str[0] <= '9') { |
|
19683
|
0 |
0 |
if (start > (numeric_limits::max() - (value.str[0] - '0')) / 10) |
|
19689
|
0 |
0 |
if (value.len == 0 || value.str[0] != ':') return false; |
|
|
0 |
0 |
if (value.len == 0 || value.str[0] != ':') return false; |
|
19693
|
0 |
0 |
while (value.len && value.str[0] >= '0' && value.str[0] <= '9') { |
|
|
0 |
0 |
while (value.len && value.str[0] >= '0' && value.str[0] <= '9') { |
|
|
0 |
0 |
while (value.len && value.str[0] >= '0' && value.str[0] <= '9') { |
|
19694
|
0 |
0 |
if (end > (numeric_limits::max() - (value.str[0] - '0')) / 10) |
|
19704
|
0 |
0 |
if (start == size_t(string::npos)) |
|
19707
|
0 |
0 |
start_misc_field("TokenRange").append(to_string(start)).append(1, ':').append(to_string(end)); |
|
19712
|
2 |
4 |
for (size_t index = 0; index < misc.size(); ) { |
|
19713
|
2 |
0 |
if (misc.compare(index, name.len, name.str, name.len) == 0 && misc[index + name.len] == '=') { |
|
|
0 |
2 |
if (misc.compare(index, name.len, name.str, name.len) == 0 && misc[index + name.len] == '=') { |
|
|
2 |
0 |
if (misc.compare(index, name.len, name.str, name.len) == 0 && misc[index + name.len] == '=') { |
|
19717
|
2 |
0 |
value.len = (value.len == size_t(string::npos) ? misc.size() : value.len) - index; |
|
19721
|
0 |
0 |
if (index != size_t(string::npos)) index++; |
|
19727
|
8 |
28 |
for (size_t index = 0; index < misc.size(); ) |
|
19728
|
2 |
6 |
if (misc.compare(index, name.len, name.str, name.len) == 0 && misc[index + name.len] == '=') { |
|
|
0 |
2 |
if (misc.compare(index, name.len, name.str, name.len) == 0 && misc[index + name.len] == '=') { |
|
|
2 |
6 |
if (misc.compare(index, name.len, name.str, name.len) == 0 && misc[index + name.len] == '=') { |
|
19730
|
2 |
0 |
if (end_index == size_t(string::npos)) end_index = misc.size(); |
|
19733
|
0 |
2 |
if (index) |
|
19736
|
2 |
0 |
misc.erase(index, end_index + (end_index < misc.size() ? 1 : 0) - index); |
|
19739
|
6 |
0 |
if (index != size_t(string::npos)) index++; |
|
19745
|
0 |
2 |
if (!misc.empty()) misc.push_back('|'); |
|
19751
|
0 |
0 |
for (unsigned i = 0; i < spaces.len; i++) |
|
19773
|
0 |
0 |
for (unsigned i = 0; i < escaped_spaces.len; i++) |
|
19774
|
0 |
0 |
if (escaped_spaces.str[i] != '\\' || i+1 >= escaped_spaces.len) |
|
|
0 |
0 |
if (escaped_spaces.str[i] != '\\' || i+1 >= escaped_spaces.len) |
|
19876
|
0 |
0 |
sa_lowercased(data_lowercased), sa_categorized(data_categorized) {} |
|
|
0 |
0 |
sa_lowercased(data_lowercased), sa_categorized(data_categorized) {} |
|
|
0 |
0 |
sa_lowercased(data_lowercased), sa_categorized(data_categorized) {} |
|
19880
|
0 |
0 |
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
|
19881
|
0 |
0 |
token* tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (token*)&s.multiword_tokens[j] : (token*)&s.words[i]; |
|
|
0 |
0 |
token* tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (token*)&s.multiword_tokens[j] : (token*)&s.words[i]; |
|
19883
|
0 |
0 |
if (previous_tok) { |
|
19886
|
0 |
0 |
if (!score) score = has_letters(previous_tok->form) && has_letters(tok->form) ? -1 : 0; |
|
|
0 |
0 |
if (!score) score = has_letters(previous_tok->form) && has_letters(tok->form) ? -1 : 0; |
|
|
0 |
0 |
if (!score) score = has_letters(previous_tok->form) && has_letters(tok->form) ? -1 : 0; |
|
19887
|
0 |
0 |
if (!score) score = only_digits(previous_tok->form) && only_digits(tok->form) ? -1 : 0; |
|
|
0 |
0 |
if (!score) score = only_digits(previous_tok->form) && only_digits(tok->form) ? -1 : 0; |
|
|
0 |
0 |
if (!score) score = only_digits(previous_tok->form) && only_digits(tok->form) ? -1 : 0; |
|
19888
|
0 |
0 |
if (!score) score = difference(previous_tok->form, tok->form, false, LOWERCASE); |
|
19889
|
0 |
0 |
if (!score) score = difference(previous_tok->form, tok->form, false, CATEGORIZE); |
|
19890
|
0 |
0 |
if (!score) score = difference(previous_tok->form, tok->form, true, CATEGORIZE); |
|
19892
|
0 |
0 |
if (score > 0) |
|
19900
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
19906
|
0 |
0 |
auto& func = mode == LOWERCASE ? perform_lowercase : perform_categorize; |
|
19907
|
0 |
0 |
auto& sa = mode == LOWERCASE ? sa_lowercased : sa_categorized; |
|
19910
|
0 |
0 |
string right_mapped = func(right); |
|
19913
|
0 |
0 |
pattern.assign(separate?" ":"").append(left_mapped).append(right_mapped).append(separate?" ":""); |
|
|
0 |
0 |
pattern.assign(separate?" ":"").append(left_mapped).append(right_mapped).append(separate?" ":""); |
|
|
0 |
0 |
pattern.assign(separate?" ":"").append(left_mapped).append(right_mapped).append(separate?" ":""); |
|
|
0 |
0 |
pattern.assign(separate?" ":"").append(left_mapped).append(right_mapped).append(separate?" ":""); |
|
19914
|
0 |
0 |
int together = sa.count(pattern); |
|
19916
|
0 |
0 |
pattern.assign(separate?" ":"").append(left_mapped).append(" ").append(right_mapped).append(separate?" ":""); |
|
|
0 |
0 |
pattern.assign(separate?" ":"").append(left_mapped).append(" ").append(right_mapped).append(separate?" ":""); |
|
|
0 |
0 |
pattern.assign(separate?" ":"").append(left_mapped).append(" ").append(right_mapped).append(separate?" ":""); |
|
|
0 |
0 |
pattern.assign(separate?" ":"").append(left_mapped).append(" ").append(right_mapped).append(separate?" ":""); |
|
|
0 |
0 |
pattern.assign(separate?" ":"").append(left_mapped).append(" ").append(right_mapped).append(separate?" ":""); |
|
19917
|
0 |
0 |
int apart = sa.count(pattern); |
|
19926
|
0 |
0 |
for (auto&& chr : utf8::decoder(input)) |
|
19927
|
0 |
0 |
utf8::append(output, unicode::lowercase(chr)); |
|
19935
|
0 |
0 |
for (auto&& chr : utf8::decoder(input)) { |
|
19937
|
0 |
0 |
if (category & unicode::C) output.push_back('C'); |
|
|
0 |
0 |
if (category & unicode::C) output.push_back('C'); |
|
19938
|
0 |
0 |
if (category & unicode::L) output.push_back('L'); |
|
|
0 |
0 |
if (category & unicode::L) output.push_back('L'); |
|
19939
|
0 |
0 |
if (category & unicode::M) output.push_back('M'); |
|
|
0 |
0 |
if (category & unicode::M) output.push_back('M'); |
|
19940
|
0 |
0 |
if (category & unicode::N) output.push_back('N'); |
|
|
0 |
0 |
if (category & unicode::N) output.push_back('N'); |
|
19941
|
0 |
0 |
if (category & unicode::Pc) output.push_back('c'); |
|
|
0 |
0 |
if (category & unicode::Pc) output.push_back('c'); |
|
19942
|
0 |
0 |
if (category & unicode::Pd) output.push_back('d'); |
|
|
0 |
0 |
if (category & unicode::Pd) output.push_back('d'); |
|
19943
|
0 |
0 |
if (category & unicode::Pe) output.push_back('e'); |
|
|
0 |
0 |
if (category & unicode::Pe) output.push_back('e'); |
|
19944
|
0 |
0 |
if (category & unicode::Pf) output.push_back('f'); |
|
|
0 |
0 |
if (category & unicode::Pf) output.push_back('f'); |
|
19945
|
0 |
0 |
if (category & unicode::Pi) output.push_back('i'); |
|
|
0 |
0 |
if (category & unicode::Pi) output.push_back('i'); |
|
19946
|
0 |
0 |
if (category & unicode::Po) output.push_back('o'); |
|
|
0 |
0 |
if (category & unicode::Po) output.push_back('o'); |
|
19947
|
0 |
0 |
if (category & unicode::Ps) output.push_back('s'); |
|
|
0 |
0 |
if (category & unicode::Ps) output.push_back('s'); |
|
19948
|
0 |
0 |
if (category & unicode::S) output.push_back('S'); |
|
|
0 |
0 |
if (category & unicode::S) output.push_back('S'); |
|
19949
|
0 |
0 |
if (category & unicode::Zl) output.push_back('Z'); |
|
|
0 |
0 |
if (category & unicode::Zl) output.push_back('Z'); |
|
19950
|
0 |
0 |
if (category & unicode::Zp) output.push_back('z'); |
|
|
0 |
0 |
if (category & unicode::Zp) output.push_back('z'); |
|
19951
|
0 |
0 |
if (category & unicode::Zs) output.push_back(' '); |
|
|
0 |
0 |
if (category & unicode::Zs) output.push_back(' '); |
|
19959
|
0 |
0 |
for (auto&& chr : utf8::decoder(word)) |
|
19960
|
0 |
0 |
if (unicode::category(chr) & unicode::L) |
|
19968
|
0 |
0 |
for (auto&& chr : utf8::decoder(word)) |
|
19969
|
0 |
0 |
if (unicode::category(chr) & ~unicode::N) |
|
19975
|
0 |
0 |
sa.reserve(str.size()); |
|
19976
|
0 |
0 |
for (unsigned i = 0; i < str.size(); i++) |
|
19977
|
0 |
0 |
sa.push_back(i); |
|
20003
|
1 |
0 |
: tokenizer(tokenizer), splitter(splitter), normalized_spaces(normalized_spaces), token_ranges(token_ranges) {} |
|
20023
|
0 |
1 |
for (char32_t chr; |
|
20024
|
1 |
0 |
text.len && (following = text, chr = unilib::utf8::decode(following.str, following.len), |
|
|
1 |
0 |
text.len && (following = text, chr = unilib::utf8::decode(following.str, following.len), |
|
20025
|
1 |
0 |
(unilib::unicode::category(chr) & unilib::unicode::Zs) || chr == '\r' || chr == '\n' || chr == '\t'); |
|
|
1 |
0 |
(unilib::unicode::category(chr) & unilib::unicode::Zs) || chr == '\r' || chr == '\n' || chr == '\t'); |
|
20032
|
34 |
1 |
for (following = text; following.len; unilib::utf8::decode(following.str, following.len)) |
|
20036
|
1 |
0 |
if (make_copy) { |
|
20053
|
0 |
2 |
if (tokenizer->next_sentence(&forms, token_ranges ? &tokens : nullptr)) { |
|
|
1 |
1 |
if (tokenizer->next_sentence(&forms, token_ranges ? &tokens : nullptr)) { |
|
20056
|
7 |
1 |
for (size_t i = 0; i < forms.size(); i++) { |
|
20057
|
7 |
0 |
while (forms[i].len && (forms[i].str[0] == '\r' || forms[i].str[0] == '\n' || |
|
|
7 |
0 |
while (forms[i].len && (forms[i].str[0] == '\r' || forms[i].str[0] == '\n' || |
|
|
7 |
0 |
while (forms[i].len && (forms[i].str[0] == '\r' || forms[i].str[0] == '\n' || |
|
|
7 |
0 |
while (forms[i].len && (forms[i].str[0] == '\r' || forms[i].str[0] == '\n' || |
|
|
7 |
0 |
while (forms[i].len && (forms[i].str[0] == '\r' || forms[i].str[0] == '\n' || |
|
20058
|
0 |
7 |
forms[i].str[0] == '\t' || forms[i].str[0] == ' ')) |
|
20060
|
7 |
0 |
while (forms[i].len && (forms[i].str[forms[i].len-1] == '\r' || forms[i].str[forms[i].len-1] == '\n' || |
|
|
7 |
0 |
while (forms[i].len && (forms[i].str[forms[i].len-1] == '\r' || forms[i].str[forms[i].len-1] == '\n' || |
|
|
7 |
0 |
while (forms[i].len && (forms[i].str[forms[i].len-1] == '\r' || forms[i].str[forms[i].len-1] == '\n' || |
|
|
7 |
0 |
while (forms[i].len && (forms[i].str[forms[i].len-1] == '\r' || forms[i].str[forms[i].len-1] == '\n' || |
|
|
0 |
7 |
while (forms[i].len && (forms[i].str[forms[i].len-1] == '\r' || forms[i].str[forms[i].len-1] == '\n' || |
|
20061
|
0 |
7 |
forms[i].str[forms[i].len-1] == '\t' || forms[i].str[forms[i].len-1] == ' ')) |
|
20063
|
0 |
7 |
if (!forms[i].len) |
|
20066
|
1 |
0 |
if (!forms.size()) return next_sentence(s, error); |
|
20068
|
7 |
1 |
for (size_t i = 0; i < forms.size(); i++) { |
|
20072
|
34 |
7 |
for (size_t j = 0; j < forms[i].len; j++) { |
|
20074
|
34 |
0 |
if (chr == '\r' || chr == '\n' || chr == '\t') chr = ' '; |
|
|
0 |
34 |
if (chr == '\r' || chr == '\n' || chr == '\t') chr = ' '; |
|
20075
|
0 |
34 |
if (chr != ' ' || tok.form.empty() || tok.form.back() != ' ') |
|
|
0 |
0 |
if (chr != ' ' || tok.form.empty() || tok.form.back() != ' ') |
|
|
0 |
0 |
if (chr != ' ' || tok.form.empty() || tok.form.back() != ' ') |
|
|
34 |
0 |
if (chr != ' ' || tok.form.empty() || tok.form.back() != ' ') |
|
20080
|
1 |
6 |
if (i == 0) { |
|
20081
|
0 |
1 |
if (forms[0].str > text.str) |
|
20085
|
7 |
0 |
if (!normalized_spaces) { |
|
20086
|
1 |
6 |
tok.set_spaces_before(i == 0 ? saved_spaces : ""); |
|
|
7 |
0 |
tok.set_spaces_before(i == 0 ? saved_spaces : ""); |
|
20091
|
1 |
6 |
if (i+1 == forms.size()) { |
|
20096
|
1 |
1 |
for (char32_t chr; text.len && (following = text, chr = unilib::utf8::decode(following.str, following.len), |
|
|
0 |
1 |
for (char32_t chr; text.len && (following = text, chr = unilib::utf8::decode(following.str, following.len), |
|
|
1 |
1 |
for (char32_t chr; text.len && (following = text, chr = unilib::utf8::decode(following.str, following.len), |
|
20097
|
0 |
0 |
(unilib::unicode::category(chr) & unilib::unicode::Zs) || chr == '\r' || chr == '\n' || chr == '\t'); text = following) |
|
|
0 |
0 |
(unilib::unicode::category(chr) & unilib::unicode::Zs) || chr == '\r' || chr == '\n' || chr == '\t'); text = following) |
|
20102
|
0 |
7 |
if (normalized_spaces) { |
|
20103
|
0 |
0 |
tok.set_space_after(i+1 == forms.size() ? !saved_spaces.empty() : forms[i+1].str > forms[i].str + forms[i].len); |
|
20105
|
0 |
7 |
tok.set_spaces_in_token(tok.form.size() != forms[i].len ? forms[i] : ""); |
|
20106
|
1 |
6 |
tok.set_spaces_after(i+1 == forms.size() ? saved_spaces : string_piece(forms[i].str + forms[i].len, forms[i+1].str - forms[i].str - forms[i].len)); |
|
20111
|
0 |
7 |
if (token_ranges) |
|
20114
|
7 |
0 |
if (splitter) |
|
20121
|
1 |
0 |
if (new_document) { |
|
20127
|
1 |
0 |
if (preceeding_newlines >= 2) |
|
20131
|
1 |
0 |
s.set_sent_id(to_string(sentence_id++)); |
|
20135
|
7 |
1 |
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
|
20136
|
0 |
7 |
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j].form : (const token&)s.words[i].form; |
|
|
0 |
0 |
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j].form : (const token&)s.words[i].form; |
|
20137
|
0 |
7 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
|
0 |
7 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
20141
|
6 |
1 |
if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' '); |
|
|
2 |
4 |
if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' '); |
|
|
3 |
4 |
if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' '); |
|
20148
|
0 |
1 |
if (text.len) { |
|
20185
|
7 |
0 |
if (it == full_rules.end()) { |
|
20186
|
0 |
7 |
if (version >= 2) { |
|
20189
|
0 |
0 |
while (suffix.size() + 1 < buffer.size()) { |
|
20193
|
0 |
0 |
if (suffix_it == suffix_rules.end()) |
|
20196
|
0 |
0 |
if (!suffix_it->second.words.empty()) { |
|
20204
|
7 |
0 |
if (!prefix_len) { |
|
20207
|
2 |
5 |
if (misc.len) s.words.back().misc.assign(misc.str, misc.len); |
|
20215
|
0 |
0 |
if (unicode::category(utf8::first(token.str, token.len)) & unicode::Lut) { |
|
20217
|
0 |
0 |
for (auto&& chr : utf8::decoder(token.str, token.len)) |
|
20218
|
0 |
0 |
if (unicode::category(chr) & (unicode::L & ~unicode::Lut)) { casing = UC_FIRST; break; } |
|
20225
|
0 |
0 |
if (prefix_len) { |
|
20228
|
0 |
0 |
while (s.words.back().form.size() < prefix_len && suffix.len) |
|
|
0 |
0 |
while (s.words.back().form.size() < prefix_len && suffix.len) |
|
|
0 |
0 |
while (s.words.back().form.size() < prefix_len && suffix.len) |
|
20232
|
0 |
0 |
for (auto&& chr : utf8::decoder(it->second.words[0])) |
|
20233
|
0 |
0 |
utf8::append(s.words.back().form, casing == UC_ALL || (casing == UC_FIRST && s.words.back().form.empty()) ? unicode::uppercase(chr) : chr); |
|
|
0 |
0 |
utf8::append(s.words.back().form, casing == UC_ALL || (casing == UC_FIRST && s.words.back().form.empty()) ? unicode::uppercase(chr) : chr); |
|
|
0 |
0 |
utf8::append(s.words.back().form, casing == UC_ALL || (casing == UC_FIRST && s.words.back().form.empty()) ? unicode::uppercase(chr) : chr); |
|
20235
|
0 |
0 |
for (size_t i = 1; i < it->second.words.size(); i++) |
|
20236
|
0 |
0 |
if (casing != UC_ALL) { |
|
20246
|
1 |
0 |
if (!is.get(version)) return nullptr; |
|
20247
|
1 |
0 |
if (!(version >= 1 && version <= VERSION_LATEST)) return nullptr; |
|
20250
|
1 |
0 |
if (!compressor::load(is, data)) return nullptr; |
|
|
1 |
0 |
if (!compressor::load(is, data)) return nullptr; |
|
20252
|
1 |
0 |
unique_ptr splitter(new multiword_splitter(version)); |
|
20254
|
1 |
0 |
for (unsigned full_rules = data.next_4B(); full_rules; full_rules--) { |
|
|
0 |
1 |
for (unsigned full_rules = data.next_4B(); full_rules; full_rules--) { |
|
20256
|
0 |
0 |
data.next_str(full_rule); |
|
20261
|
0 |
0 |
for (unsigned words = data.next_1B(); words; words--) { |
|
|
0 |
0 |
for (unsigned words = data.next_1B(); words; words--) { |
|
20262
|
0 |
0 |
info.words.emplace_back(); |
|
20263
|
0 |
0 |
data.next_str(info.words.back()); |
|
20265
|
0 |
0 |
if (info.words.empty()) return nullptr; |
|
20268
|
0 |
1 |
if (version >= 2) |
|
20269
|
0 |
0 |
for (unsigned suffix_rules = data.next_4B(); suffix_rules; suffix_rules--) { |
|
|
0 |
0 |
for (unsigned suffix_rules = data.next_4B(); suffix_rules; suffix_rules--) { |
|
20271
|
0 |
0 |
data.next_str(suffix_rule); |
|
20276
|
0 |
0 |
for (unsigned words = data.next_1B(); words; words--) { |
|
|
0 |
0 |
for (unsigned words = data.next_1B(); words; words--) { |
|
20277
|
0 |
0 |
info.words.emplace_back(); |
|
20278
|
0 |
0 |
data.next_str(info.words.back()); |
|
20280
|
0 |
0 |
if (info.words.empty()) return nullptr; |
|
20283
|
0 |
0 |
if (!suffix_rule.empty()) |
|
20284
|
0 |
0 |
for (suffix_rule.pop_back(); !suffix_rule.empty(); suffix_rule.pop_back()) |
|
20286
|
0 |
0 |
} |
|
20291
|
1 |
0 |
return data.is_end() ? splitter.release() : nullptr; |
|
20339
|
0 |
0 |
for (auto&& sentence : data) |
|
20340
|
0 |
0 |
for (auto&& multiword : sentence.multiword_tokens) { |
|
20343
|
0 |
0 |
for (int i = multiword.id_first; i <= multiword.id_last; i++) |
|
20344
|
0 |
0 |
utf8::map(unicode::lowercase, sentence.words[i].form, (lc_words.emplace_back(), lc_words.back())); |
|
20346
|
0 |
0 |
auto& info = full_rules[lc_form]; |
|
20347
|
0 |
0 |
if (info.words.empty()) |
|
20350
|
0 |
0 |
if (!info.count) full_rules.erase(lc_form); |
|
20354
|
0 |
0 |
for (auto&& sentence : data) |
|
20355
|
0 |
0 |
for (size_t i = 1, j = 0; i < sentence.words.size(); i++) { |
|
20356
|
0 |
0 |
if (j < sentence.multiword_tokens.size() && sentence.multiword_tokens[j].id_first == int(i)) { |
|
|
0 |
0 |
if (j < sentence.multiword_tokens.size() && sentence.multiword_tokens[j].id_first == int(i)) { |
|
|
0 |
0 |
if (j < sentence.multiword_tokens.size() && sentence.multiword_tokens[j].id_first == int(i)) { |
|
20363
|
0 |
0 |
if (it != full_rules.end()) |
|
20364
|
0 |
0 |
if (!--it->second.count) |
|
20369
|
0 |
0 |
for (auto&& full_rule : full_rules) { |
|
20371
|
0 |
0 |
while (prefix_match < full_rule.first.size() && prefix_match < full_rule.second.words[0].size()) prefix_match++; |
|
|
0 |
0 |
while (prefix_match < full_rule.first.size() && prefix_match < full_rule.second.words[0].size()) prefix_match++; |
|
|
0 |
0 |
while (prefix_match < full_rule.first.size() && prefix_match < full_rule.second.words[0].size()) prefix_match++; |
|
20372
|
0 |
0 |
for (; prefix_match; prefix_match--) |
|
20373
|
0 |
0 |
if (((unsigned char)full_rule.first[prefix_match]) < 0x80 || ((unsigned char)full_rule.first[prefix_match]) >= 0xC0) { |
|
|
0 |
0 |
if (((unsigned char)full_rule.first[prefix_match]) < 0x80 || ((unsigned char)full_rule.first[prefix_match]) >= 0xC0) { |
|
|
0 |
0 |
if (((unsigned char)full_rule.first[prefix_match]) < 0x80 || ((unsigned char)full_rule.first[prefix_match]) >= 0xC0) { |
|
20374
|
0 |
0 |
lc_form.assign(full_rule.first, prefix_match, string::npos); |
|
20376
|
0 |
0 |
lc_words[0].erase(0, prefix_match); |
|
20378
|
0 |
0 |
auto& info = suffix_rules[lc_form]; |
|
20379
|
0 |
0 |
if (info.words.empty()) |
|
20382
|
0 |
0 |
if (!info.count) suffix_rules.erase(lc_form); |
|
20387
|
0 |
0 |
for (auto&& sentence : data) |
|
20388
|
0 |
0 |
for (size_t i = 1, j = 0; i < sentence.words.size(); i++) { |
|
20389
|
0 |
0 |
if (j < sentence.multiword_tokens.size() && sentence.multiword_tokens[j].id_first == int(i)) { |
|
|
0 |
0 |
if (j < sentence.multiword_tokens.size() && sentence.multiword_tokens[j].id_first == int(i)) { |
|
|
0 |
0 |
if (j < sentence.multiword_tokens.size() && sentence.multiword_tokens[j].id_first == int(i)) { |
|
20395
|
0 |
0 |
while (lc_form.size() > 1) { |
|
20396
|
0 |
0 |
lc_form.erase(0, 1); |
|
20398
|
0 |
0 |
if (it != suffix_rules.end()) { |
|
20399
|
0 |
0 |
if (it->second.count <= 10) |
|
20408
|
0 |
0 |
binary_encoder enc; |
|
20410
|
0 |
0 |
for (auto&& full_rule : full_rules) { |
|
20411
|
0 |
0 |
enc.add_str(full_rule.first); |
|
20412
|
0 |
0 |
enc.add_1B(full_rule.second.words.size()); |
|
20413
|
0 |
0 |
for (auto& word : full_rule.second.words) |
|
20414
|
0 |
0 |
enc.add_str(word); |
|
20417
|
0 |
0 |
for (auto&& suffix_rule : suffix_rules) { |
|
20418
|
0 |
0 |
enc.add_str(suffix_rule.first); |
|
20419
|
0 |
0 |
enc.add_1B(suffix_rule.second.words.size()); |
|
20420
|
0 |
0 |
for (auto& word : suffix_rule.second.words) |
|
20421
|
0 |
0 |
enc.add_str(word); |
|
20425
|
0 |
0 |
os.put(multiword_splitter::VERSION_LATEST); |
|
20426
|
0 |
0 |
if (!compressor::save(os, enc)) return error.assign("Cannot encode multiword_splitter!"), false; |
|
|
0 |
0 |
if (!compressor::save(os, enc)) return error.assign("Cannot encode multiword_splitter!"), false; |
|
|
0 |
0 |
if (!compressor::save(os, enc)) return error.assign("Cannot encode multiword_splitter!"), false; |
|
20536
|
0 |
0 |
stringstream os_buffer; |
|
20537
|
0 |
0 |
os_buffer.put(method.size()); |
|
20538
|
0 |
0 |
os_buffer.write(method.c_str(), method.size()); |
|
20541
|
0 |
0 |
if (method == "morphodita_parsito") { |
|
20542
|
0 |
0 |
if (!trainer_morphodita_parsito::train(training, heldout, tokenizer, tagger, parser, os_buffer, error)) |
|
|
0 |
0 |
if (!trainer_morphodita_parsito::train(training, heldout, tokenizer, tagger, parser, os_buffer, error)) |
|
20545
|
0 |
0 |
error.assign("Unknown UDPipe method '").append(method).append("'!"); |
|
|
0 |
0 |
error.assign("Unknown UDPipe method '").append(method).append("'!"); |
|
20547
|
0 |
0 |
} |
|
|
0 |
0 |
} |
|
20553
|
0 |
0 |
os << os_buffer.rdbuf(); |
|
20571
|
0 |
0 |
for (unsigned i = 0; i < 10; i++) |
|
|
0 |
0 |
for (unsigned i = 0; i < 10; i++) |
|
|
0 |
0 |
for (unsigned i = 0; i < 10; i++) |
|
|
0 |
0 |
for (unsigned i = 0; i < 10; i++) |
|
|
0 |
0 |
for (unsigned i = 0; i < 10; i++) |
|
|
0 |
0 |
for (unsigned i = 0; i < 10; i++) |
|
|
0 |
0 |
for (unsigned i = 0; i < 10; i++) |
|
|
0 |
0 |
for (unsigned i = 0; i < 10; i++) |
|
|
0 |
0 |
for (unsigned i = 0; i < 10; i++) |
|
|
0 |
0 |
for (unsigned i = 0; i < 10; i++) |
|
20595
|
0 |
0 |
enc.add_1B(maps.size()); |
|
20596
|
0 |
0 |
for (auto&& map : maps) |
|
20597
|
0 |
0 |
map.save(enc); |
|
20599
|
0 |
0 |
return compressor::save(os, enc); |
|
20622
|
0 |
0 |
for (auto&& description : ElementaryFeatures::descriptions) |
|
20623
|
0 |
0 |
if (!elementary_map.emplace(description.name, description).second) |
|
20624
|
0 |
0 |
training_failure("Repeated elementary feature with name " << description.name << '!'); |
|
20628
|
0 |
0 |
while (getline(is, line)) { |
|
|
0 |
0 |
while (getline(is, line)) { |
|
20629
|
0 |
0 |
split(line, ',', tokens); |
|
20630
|
0 |
0 |
if (tokens.empty()) training_failure("Feature sequence cannot be empty!"); |
|
|
0 |
0 |
if (tokens.empty()) training_failure("Feature sequence cannot be empty!"); |
|
|
0 |
0 |
if (tokens.empty()) training_failure("Feature sequence cannot be empty!"); |
|
20633
|
0 |
0 |
sequences.emplace_back(); |
|
20634
|
0 |
0 |
for (auto&& token : tokens) { |
|
20636
|
0 |
0 |
split(token, ' ', parts); |
|
20637
|
0 |
0 |
if (parts.size() != 2) training_failure("Cannot parse feature sequence element '" << token << "'!"); |
|
|
0 |
0 |
if (parts.size() != 2) training_failure("Cannot parse feature sequence element '" << token << "'!"); |
|
|
0 |
0 |
if (parts.size() != 2) training_failure("Cannot parse feature sequence element '" << token << "'!"); |
|
20639
|
0 |
0 |
if (it == elementary_map.end()) training_failure("Unknown elementary feature '" << parts[0] << "' used in feature sequence '" << token << "'!"); |
|
|
0 |
0 |
if (it == elementary_map.end()) training_failure("Unknown elementary feature '" << parts[0] << "' used in feature sequence '" << token << "'!"); |
|
|
0 |
0 |
if (it == elementary_map.end()) training_failure("Unknown elementary feature '" << parts[0] << "' used in feature sequence '" << token << "'!"); |
|
20642
|
0 |
0 |
int sequence_index = parse_int(parts[1].c_str(), "sequence_index"); |
|
20643
|
0 |
0 |
if (desc.type == DYNAMIC && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of dynamic elementary feature '" << desc.name << "'!"); |
|
|
0 |
0 |
if (desc.type == DYNAMIC && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of dynamic elementary feature '" << desc.name << "'!"); |
|
|
0 |
0 |
if (desc.type == DYNAMIC && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of dynamic elementary feature '" << desc.name << "'!"); |
|
|
0 |
0 |
if (desc.type == DYNAMIC && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of dynamic elementary feature '" << desc.name << "'!"); |
|
|
0 |
0 |
if (desc.type == DYNAMIC && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of dynamic elementary feature '" << desc.name << "'!"); |
|
20644
|
0 |
0 |
if (desc.type == PER_TAG && (sequence_index > 0 || sequence_index <= -window_size)) training_failure("Wrong sequence index " << sequence_index << " of per-tag elementary feature '" << desc.name << "'!"); |
|
|
0 |
0 |
if (desc.type == PER_TAG && (sequence_index > 0 || sequence_index <= -window_size)) training_failure("Wrong sequence index " << sequence_index << " of per-tag elementary feature '" << desc.name << "'!"); |
|
|
0 |
0 |
if (desc.type == PER_TAG && (sequence_index > 0 || sequence_index <= -window_size)) training_failure("Wrong sequence index " << sequence_index << " of per-tag elementary feature '" << desc.name << "'!"); |
|
|
0 |
0 |
if (desc.type == PER_TAG && (sequence_index > 0 || sequence_index <= -window_size)) training_failure("Wrong sequence index " << sequence_index << " of per-tag elementary feature '" << desc.name << "'!"); |
|
|
0 |
0 |
if (desc.type == PER_TAG && (sequence_index > 0 || sequence_index <= -window_size)) training_failure("Wrong sequence index " << sequence_index << " of per-tag elementary feature '" << desc.name << "'!"); |
|
|
0 |
0 |
if (desc.type == PER_TAG && (sequence_index > 0 || sequence_index <= -window_size)) training_failure("Wrong sequence index " << sequence_index << " of per-tag elementary feature '" << desc.name << "'!"); |
|
20645
|
0 |
0 |
if (desc.range == ONLY_CURRENT && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of elementary feature '" << desc.name << "' requiring zero offset!"); |
|
|
0 |
0 |
if (desc.range == ONLY_CURRENT && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of elementary feature '" << desc.name << "' requiring zero offset!"); |
|
|
0 |
0 |
if (desc.range == ONLY_CURRENT && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of elementary feature '" << desc.name << "' requiring zero offset!"); |
|
|
0 |
0 |
if (desc.range == ONLY_CURRENT && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of elementary feature '" << desc.name << "' requiring zero offset!"); |
|
|
0 |
0 |
if (desc.range == ONLY_CURRENT && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of elementary feature '" << desc.name << "' requiring zero offset!"); |
|
20647
|
0 |
0 |
sequences.back().elements.emplace_back(it->second.type, it->second.index, sequence_index); |
|
20648
|
0 |
0 |
if (desc.type == DYNAMIC) sequences.back().dependant_range = max(sequences.back().dependant_range, window_size + 1); |
|
20649
|
0 |
0 |
if (desc.type == PER_TAG) sequences.back().dependant_range = max(sequences.back().dependant_range, 1 - sequence_index); |
|
20652
|
0 |
0 |
if (contains_only_current && sequences.back().dependant_range > 1) training_failure("Feature sequence '" << line << "' contains both a non-local elementary feature and exclusively-local elementary feature!"); |
|
|
0 |
0 |
if (contains_only_current && sequences.back().dependant_range > 1) training_failure("Feature sequence '" << line << "' contains both a non-local elementary feature and exclusively-local elementary feature!"); |
|
|
0 |
0 |
if (contains_only_current && sequences.back().dependant_range > 1) training_failure("Feature sequence '" << line << "' contains both a non-local elementary feature and exclusively-local elementary feature!"); |
|
|
0 |
0 |
if (contains_only_current && sequences.back().dependant_range > 1) training_failure("Feature sequence '" << line << "' contains both a non-local elementary feature and exclusively-local elementary feature!"); |
|
|
0 |
0 |
if (contains_only_current && sequences.back().dependant_range > 1) training_failure("Feature sequence '" << line << "' contains both a non-local elementary feature and exclusively-local elementary feature!"); |
|
20656
|
0 |
0 |
scores.resize(sequences.size()); |
|
20661
|
0 |
0 |
if (!elementary.save(os)) return false; |
|
20664
|
0 |
0 |
enc.add_1B(sequences.size()); |
|
20665
|
0 |
0 |
for (auto&& sequence : sequences) { |
|
20667
|
0 |
0 |
enc.add_1B(sequence.elements.size()); |
|
20668
|
0 |
0 |
for (auto&& element : sequence.elements) { |
|
20675
|
0 |
0 |
enc.add_1B(scores.size()); |
|
20676
|
0 |
0 |
for (auto&& score : scores) |
|
20677
|
0 |
0 |
score.save(enc); |
|
20679
|
0 |
0 |
return compressor::save(os, enc); |
|
20700
|
0 |
0 |
class training_elementary_feature_map { |
|
|
0 |
0 |
class training_elementary_feature_map { |
|
20734
|
0 |
0 |
return it != map.end() ? it->second.alpha : 0; |
|
20777
|
0 |
0 |
for (unsigned i = 0; i < map_indices.size(); i++) { |
|
20778
|
0 |
0 |
for (auto&& element : features.sequences[i].elements) |
|
20779
|
0 |
0 |
for (auto&& description : decltype(features.elementary)::descriptions) |
|
20780
|
0 |
0 |
if (element.type == description.type && element.elementary_index == description.index) |
|
|
0 |
0 |
if (element.type == description.type && element.elementary_index == description.index) |
|
20781
|
0 |
0 |
map_indices[i].emplace_back(description.map_index); |
|
20783
|
0 |
0 |
assert(map_indices[i].size() == features.sequences[i].elements.size()); |
|
20787
|
0 |
0 |
vector> counts(elementary.maps.size()); |
|
20789
|
0 |
0 |
for (unsigned i = 0; i < features.sequences.size(); i++) |
|
20790
|
0 |
0 |
for (auto&& element : features.scores[i].map) |
|
20791
|
0 |
0 |
if (element.second.gamma) { |
|
20793
|
0 |
0 |
for (const char* key = element.first.c_str(); key != element.first.c_str() + element.first.size(); assert(key <= element.first.c_str() + element.first.size())) |
|
|
0 |
0 |
for (const char* key = element.first.c_str(); key != element.first.c_str() + element.first.size(); assert(key <= element.first.c_str() + element.first.size())) |
|
20794
|
0 |
0 |
elementary_ids.emplace_back(vli::decode(key)); |
|
20796
|
0 |
0 |
assert(elementary_ids.size() == features.sequences[i].elements.size()); |
|
20797
|
0 |
0 |
for (unsigned j = 0; j < elementary_ids.size(); j++) { |
|
20798
|
0 |
0 |
if (map_indices[i][j] < 0) continue; |
|
20799
|
0 |
0 |
if (elementary_ids[j] >= counts[map_indices[i][j]].size()) counts[map_indices[i][j]].resize(elementary_ids[j] + 1); |
|
|
0 |
0 |
if (elementary_ids[j] >= counts[map_indices[i][j]].size()) counts[map_indices[i][j]].resize(elementary_ids[j] + 1); |
|
20805
|
0 |
0 |
for (auto&& count : counts) { |
|
20806
|
0 |
0 |
if (elementary_feature_empty >= count.size()) count.resize(elementary_feature_empty + 1); |
|
|
0 |
0 |
if (elementary_feature_empty >= count.size()) count.resize(elementary_feature_empty + 1); |
|
20809
|
0 |
0 |
for (elementary_feature_value i = 0; i < count.size(); i++) count[i].ori = i; |
|
20814
|
0 |
0 |
vector> elementary_ids_map(counts.size()); |
|
20815
|
0 |
0 |
for (unsigned i = 0; i < counts.size(); i++) { |
|
20816
|
0 |
0 |
elementary_ids_map[i].resize(counts[i].size()); |
|
20817
|
0 |
0 |
for (elementary_feature_value j = 0; j < counts[i].size(); j++) |
|
20818
|
0 |
0 |
elementary_ids_map[i][counts[i][j].ori] = counts[i][j].count ? j : elementary_feature_unknown; |
|
20823
|
0 |
0 |
for (unsigned i = 0; i < elementary.maps.size(); i++) { |
|
20825
|
0 |
0 |
for (auto&& element : elementary.maps[i].map) |
|
20826
|
0 |
0 |
if (element.second < elementary_ids_map[i].size() && elementary_ids_map[i][element.second] != elementary_feature_unknown) |
|
|
0 |
0 |
if (element.second < elementary_ids_map[i].size() && elementary_ids_map[i][element.second] != elementary_feature_unknown) |
|
|
0 |
0 |
if (element.second < elementary_ids_map[i].size() && elementary_ids_map[i][element.second] != elementary_feature_unknown) |
|
20829
|
0 |
0 |
optimized_elementary.maps.emplace_back(persistent_unordered_map(mapped_ids, 1, [](binary_encoder& enc, int id) { |
|
|
0 |
0 |
optimized_elementary.maps.emplace_back(persistent_unordered_map(mapped_ids, 1, [](binary_encoder& enc, int id) { |
|
20835
|
0 |
0 |
optimized_features.sequences = features.sequences; |
|
20838
|
0 |
0 |
for (unsigned i = 0; i < features.sequences.size(); i++) { |
|
20840
|
0 |
0 |
for (auto&& element : features.scores[i].map) |
|
20841
|
0 |
0 |
if (element.second.gamma) { |
|
20843
|
0 |
0 |
for (const char* key = element.first.c_str(); key < element.first.c_str() + element.first.size(); ) |
|
20844
|
0 |
0 |
elementary_ids.emplace_back(vli::decode(key)); |
|
20846
|
0 |
0 |
assert(elementary_ids.size() == features.sequences[i].elements.size()); |
|
20847
|
0 |
0 |
for (unsigned j = 0; j < elementary_ids.size(); j++) { |
|
20848
|
0 |
0 |
if (map_indices[i][j] < 0) continue; |
|
20849
|
0 |
0 |
assert(elementary_ids[j] < elementary_ids_map[map_indices[i][j]].size() && elementary_ids_map[map_indices[i][j]][elementary_ids[j]] != elementary_feature_unknown); |
|
|
0 |
0 |
assert(elementary_ids[j] < elementary_ids_map[map_indices[i][j]].size() && elementary_ids_map[map_indices[i][j]][elementary_ids[j]] != elementary_feature_unknown); |
|
20853
|
0 |
0 |
key_buffer.resize(elementary_ids.size() * vli::max_length()); |
|
20855
|
0 |
0 |
for (unsigned j = 0; j < elementary_ids.size(); j++) |
|
20861
|
0 |
0 |
optimized_features.scores.emplace_back(persistent_unordered_map(updated_map, 1, [](binary_encoder& enc, const training_feature_sequence_map::info& info) { |
|
|
0 |
0 |
optimized_features.scores.emplace_back(persistent_unordered_map(updated_map, 1, [](binary_encoder& enc, const training_feature_sequence_map::info& info) { |
|
20862
|
0 |
0 |
assert(feature_sequence_score(info.gamma) == info.gamma); |
|
20928
|
0 |
0 |
if (!d) training_failure("Cannot load dictionary!"); |
|
|
0 |
0 |
if (!d) training_failure("Cannot load dictionary!"); |
|
|
0 |
0 |
if (!d) training_failure("Cannot load dictionary!"); |
|
20930
|
0 |
0 |
if (!in_morpho_dict.seekg(0, istream::beg)) training_failure("Cannot seek in dictionary file to the beginning!"); |
|
|
0 |
0 |
if (!in_morpho_dict.seekg(0, istream::beg)) training_failure("Cannot seek in dictionary file to the beginning!"); |
|
|
0 |
0 |
if (!in_morpho_dict.seekg(0, istream::beg)) training_failure("Cannot seek in dictionary file to the beginning!"); |
|
|
0 |
0 |
if (!in_morpho_dict.seekg(0, istream::beg)) training_failure("Cannot seek in dictionary file to the beginning!"); |
|
20935
|
0 |
0 |
load_data(in_train, *d, use_guesser, train_data, true); |
|
20938
|
0 |
0 |
if (in_heldout) { |
|
20941
|
0 |
0 |
load_data(in_heldout, *d, use_guesser, heldout_data, false); |
|
20946
|
0 |
0 |
out_tagger << in_morpho_dict.rdbuf(); |
|
20947
|
0 |
0 |
out_tagger.put(use_guesser); |
|
20950
|
0 |
0 |
TaggerTrainer::train(decoding_order, window_size, iterations, train_data, heldout_data, early_stopping, prune_features, in_feature_templates, out_tagger); |
|
20961
|
0 |
0 |
sentences.emplace_back(); |
|
20962
|
0 |
0 |
while (getline(is, line)) { |
|
|
0 |
0 |
while (getline(is, line)) { |
|
20963
|
0 |
0 |
if (line.empty()) { |
|
20964
|
0 |
0 |
if (!sentences.back().words.empty()) |
|
20965
|
0 |
0 |
sentences.emplace_back(); |
|
20969
|
0 |
0 |
split(line, '\t', tokens); |
|
20970
|
0 |
0 |
if (tokens.size() != 3) training_failure("The tagger data line '" << line << "' does not contain three columns!"); |
|
|
0 |
0 |
if (tokens.size() != 3) training_failure("The tagger data line '" << line << "' does not contain three columns!"); |
|
|
0 |
0 |
if (tokens.size() != 3) training_failure("The tagger data line '" << line << "' does not contain three columns!"); |
|
20975
|
0 |
0 |
s.words.emplace_back(tokens[0]); |
|
20976
|
0 |
0 |
s.gold.emplace_back(tokens[1], tokens[2]); |
|
20977
|
0 |
0 |
s.gold_index.emplace_back(-1); |
|
20980
|
0 |
0 |
s.analyses.emplace_back(); |
|
20981
|
0 |
0 |
d.analyze(tokens[0], use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, s.analyses.back()); |
|
|
0 |
0 |
d.analyze(tokens[0], use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, s.analyses.back()); |
|
20984
|
0 |
0 |
for (size_t i = 0; i < s.analyses.back().size(); i++) |
|
20985
|
0 |
0 |
if (s.analyses.back()[i].lemma == s.gold.back().lemma && s.analyses.back()[i].tag == s.gold.back().tag) { |
|
|
0 |
0 |
if (s.analyses.back()[i].lemma == s.gold.back().lemma && s.analyses.back()[i].tag == s.gold.back().tag) { |
|
|
0 |
0 |
if (s.analyses.back()[i].lemma == s.gold.back().lemma && s.analyses.back()[i].tag == s.gold.back().tag) { |
|
20990
|
0 |
0 |
if (s.gold_index.back() == -1 && add_gold) { |
|
|
0 |
0 |
if (s.gold_index.back() == -1 && add_gold) { |
|
|
0 |
0 |
if (s.gold_index.back() == -1 && add_gold) { |
|
20992
|
0 |
0 |
s.analyses.back().emplace_back(tokens[1], tokens[2]); |
|
20995
|
0 |
0 |
if (!sentences.empty() && sentences.back().words.empty()) sentences.pop_back(); |
|
|
0 |
0 |
if (!sentences.empty() && sentences.back().words.empty()) sentences.pop_back(); |
|
|
0 |
0 |
if (!sentences.empty() && sentences.back().words.empty()) sentences.pop_back(); |
|
20998
|
0 |
0 |
for (auto&& sentence : sentences) |
|
20999
|
0 |
0 |
for (auto&& word : sentence.words) |
|
21000
|
0 |
0 |
sentence.forms.emplace_back(string_piece(word.c_str(), d.raw_form_len(word))); |
|
|
0 |
0 |
sentence.forms.emplace_back(string_piece(word.c_str(), d.raw_form_len(word))); |
|
21040
|
0 |
0 |
features.parse(window_size, in_feature_templates); |
|
21043
|
0 |
0 |
train_viterbi(decoding_order, window_size, iterations, train, heldout, early_stopping, prune_features, features); |
|
21048
|
0 |
0 |
optimizer::optimize(features, optimized_features); |
|
21049
|
0 |
0 |
if (!optimized_features.save(out_tagger)) training_failure("Cannot save feature sequences!"); |
|
|
0 |
0 |
if (!optimized_features.save(out_tagger)) training_failure("Cannot save feature sequences!"); |
|
|
0 |
0 |
if (!optimized_features.save(out_tagger)) training_failure("Cannot save feature sequences!"); |
|
|
0 |
0 |
if (!optimized_features.save(out_tagger)) training_failure("Cannot save feature sequences!"); |
|
21058
|
0 |
0 |
typename decltype(decoder)::cache decoder_cache(decoder); |
|
21060
|
0 |
0 |
typename FeatureSequences::cache feature_sequences_cache(features); |
|
21064
|
0 |
0 |
vector window(window_size); |
|
21067
|
0 |
0 |
if (prune_features) |
|
21068
|
0 |
0 |
for (unsigned s = 0; s < train.size(); s++) { |
|
21070
|
0 |
0 |
features.initialize_sentence(sentence.forms, sentence.analyses, feature_sequences_cache); |
|
21071
|
0 |
0 |
for (int i = 0; i < int(sentence.forms.size()); i++) { |
|
21073
|
0 |
0 |
for (int j = 0; j < window_size && i - j >= 0; j++) window[j] = sentence.gold_index[i - j]; |
|
|
0 |
0 |
for (int j = 0; j < window_size && i - j >= 0; j++) window[j] = sentence.gold_index[i - j]; |
|
21076
|
0 |
0 |
features.feature_keys(i, window.data(), 0, gold_dynamic_features, gold_feature_sequences_keys, feature_sequences_cache); |
|
21078
|
0 |
0 |
for (unsigned f = 0; f < features.scores.size(); f++) |
|
21079
|
0 |
0 |
if (!gold_feature_sequences_keys[f].empty()) |
|
21085
|
0 |
0 |
for (int i = 0; i < iterations; i++) { |
|
21088
|
0 |
0 |
cerr << "Iteration " << i + 1 << ": "; |
|
|
0 |
0 |
cerr << "Iteration " << i + 1 << ": "; |
|
21091
|
0 |
0 |
for (unsigned s = 0; s < train.size(); s++) { |
|
21095
|
0 |
0 |
if (tags.size() < sentence.forms.size()) tags.resize(2 * sentence.forms.size()); |
|
|
0 |
0 |
if (tags.size() < sentence.forms.size()) tags.resize(2 * sentence.forms.size()); |
|
21096
|
0 |
0 |
decoder.tag(sentence.forms, sentence.analyses, decoder_cache, tags); |
|
21099
|
0 |
0 |
features.initialize_sentence(sentence.forms, sentence.analyses, feature_sequences_cache); |
|
21100
|
0 |
0 |
for (int i = 0; i < int(sentence.forms.size()); i++) { |
|
21105
|
0 |
0 |
for (int j = 0; j < window_size && i - j >= 0; j++) window[j] = tags[i - j]; |
|
|
0 |
0 |
for (int j = 0; j < window_size && i - j >= 0; j++) window[j] = tags[i - j]; |
|
21107
|
0 |
0 |
features.feature_keys(i, window.data(), 0, decoded_dynamic_features, decoded_feature_sequences_keys, feature_sequences_cache); |
|
21109
|
0 |
0 |
for (int j = 0; j < window_size && i - j >= 0; j++) window[j] = sentence.gold_index[i - j]; |
|
|
0 |
0 |
for (int j = 0; j < window_size && i - j >= 0; j++) window[j] = sentence.gold_index[i - j]; |
|
21111
|
0 |
0 |
features.feature_keys(i, window.data(), 0, gold_dynamic_features, gold_feature_sequences_keys, feature_sequences_cache); |
|
21113
|
0 |
0 |
for (unsigned f = 0; f < features.scores.size(); f++) { |
|
21114
|
0 |
0 |
if (decoded_feature_sequences_keys[f] != gold_feature_sequences_keys[f]) { |
|
21115
|
0 |
0 |
if (!decoded_feature_sequences_keys[f].empty()) { |
|
21117
|
0 |
0 |
if (it == features.scores[f].map.end() && !prune_features) it = features.scores[f].map.emplace(decoded_feature_sequences_keys[f], typename decltype(features.scores[f].map)::mapped_type()).first; |
|
|
0 |
0 |
if (it == features.scores[f].map.end() && !prune_features) it = features.scores[f].map.emplace(decoded_feature_sequences_keys[f], typename decltype(features.scores[f].map)::mapped_type()).first; |
|
|
0 |
0 |
if (it == features.scores[f].map.end() && !prune_features) it = features.scores[f].map.emplace(decoded_feature_sequences_keys[f], typename decltype(features.scores[f].map)::mapped_type()).first; |
|
21118
|
0 |
0 |
if (it != features.scores[f].map.end()) { |
|
21126
|
0 |
0 |
if (!gold_feature_sequences_keys[f].empty()) { |
|
21128
|
0 |
0 |
if (it == features.scores[f].map.end() && !prune_features) it = features.scores[f].map.emplace(gold_feature_sequences_keys[f], typename decltype(features.scores[f].map)::mapped_type()).first; |
|
|
0 |
0 |
if (it == features.scores[f].map.end() && !prune_features) it = features.scores[f].map.emplace(gold_feature_sequences_keys[f], typename decltype(features.scores[f].map)::mapped_type()).first; |
|
|
0 |
0 |
if (it == features.scores[f].map.end() && !prune_features) it = features.scores[f].map.emplace(gold_feature_sequences_keys[f], typename decltype(features.scores[f].map)::mapped_type()).first; |
|
21129
|
0 |
0 |
if (it != features.scores[f].map.end()) { |
|
21142
|
0 |
0 |
for (auto&& score : features.scores) |
|
21143
|
0 |
0 |
for (auto&& element : score.map) { |
|
21150
|
0 |
0 |
if (!heldout.empty()) { |
|
21156
|
0 |
0 |
optimizer::optimize(features, frozen_features); |
|
21158
|
0 |
0 |
typename decltype(frozen_decoder)::cache frozen_decoder_cache(frozen_decoder); |
|
21160
|
0 |
0 |
for (auto&& sentence : heldout) { |
|
21161
|
0 |
0 |
if (tags.size() < sentence.forms.size()) tags.resize(sentence.forms.size() * 2); |
|
|
0 |
0 |
if (tags.size() < sentence.forms.size()) tags.resize(sentence.forms.size() * 2); |
|
21162
|
0 |
0 |
frozen_decoder.tag(sentence.forms, sentence.analyses, frozen_decoder_cache, tags); |
|
21164
|
0 |
0 |
for (unsigned i = 0; i < sentence.forms.size(); i++) { |
|
21167
|
0 |
0 |
heldout_correct[BOTH] += sentence.gold[i].tag == sentence.analyses[i][tags[i]].tag && sentence.gold[i].lemma == sentence.analyses[i][tags[i]].lemma; |
|
|
0 |
0 |
heldout_correct[BOTH] += sentence.gold[i].tag == sentence.analyses[i][tags[i]].tag && sentence.gold[i].lemma == sentence.analyses[i][tags[i]].lemma; |
|
21172
|
0 |
0 |
if (early_stopping && heldout_correct[BOTH] > best_correct) { |
|
|
0 |
0 |
if (early_stopping && heldout_correct[BOTH] > best_correct) { |
|
21175
|
0 |
0 |
best_features = features; |
|
21178
|
0 |
0 |
cerr << ", heldout accuracy " << fixed << setprecision(2) |
|
21186
|
0 |
0 |
if (early_stopping && best_iteration >= 0) { |
|
21187
|
0 |
0 |
cerr << "Chosen tagger model from iteration " << best_iteration + 1 << endl; |
|
21188
|
0 |
0 |
features = best_features; |
|
21286
|
0 |
0 |
for (auto&& sentence : training) |
|
21287
|
0 |
0 |
for (size_t i = 1; i < sentence.words.size(); i++) |
|
21288
|
0 |
0 |
if (!can_combine_tag(sentence.words[i], error)) |
|
21290
|
0 |
0 |
for (auto&& sentence : heldout) |
|
21291
|
0 |
0 |
for (size_t i = 1; i < sentence.words.size(); i++) |
|
21292
|
0 |
0 |
if (!can_combine_tag(sentence.words[i], error)) |
|
21295
|
0 |
0 |
if (!train_tokenizer(training, heldout, tokenizer, os, error)) return false; |
|
21298
|
0 |
0 |
ostringstream os_tagger; |
|
21299
|
0 |
0 |
if (!train_tagger(training, heldout, tagger, os_tagger, error)) return false; |
|
|
0 |
0 |
if (!train_tagger(training, heldout, tagger, os_tagger, error)) return false; |
|
21301
|
0 |
0 |
os.write(tagger_model.data(), tagger_model.size()); |
|
21303
|
0 |
0 |
if (!train_parser(training, heldout, parser, tagger_model, os, error)) return false; |
|
|
0 |
0 |
if (!train_parser(training, heldout, parser, tagger_model, os, error)) return false; |
|
21310
|
0 |
0 |
if (options == NONE) { |
|
21315
|
0 |
0 |
if (!named_values::parse(options, tokenizer, error)) return false; |
|
|
0 |
0 |
if (!named_values::parse(options, tokenizer, error)) return false; |
|
21316
|
0 |
0 |
int run = 0; if (!option_int(tokenizer, "run", run, error)) return false; |
|
|
0 |
0 |
int run = 0; if (!option_int(tokenizer, "run", run, error)) return false; |
|
|
0 |
0 |
int run = 0; if (!option_int(tokenizer, "run", run, error)) return false; |
|
21318
|
0 |
0 |
if (tokenizer.count("from_model")) { |
|
|
0 |
0 |
if (tokenizer.count("from_model")) { |
|
21321
|
0 |
0 |
if (!load_model(tokenizer["from_model"], TOKENIZER_MODEL, tokenizer_data)) |
|
|
0 |
0 |
if (!load_model(tokenizer["from_model"], TOKENIZER_MODEL, tokenizer_data)) |
|
|
0 |
0 |
if (!load_model(tokenizer["from_model"], TOKENIZER_MODEL, tokenizer_data)) |
|
21322
|
0 |
0 |
return error.assign("Cannot load model from which the tokenizer should be used!"), false; |
|
21325
|
0 |
0 |
os.write(tokenizer_data.str, tokenizer_data.len); |
|
21327
|
0 |
0 |
os.put(1); |
|
21328
|
0 |
0 |
const string& model = option_str(tokenizer, "model"); |
|
|
0 |
0 |
const string& model = option_str(tokenizer, "model"); |
|
21331
|
0 |
0 |
if (model == "generic") { |
|
21332
|
0 |
0 |
os.put(morphodita::tokenizer_id::GENERIC); |
|
21334
|
0 |
0 |
} else if (model.empty() || model == "gru") { |
|
|
0 |
0 |
} else if (model.empty() || model == "gru") { |
|
|
0 |
0 |
} else if (model.empty() || model == "gru") { |
|
21337
|
0 |
0 |
if (tokenizer.count("detokenize")) { |
|
|
0 |
0 |
if (tokenizer.count("detokenize")) { |
|
21338
|
0 |
0 |
detokenizer.reset(new udpipe::detokenizer(tokenizer["detokenize"])); |
|
|
0 |
0 |
detokenizer.reset(new udpipe::detokenizer(tokenizer["detokenize"])); |
|
|
0 |
0 |
detokenizer.reset(new udpipe::detokenizer(tokenizer["detokenize"])); |
|
21339
|
0 |
0 |
if (!detokenizer) return error.assign("Cannot create detokenizer!"), false; |
|
|
0 |
0 |
if (!detokenizer) return error.assign("Cannot create detokenizer!"), false; |
|
21345
|
0 |
0 |
for (size_t training_sentence = 0; training_sentence < training.size(); training_sentence++) { |
|
21346
|
0 |
0 |
sentence s = training[training_sentence]; |
|
21347
|
0 |
0 |
if (detokenizer) detokenizer->detokenize(s); |
|
|
0 |
0 |
if (detokenizer) detokenizer->detokenize(s); |
|
21349
|
0 |
0 |
auto& sentence = (sentences.emplace_back(), sentences.back()); |
|
21351
|
0 |
0 |
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
|
21352
|
0 |
0 |
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? |
|
21353
|
0 |
0 |
(const token&)s.multiword_tokens[j] : (const token&)s.words[i]; |
|
21355
|
0 |
0 |
sentence.tokens.emplace_back(sentence.sentence.size(), 0); |
|
21356
|
0 |
0 |
for (auto&& chr : unilib::utf8::decoder(tok.form)) { |
|
21357
|
0 |
0 |
sentence.sentence.push_back(chr); |
|
21358
|
0 |
0 |
if (unilib::unicode::category(chr) & unilib::unicode::Zs) spaces_in_training = true; |
|
21362
|
0 |
0 |
if (tok.get_space_after()) sentence.sentence.push_back(' '); |
|
|
0 |
0 |
if (tok.get_space_after()) sentence.sentence.push_back(' '); |
|
|
0 |
0 |
if (tok.get_space_after()) sentence.sentence.push_back(' '); |
|
21364
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
21367
|
0 |
0 |
if (training_sentence + 1 < training.size() && (training[training_sentence + 1].get_new_doc() || training[training_sentence + 1].get_new_par())) |
|
|
0 |
0 |
if (training_sentence + 1 < training.size() && (training[training_sentence + 1].get_new_doc() || training[training_sentence + 1].get_new_par())) |
|
|
0 |
0 |
if (training_sentence + 1 < training.size() && (training[training_sentence + 1].get_new_doc() || training[training_sentence + 1].get_new_par())) |
|
|
0 |
0 |
if (training_sentence + 1 < training.size() && (training[training_sentence + 1].get_new_doc() || training[training_sentence + 1].get_new_par())) |
|
|
0 |
0 |
if (training_sentence + 1 < training.size() && (training[training_sentence + 1].get_new_doc() || training[training_sentence + 1].get_new_par())) |
|
|
0 |
0 |
if (training_sentence + 1 < training.size() && (training[training_sentence + 1].get_new_doc() || training[training_sentence + 1].get_new_par())) |
|
21374
|
0 |
0 |
bool detokenize_handout = true; if (!option_bool(tokenizer, "detokenize_handout", detokenize_handout, error)) return false; |
|
|
0 |
0 |
bool detokenize_handout = true; if (!option_bool(tokenizer, "detokenize_handout", detokenize_handout, error)) return false; |
|
|
0 |
0 |
bool detokenize_handout = true; if (!option_bool(tokenizer, "detokenize_handout", detokenize_handout, error)) return false; |
|
21375
|
0 |
0 |
for (size_t heldout_sentence = 0; heldout_sentence < heldout.size(); heldout_sentence++) { |
|
21376
|
0 |
0 |
sentence s = heldout[heldout_sentence]; |
|
21377
|
0 |
0 |
if (detokenizer && detokenize_handout) detokenizer->detokenize(s); |
|
|
0 |
0 |
if (detokenizer && detokenize_handout) detokenizer->detokenize(s); |
|
|
0 |
0 |
if (detokenizer && detokenize_handout) detokenizer->detokenize(s); |
|
|
0 |
0 |
if (detokenizer && detokenize_handout) detokenizer->detokenize(s); |
|
21379
|
0 |
0 |
auto& sentence = (heldout_sentences.emplace_back(), heldout_sentences.back()); |
|
21381
|
0 |
0 |
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
|
21382
|
0 |
0 |
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? |
|
21383
|
0 |
0 |
(const token&)s.multiword_tokens[j] : (const token&)s.words[i]; |
|
21385
|
0 |
0 |
sentence.tokens.emplace_back(sentence.sentence.size(), 0); |
|
21386
|
0 |
0 |
for (auto&& chr : unilib::utf8::decoder(tok.form)) |
|
21387
|
0 |
0 |
sentence.sentence.push_back(chr); |
|
21390
|
0 |
0 |
if (tok.get_space_after()) sentence.sentence.push_back(' '); |
|
|
0 |
0 |
if (tok.get_space_after()) sentence.sentence.push_back(' '); |
|
|
0 |
0 |
if (tok.get_space_after()) sentence.sentence.push_back(' '); |
|
21392
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
21395
|
0 |
0 |
if (heldout_sentence + 1 < heldout.size() && (heldout[heldout_sentence + 1].get_new_doc() || heldout[heldout_sentence + 1].get_new_par())) |
|
|
0 |
0 |
if (heldout_sentence + 1 < heldout.size() && (heldout[heldout_sentence + 1].get_new_doc() || heldout[heldout_sentence + 1].get_new_par())) |
|
|
0 |
0 |
if (heldout_sentence + 1 < heldout.size() && (heldout[heldout_sentence + 1].get_new_doc() || heldout[heldout_sentence + 1].get_new_par())) |
|
|
0 |
0 |
if (heldout_sentence + 1 < heldout.size() && (heldout[heldout_sentence + 1].get_new_doc() || heldout[heldout_sentence + 1].get_new_par())) |
|
|
0 |
0 |
if (heldout_sentence + 1 < heldout.size() && (heldout[heldout_sentence + 1].get_new_doc() || heldout[heldout_sentence + 1].get_new_par())) |
|
|
0 |
0 |
if (heldout_sentence + 1 < heldout.size() && (heldout[heldout_sentence + 1].get_new_doc() || heldout[heldout_sentence + 1].get_new_par())) |
|
21400
|
0 |
0 |
bool tokenize_url = true; if (!option_bool(tokenizer, "tokenize_url", tokenize_url, error)) return false; |
|
|
0 |
0 |
bool tokenize_url = true; if (!option_bool(tokenizer, "tokenize_url", tokenize_url, error)) return false; |
|
|
0 |
0 |
bool tokenize_url = true; if (!option_bool(tokenizer, "tokenize_url", tokenize_url, error)) return false; |
|
21401
|
0 |
0 |
int segment_size = 50; if (!option_int(tokenizer, "segment_size", segment_size, error)) return false; |
|
|
0 |
0 |
int segment_size = 50; if (!option_int(tokenizer, "segment_size", segment_size, error)) return false; |
|
|
0 |
0 |
int segment_size = 50; if (!option_int(tokenizer, "segment_size", segment_size, error)) return false; |
|
21402
|
0 |
0 |
bool allow_spaces = spaces_in_training; if (!option_bool(tokenizer, "allow_spaces", allow_spaces, error)) return false; |
|
|
0 |
0 |
bool allow_spaces = spaces_in_training; if (!option_bool(tokenizer, "allow_spaces", allow_spaces, error)) return false; |
|
|
0 |
0 |
bool allow_spaces = spaces_in_training; if (!option_bool(tokenizer, "allow_spaces", allow_spaces, error)) return false; |
|
21403
|
0 |
0 |
int dimension = 24; if (!option_int(tokenizer, "dimension", dimension, error)) return false; |
|
|
0 |
0 |
int dimension = 24; if (!option_int(tokenizer, "dimension", dimension, error)) return false; |
|
|
0 |
0 |
int dimension = 24; if (!option_int(tokenizer, "dimension", dimension, error)) return false; |
|
21404
|
0 |
0 |
int epochs = 100; if (!option_int(tokenizer, "epochs", epochs, error)) return false; |
|
|
0 |
0 |
int epochs = 100; if (!option_int(tokenizer, "epochs", epochs, error)) return false; |
|
|
0 |
0 |
int epochs = 100; if (!option_int(tokenizer, "epochs", epochs, error)) return false; |
|
21405
|
0 |
0 |
int batch_size = run <= 1 ? 50 : 50 + 50 * hyperparameter_integer(run, 1, 0, 1); |
|
21406
|
0 |
0 |
if (!option_int(tokenizer, "batch_size", batch_size, error)) return false; |
|
|
0 |
0 |
if (!option_int(tokenizer, "batch_size", batch_size, error)) return false; |
|
|
0 |
0 |
if (!option_int(tokenizer, "batch_size", batch_size, error)) return false; |
|
21407
|
0 |
0 |
double learning_rate = run <= 1 ? 0.005 : hyperparameter_logarithmic(run, 2, 0.0005, 0.01); |
|
21408
|
0 |
0 |
if (!option_double(tokenizer, "learning_rate", learning_rate, error)) return false; |
|
|
0 |
0 |
if (!option_double(tokenizer, "learning_rate", learning_rate, error)) return false; |
|
|
0 |
0 |
if (!option_double(tokenizer, "learning_rate", learning_rate, error)) return false; |
|
21409
|
0 |
0 |
double learning_rate_final = 0.0; if (!option_double(tokenizer, "learning_rate_final", learning_rate_final, error)) return false; |
|
|
0 |
0 |
double learning_rate_final = 0.0; if (!option_double(tokenizer, "learning_rate_final", learning_rate_final, error)) return false; |
|
|
0 |
0 |
double learning_rate_final = 0.0; if (!option_double(tokenizer, "learning_rate_final", learning_rate_final, error)) return false; |
|
21410
|
0 |
0 |
double dropout = 0.1; if (!option_double(tokenizer, "dropout", dropout, error)) return false; |
|
|
0 |
0 |
double dropout = 0.1; if (!option_double(tokenizer, "dropout", dropout, error)) return false; |
|
|
0 |
0 |
double dropout = 0.1; if (!option_double(tokenizer, "dropout", dropout, error)) return false; |
|
21411
|
0 |
0 |
double initialization_range = 0.5; if (!option_double(tokenizer, "initialization_range", initialization_range, error)) return false; |
|
|
0 |
0 |
double initialization_range = 0.5; if (!option_double(tokenizer, "initialization_range", initialization_range, error)) return false; |
|
|
0 |
0 |
double initialization_range = 0.5; if (!option_double(tokenizer, "initialization_range", initialization_range, error)) return false; |
|
21412
|
0 |
0 |
bool early_stopping = !heldout_sentences.empty(); if (!option_bool(tokenizer, "early_stopping", early_stopping, error)) return false; |
|
|
0 |
0 |
bool early_stopping = !heldout_sentences.empty(); if (!option_bool(tokenizer, "early_stopping", early_stopping, error)) return false; |
|
|
0 |
0 |
bool early_stopping = !heldout_sentences.empty(); if (!option_bool(tokenizer, "early_stopping", early_stopping, error)) return false; |
|
21414
|
0 |
0 |
if (run >= 1) cerr << "Random search run " << run << ", batch_size=" << batch_size |
|
|
0 |
0 |
if (run >= 1) cerr << "Random search run " << run << ", batch_size=" << batch_size |
|
|
0 |
0 |
if (run >= 1) cerr << "Random search run " << run << ", batch_size=" << batch_size |
|
21417
|
0 |
0 |
cerr << "Training tokenizer with the following options: " << "tokenize_url=" << (tokenize_url ? 1 : 0) |
|
|
0 |
0 |
cerr << "Training tokenizer with the following options: " << "tokenize_url=" << (tokenize_url ? 1 : 0) |
|
21418
|
0 |
0 |
<< ", allow_spaces=" << (allow_spaces ? 1 : 0) << ", dimension=" << dimension << endl |
|
|
0 |
0 |
<< ", allow_spaces=" << (allow_spaces ? 1 : 0) << ", dimension=" << dimension << endl |
|
|
0 |
0 |
<< ", allow_spaces=" << (allow_spaces ? 1 : 0) << ", dimension=" << dimension << endl |
|
21419
|
0 |
0 |
<< " epochs=" << epochs << ", batch_size=" << batch_size << ", segment_size=" << segment_size |
|
|
0 |
0 |
<< " epochs=" << epochs << ", batch_size=" << batch_size << ", segment_size=" << segment_size |
|
|
0 |
0 |
<< " epochs=" << epochs << ", batch_size=" << batch_size << ", segment_size=" << segment_size |
|
21421
|
0 |
0 |
<< " dropout=" << dropout << ", early_stopping=" << (early_stopping ? 1 : 0) << endl; |
|
|
0 |
0 |
<< " dropout=" << dropout << ", early_stopping=" << (early_stopping ? 1 : 0) << endl; |
|
21424
|
0 |
0 |
os.put(morphodita::tokenizer_ids::GRU); |
|
21425
|
0 |
0 |
if (!morphodita::gru_tokenizer_trainer::train(tokenize_url ? morphodita::gru_tokenizer_trainer::URL_EMAIL_LATEST : 0, |
|
|
0 |
0 |
if (!morphodita::gru_tokenizer_trainer::train(tokenize_url ? morphodita::gru_tokenizer_trainer::URL_EMAIL_LATEST : 0, |
|
|
0 |
0 |
if (!morphodita::gru_tokenizer_trainer::train(tokenize_url ? morphodita::gru_tokenizer_trainer::URL_EMAIL_LATEST : 0, |
|
21431
|
0 |
0 |
return error.assign("Unknown tokenizer model '").append(model).append("'!"), false; |
|
|
0 |
0 |
return error.assign("Unknown tokenizer model '").append(model).append("'!"), false; |
|
21435
|
0 |
0 |
if (!multiword_splitter_trainer::train(training, os, error)) return false; |
|
|
0 |
0 |
if (!multiword_splitter_trainer::train(training, os, error)) return false; |
|
21444
|
0 |
0 |
if (options == NONE) { |
|
21449
|
0 |
0 |
if (!named_values::parse(options, tagger, error)) return false; |
|
|
0 |
0 |
if (!named_values::parse(options, tagger, error)) return false; |
|
21451
|
0 |
0 |
if (tagger.count("from_model")) { |
|
|
0 |
0 |
if (tagger.count("from_model")) { |
|
21454
|
0 |
0 |
string model_name = "from_model"; |
|
21456
|
0 |
0 |
do { |
|
21457
|
0 |
0 |
taggers_data.emplace_back(); |
|
21458
|
0 |
0 |
if (!load_model(tagger[model_name], TAGGER_MODEL, taggers_data.back())) |
|
|
0 |
0 |
if (!load_model(tagger[model_name], TAGGER_MODEL, taggers_data.back())) |
|
21459
|
0 |
0 |
return error.assign("Cannot load model from which the tagger should be used!"), false; |
|
21460
|
0 |
0 |
if (taggers_data.back().str[0]) { |
|
21463
|
0 |
0 |
vector overrides = {"lemma", "xpostag", "feats"}; |
|
|
0 |
0 |
vector overrides = {"lemma", "xpostag", "feats"}; |
|
|
0 |
0 |
vector overrides = {"lemma", "xpostag", "feats"}; |
|
|
0 |
0 |
vector overrides = {"lemma", "xpostag", "feats"}; |
|
|
0 |
0 |
vector overrides = {"lemma", "xpostag", "feats"}; |
|
|
0 |
0 |
vector overrides = {"lemma", "xpostag", "feats"}; |
|
21464
|
0 |
0 |
for (size_t i = 0; i < overrides.size(); i++) { |
|
21465
|
0 |
0 |
string override_name = "from_model_" + overrides[i]; |
|
21467
|
0 |
0 |
if (!option_int(tagger, override_name, override_value, error, model_index)) return false; |
|
|
0 |
0 |
if (!option_int(tagger, override_name, override_value, error, model_index)) return false; |
|
21468
|
0 |
0 |
if (override_value >= 0) |
|
21474
|
0 |
0 |
model_name = "from_model_" + to_string(1 + ++model_index); |
|
21476
|
0 |
0 |
if (taggers_total < 0 || taggers_total > 4) return error.assign("Cannot create more than four tagger models!"), false; |
|
|
0 |
0 |
if (taggers_total < 0 || taggers_total > 4) return error.assign("Cannot create more than four tagger models!"), false; |
|
21479
|
0 |
0 |
os.put(taggers_total); |
|
21480
|
0 |
0 |
for (auto&& tagger_data : taggers_data) |
|
21481
|
0 |
0 |
os.write(tagger_data.str + 1, tagger_data.len - 1); |
|
21484
|
0 |
0 |
int models = 1; if (!option_int(tagger, "models", models, error)) return false; |
|
|
0 |
0 |
int models = 1; if (!option_int(tagger, "models", models, error)) return false; |
|
|
0 |
0 |
int models = 1; if (!option_int(tagger, "models", models, error)) return false; |
|
21485
|
0 |
0 |
if (models <= 0) return error.assign("Number of tagger models cannot be negative or zero!"), false; |
|
|
0 |
0 |
if (models <= 0) return error.assign("Number of tagger models cannot be negative or zero!"), false; |
|
21486
|
0 |
0 |
if (models > 4) return error.assign("Cannot create more than four tagger models!"), false; |
|
|
0 |
0 |
if (models > 4) return error.assign("Cannot create more than four tagger models!"), false; |
|
21488
|
0 |
0 |
os.put(models); |
|
21489
|
0 |
0 |
for (int model = 0; model < models; model++) |
|
21490
|
0 |
0 |
if (!train_tagger_model(training, heldout, model, models, tagger, os, error)) |
|
|
0 |
0 |
if (!train_tagger_model(training, heldout, model, models, tagger, os, error)) |
|
21500
|
0 |
0 |
if (options == NONE) { |
|
21505
|
0 |
0 |
if (!named_values::parse(options, parser, error)) return false; |
|
|
0 |
0 |
if (!named_values::parse(options, parser, error)) return false; |
|
21506
|
0 |
0 |
int run = 0; if (!option_int(parser, "run", run, error)) return false; |
|
|
0 |
0 |
int run = 0; if (!option_int(parser, "run", run, error)) return false; |
|
|
0 |
0 |
int run = 0; if (!option_int(parser, "run", run, error)) return false; |
|
21508
|
0 |
0 |
if (parser.count("from_model")) { |
|
|
0 |
0 |
if (parser.count("from_model")) { |
|
21511
|
0 |
0 |
if (!load_model(parser["from_model"], PARSER_MODEL, parser_data)) |
|
|
0 |
0 |
if (!load_model(parser["from_model"], PARSER_MODEL, parser_data)) |
|
|
0 |
0 |
if (!load_model(parser["from_model"], PARSER_MODEL, parser_data)) |
|
21512
|
0 |
0 |
return error.assign("Cannot load model from which the parser should be used!"), false; |
|
21515
|
0 |
0 |
os.write(parser_data.str, parser_data.len); |
|
21517
|
0 |
0 |
os.put(1); |
|
21520
|
0 |
0 |
string transition_system = parser.count("transition_system") ? parser["transition_system"] : "projective"; |
|
|
0 |
0 |
string transition_system = parser.count("transition_system") ? parser["transition_system"] : "projective"; |
|
|
0 |
0 |
string transition_system = parser.count("transition_system") ? parser["transition_system"] : "projective"; |
|
|
0 |
0 |
string transition_system = parser.count("transition_system") ? parser["transition_system"] : "projective"; |
|
21521
|
0 |
0 |
string transition_oracle = parser.count("transition_oracle") ? parser["transition_oracle"] : |
|
|
0 |
0 |
string transition_oracle = parser.count("transition_oracle") ? parser["transition_oracle"] : |
|
|
0 |
0 |
string transition_oracle = parser.count("transition_oracle") ? parser["transition_oracle"] : |
|
|
0 |
0 |
string transition_oracle = parser.count("transition_oracle") ? parser["transition_oracle"] : |
|
21524
|
0 |
0 |
"static"; |
|
|
0 |
0 |
"static"; |
|
|
0 |
0 |
"static"; |
|
|
0 |
0 |
"static"; |
|
21526
|
0 |
0 |
int embedding_upostag = 20; if (!option_int(parser, "embedding_upostag", embedding_upostag, error)) return false; |
|
|
0 |
0 |
int embedding_upostag = 20; if (!option_int(parser, "embedding_upostag", embedding_upostag, error)) return false; |
|
|
0 |
0 |
int embedding_upostag = 20; if (!option_int(parser, "embedding_upostag", embedding_upostag, error)) return false; |
|
21527
|
0 |
0 |
int embedding_feats = 20; if (!option_int(parser, "embedding_feats", embedding_feats, error)) return false; |
|
|
0 |
0 |
int embedding_feats = 20; if (!option_int(parser, "embedding_feats", embedding_feats, error)) return false; |
|
|
0 |
0 |
int embedding_feats = 20; if (!option_int(parser, "embedding_feats", embedding_feats, error)) return false; |
|
21528
|
0 |
0 |
int embedding_xpostag = 0; if (!option_int(parser, "embedding_xpostag", embedding_xpostag, error)) return false; |
|
|
0 |
0 |
int embedding_xpostag = 0; if (!option_int(parser, "embedding_xpostag", embedding_xpostag, error)) return false; |
|
|
0 |
0 |
int embedding_xpostag = 0; if (!option_int(parser, "embedding_xpostag", embedding_xpostag, error)) return false; |
|
21529
|
0 |
0 |
int embedding_form = 50; if (!option_int(parser, "embedding_form", embedding_form, error)) return false; |
|
|
0 |
0 |
int embedding_form = 50; if (!option_int(parser, "embedding_form", embedding_form, error)) return false; |
|
|
0 |
0 |
int embedding_form = 50; if (!option_int(parser, "embedding_form", embedding_form, error)) return false; |
|
21530
|
0 |
0 |
int embedding_form_mincount = 2; if (!option_int(parser, "embedding_form_mincount", embedding_form_mincount, error)) return false; |
|
|
0 |
0 |
int embedding_form_mincount = 2; if (!option_int(parser, "embedding_form_mincount", embedding_form_mincount, error)) return false; |
|
|
0 |
0 |
int embedding_form_mincount = 2; if (!option_int(parser, "embedding_form_mincount", embedding_form_mincount, error)) return false; |
|
21531
|
0 |
0 |
int embedding_lemma = 0; if (!option_int(parser, "embedding_lemma", embedding_lemma, error)) return false; |
|
|
0 |
0 |
int embedding_lemma = 0; if (!option_int(parser, "embedding_lemma", embedding_lemma, error)) return false; |
|
|
0 |
0 |
int embedding_lemma = 0; if (!option_int(parser, "embedding_lemma", embedding_lemma, error)) return false; |
|
21532
|
0 |
0 |
int embedding_lemma_mincount = 2; if (!option_int(parser, "embedding_lemma_mincount", embedding_lemma_mincount, error)) return false; |
|
|
0 |
0 |
int embedding_lemma_mincount = 2; if (!option_int(parser, "embedding_lemma_mincount", embedding_lemma_mincount, error)) return false; |
|
|
0 |
0 |
int embedding_lemma_mincount = 2; if (!option_int(parser, "embedding_lemma_mincount", embedding_lemma_mincount, error)) return false; |
|
21533
|
0 |
0 |
int embedding_deprel = 20; if (!option_int(parser, "embedding_deprel", embedding_deprel, error)) return false; |
|
|
0 |
0 |
int embedding_deprel = 20; if (!option_int(parser, "embedding_deprel", embedding_deprel, error)) return false; |
|
|
0 |
0 |
int embedding_deprel = 20; if (!option_int(parser, "embedding_deprel", embedding_deprel, error)) return false; |
|
21535
|
0 |
0 |
if (embedding_upostag) embeddings.append("universal_tag ").append(to_string(embedding_upostag)).append(" 1\n"); |
|
|
0 |
0 |
if (embedding_upostag) embeddings.append("universal_tag ").append(to_string(embedding_upostag)).append(" 1\n"); |
|
|
0 |
0 |
if (embedding_upostag) embeddings.append("universal_tag ").append(to_string(embedding_upostag)).append(" 1\n"); |
|
21536
|
0 |
0 |
if (embedding_feats) embeddings.append("feats ").append(to_string(embedding_feats)).append(" 1\n"); |
|
|
0 |
0 |
if (embedding_feats) embeddings.append("feats ").append(to_string(embedding_feats)).append(" 1\n"); |
|
|
0 |
0 |
if (embedding_feats) embeddings.append("feats ").append(to_string(embedding_feats)).append(" 1\n"); |
|
21537
|
0 |
0 |
if (embedding_xpostag) embeddings.append("tag ").append(to_string(embedding_xpostag)).append(" 1\n"); |
|
|
0 |
0 |
if (embedding_xpostag) embeddings.append("tag ").append(to_string(embedding_xpostag)).append(" 1\n"); |
|
|
0 |
0 |
if (embedding_xpostag) embeddings.append("tag ").append(to_string(embedding_xpostag)).append(" 1\n"); |
|
21538
|
0 |
0 |
if (embedding_form) { |
|
21539
|
0 |
0 |
embeddings.append("form ").append(to_string(embedding_form)).append(" ").append(to_string(embedding_form_mincount)); |
|
|
0 |
0 |
embeddings.append("form ").append(to_string(embedding_form)).append(" ").append(to_string(embedding_form_mincount)); |
|
21540
|
0 |
0 |
if (!option_str(parser, "embedding_form_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_form_file")); |
|
|
0 |
0 |
if (!option_str(parser, "embedding_form_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_form_file")); |
|
|
0 |
0 |
if (!option_str(parser, "embedding_form_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_form_file")); |
|
|
0 |
0 |
if (!option_str(parser, "embedding_form_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_form_file")); |
|
|
0 |
0 |
if (!option_str(parser, "embedding_form_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_form_file")); |
|
|
0 |
0 |
if (!option_str(parser, "embedding_form_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_form_file")); |
|
21541
|
0 |
0 |
embeddings.push_back('\n'); |
|
21543
|
0 |
0 |
if (embedding_lemma) { |
|
21544
|
0 |
0 |
embeddings.append("lemma ").append(to_string(embedding_lemma)).append(" ").append(to_string(embedding_lemma_mincount)); |
|
|
0 |
0 |
embeddings.append("lemma ").append(to_string(embedding_lemma)).append(" ").append(to_string(embedding_lemma_mincount)); |
|
21545
|
0 |
0 |
if (!option_str(parser, "embedding_lemma_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_lemma_file")); |
|
|
0 |
0 |
if (!option_str(parser, "embedding_lemma_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_lemma_file")); |
|
|
0 |
0 |
if (!option_str(parser, "embedding_lemma_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_lemma_file")); |
|
|
0 |
0 |
if (!option_str(parser, "embedding_lemma_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_lemma_file")); |
|
|
0 |
0 |
if (!option_str(parser, "embedding_lemma_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_lemma_file")); |
|
|
0 |
0 |
if (!option_str(parser, "embedding_lemma_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_lemma_file")); |
|
21546
|
0 |
0 |
embeddings.push_back('\n'); |
|
21548
|
0 |
0 |
if (embedding_deprel) embeddings.append("deprel ").append(to_string(embedding_deprel)).append(" 1\n"); |
|
|
0 |
0 |
if (embedding_deprel) embeddings.append("deprel ").append(to_string(embedding_deprel)).append(" 1\n"); |
|
|
0 |
0 |
if (embedding_deprel) embeddings.append("deprel ").append(to_string(embedding_deprel)).append(" 1\n"); |
|
21550
|
0 |
0 |
bool single_root = true; if (!option_bool(parser, "single_root", single_root, error)) return false; |
|
|
0 |
0 |
bool single_root = true; if (!option_bool(parser, "single_root", single_root, error)) return false; |
|
|
0 |
0 |
bool single_root = true; if (!option_bool(parser, "single_root", single_root, error)) return false; |
|
21551
|
0 |
0 |
int iterations = 10; if (!option_int(parser, "iterations", iterations, error)) return false; |
|
|
0 |
0 |
int iterations = 10; if (!option_int(parser, "iterations", iterations, error)) return false; |
|
|
0 |
0 |
int iterations = 10; if (!option_int(parser, "iterations", iterations, error)) return false; |
|
21552
|
0 |
0 |
int hidden_layer = 200; if (!option_int(parser, "hidden_layer", hidden_layer, error)) return false; |
|
|
0 |
0 |
int hidden_layer = 200; if (!option_int(parser, "hidden_layer", hidden_layer, error)) return false; |
|
|
0 |
0 |
int hidden_layer = 200; if (!option_int(parser, "hidden_layer", hidden_layer, error)) return false; |
|
21553
|
0 |
0 |
int batch_size = 10; if (!option_int(parser, "batch_size", batch_size, error)) return false; |
|
|
0 |
0 |
int batch_size = 10; if (!option_int(parser, "batch_size", batch_size, error)) return false; |
|
|
0 |
0 |
int batch_size = 10; if (!option_int(parser, "batch_size", batch_size, error)) return false; |
|
21554
|
0 |
0 |
int structured_interval = run <= 1 ? 8 : hyperparameter_integer(run,1,0,2) == 2 ? 0 : 8 + 2*hyperparameter_integer(run,1,0,2); |
|
|
0 |
0 |
int structured_interval = run <= 1 ? 8 : hyperparameter_integer(run,1,0,2) == 2 ? 0 : 8 + 2*hyperparameter_integer(run,1,0,2); |
|
21555
|
0 |
0 |
if (!option_int(parser, "structured_interval", structured_interval, error)) return false; |
|
|
0 |
0 |
if (!option_int(parser, "structured_interval", structured_interval, error)) return false; |
|
|
0 |
0 |
if (!option_int(parser, "structured_interval", structured_interval, error)) return false; |
|
21556
|
0 |
0 |
double learning_rate = run <= 1 ? 0.02 : hyperparameter_logarithmic(run, 2, 0.005, 0.04); |
|
21557
|
0 |
0 |
if (!option_double(parser, "learning_rate", learning_rate, error)) return false; |
|
|
0 |
0 |
if (!option_double(parser, "learning_rate", learning_rate, error)) return false; |
|
|
0 |
0 |
if (!option_double(parser, "learning_rate", learning_rate, error)) return false; |
|
21558
|
0 |
0 |
double learning_rate_final = 0.001; if (!option_double(parser, "learning_rate_final", learning_rate_final, error)) return false; |
|
|
0 |
0 |
double learning_rate_final = 0.001; if (!option_double(parser, "learning_rate_final", learning_rate_final, error)) return false; |
|
|
0 |
0 |
double learning_rate_final = 0.001; if (!option_double(parser, "learning_rate_final", learning_rate_final, error)) return false; |
|
21559
|
0 |
0 |
double l2 = run <= 1 ? 0.5 : hyperparameter_uniform(run, 3, 0.2, 0.6); |
|
21560
|
0 |
0 |
if (!option_double(parser, "l2", l2, error)) return false; |
|
|
0 |
0 |
if (!option_double(parser, "l2", l2, error)) return false; |
|
|
0 |
0 |
if (!option_double(parser, "l2", l2, error)) return false; |
|
21561
|
0 |
0 |
bool early_stopping = !heldout.empty(); if (!option_bool(parser, "early_stopping", early_stopping, error)) return false; |
|
|
0 |
0 |
bool early_stopping = !heldout.empty(); if (!option_bool(parser, "early_stopping", early_stopping, error)) return false; |
|
|
0 |
0 |
bool early_stopping = !heldout.empty(); if (!option_bool(parser, "early_stopping", early_stopping, error)) return false; |
|
21563
|
0 |
0 |
if (run >= 1) cerr << "Random search run " << run << ", structured_interval=" << structured_interval |
|
|
0 |
0 |
if (run >= 1) cerr << "Random search run " << run << ", structured_interval=" << structured_interval |
|
|
0 |
0 |
if (run >= 1) cerr << "Random search run " << run << ", structured_interval=" << structured_interval |
|
21589
|
0 |
0 |
bool use_gold_tags = false; if (!option_bool(parser, "use_gold_tags", use_gold_tags, error)) return false; |
|
|
0 |
0 |
bool use_gold_tags = false; if (!option_bool(parser, "use_gold_tags", use_gold_tags, error)) return false; |
|
|
0 |
0 |
bool use_gold_tags = false; if (!option_bool(parser, "use_gold_tags", use_gold_tags, error)) return false; |
|
21590
|
0 |
0 |
if (!use_gold_tags && !tagger_model.empty() && tagger_model[0]) { |
|
|
0 |
0 |
if (!use_gold_tags && !tagger_model.empty() && tagger_model[0]) { |
|
|
0 |
0 |
if (!use_gold_tags && !tagger_model.empty() && tagger_model[0]) { |
|
|
0 |
0 |
if (!use_gold_tags && !tagger_model.empty() && tagger_model[0]) { |
|
21591
|
0 |
0 |
stringstream tagger_description; |
|
21592
|
0 |
0 |
tagger_description.put(model_morphodita_parsito::VERSION_LATEST).put(0x7F).put(0x7F).put(0).write(tagger_model.data(), tagger_model.size()).put(0); |
|
|
0 |
0 |
tagger_description.put(model_morphodita_parsito::VERSION_LATEST).put(0x7F).put(0x7F).put(0).write(tagger_model.data(), tagger_model.size()).put(0); |
|
|
0 |
0 |
tagger_description.put(model_morphodita_parsito::VERSION_LATEST).put(0x7F).put(0x7F).put(0).write(tagger_model.data(), tagger_model.size()).put(0); |
|
|
0 |
0 |
tagger_description.put(model_morphodita_parsito::VERSION_LATEST).put(0x7F).put(0x7F).put(0).write(tagger_model.data(), tagger_model.size()).put(0); |
|
|
0 |
0 |
tagger_description.put(model_morphodita_parsito::VERSION_LATEST).put(0x7F).put(0x7F).put(0).write(tagger_model.data(), tagger_model.size()).put(0); |
|
|
0 |
0 |
tagger_description.put(model_morphodita_parsito::VERSION_LATEST).put(0x7F).put(0x7F).put(0).write(tagger_model.data(), tagger_model.size()).put(0); |
|
21593
|
0 |
0 |
tagger.reset(model_morphodita_parsito::load(tagger_description)); |
|
21594
|
0 |
0 |
if (!tagger) return error.assign("Cannot load the tagger model for parser training data generation!"), false; |
|
|
0 |
0 |
if (!tagger) return error.assign("Cannot load the tagger model for parser training data generation!"), false; |
|
21598
|
0 |
0 |
sentence tagged; |
|
21600
|
0 |
0 |
for (auto&& sentence : training) { |
|
21601
|
0 |
0 |
tagged = sentence; |
|
21602
|
0 |
0 |
if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false; |
|
|
0 |
0 |
if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false; |
|
|
0 |
0 |
if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false; |
|
|
0 |
0 |
if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false; |
|
21604
|
0 |
0 |
train_trees.emplace_back(); |
|
21605
|
0 |
0 |
for (size_t i = 1; i < tagged.words.size(); i++) { |
|
21607
|
0 |
0 |
model_normalize_form(tagged.words[i].form, train_trees.back().nodes.back().form); |
|
21613
|
0 |
0 |
for (size_t i = 1; i < tagged.words.size(); i++) |
|
21614
|
0 |
0 |
train_trees.back().set_head(tagged.words[i].id, tagged.words[i].head, tagged.words[i].deprel); |
|
21619
|
0 |
0 |
for (auto&& sentence : heldout) { |
|
21620
|
0 |
0 |
tagged = sentence; |
|
21621
|
0 |
0 |
if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false; |
|
|
0 |
0 |
if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false; |
|
|
0 |
0 |
if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false; |
|
|
0 |
0 |
if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false; |
|
21623
|
0 |
0 |
heldout_trees.emplace_back(); |
|
21624
|
0 |
0 |
for (size_t i = 1; i < tagged.words.size(); i++) { |
|
21626
|
0 |
0 |
model_normalize_form(tagged.words[i].form, heldout_trees.back().nodes.back().form); |
|
21632
|
0 |
0 |
for (size_t i = 1; i < tagged.words.size(); i++) |
|
21633
|
0 |
0 |
heldout_trees.back().set_head(tagged.words[i].id, tagged.words[i].head, tagged.words[i].deprel); |
|
21637
|
0 |
0 |
<< ", structured_interval=" << structured_interval << ", single_root=" << (single_root ? 1 : 0) << endl |
|
|
0 |
0 |
<< ", structured_interval=" << structured_interval << ", single_root=" << (single_root ? 1 : 0) << endl |
|
|
0 |
0 |
<< ", structured_interval=" << structured_interval << ", single_root=" << (single_root ? 1 : 0) << endl |
|
21638
|
0 |
0 |
<< "Parser uses lemmas/upos/xpos/feats: " << (tagger ? "automatically generated by tagger" : "from gold data") << endl |
|
|
0 |
0 |
<< "Parser uses lemmas/upos/xpos/feats: " << (tagger ? "automatically generated by tagger" : "from gold data") << endl |
|
21639
|
0 |
0 |
<< "Parser embeddings options: upostag=" << embedding_upostag << ", feats=" << embedding_feats << ", xpostag=" << embedding_xpostag |
|
|
0 |
0 |
<< "Parser embeddings options: upostag=" << embedding_upostag << ", feats=" << embedding_feats << ", xpostag=" << embedding_xpostag |
|
|
0 |
0 |
<< "Parser embeddings options: upostag=" << embedding_upostag << ", feats=" << embedding_feats << ", xpostag=" << embedding_xpostag |
|
21640
|
0 |
0 |
<< ", form=" << embedding_form << ", lemma=" << embedding_lemma << ", deprel=" << embedding_deprel << endl |
|
|
0 |
0 |
<< ", form=" << embedding_form << ", lemma=" << embedding_lemma << ", deprel=" << embedding_deprel << endl |
|
|
0 |
0 |
<< ", form=" << embedding_form << ", lemma=" << embedding_lemma << ", deprel=" << embedding_deprel << endl |
|
21641
|
0 |
0 |
<< " form mincount=" << embedding_form_mincount << ", precomputed form embeddings=" << (parser["embedding_form_file"].empty() ? "none" : parser["embedding_form_file"]) << endl |
|
|
0 |
0 |
<< " form mincount=" << embedding_form_mincount << ", precomputed form embeddings=" << (parser["embedding_form_file"].empty() ? "none" : parser["embedding_form_file"]) << endl |
|
|
0 |
0 |
<< " form mincount=" << embedding_form_mincount << ", precomputed form embeddings=" << (parser["embedding_form_file"].empty() ? "none" : parser["embedding_form_file"]) << endl |
|
|
0 |
0 |
<< " form mincount=" << embedding_form_mincount << ", precomputed form embeddings=" << (parser["embedding_form_file"].empty() ? "none" : parser["embedding_form_file"]) << endl |
|
|
0 |
0 |
<< " form mincount=" << embedding_form_mincount << ", precomputed form embeddings=" << (parser["embedding_form_file"].empty() ? "none" : parser["embedding_form_file"]) << endl |
|
|
0 |
0 |
<< " form mincount=" << embedding_form_mincount << ", precomputed form embeddings=" << (parser["embedding_form_file"].empty() ? "none" : parser["embedding_form_file"]) << endl |
|
|
0 |
0 |
<< " form mincount=" << embedding_form_mincount << ", precomputed form embeddings=" << (parser["embedding_form_file"].empty() ? "none" : parser["embedding_form_file"]) << endl |
|
21642
|
0 |
0 |
<< " lemma mincount=" << embedding_lemma_mincount << ", precomputed lemma embeddings=" << (parser["embedding_lemma_file"].empty() ? "none" : parser["embedding_lemma_file"]) << endl |
|
|
0 |
0 |
<< " lemma mincount=" << embedding_lemma_mincount << ", precomputed lemma embeddings=" << (parser["embedding_lemma_file"].empty() ? "none" : parser["embedding_lemma_file"]) << endl |
|
|
0 |
0 |
<< " lemma mincount=" << embedding_lemma_mincount << ", precomputed lemma embeddings=" << (parser["embedding_lemma_file"].empty() ? "none" : parser["embedding_lemma_file"]) << endl |
|
|
0 |
0 |
<< " lemma mincount=" << embedding_lemma_mincount << ", precomputed lemma embeddings=" << (parser["embedding_lemma_file"].empty() ? "none" : parser["embedding_lemma_file"]) << endl |
|
|
0 |
0 |
<< " lemma mincount=" << embedding_lemma_mincount << ", precomputed lemma embeddings=" << (parser["embedding_lemma_file"].empty() ? "none" : parser["embedding_lemma_file"]) << endl |
|
|
0 |
0 |
<< " lemma mincount=" << embedding_lemma_mincount << ", precomputed lemma embeddings=" << (parser["embedding_lemma_file"].empty() ? "none" : parser["embedding_lemma_file"]) << endl |
|
|
0 |
0 |
<< " lemma mincount=" << embedding_lemma_mincount << ", precomputed lemma embeddings=" << (parser["embedding_lemma_file"].empty() ? "none" : parser["embedding_lemma_file"]) << endl |
|
21643
|
0 |
0 |
<< "Parser network options: iterations=" << iterations << ", hidden_layer=" << hidden_layer << ", batch_size=" << batch_size << "," << endl |
|
|
0 |
0 |
<< "Parser network options: iterations=" << iterations << ", hidden_layer=" << hidden_layer << ", batch_size=" << batch_size << "," << endl |
|
|
0 |
0 |
<< "Parser network options: iterations=" << iterations << ", hidden_layer=" << hidden_layer << ", batch_size=" << batch_size << "," << endl |
|
21645
|
0 |
0 |
<< ", l2=" << l2 << ", early_stopping=" << (early_stopping ? 1 : 0) << endl; |
|
|
0 |
0 |
<< ", l2=" << l2 << ", early_stopping=" << (early_stopping ? 1 : 0) << endl; |
|
21648
|
0 |
0 |
binary_encoder enc; |
|
21649
|
0 |
0 |
enc.add_str("nn_versioned"); |
|
21651
|
0 |
0 |
parameters, 1, train_trees, heldout_trees, enc); |
|
21652
|
0 |
0 |
compressor::save(os, enc); |
|
21664
|
0 |
0 |
if (!is.get(len)) return false; |
|
|
0 |
0 |
if (!is.get(len)) return false; |
|
21666
|
0 |
0 |
if (!is.read(&name[0], len)) return false; |
|
|
0 |
0 |
if (!is.read(&name[0], len)) return false; |
|
21667
|
0 |
0 |
if (name != "morphodita_parsito") return false; |
|
21670
|
0 |
0 |
if (!is.get(version)) return false; |
|
|
0 |
0 |
if (!is.get(version)) return false; |
|
21671
|
0 |
0 |
if (!(version >= 1 && version <= model_morphodita_parsito::VERSION_LATEST)) return false; |
|
21676
|
0 |
0 |
if (version >= 2) { |
|
21678
|
0 |
0 |
if (!is.get(sentinel) || sentinel != 0x7F) return false; |
|
|
0 |
0 |
if (!is.get(sentinel) || sentinel != 0x7F) return false; |
|
|
0 |
0 |
if (!is.get(sentinel) || sentinel != 0x7F) return false; |
|
|
0 |
0 |
if (!is.get(sentinel) || sentinel != 0x7F) return false; |
|
21679
|
0 |
0 |
if (!is.get(sentinel) || sentinel != 0x7F) return false; |
|
|
0 |
0 |
if (!is.get(sentinel) || sentinel != 0x7F) return false; |
|
|
0 |
0 |
if (!is.get(sentinel) || sentinel != 0x7F) return false; |
|
|
0 |
0 |
if (!is.get(sentinel) || sentinel != 0x7F) return false; |
|
21684
|
0 |
0 |
if (model == TOKENIZER_MODEL) range.str = data.data() + is.tellg(); |
|
|
0 |
0 |
if (model == TOKENIZER_MODEL) range.str = data.data() + is.tellg(); |
|
21685
|
0 |
0 |
char tokenizer; if (!is.get(tokenizer)) return false; |
|
|
0 |
0 |
char tokenizer; if (!is.get(tokenizer)) return false; |
|
21686
|
0 |
0 |
unique_ptr tokenizer_factory(tokenizer ? morphodita::tokenizer_factory::load(is) : nullptr); |
|
|
0 |
0 |
unique_ptr tokenizer_factory(tokenizer ? morphodita::tokenizer_factory::load(is) : nullptr); |
|
21687
|
0 |
0 |
if (tokenizer && !tokenizer_factory) return false; |
|
|
0 |
0 |
if (tokenizer && !tokenizer_factory) return false; |
|
|
0 |
0 |
if (tokenizer && !tokenizer_factory) return false; |
|
21688
|
0 |
0 |
unique_ptr splitter(tokenizer ? multiword_splitter::load(is) : nullptr); |
|
|
0 |
0 |
unique_ptr splitter(tokenizer ? multiword_splitter::load(is) : nullptr); |
|
21689
|
0 |
0 |
if (model == TOKENIZER_MODEL) return range.len = is.tellg() - streampos(range.str - data.data()), true; |
|
|
0 |
0 |
if (model == TOKENIZER_MODEL) return range.len = is.tellg() - streampos(range.str - data.data()), true; |
|
21694
|
0 |
0 |
if (model == TAGGER_MODEL) range.str = data.data() + is.tellg(); |
|
|
0 |
0 |
if (model == TAGGER_MODEL) range.str = data.data() + is.tellg(); |
|
21695
|
0 |
0 |
char taggers; if (!is.get(taggers)) return false; |
|
|
0 |
0 |
char taggers; if (!is.get(taggers)) return false; |
|
21696
|
0 |
0 |
for (char i = 0; i < taggers; i++) { |
|
21697
|
0 |
0 |
char lemma; if (!is.get(lemma)) return false; |
|
|
0 |
0 |
char lemma; if (!is.get(lemma)) return false; |
|
21698
|
0 |
0 |
char xpostag; if (!is.get(xpostag)) return false; |
|
|
0 |
0 |
char xpostag; if (!is.get(xpostag)) return false; |
|
21699
|
0 |
0 |
char feats; if (!is.get(feats)) return false; |
|
|
0 |
0 |
char feats; if (!is.get(feats)) return false; |
|
21700
|
0 |
0 |
unique_ptr tagger(morphodita::tagger::load(is)); |
|
21701
|
0 |
0 |
if (!tagger) return false; |
|
21703
|
0 |
0 |
if (model == TAGGER_MODEL) return range.len = is.tellg() - streampos(range.str - data.data()), true; |
|
|
0 |
0 |
if (model == TAGGER_MODEL) return range.len = is.tellg() - streampos(range.str - data.data()), true; |
|
21708
|
0 |
0 |
if (model == PARSER_MODEL) range.str = data.data() + is.tellg(); |
|
|
0 |
0 |
if (model == PARSER_MODEL) range.str = data.data() + is.tellg(); |
|
21710
|
0 |
0 |
if (!is.get(parser)) return false; |
|
|
0 |
0 |
if (!is.get(parser)) return false; |
|
21711
|
0 |
0 |
unique_ptr parser_model(parser ? parsito::parser::load(is) : nullptr); |
|
|
0 |
0 |
unique_ptr parser_model(parser ? parsito::parser::load(is) : nullptr); |
|
21712
|
0 |
0 |
if (parser && !parser_model) return false; |
|
|
0 |
0 |
if (parser && !parser_model) return false; |
|
|
0 |
0 |
if (parser && !parser_model) return false; |
|
21713
|
0 |
0 |
if (model == PARSER_MODEL) return range.len = is.tellg() - streampos(range.str - data.data()), true; |
|
|
0 |
0 |
if (model == PARSER_MODEL) return range.len = is.tellg() - streampos(range.str - data.data()), true; |
|
21720
|
0 |
0 |
return model_morphodita_parsito(model_morphodita_parsito::VERSION_LATEST).normalize_form(form, output); |
|
21724
|
0 |
0 |
return model_morphodita_parsito(model_morphodita_parsito::VERSION_LATEST).normalize_lemma(lemma, output); |
|
21728
|
0 |
0 |
model_morphodita_parsito(model_morphodita_parsito::VERSION_LATEST).fill_word_analysis(analysis, false, upostag, lemma, xpostag, feats, word); |
|
21736
|
0 |
0 |
unique_ptr conllu_input_format(input_format::new_conllu_input_format()); |
|
21738
|
0 |
0 |
int run = 0; if (!option_int(tagger, "run", run, error, model)) return false; |
|
|
0 |
0 |
int run = 0; if (!option_int(tagger, "run", run, error, model)) return false; |
|
|
0 |
0 |
int run = 0; if (!option_int(tagger, "run", run, error, model)) return false; |
|
21741
|
0 |
0 |
for (auto&& sentence : training) |
|
21742
|
0 |
0 |
for (size_t i = 1; !have_lemma && i < sentence.words.size(); i++) |
|
|
0 |
0 |
for (size_t i = 1; !have_lemma && i < sentence.words.size(); i++) |
|
|
0 |
0 |
for (size_t i = 1; !have_lemma && i < sentence.words.size(); i++) |
|
21743
|
0 |
0 |
if (!sentence.words[i].lemma.empty() && sentence.words[i].lemma != "_") |
|
|
0 |
0 |
if (!sentence.words[i].lemma.empty() && sentence.words[i].lemma != "_") |
|
|
0 |
0 |
if (!sentence.words[i].lemma.empty() && sentence.words[i].lemma != "_") |
|
21745
|
0 |
0 |
bool use_lemma_flag = model == 1 || models == 1; if (!option_bool(tagger, "use_lemma", use_lemma_flag, error, model)) return false; |
|
|
0 |
0 |
bool use_lemma_flag = model == 1 || models == 1; if (!option_bool(tagger, "use_lemma", use_lemma_flag, error, model)) return false; |
|
|
0 |
0 |
bool use_lemma_flag = model == 1 || models == 1; if (!option_bool(tagger, "use_lemma", use_lemma_flag, error, model)) return false; |
|
21746
|
0 |
0 |
int lemma_encoding = 2; if (!option_int(tagger, "dictionary_lemma_encoding", lemma_encoding, error, model)) return false; |
|
|
0 |
0 |
int lemma_encoding = 2; if (!option_int(tagger, "dictionary_lemma_encoding", lemma_encoding, error, model)) return false; |
|
|
0 |
0 |
int lemma_encoding = 2; if (!option_int(tagger, "dictionary_lemma_encoding", lemma_encoding, error, model)) return false; |
|
21747
|
0 |
0 |
int use_lemma = have_lemma && use_lemma_flag ? lemma_encoding : 0; |
|
|
0 |
0 |
int use_lemma = have_lemma && use_lemma_flag ? lemma_encoding : 0; |
|
21748
|
0 |
0 |
bool use_xpostag = model == 0; if (!option_bool(tagger, "use_xpostag", use_xpostag, error, model)) return false; |
|
|
0 |
0 |
bool use_xpostag = model == 0; if (!option_bool(tagger, "use_xpostag", use_xpostag, error, model)) return false; |
|
|
0 |
0 |
bool use_xpostag = model == 0; if (!option_bool(tagger, "use_xpostag", use_xpostag, error, model)) return false; |
|
21749
|
0 |
0 |
bool use_feats = model == 0; if (!option_bool(tagger, "use_feats", use_feats, error, model)) return false; |
|
|
0 |
0 |
bool use_feats = model == 0; if (!option_bool(tagger, "use_feats", use_feats, error, model)) return false; |
|
|
0 |
0 |
bool use_feats = model == 0; if (!option_bool(tagger, "use_feats", use_feats, error, model)) return false; |
|
21751
|
0 |
0 |
bool provide_lemma = model == 1 || models == 1; if (!option_bool(tagger, "provide_lemma", provide_lemma, error, model)) return false; |
|
|
0 |
0 |
bool provide_lemma = model == 1 || models == 1; if (!option_bool(tagger, "provide_lemma", provide_lemma, error, model)) return false; |
|
|
0 |
0 |
bool provide_lemma = model == 1 || models == 1; if (!option_bool(tagger, "provide_lemma", provide_lemma, error, model)) return false; |
|
21752
|
0 |
0 |
bool provide_xpostag = model == 0; if (!option_bool(tagger, "provide_xpostag", provide_xpostag, error, model)) return false; |
|
|
0 |
0 |
bool provide_xpostag = model == 0; if (!option_bool(tagger, "provide_xpostag", provide_xpostag, error, model)) return false; |
|
|
0 |
0 |
bool provide_xpostag = model == 0; if (!option_bool(tagger, "provide_xpostag", provide_xpostag, error, model)) return false; |
|
21753
|
0 |
0 |
bool provide_feats = model == 0; if (!option_bool(tagger, "provide_feats", provide_feats, error, model)) return false; |
|
|
0 |
0 |
bool provide_feats = model == 0; if (!option_bool(tagger, "provide_feats", provide_feats, error, model)) return false; |
|
|
0 |
0 |
bool provide_feats = model == 0; if (!option_bool(tagger, "provide_feats", provide_feats, error, model)) return false; |
|
21754
|
0 |
0 |
os.put(char(provide_lemma ? use_lemma : 0)); |
|
|
0 |
0 |
os.put(char(provide_lemma ? use_lemma : 0)); |
|
21755
|
0 |
0 |
os.put(char(provide_xpostag && use_xpostag)); |
|
|
0 |
0 |
os.put(char(provide_xpostag && use_xpostag)); |
|
|
0 |
0 |
os.put(char(provide_xpostag && use_xpostag)); |
|
21756
|
0 |
0 |
os.put(char(provide_feats && use_feats)); |
|
|
0 |
0 |
os.put(char(provide_feats && use_feats)); |
|
|
0 |
0 |
os.put(char(provide_feats && use_feats)); |
|
21758
|
0 |
0 |
cerr << "Tagger model " << model+1 << " columns: " << "lemma use=" << (use_lemma ? 1 : 0) << "/provide=" << (provide_lemma ? 1 : 0) |
|
|
0 |
0 |
cerr << "Tagger model " << model+1 << " columns: " << "lemma use=" << (use_lemma ? 1 : 0) << "/provide=" << (provide_lemma ? 1 : 0) |
|
|
0 |
0 |
cerr << "Tagger model " << model+1 << " columns: " << "lemma use=" << (use_lemma ? 1 : 0) << "/provide=" << (provide_lemma ? 1 : 0) |
|
|
0 |
0 |
cerr << "Tagger model " << model+1 << " columns: " << "lemma use=" << (use_lemma ? 1 : 0) << "/provide=" << (provide_lemma ? 1 : 0) |
|
21759
|
0 |
0 |
<< ", xpostag use=" << (use_xpostag ? 1 : 0) << "/provide=" << (provide_xpostag ? 1 : 0) |
|
|
0 |
0 |
<< ", xpostag use=" << (use_xpostag ? 1 : 0) << "/provide=" << (provide_xpostag ? 1 : 0) |
|
|
0 |
0 |
<< ", xpostag use=" << (use_xpostag ? 1 : 0) << "/provide=" << (provide_xpostag ? 1 : 0) |
|
|
0 |
0 |
<< ", xpostag use=" << (use_xpostag ? 1 : 0) << "/provide=" << (provide_xpostag ? 1 : 0) |
|
21760
|
0 |
0 |
<< ", feats use=" << (use_feats ? 1 : 0) << "/provide=" << (provide_feats ? 1 : 0) << endl; |
|
|
0 |
0 |
<< ", feats use=" << (use_feats ? 1 : 0) << "/provide=" << (provide_feats ? 1 : 0) << endl; |
|
|
0 |
0 |
<< ", feats use=" << (use_feats ? 1 : 0) << "/provide=" << (provide_feats ? 1 : 0) << endl; |
|
|
0 |
0 |
<< ", feats use=" << (use_feats ? 1 : 0) << "/provide=" << (provide_feats ? 1 : 0) << endl; |
|
21763
|
0 |
0 |
stringstream morpho_description; |
|
21767
|
0 |
0 |
const string& dictionary_model = option_str(tagger, "dictionary_model", model); |
|
|
0 |
0 |
const string& dictionary_model = option_str(tagger, "dictionary_model", model); |
|
21768
|
0 |
0 |
if (!dictionary_model.empty()) { |
|
21777
|
0 |
0 |
int dictionary_suffix_len = 8; if (!option_int(tagger, "dictionary_suffix_len", dictionary_suffix_len, error, model)) return false; |
|
|
0 |
0 |
int dictionary_suffix_len = 8; if (!option_int(tagger, "dictionary_suffix_len", dictionary_suffix_len, error, model)) return false; |
|
|
0 |
0 |
int dictionary_suffix_len = 8; if (!option_int(tagger, "dictionary_suffix_len", dictionary_suffix_len, error, model)) return false; |
|
21779
|
0 |
0 |
if (!option_str(tagger, "dictionary_flat_lemmas", model).empty()) { |
|
|
0 |
0 |
if (!option_str(tagger, "dictionary_flat_lemmas", model).empty()) { |
|
|
0 |
0 |
if (!option_str(tagger, "dictionary_flat_lemmas", model).empty()) { |
|
21781
|
0 |
0 |
split(option_str(tagger, "dictionary_flat_lemmas", model), ',', lemmas); |
|
|
0 |
0 |
split(option_str(tagger, "dictionary_flat_lemmas", model), ',', lemmas); |
|
|
0 |
0 |
split(option_str(tagger, "dictionary_flat_lemmas", model), ',', lemmas); |
|
21782
|
0 |
0 |
for (auto&& lemma : lemmas) { |
|
21783
|
0 |
0 |
if (lemma.find('~') != string::npos) |
|
21784
|
0 |
0 |
return error.assign("Dictionary_flat_lemmas cannot contain '~' character!"), false; |
|
21788
|
0 |
0 |
flat_lemmas.insert("greek.expression"); |
|
21791
|
0 |
0 |
if (!option_str(tagger, "dictionary", model).empty()) |
|
|
0 |
0 |
if (!option_str(tagger, "dictionary", model).empty()) |
|
|
0 |
0 |
if (!option_str(tagger, "dictionary", model).empty()) |
|
21792
|
0 |
0 |
return error.assign("The tagger 'dictionary' option is no longer supported, use 'dictionary_file' instead!"), false; |
|
21793
|
0 |
0 |
const string& dictionary_file = option_str(tagger, "dictionary_file", model); |
|
|
0 |
0 |
const string& dictionary_file = option_str(tagger, "dictionary_file", model); |
|
21794
|
0 |
0 |
int max_form_analyses = 0; if (!option_int(tagger, "dictionary_max_form_analyses", max_form_analyses, error, model)) return false; |
|
|
0 |
0 |
int max_form_analyses = 0; if (!option_int(tagger, "dictionary_max_form_analyses", max_form_analyses, error, model)) return false; |
|
|
0 |
0 |
int max_form_analyses = 0; if (!option_int(tagger, "dictionary_max_form_analyses", max_form_analyses, error, model)) return false; |
|
21796
|
0 |
0 |
cerr << "Tagger model " << model+1 << " dictionary options: " << "max_form_analyses=" << max_form_analyses |
|
21797
|
0 |
0 |
<< ", custom dictionary_file=" << (dictionary_file.empty() ? "none" : dictionary_file) << endl; |
|
|
0 |
0 |
<< ", custom dictionary_file=" << (dictionary_file.empty() ? "none" : dictionary_file) << endl; |
|
21800
|
0 |
0 |
int guesser_suffix_len = 4; if (!option_int(tagger, "guesser_suffix_len", guesser_suffix_len, error, model)) return false; |
|
|
0 |
0 |
int guesser_suffix_len = 4; if (!option_int(tagger, "guesser_suffix_len", guesser_suffix_len, error, model)) return false; |
|
|
0 |
0 |
int guesser_suffix_len = 4; if (!option_int(tagger, "guesser_suffix_len", guesser_suffix_len, error, model)) return false; |
|
21801
|
0 |
0 |
int guesser_suffix_rules = run <= 1 ? 8 : 5 + hyperparameter_integer(run, 1, 0, 7); |
|
21802
|
0 |
0 |
if (!option_int(tagger, "guesser_suffix_rules", guesser_suffix_rules, error, model)) return false; |
|
|
0 |
0 |
if (!option_int(tagger, "guesser_suffix_rules", guesser_suffix_rules, error, model)) return false; |
|
|
0 |
0 |
if (!option_int(tagger, "guesser_suffix_rules", guesser_suffix_rules, error, model)) return false; |
|
21803
|
0 |
0 |
int guesser_prefixes_max = provide_lemma ? 4 : 0; if (!option_int(tagger, "guesser_prefixes_max", guesser_prefixes_max, error, model)) return false; |
|
|
0 |
0 |
int guesser_prefixes_max = provide_lemma ? 4 : 0; if (!option_int(tagger, "guesser_prefixes_max", guesser_prefixes_max, error, model)) return false; |
|
|
0 |
0 |
int guesser_prefixes_max = provide_lemma ? 4 : 0; if (!option_int(tagger, "guesser_prefixes_max", guesser_prefixes_max, error, model)) return false; |
|
|
0 |
0 |
int guesser_prefixes_max = provide_lemma ? 4 : 0; if (!option_int(tagger, "guesser_prefixes_max", guesser_prefixes_max, error, model)) return false; |
|
21804
|
0 |
0 |
int guesser_prefix_min_count = 10; if (!option_int(tagger, "guesser_prefix_min_count", guesser_prefix_min_count, error, model)) return false; |
|
|
0 |
0 |
int guesser_prefix_min_count = 10; if (!option_int(tagger, "guesser_prefix_min_count", guesser_prefix_min_count, error, model)) return false; |
|
|
0 |
0 |
int guesser_prefix_min_count = 10; if (!option_int(tagger, "guesser_prefix_min_count", guesser_prefix_min_count, error, model)) return false; |
|
21805
|
0 |
0 |
int guesser_enrich_dictionary = run <= 1 ? 6 : 3 + hyperparameter_integer(run, 2, 0, 7); |
|
21806
|
0 |
0 |
if (!dictionary_file.empty()) guesser_enrich_dictionary = 0; |
|
21807
|
0 |
0 |
if (!option_int(tagger, "guesser_enrich_dictionary", guesser_enrich_dictionary, error, model)) return false; |
|
|
0 |
0 |
if (!option_int(tagger, "guesser_enrich_dictionary", guesser_enrich_dictionary, error, model)) return false; |
|
|
0 |
0 |
if (!option_int(tagger, "guesser_enrich_dictionary", guesser_enrich_dictionary, error, model)) return false; |
|
21809
|
0 |
0 |
if (run >= 1) cerr << "Random search run " << run << ", guesser_suffix_rules=" << guesser_suffix_rules |
|
|
0 |
0 |
if (run >= 1) cerr << "Random search run " << run << ", guesser_suffix_rules=" << guesser_suffix_rules |
|
|
0 |
0 |
if (run >= 1) cerr << "Random search run " << run << ", guesser_suffix_rules=" << guesser_suffix_rules |
|
21810
|
0 |
0 |
<< ", guesser_enrich_dictionary=" << guesser_enrich_dictionary << endl; |
|
21812
|
0 |
0 |
cerr << "Tagger model " << model+1 << " guesser options: " << "suffix_rules=" << guesser_suffix_rules |
|
21813
|
0 |
0 |
<< ", prefixes_max=" << guesser_prefixes_max << ", prefix_min_count=" << guesser_prefix_min_count |
|
|
0 |
0 |
<< ", prefixes_max=" << guesser_prefixes_max << ", prefix_min_count=" << guesser_prefix_min_count |
|
21814
|
0 |
0 |
<< ", enrich_dictionary=" << guesser_enrich_dictionary << endl; |
|
21817
|
0 |
0 |
stringstream guesser_description; |
|
21819
|
0 |
0 |
stringstream guesser_input; |
|
21820
|
0 |
0 |
for (auto&& sentence : training) { |
|
21821
|
0 |
0 |
for (size_t i = 1; i < sentence.words.size(); i++) |
|
21822
|
0 |
0 |
guesser_input << model_normalize_form(sentence.words[i].form, normalized_form) << '\t' |
|
21823
|
0 |
0 |
<< combine_lemma(sentence.words[i], use_lemma, combined_lemma, flat_lemmas) << '\t' |
|
21824
|
0 |
0 |
<< combine_tag(sentence.words[i], use_xpostag, use_feats, combined_tag) << '\n'; |
|
21827
|
0 |
0 |
morphodita::morpho_statistical_guesser_trainer::train(guesser_input, guesser_suffix_len, guesser_suffix_rules, guesser_prefixes_max, guesser_prefix_min_count, guesser_description); |
|
21835
|
0 |
0 |
for (auto&& sentence : training) |
|
21836
|
0 |
0 |
for (size_t i = 1; i < sentence.words.size(); i++) { |
|
21837
|
0 |
0 |
model_normalize_form(sentence.words[i].form, normalized_form); |
|
21838
|
0 |
0 |
entry.assign(combine_lemma(sentence.words[i], use_lemma, combined_lemma, flat_lemmas)) |
|
21839
|
0 |
0 |
.append("\t").append(combine_tag(sentence.words[i], use_xpostag, use_feats, combined_tag)) |
|
|
0 |
0 |
.append("\t").append(combine_tag(sentence.words[i], use_xpostag, use_feats, combined_tag)) |
|
21840
|
0 |
0 |
.append("\t").append(normalized_form); |
|
21845
|
0 |
0 |
for (auto&& form_analyses : entries) { |
|
21847
|
0 |
0 |
for (auto&& analysis : form_analyses.second) |
|
21848
|
0 |
0 |
analyses.emplace_back(analysis.second, analysis.first); |
|
21849
|
0 |
0 |
if (max_form_analyses && int(analyses.size()) > max_form_analyses) { |
|
|
0 |
0 |
if (max_form_analyses && int(analyses.size()) > max_form_analyses) { |
|
|
0 |
0 |
if (max_form_analyses && int(analyses.size()) > max_form_analyses) { |
|
21851
|
0 |
0 |
analyses.resize(max_form_analyses); |
|
21853
|
0 |
0 |
for (auto&& analysis : analyses) |
|
21859
|
0 |
0 |
dictionary_special_tags.number_tag = most_frequent_tag(training, "NUM", use_xpostag, use_feats, combined_tag); |
|
|
0 |
0 |
dictionary_special_tags.number_tag = most_frequent_tag(training, "NUM", use_xpostag, use_feats, combined_tag); |
|
21860
|
0 |
0 |
dictionary_special_tags.punctuation_tag = most_frequent_tag(training, "PUNCT", use_xpostag, use_feats, combined_tag); |
|
|
0 |
0 |
dictionary_special_tags.punctuation_tag = most_frequent_tag(training, "PUNCT", use_xpostag, use_feats, combined_tag); |
|
21861
|
0 |
0 |
dictionary_special_tags.symbol_tag = most_frequent_tag(training, "SYM", use_xpostag, use_feats, combined_tag); |
|
|
0 |
0 |
dictionary_special_tags.symbol_tag = most_frequent_tag(training, "SYM", use_xpostag, use_feats, combined_tag); |
|
21864
|
0 |
0 |
if (!dictionary_file.empty()) { |
|
21865
|
0 |
0 |
ifstream is(path_from_utf8(dictionary_file).c_str()); |
|
21866
|
0 |
0 |
if (!is.is_open()) return error.assign("Cannot open dictionary_file '").append(dictionary_file).append("'!"), false; |
|
|
0 |
0 |
if (!is.is_open()) return error.assign("Cannot open dictionary_file '").append(dictionary_file).append("'!"), false; |
|
|
0 |
0 |
if (!is.is_open()) return error.assign("Cannot open dictionary_file '").append(dictionary_file).append("'!"), false; |
|
21869
|
0 |
0 |
word entry; |
|
21871
|
0 |
0 |
while (getline(is, line)) { |
|
|
0 |
0 |
while (getline(is, line)) { |
|
21873
|
0 |
0 |
if (line.empty()) continue; |
|
21875
|
0 |
0 |
split(line, '\t', dictionary_parts); |
|
21877
|
0 |
0 |
if (dictionary_parts.size() != 5) |
|
21878
|
0 |
0 |
return error.assign("Dictionary line '").append(line).append("' does not contain 5 tab-separated columns!"), false; |
|
|
0 |
0 |
return error.assign("Dictionary line '").append(line).append("' does not contain 5 tab-separated columns!"), false; |
|
21880
|
0 |
0 |
model_normalize_form(dictionary_parts[0], entry.form); |
|
21881
|
0 |
0 |
entry.lemma.assign(dictionary_parts[1].str, dictionary_parts[1].len == 1 && dictionary_parts[1].str[0] == '_' ? 0 : dictionary_parts[1].len); |
|
|
0 |
0 |
entry.lemma.assign(dictionary_parts[1].str, dictionary_parts[1].len == 1 && dictionary_parts[1].str[0] == '_' ? 0 : dictionary_parts[1].len); |
|
21882
|
0 |
0 |
entry.upostag.assign(dictionary_parts[2].str, dictionary_parts[2].len == 1 && dictionary_parts[2].str[0] == '_' ? 0 : dictionary_parts[2].len); |
|
|
0 |
0 |
entry.upostag.assign(dictionary_parts[2].str, dictionary_parts[2].len == 1 && dictionary_parts[2].str[0] == '_' ? 0 : dictionary_parts[2].len); |
|
21883
|
0 |
0 |
entry.xpostag.assign(dictionary_parts[3].str, dictionary_parts[3].len == 1 && dictionary_parts[3].str[0] == '_' ? 0 : dictionary_parts[3].len); |
|
|
0 |
0 |
entry.xpostag.assign(dictionary_parts[3].str, dictionary_parts[3].len == 1 && dictionary_parts[3].str[0] == '_' ? 0 : dictionary_parts[3].len); |
|
21884
|
0 |
0 |
entry.feats.assign(dictionary_parts[4].str, dictionary_parts[4].len == 1 && dictionary_parts[4].str[0] == '_' ? 0 : dictionary_parts[4].len); |
|
|
0 |
0 |
entry.feats.assign(dictionary_parts[4].str, dictionary_parts[4].len == 1 && dictionary_parts[4].str[0] == '_' ? 0 : dictionary_parts[4].len); |
|
21886
|
0 |
0 |
entry_encoded.assign(combine_lemma(entry, use_lemma, combined_lemma, flat_lemmas)) |
|
21887
|
0 |
0 |
.append("\t").append(combine_tag(entry, use_xpostag, use_feats, combined_tag)) |
|
|
0 |
0 |
.append("\t").append(combine_tag(entry, use_xpostag, use_feats, combined_tag)) |
|
21888
|
0 |
0 |
.append("\t").append(entry.form); |
|
21894
|
0 |
0 |
if (guesser_enrich_dictionary) { |
|
21896
|
0 |
0 |
stringstream empty_data, guesser_description_copy(guesser_description.str()), guesser_only_morphology; |
|
|
0 |
0 |
stringstream empty_data, guesser_description_copy(guesser_description.str()), guesser_only_morphology; |
|
|
0 |
0 |
stringstream empty_data, guesser_description_copy(guesser_description.str()), guesser_only_morphology; |
|
21897
|
0 |
0 |
guesser_only_morphology.put(morphodita::morpho_ids::GENERIC); |
|
21898
|
0 |
0 |
morphodita::generic_morpho_encoder::encode(empty_data, dictionary_suffix_len, dictionary_special_tags, guesser_description_copy, guesser_only_morphology); |
|
21900
|
0 |
0 |
unique_ptr guesser_only_morpho(morphodita::morpho::load(guesser_only_morphology)); |
|
21901
|
0 |
0 |
if (!guesser_only_morpho) return error.assign("Cannot create temporary guesser-only morphology!"), false; |
|
|
0 |
0 |
if (!guesser_only_morpho) return error.assign("Cannot create temporary guesser-only morphology!"), false; |
|
21906
|
0 |
0 |
for (auto&& sentence : training) |
|
21907
|
0 |
0 |
for (size_t i = 1; i < sentence.words.size(); i++) { |
|
21908
|
0 |
0 |
const auto& form = model_normalize_form(sentence.words[i].form, normalized_form); |
|
21909
|
0 |
0 |
if (!analyzed_forms.count(form)) { |
|
21910
|
0 |
0 |
guesser_only_morpho->analyze(form, morphodita::morpho::GUESSER, analyses); |
|
21913
|
0 |
0 |
for (auto&& analyse : analyses) { |
|
21914
|
0 |
0 |
entry.assign(analyse.lemma).push_back('\t'); |
|
21915
|
0 |
0 |
entry.append(analyse.tag).push_back('\t'); |
|
21917
|
0 |
0 |
if (dictionary_entries.insert(entry).second) |
|
21918
|
0 |
0 |
if (!--to_add) |
|
21927
|
0 |
0 |
vector sorted_dictionary(dictionary_entries.begin(), dictionary_entries.end()); |
|
21930
|
0 |
0 |
stringstream morpho_input; |
|
21931
|
0 |
0 |
for (auto&& entry : sorted_dictionary) |
|
21934
|
0 |
0 |
morpho_description.put(morphodita::morpho_ids::GENERIC); |
|
21935
|
0 |
0 |
morphodita::generic_morpho_encoder::encode(morpho_input, dictionary_suffix_len, dictionary_special_tags, guesser_description, morpho_description); |
|
21939
|
0 |
0 |
const string& dictionary_accuracy = option_str(tagger, "dictionary_accuracy", model); |
|
|
0 |
0 |
const string& dictionary_accuracy = option_str(tagger, "dictionary_accuracy", model); |
|
21940
|
0 |
0 |
if (!dictionary_accuracy.empty()) { |
|
21941
|
0 |
0 |
unique_ptr morpho(morphodita::morpho::load(morpho_description)); |
|
21942
|
0 |
0 |
if (!morpho) return error.assign("Cannot create temporary morphology for evaluating accuracy!"), false; |
|
|
0 |
0 |
if (!morpho) return error.assign("Cannot create temporary morphology for evaluating accuracy!"), false; |
|
21943
|
0 |
0 |
morpho_description.seekg(0, ios::beg); |
|
21948
|
0 |
0 |
word w; |
|
21950
|
0 |
0 |
conllu_input_format->set_text(dictionary_accuracy.c_str()); |
|
21951
|
0 |
0 |
for (sentence sentence; conllu_input_format->next_sentence(sentence, error); ) |
|
|
0 |
0 |
for (sentence sentence; conllu_input_format->next_sentence(sentence, error); ) |
|
|
0 |
0 |
for (sentence sentence; conllu_input_format->next_sentence(sentence, error); ) |
|
21952
|
0 |
0 |
for (size_t i = 1; i < sentence.words.size(); i++) { |
|
21953
|
0 |
0 |
morpho->analyze(model_normalize_form(sentence.words[i].form, normalized_form), morphodita::morpho::GUESSER, analyses); |
|
|
0 |
0 |
morpho->analyze(model_normalize_form(sentence.words[i].form, normalized_form), morphodita::morpho::GUESSER, analyses); |
|
21955
|
0 |
0 |
for (auto&& analysis : analyses) { |
|
21956
|
0 |
0 |
w.lemma.assign("_"); |
|
21957
|
0 |
0 |
model_fill_word_analysis(analysis, true, use_lemma, true, true, w); |
|
21961
|
0 |
0 |
all_tags_ok |= int(sentence.words[i].upostag == w.upostag && sentence.words[i].xpostag == w.xpostag && sentence.words[i].feats == w.feats); |
|
|
0 |
0 |
all_tags_ok |= int(sentence.words[i].upostag == w.upostag && sentence.words[i].xpostag == w.xpostag && sentence.words[i].feats == w.feats); |
|
|
0 |
0 |
all_tags_ok |= int(sentence.words[i].upostag == w.upostag && sentence.words[i].xpostag == w.xpostag && sentence.words[i].feats == w.feats); |
|
21972
|
0 |
0 |
if (!error.empty()) return false; |
|
21981
|
0 |
0 |
double tagger_order = 3; if (!option_double(tagger, "order", tagger_order, error, model)) return false; |
|
|
0 |
0 |
double tagger_order = 3; if (!option_double(tagger, "order", tagger_order, error, model)) return false; |
|
|
0 |
0 |
double tagger_order = 3; if (!option_double(tagger, "order", tagger_order, error, model)) return false; |
|
21983
|
0 |
0 |
if (tagger_order == 2) tagger_id = morphodita::tagger_ids::CONLLU2; |
|
21984
|
0 |
0 |
else if (tagger_order == 2.5) tagger_id = morphodita::tagger_ids::CONLLU2_3; |
|
21985
|
0 |
0 |
else if (tagger_order == 3) tagger_id = morphodita::tagger_ids::CONLLU3; |
|
21986
|
0 |
0 |
else return error.assign("The tagger_order can be only 2, 2.5 or 3!"), false; |
|
21988
|
0 |
0 |
int tagger_iterations = 20; if (!option_int(tagger, "iterations", tagger_iterations, error, model)) return false; |
|
|
0 |
0 |
int tagger_iterations = 20; if (!option_int(tagger, "iterations", tagger_iterations, error, model)) return false; |
|
|
0 |
0 |
int tagger_iterations = 20; if (!option_int(tagger, "iterations", tagger_iterations, error, model)) return false; |
|
21989
|
0 |
0 |
bool tagger_prune_features = false; if (!option_bool(tagger, "prune_features", tagger_prune_features, error, model)) return false; |
|
|
0 |
0 |
bool tagger_prune_features = false; if (!option_bool(tagger, "prune_features", tagger_prune_features, error, model)) return false; |
|
|
0 |
0 |
bool tagger_prune_features = false; if (!option_bool(tagger, "prune_features", tagger_prune_features, error, model)) return false; |
|
21990
|
0 |
0 |
bool tagger_early_stopping = true; if (!option_bool(tagger, "early_stopping", tagger_early_stopping, error, model)) return false; |
|
|
0 |
0 |
bool tagger_early_stopping = true; if (!option_bool(tagger, "early_stopping", tagger_early_stopping, error, model)) return false; |
|
|
0 |
0 |
bool tagger_early_stopping = true; if (!option_bool(tagger, "early_stopping", tagger_early_stopping, error, model)) return false; |
|
21992
|
0 |
0 |
option_str(tagger, "templates", model) == "tagger" ? tagger_features_tagger : |
|
|
0 |
0 |
option_str(tagger, "templates", model) == "tagger" ? tagger_features_tagger : |
|
21993
|
0 |
0 |
option_str(tagger, "templates", model) == "lemmatizer" ? tagger_features_lemmatizer : |
|
|
0 |
0 |
option_str(tagger, "templates", model) == "lemmatizer" ? tagger_features_lemmatizer : |
|
|
0 |
0 |
option_str(tagger, "templates", model) == "lemmatizer" ? tagger_features_lemmatizer : |
|
|
0 |
0 |
option_str(tagger, "templates", model) == "lemmatizer" ? tagger_features_lemmatizer : |
|
21994
|
0 |
0 |
!option_str(tagger, "templates", model).empty() ? option_str(tagger, "templates", model) : |
|
|
0 |
0 |
!option_str(tagger, "templates", model).empty() ? option_str(tagger, "templates", model) : |
|
|
0 |
0 |
!option_str(tagger, "templates", model).empty() ? option_str(tagger, "templates", model) : |
|
|
0 |
0 |
!option_str(tagger, "templates", model).empty() ? option_str(tagger, "templates", model) : |
|
|
0 |
0 |
!option_str(tagger, "templates", model).empty() ? option_str(tagger, "templates", model) : |
|
|
0 |
0 |
!option_str(tagger, "templates", model).empty() ? option_str(tagger, "templates", model) : |
|
|
0 |
0 |
!option_str(tagger, "templates", model).empty() ? option_str(tagger, "templates", model) : |
|
21995
|
0 |
0 |
model == 1 ? tagger_features_lemmatizer : tagger_features_tagger; |
|
|
0 |
0 |
model == 1 ? tagger_features_lemmatizer : tagger_features_tagger; |
|
|
0 |
0 |
model == 1 ? tagger_features_lemmatizer : tagger_features_tagger; |
|
|
0 |
0 |
model == 1 ? tagger_features_lemmatizer : tagger_features_tagger; |
|
|
0 |
0 |
model == 1 ? tagger_features_lemmatizer : tagger_features_tagger; |
|
21996
|
0 |
0 |
if (heldout.empty()) tagger_early_stopping = false; |
|
21998
|
0 |
0 |
cerr << "Tagger model " << model+1 << " options: iterations=" << tagger_iterations |
|
21999
|
0 |
0 |
<< ", early_stopping=" << (tagger_early_stopping ? 1 : 0) << ", templates=" |
|
|
0 |
0 |
<< ", early_stopping=" << (tagger_early_stopping ? 1 : 0) << ", templates=" |
|
22001
|
0 |
0 |
tagger_feature_templates == tagger_features_lemmatizer ? "lemmatizer" : "custom") << endl; |
|
|
0 |
0 |
tagger_feature_templates == tagger_features_lemmatizer ? "lemmatizer" : "custom") << endl; |
|
|
0 |
0 |
tagger_feature_templates == tagger_features_lemmatizer ? "lemmatizer" : "custom") << endl; |
|
22005
|
0 |
0 |
stringstream input, heldout_input, feature_templates_input(tagger_feature_templates); |
|
|
0 |
0 |
stringstream input, heldout_input, feature_templates_input(tagger_feature_templates); |
|
|
0 |
0 |
stringstream input, heldout_input, feature_templates_input(tagger_feature_templates); |
|
22006
|
0 |
0 |
for (auto&& sentence : training) { |
|
22007
|
0 |
0 |
for (size_t i = 1; i < sentence.words.size(); i++) |
|
22008
|
0 |
0 |
input << model_normalize_form(sentence.words[i].form, normalized_form) << '\t' |
|
22009
|
0 |
0 |
<< combine_lemma(sentence.words[i], use_lemma, combined_lemma) << '\t' |
|
22010
|
0 |
0 |
<< combine_tag(sentence.words[i], use_xpostag, use_feats, combined_tag) << '\n'; |
|
22014
|
0 |
0 |
for (auto&& sentence : heldout) { |
|
22015
|
0 |
0 |
for (size_t i = 1; i < sentence.words.size(); i++) |
|
22016
|
0 |
0 |
heldout_input << model_normalize_form(sentence.words[i].form, normalized_form) << '\t' |
|
22017
|
0 |
0 |
<< combine_lemma(sentence.words[i], use_lemma, combined_lemma) << '\t' |
|
22018
|
0 |
0 |
<< combine_tag(sentence.words[i], use_xpostag, use_feats, combined_tag) << '\n'; |
|
22022
|
0 |
0 |
os.put(tagger_id); |
|
22023
|
0 |
0 |
morphodita::tagger_trainer>>::train(morphodita::tagger_ids::decoding_order(tagger_id), morphodita::tagger_ids::window_size(tagger_id), tagger_iterations, morpho_description, true, feature_templates_input, tagger_prune_features, input, heldout_input, tagger_early_stopping, os); |
|
|
0 |
0 |
morphodita::tagger_trainer>>::train(morphodita::tagger_ids::decoding_order(tagger_id), morphodita::tagger_ids::window_size(tagger_id), tagger_iterations, morpho_description, true, feature_templates_input, tagger_prune_features, input, heldout_input, tagger_early_stopping, os); |
|
|
0 |
0 |
morphodita::tagger_trainer>>::train(morphodita::tagger_ids::decoding_order(tagger_id), morphodita::tagger_ids::window_size(tagger_id), tagger_iterations, morpho_description, true, feature_templates_input, tagger_prune_features, input, heldout_input, tagger_early_stopping, os); |
|
|
0 |
0 |
morphodita::tagger_trainer>>::train(morphodita::tagger_ids::decoding_order(tagger_id), morphodita::tagger_ids::window_size(tagger_id), tagger_iterations, morpho_description, true, feature_templates_input, tagger_prune_features, input, heldout_input, tagger_early_stopping, os); |
|
|
0 |
0 |
morphodita::tagger_trainer>>::train(morphodita::tagger_ids::decoding_order(tagger_id), morphodita::tagger_ids::window_size(tagger_id), tagger_iterations, morpho_description, true, feature_templates_input, tagger_prune_features, input, heldout_input, tagger_early_stopping, os); |
|
22032
|
0 |
0 |
while (separator < tag_separators.size() && |
|
22033
|
0 |
0 |
(w.upostag.find(tag_separators[separator]) != string::npos || w.xpostag.find(tag_separators[separator]) != string::npos)) |
|
22036
|
0 |
0 |
if (separator >= tag_separators.size()) { |
|
22045
|
0 |
0 |
while (separator < tag_separators.size() && |
|
22046
|
0 |
0 |
(w.upostag.find(tag_separators[separator]) != string::npos || w.xpostag.find(tag_separators[separator]) != string::npos)) |
|
22048
|
0 |
0 |
if (separator >= tag_separators.size()) |
|
22054
|
0 |
0 |
if (xpostag || feats) { |
|
22056
|
0 |
0 |
if (xpostag) combined_tag.append(w.xpostag); |
|
22057
|
0 |
0 |
if (feats) combined_tag.push_back(tag_separators[separator]); |
|
22058
|
0 |
0 |
if (feats) combined_tag.append(w.feats); |
|
22067
|
0 |
0 |
for (auto&& sentence : data) |
|
22068
|
0 |
0 |
for (size_t i = 1; i < sentence.words.size(); i++) |
|
22069
|
0 |
0 |
if (sentence.words[i].upostag == upostag) |
|
22070
|
0 |
0 |
counts[combine_tag(sentence.words[i], xpostag, feats, combined_tag)]++; |
|
22072
|
0 |
0 |
combined_tag.assign("~").append(upostag); |
|
22074
|
0 |
0 |
for (auto&& tags : counts) |
|
22075
|
0 |
0 |
if (tags.second > best) { |
|
22088
|
0 |
0 |
if (flat_lemmas.count(w.lemma) || flat_lemmas.count(combined_lemma)) |
|
22092
|
0 |
0 |
if (w.lemma == "") |
|
22094
|
0 |
0 |
else if (w.lemma == "_") |
|
22098
|
0 |
0 |
if (flat_lemmas.count(w.lemma) || flat_lemmas.count(combined_lemma)) { |
|
22100
|
0 |
0 |
model_normalize_form(w.form, normalized_form); |
|
22101
|
0 |
0 |
return combined_lemma.insert(0, "~").append("~").append(normalized_form); |
|
|
0 |
0 |
return combined_lemma.insert(0, "~").append("~").append(normalized_form); |
|
22111
|
0 |
0 |
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
|
|
0 |
0 |
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
|
|
0 |
0 |
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
|
22118
|
0 |
0 |
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
|
|
0 |
0 |
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
|
|
0 |
0 |
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
|
22120
|
0 |
0 |
if (options.count(indexed_name)) |
|
22121
|
0 |
0 |
return parse_int(options.at(indexed_name), name.c_str(), value, error); |
|
22122
|
0 |
0 |
if (options.count(name)) |
|
22123
|
0 |
0 |
return parse_int(options.at(name), name.c_str(), value, error); |
|
22129
|
0 |
0 |
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
|
|
0 |
0 |
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
|
|
0 |
0 |
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
|
22131
|
0 |
0 |
if (options.count(indexed_name) || options.count(name)) { |
|
22133
|
0 |
0 |
if (!parse_int(options.count(indexed_name) ? options.at(indexed_name) : options.at(name), name.c_str(), int_value, error)) |
|
|
0 |
0 |
if (!parse_int(options.count(indexed_name) ? options.at(indexed_name) : options.at(name), name.c_str(), int_value, error)) |
|
|
0 |
0 |
if (!parse_int(options.count(indexed_name) ? options.at(indexed_name) : options.at(name), name.c_str(), int_value, error)) |
|
22142
|
0 |
0 |
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
|
|
0 |
0 |
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
|
|
0 |
0 |
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
|
22144
|
0 |
0 |
if (options.count(indexed_name)) |
|
22145
|
0 |
0 |
return parse_double(options.at(indexed_name), name.c_str(), value, error); |
|
22146
|
0 |
0 |
if (options.count(name)) |
|
22147
|
0 |
0 |
return parse_double(options.at(name), name.c_str(), value, error); |
|
22307
|
0 |
0 |
training_error::training_error() : runtime_error(message_collector.str()) { |
|
22580
|
0 |
0 |
decompose(str, true); |
|
|
0 |
0 |
decompose(str, true); |
|
|
0 |
0 |
decompose(str, true); |
|
22585
|
0 |
0 |
for (old = 0, com = 0; old < str.size(); old++, com++) { |
|
22587
|
0 |
0 |
if (str[old] >= Hangul::LBase && str[old] < Hangul::LBase + Hangul::LCount) { |
|
|
0 |
0 |
if (str[old] >= Hangul::LBase && str[old] < Hangul::LBase + Hangul::LCount) { |
|
|
0 |
0 |
if (str[old] >= Hangul::LBase && str[old] < Hangul::LBase + Hangul::LCount) { |
|
22589
|
0 |
0 |
if (old + 1 < str.size() && str[old + 1] >= Hangul::VBase && str[old + 1] < Hangul::VBase + Hangul::VCount) { |
|
|
0 |
0 |
if (old + 1 < str.size() && str[old + 1] >= Hangul::VBase && str[old + 1] < Hangul::VBase + Hangul::VCount) { |
|
|
0 |
0 |
if (old + 1 < str.size() && str[old + 1] >= Hangul::VBase && str[old + 1] < Hangul::VBase + Hangul::VCount) { |
|
|
0 |
0 |
if (old + 1 < str.size() && str[old + 1] >= Hangul::VBase && str[old + 1] < Hangul::VBase + Hangul::VCount) { |
|
22592
|
0 |
0 |
if (old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount) |
|
|
0 |
0 |
if (old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount) |
|
|
0 |
0 |
if (old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount) |
|
|
0 |
0 |
if (old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount) |
|
22595
|
0 |
0 |
} else if (str[old] >= Hangul::SBase && str[old] < Hangul::SBase + Hangul::SCount) { |
|
|
0 |
0 |
} else if (str[old] >= Hangul::SBase && str[old] < Hangul::SBase + Hangul::SCount) { |
|
|
0 |
0 |
} else if (str[old] >= Hangul::SBase && str[old] < Hangul::SBase + Hangul::SCount) { |
|
22597
|
0 |
0 |
if ((str[old] - Hangul::SBase) % Hangul::TCount && old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount) |
|
|
0 |
0 |
if ((str[old] - Hangul::SBase) % Hangul::TCount && old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount) |
|
|
0 |
0 |
if ((str[old] - Hangul::SBase) % Hangul::TCount && old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount) |
|
|
0 |
0 |
if ((str[old] - Hangul::SBase) % Hangul::TCount && old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount) |
|
|
0 |
0 |
if ((str[old] - Hangul::SBase) % Hangul::TCount && old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount) |
|
22599
|
0 |
0 |
} else if (str[old] < CHARS) { |
|
22603
|
0 |
0 |
for (int last_ccc = -1; old + 1 < str.size(); old++) { |
|
22604
|
0 |
0 |
int ccc = str[old + 1] < CHARS ? ccc_block[ccc_index[str[old + 1] >> 8]][str[old + 1] & 0xFF] : 0; |
|
22605
|
0 |
0 |
if (composition[1] - composition[0] && last_ccc < ccc) { |
|
|
0 |
0 |
if (composition[1] - composition[0] && last_ccc < ccc) { |
|
22608
|
0 |
0 |
while (l + 2 < r) { |
|
22610
|
0 |
0 |
if (composition_data[m] <= str[old + 1]) l = m; |
|
22611
|
0 |
0 |
if (composition_data[m] >= str[old + 1]) r = m; |
|
22613
|
0 |
0 |
if (composition_data[l] == str[old + 1]) { |
|
22621
|
0 |
0 |
if (!ccc) break; |
|
22628
|
0 |
0 |
if (com < old) str.resize(com); |
|
22635
|
0 |
0 |
for (auto&& chr : str) { |
|
22638
|
0 |
0 |
if (chr >= Hangul::SBase && chr < Hangul::SBase + Hangul::SCount) { |
|
22640
|
0 |
0 |
decomposition_len = 2 + ((chr - Hangul::SBase) % Hangul::TCount ? 1 : 0); |
|
22641
|
0 |
0 |
} else if (chr < CHARS) { |
|
22645
|
0 |
0 |
if (decomposition_len && !kompatibility && (decomposition[0] & 1)) decomposition_len = 0; |
|
|
0 |
0 |
if (decomposition_len && !kompatibility && (decomposition[0] & 1)) decomposition_len = 0; |
|
22646
|
0 |
0 |
if (decomposition_len && kompatibility && (decomposition[0] & 2)) |
|
|
0 |
0 |
if (decomposition_len && kompatibility && (decomposition[0] & 2)) |
|
22648
|
0 |
0 |
for (auto i = decomposition[0] >> 2; i < decomposition[1] >> 2; i++) { |
|
22650
|
0 |
0 |
if (further_decomposition[0] & 1) decomposition_len += (further_decomposition[1] >> 2) - (further_decomposition[0] >> 2) - 1; |
|
22654
|
0 |
0 |
if (!decomposition_len) continue; |
|
22660
|
0 |
0 |
if (any_decomposition) { |
|
22662
|
0 |
0 |
for (size_t dec = str.size(), old = dec - additional; old--; ) |
|
22663
|
0 |
0 |
if (str[old] >= Hangul::SBase && str[old] < Hangul::SBase + Hangul::SCount) { |
|
|
0 |
0 |
if (str[old] >= Hangul::SBase && str[old] < Hangul::SBase + Hangul::SCount) { |
|
|
0 |
0 |
if (str[old] >= Hangul::SBase && str[old] < Hangul::SBase + Hangul::SCount) { |
|
22666
|
0 |
0 |
if (s_index % Hangul::TCount) str[--dec] = Hangul::TBase + s_index % Hangul::TCount; |
|
22669
|
0 |
0 |
} else if (str[old] < CHARS) { |
|
22673
|
0 |
0 |
if (decomposition_len && !kompatibility && (decomposition[0] & 1)) decomposition_len = 0; |
|
|
0 |
0 |
if (decomposition_len && !kompatibility && (decomposition[0] & 1)) decomposition_len = 0; |
|
22674
|
0 |
0 |
if (decomposition_len && kompatibility && (decomposition[0] & 2)) { |
|
|
0 |
0 |
if (decomposition_len && kompatibility && (decomposition[0] & 2)) { |
|
22676
|
0 |
0 |
while (decomposition_len--) { |
|
22679
|
0 |
0 |
if (further_decomposition[0] & 1) { |
|
22680
|
0 |
0 |
for (int further_decomposition_len = (further_decomposition[1] >> 2) - (further_decomposition[0] >> 2); further_decomposition_len--; ) |
|
22686
|
0 |
0 |
} else if (decomposition_len) { |
|
22688
|
0 |
0 |
while (decomposition_len--) |
|
22701
|
0 |
0 |
for (size_t i = 1; i < str.size(); i++) { |
|
22702
|
0 |
0 |
unsigned ccc = str[i] < CHARS ? ccc_block[ccc_index[str[i] >> 8]][str[i] & 0xFF] : 0; |
|
22703
|
0 |
0 |
if (!ccc) continue; |
|
22707
|
0 |
0 |
for (j = i; j && (str[j-1] < CHARS ? ccc_block[ccc_index[str[j-1] >> 8]][str[j-1] & 0xFF] : 0) > ccc; j--) str[j] = str[j-1]; |
|
|
0 |
0 |
for (j = i; j && (str[j-1] < CHARS ? ccc_block[ccc_index[str[j-1] >> 8]][str[j-1] & 0xFF] : 0) > ccc; j--) str[j] = str[j-1]; |
|
|
0 |
0 |
for (j = i; j && (str[j-1] < CHARS ? ccc_block[ccc_index[str[j-1] >> 8]][str[j-1] & 0xFF] : 0) > ccc; j--) str[j] = str[j-1]; |
|
|
0 |
0 |
for (j = i; j && (str[j-1] < CHARS ? ccc_block[ccc_index[str[j-1] >> 8]][str[j-1] & 0xFF] : 0) > ccc; j--) str[j] = str[j-1]; |
|
22963
|
0 |
0 |
for (; *str; str++) |
|
22964
|
0 |
0 |
if (((unsigned char)*str) >= 0x80) { |
|
22965
|
0 |
0 |
if (((unsigned char)*str) < 0xC0) return false; |
|
22966
|
0 |
0 |
else if (((unsigned char)*str) < 0xE0) { |
|
22967
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
22968
|
0 |
0 |
} else if (((unsigned char)*str) < 0xF0) { |
|
22969
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
22970
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
22971
|
0 |
0 |
} else if (((unsigned char)*str) < 0xF8) { |
|
22972
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
22973
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
22974
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
22981
|
0 |
0 |
for (; len > 0; str++, len--) |
|
22982
|
0 |
0 |
if (((unsigned char)*str) >= 0x80) { |
|
22983
|
0 |
0 |
if (((unsigned char)*str) < 0xC0) return false; |
|
22984
|
0 |
0 |
else if (((unsigned char)*str) < 0xE0) { |
|
22985
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
22986
|
0 |
0 |
} else if (((unsigned char)*str) < 0xF0) { |
|
22987
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
22988
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
22989
|
0 |
0 |
} else if (((unsigned char)*str) < 0xF8) { |
|
22990
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
22991
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
22992
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
23001
|
0 |
0 |
for (char32_t chr; (chr = decode(str)); ) |
|
23008
|
0 |
0 |
while (len) |
|
23015
|
0 |
0 |
for (auto&& chr : str) |
|
23043
|
0 |
0 |
return {3, 3, 0, ""}; |
|
|
0 |
0 |
return {3, 3, 0, ""}; |
|
|
0 |
0 |
return {3, 3, 0, ""}; |
|
23573
|
3092 |
104350 |
IF_BIT_0(prob) |
|
|
23097 |
84345 |
IF_BIT_0(prob) |
|
23578
|
23091 |
6 |
if (checkDicSize != 0 || processedPos != 0) |
|
23580
|
0 |
23091 |
(dic[(dicPos == 0 ? dicBufSize : dicPos) - 1] >> (8 - lc)))); |
|
23582
|
21934 |
1163 |
if (state < kNumLitStates) |
|
23586
|
18121 |
157351 |
do { GET_BIT(prob + symbol, symbol) } while (symbol < 0x100); |
|
|
81155 |
94317 |
do { GET_BIT(prob + symbol, symbol) } while (symbol < 0x100); |
|
|
153538 |
21934 |
do { GET_BIT(prob + symbol, symbol) } while (symbol < 0x100); |
|
23590
|
0 |
1163 |
unsigned matchByte = p->dic[(dicPos - rep0) + ((dicPos < rep0) ? dicBufSize : 0)]; |
|
23592
|
166 |
997 |
state -= (state < 10) ? 3 : 6; |
|
23601
|
1028 |
8276 |
GET_BIT2(probLit, symbol, offs &= ~bit, offs &= bit) |
|
|
5355 |
3949 |
GET_BIT2(probLit, symbol, offs &= ~bit, offs &= bit) |
|
23603
|
8141 |
1163 |
while (symbol < 0x100); |
|
23613
|
572 |
83773 |
IF_BIT_0(prob) |
|
|
487 |
83858 |
IF_BIT_0(prob) |
|
23622
|
83858 |
0 |
if (checkDicSize == 0 && processedPos == 0) |
|
23625
|
280 |
83578 |
IF_BIT_0(prob) |
|
|
83695 |
163 |
IF_BIT_0(prob) |
|
23629
|
283 |
83412 |
IF_BIT_0(prob) |
|
|
645 |
83050 |
IF_BIT_0(prob) |
|
23632
|
0 |
645 |
dic[dicPos] = dic[(dicPos - rep0) + ((dicPos < rep0) ? dicBufSize : 0)]; |
|
23635
|
3 |
642 |
state = state < kNumLitStates ? 9 : 11; |
|
23645
|
49 |
114 |
IF_BIT_0(prob) |
|
|
97 |
66 |
IF_BIT_0(prob) |
|
23654
|
9 |
57 |
IF_BIT_0(prob) |
|
|
37 |
29 |
IF_BIT_0(prob) |
|
23670
|
82945 |
268 |
state = state < kNumLitStates ? 8 : 11; |
|
23676
|
462 |
83238 |
IF_BIT_0(probLen) |
|
|
445 |
83255 |
IF_BIT_0(probLen) |
|
23687
|
274 |
82981 |
IF_BIT_0(probLen) |
|
|
113 |
83142 |
IF_BIT_0(probLen) |
|
23702
|
2505 |
664305 |
TREE_DECODE(probLen, limit, len); |
|
|
2240 |
664570 |
TREE_DECODE(probLen, limit, len); |
|
|
583110 |
83700 |
TREE_DECODE(probLen, limit, len); |
|
23706
|
487 |
83213 |
if (state >= kNumStates) |
|
23711
|
55 |
432 |
TREE_6_DECODE(prob, distance); |
|
|
325 |
162 |
TREE_6_DECODE(prob, distance); |
|
|
55 |
432 |
TREE_6_DECODE(prob, distance); |
|
|
429 |
58 |
TREE_6_DECODE(prob, distance); |
|
|
48 |
439 |
TREE_6_DECODE(prob, distance); |
|
|
303 |
184 |
TREE_6_DECODE(prob, distance); |
|
|
65 |
422 |
TREE_6_DECODE(prob, distance); |
|
|
254 |
233 |
TREE_6_DECODE(prob, distance); |
|
|
58 |
429 |
TREE_6_DECODE(prob, distance); |
|
|
260 |
227 |
TREE_6_DECODE(prob, distance); |
|
|
56 |
431 |
TREE_6_DECODE(prob, distance); |
|
|
272 |
215 |
TREE_6_DECODE(prob, distance); |
|
23712
|
405 |
82 |
if (distance >= kStartPosModelIndex) |
|
23717
|
166 |
239 |
if (posSlot < kEndPosModelIndex) |
|
23724
|
325 |
166 |
do |
|
23726
|
54 |
437 |
GET_BIT2(prob + i, i, ; , distance |= mask); |
|
|
239 |
252 |
GET_BIT2(prob + i, i, ; , distance |= mask); |
|
23735
|
2333 |
239 |
do |
|
23737
|
309 |
2263 |
NORMALIZE |
|
23761
|
29 |
210 |
GET_BIT2(prob + i, i, ; , distance |= 1); |
|
|
124 |
115 |
GET_BIT2(prob + i, i, ; , distance |= 1); |
|
23762
|
37 |
202 |
GET_BIT2(prob + i, i, ; , distance |= 2); |
|
|
118 |
121 |
GET_BIT2(prob + i, i, ; , distance |= 2); |
|
23763
|
32 |
207 |
GET_BIT2(prob + i, i, ; , distance |= 4); |
|
|
130 |
109 |
GET_BIT2(prob + i, i, ; , distance |= 4); |
|
23764
|
26 |
213 |
GET_BIT2(prob + i, i, ; , distance |= 8); |
|
|
126 |
113 |
GET_BIT2(prob + i, i, ; , distance |= 8); |
|
23766
|
0 |
239 |
if (distance == (uint32_t)0xFFFFFFFF) |
|
23778
|
487 |
0 |
if (checkDicSize == 0) |
|
23780
|
487 |
0 |
if (distance >= processedPos) |
|
23783
|
0 |
0 |
else if (distance >= checkDicSize) |
|
23785
|
229 |
258 |
state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3; |
|
23790
|
83700 |
0 |
if (limit == dicPos) |
|
23794
|
0 |
83700 |
unsigned curLen = ((rem < len) ? (unsigned)rem : len); |
|
23795
|
0 |
83700 |
size_t pos = (dicPos - rep0) + ((dicPos < rep0) ? dicBufSize : 0); |
|
23800
|
83700 |
0 |
if (pos + curLen <= dicBufSize) |
|
23806
|
22566528 |
83700 |
do |
|
23812
|
0 |
0 |
do |
|
23815
|
0 |
0 |
if (++pos == dicBufSize) |
|
23823
|
106938 |
504 |
while (dicPos < limit && buf < bufLimit); |
|
23824
|
9 |
495 |
NORMALIZE; |
|
23842
|
0 |
510 |
if (p->remainLen != 0 && p->remainLen < kMatchSpecLenStart) |
|
23849
|
0 |
0 |
if (limit - dicPos < len) |
|
23852
|
0 |
0 |
if (p->checkDicSize == 0 && p->prop.dicSize - p->processedPos <= len) |
|
|
0 |
0 |
if (p->checkDicSize == 0 && p->prop.dicSize - p->processedPos <= len) |
|
23857
|
0 |
0 |
while (len-- != 0) |
|
23859
|
0 |
0 |
dic[dicPos] = dic[(dicPos - rep0) + ((dicPos < rep0) ? dicBufSize : 0)]; |
|
23871
|
504 |
0 |
if (p->checkDicSize == 0) |
|
23874
|
0 |
504 |
if (limit - p->dicPos > rem) |
|
23877
|
504 |
0 |
RINOK(LzmaDec_DecodeReal(p, limit2, bufLimit)); |
|
23878
|
0 |
504 |
if (p->processedPos >= p->prop.dicSize) |
|
23882
|
498 |
6 |
while (p->dicPos < limit && p->buf < bufLimit && p->remainLen < kMatchSpecLenStart); |
|
|
0 |
498 |
while (p->dicPos < limit && p->buf < bufLimit && p->remainLen < kMatchSpecLenStart); |
|
|
0 |
0 |
while (p->dicPos < limit && p->buf < bufLimit && p->remainLen < kMatchSpecLenStart); |
|
23884
|
0 |
504 |
if (p->remainLen > kMatchSpecLenStart) |
|
23915
|
0 |
470 |
IF_BIT_0_CHECK(prob) |
|
|
0 |
0 |
IF_BIT_0_CHECK(prob) |
|
|
37 |
433 |
IF_BIT_0_CHECK(prob) |
|
23922
|
36 |
1 |
if (p->checkDicSize != 0 || p->processedPos != 0) |
|
23925
|
0 |
36 |
(p->dic[(p->dicPos == 0 ? p->dicBufSize : p->dicPos) - 1] >> (8 - p->prop.lc)))); |
|
23927
|
27 |
10 |
if (state < kNumLitStates) |
|
23930
|
23 |
193 |
do { GET_BIT_CHECK(prob + symbol, symbol) } while (symbol < 0x100); |
|
|
23 |
0 |
do { GET_BIT_CHECK(prob + symbol, symbol) } while (symbol < 0x100); |
|
|
111 |
105 |
do { GET_BIT_CHECK(prob + symbol, symbol) } while (symbol < 0x100); |
|
|
189 |
27 |
do { GET_BIT_CHECK(prob + symbol, symbol) } while (symbol < 0x100); |
|
23935
|
0 |
10 |
((p->dicPos < p->reps[0]) ? p->dicBufSize : 0)]; |
|
23945
|
10 |
70 |
GET_BIT2_CHECK(probLit, symbol, offs &= ~bit, offs &= bit) |
|
|
10 |
0 |
GET_BIT2_CHECK(probLit, symbol, offs &= ~bit, offs &= bit) |
|
|
56 |
24 |
GET_BIT2_CHECK(probLit, symbol, offs &= ~bit, offs &= bit) |
|
23947
|
70 |
10 |
while (symbol < 0x100); |
|
23957
|
2 |
431 |
IF_BIT_0_CHECK(prob) |
|
|
2 |
0 |
IF_BIT_0_CHECK(prob) |
|
|
17 |
416 |
IF_BIT_0_CHECK(prob) |
|
23969
|
0 |
416 |
IF_BIT_0_CHECK(prob) |
|
|
0 |
0 |
IF_BIT_0_CHECK(prob) |
|
|
415 |
1 |
IF_BIT_0_CHECK(prob) |
|
23973
|
1 |
414 |
IF_BIT_0_CHECK(prob) |
|
|
1 |
0 |
IF_BIT_0_CHECK(prob) |
|
|
0 |
415 |
IF_BIT_0_CHECK(prob) |
|
23976
|
0 |
0 |
NORMALIZE_CHECK; |
|
|
0 |
0 |
NORMALIZE_CHECK; |
|
23988
|
0 |
1 |
IF_BIT_0_CHECK(prob) |
|
|
0 |
0 |
IF_BIT_0_CHECK(prob) |
|
|
1 |
0 |
IF_BIT_0_CHECK(prob) |
|
23996
|
0 |
1 |
IF_BIT_0_CHECK(prob) |
|
|
0 |
0 |
IF_BIT_0_CHECK(prob) |
|
|
0 |
1 |
IF_BIT_0_CHECK(prob) |
|
24012
|
4 |
429 |
IF_BIT_0_CHECK(probLen) |
|
|
4 |
0 |
IF_BIT_0_CHECK(probLen) |
|
|
16 |
417 |
IF_BIT_0_CHECK(probLen) |
|
24023
|
2 |
415 |
IF_BIT_0_CHECK(probLen) |
|
|
2 |
0 |
IF_BIT_0_CHECK(probLen) |
|
|
2 |
415 |
IF_BIT_0_CHECK(probLen) |
|
24038
|
21 |
3353 |
TREE_DECODE_CHECK(probLen, limit, len); |
|
|
21 |
0 |
TREE_DECODE_CHECK(probLen, limit, len); |
|
|
56 |
3318 |
TREE_DECODE_CHECK(probLen, limit, len); |
|
|
2941 |
433 |
TREE_DECODE_CHECK(probLen, limit, len); |
|
24042
|
17 |
416 |
if (state < 4) |
|
24048
|
12 |
90 |
TREE_DECODE_CHECK(prob, 1 << kNumPosSlotBits, posSlot); |
|
|
12 |
0 |
TREE_DECODE_CHECK(prob, 1 << kNumPosSlotBits, posSlot); |
|
|
62 |
40 |
TREE_DECODE_CHECK(prob, 1 << kNumPosSlotBits, posSlot); |
|
|
85 |
17 |
TREE_DECODE_CHECK(prob, 1 << kNumPosSlotBits, posSlot); |
|
24049
|
14 |
3 |
if (posSlot >= kStartPosModelIndex) |
|
24055
|
9 |
5 |
if (posSlot < kEndPosModelIndex) |
|
24062
|
37 |
5 |
do |
|
24064
|
5 |
37 |
NORMALIZE_CHECK |
|
|
5 |
0 |
NORMALIZE_CHECK |
|
24075
|
33 |
14 |
do |
|
24077
|
6 |
41 |
GET_BIT_CHECK(prob + i, i); |
|
|
6 |
0 |
GET_BIT_CHECK(prob + i, i); |
|
|
27 |
20 |
GET_BIT_CHECK(prob + i, i); |
|
24085
|
9 |
461 |
NORMALIZE_CHECK; |
|
|
9 |
0 |
NORMALIZE_CHECK; |
|
24102
|
0 |
0 |
if (initDic) |
|
24108
|
0 |
0 |
if (initState) |
|
24123
|
47940 |
6 |
for (i = 0; i < numProbs; i++) |
|
24139
|
510 |
0 |
while (p->remainLen != kMatchSpecLenStart) |
|
24143
|
6 |
504 |
if (p->needFlush != 0) |
|
24145
|
36 |
0 |
for (; inSize > 0 && p->tempBufSize < RC_INIT_SIZE; (*srcLen)++, inSize--) |
|
|
30 |
6 |
for (; inSize > 0 && p->tempBufSize < RC_INIT_SIZE; (*srcLen)++, inSize--) |
|
24147
|
0 |
6 |
if (p->tempBufSize < RC_INIT_SIZE) |
|
24152
|
6 |
0 |
if (p->tempBuf[0] != 0) |
|
24160
|
6 |
504 |
if (p->dicPos >= dicLimit) |
|
24162
|
6 |
0 |
if (p->remainLen == 0 && p->code == 0) |
|
|
6 |
0 |
if (p->remainLen == 0 && p->code == 0) |
|
24167
|
0 |
0 |
if (finishMode == LZMA_FINISH_ANY) |
|
24172
|
0 |
0 |
if (p->remainLen != 0) |
|
24180
|
6 |
498 |
if (p->needInitState) |
|
24183
|
0 |
504 |
if (p->tempBufSize == 0) |
|
24187
|
470 |
34 |
if (inSize < LZMA_REQUIRED_INPUT_MAX || checkEndMarkNow) |
|
24190
|
0 |
470 |
if (dummyRes == DUMMY_ERROR) |
|
24198
|
0 |
470 |
if (checkEndMarkNow && dummyRes != DUMMY_MATCH) |
|
24208
|
504 |
0 |
if (LzmaDec_DecodeReal2(p, dicLimit, bufLimit) != 0) |
|
24218
|
0 |
0 |
while (rem < LZMA_REQUIRED_INPUT_MAX && lookAhead < inSize) |
|
|
0 |
0 |
while (rem < LZMA_REQUIRED_INPUT_MAX && lookAhead < inSize) |
|
24221
|
0 |
0 |
if (rem < LZMA_REQUIRED_INPUT_MAX || checkEndMarkNow) |
|
24224
|
0 |
0 |
if (dummyRes == DUMMY_ERROR) |
|
24230
|
0 |
0 |
if (checkEndMarkNow && dummyRes != DUMMY_MATCH) |
|
24237
|
0 |
0 |
if (LzmaDec_DecodeReal2(p, dicLimit, p->buf) != 0) |
|
24246
|
0 |
0 |
if (p->code == 0) |
|
24261
|
0 |
0 |
if (p->dicPos == p->dicBufSize) |
|
24264
|
0 |
0 |
if (outSize > p->dicBufSize - dicPos) |
|
24284
|
0 |
0 |
if (res != 0) |
|
24286
|
0 |
0 |
if (outSizeCur == 0 || outSize == 0) |
|
24314
|
6 |
0 |
if (size < LZMA_PROPS_SIZE) |
|
24319
|
0 |
6 |
if (dicSize < LZMA_DIC_MIN) |
|
24324
|
6 |
0 |
if (d >= (9 * 5 * 5)) |
|
24338
|
0 |
6 |
if (p->probs == 0 || numProbs != p->numProbs) |
|
|
0 |
0 |
if (p->probs == 0 || numProbs != p->numProbs) |
|
24343
|
6 |
0 |
if (p->probs == 0) |
|
24352
|
6 |
0 |
RINOK(LzmaProps_Decode(&propNew, props, propsSize)); |
|
24353
|
6 |
0 |
RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc)); |
|
24362
|
0 |
0 |
RINOK(LzmaProps_Decode(&propNew, props, propsSize)); |
|
24363
|
0 |
0 |
RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc)); |
|
24365
|
0 |
0 |
if (p->dic == 0 || dicBufSize != p->dicBufSize) |
|
|
0 |
0 |
if (p->dic == 0 || dicBufSize != p->dicBufSize) |
|
24369
|
0 |
0 |
if (p->dic == 0) |
|
24389
|
6 |
0 |
if (inSize < RC_INIT_SIZE) |
|
24394
|
6 |
0 |
if (res != 0) |
|
24404
|
6 |
0 |
if (res == SZ_OK && *status == LZMA_STATUS_NEEDS_MORE_INPUT) |
|
|
0 |
6 |
if (res == SZ_OK && *status == LZMA_STATUS_NEEDS_MORE_INPUT) |
|
24418
|
6 |
6 |
static void LzmaFree(void* /*p*/, void *address) { delete[] (char*) address; } |
|
24426
|
6 |
0 |
if (!is.read((char *) &uncompressed_len, sizeof(uncompressed_len))) return false; |
|
24427
|
6 |
0 |
if (!is.read((char *) &compressed_len, sizeof(compressed_len))) return false; |
|
24428
|
6 |
0 |
if (!is.read((char *) &poor_crc, sizeof(poor_crc))) return false; |
|
24429
|
6 |
0 |
if (poor_crc != uncompressed_len * 19991 + compressed_len * 199999991 + 1234567890) return false; |
|
24430
|
6 |
0 |
if (!is.read((char *) props_encoded, sizeof(props_encoded))) return false; |
|
24433
|
6 |
0 |
if (!is.read((char *) compressed.data(), compressed_len)) return false; |
|
|
6 |
0 |
if (!is.read((char *) compressed.data(), compressed_len)) return false; |
|
24437
|
6 |
0 |
auto res = lzma::LzmaDecode(data.fill(uncompressed_len), &uncompressed_size, compressed.data(), &compressed_size, props_encoded, LZMA_PROPS_SIZE, lzma::LZMA_FINISH_ANY, &status, &lzmaAllocator); |
|
24438
|
6 |
0 |
if (res != SZ_OK || uncompressed_size != uncompressed_len || compressed_size != compressed_len) return false; |
|
|
6 |
0 |
if (res != SZ_OK || uncompressed_size != uncompressed_len || compressed_size != compressed_len) return false; |
|
|
6 |
0 |
if (res != SZ_OK || uncompressed_size != uncompressed_len || compressed_size != compressed_len) return false; |
|
24768
|
0 |
0 |
if (!p->directInput) |
|
24780
|
0 |
0 |
if (p->directInput) |
|
24785
|
0 |
0 |
if (p->bufferBase == 0 || p->blockSize != blockSize) |
|
|
0 |
0 |
if (p->bufferBase == 0 || p->blockSize != blockSize) |
|
24808
|
0 |
0 |
if (p->streamEndWasReached || p->result != SZ_OK) |
|
|
0 |
0 |
if (p->streamEndWasReached || p->result != SZ_OK) |
|
24810
|
0 |
0 |
if (p->directInput) |
|
24813
|
0 |
0 |
if (curSize > p->directInputRem) |
|
24817
|
0 |
0 |
if (p->directInputRem == 0) |
|
24825
|
0 |
0 |
if (size == 0) |
|
24828
|
0 |
0 |
if (p->result != SZ_OK) |
|
24830
|
0 |
0 |
if (size == 0) |
|
24836
|
0 |
0 |
if (p->streamPos - p->pos > p->keepSizeAfter) |
|
24851
|
0 |
0 |
if (p->directInput) |
|
|
0 |
0 |
if (p->directInput) |
|
24859
|
0 |
0 |
if (p->streamEndWasReached) |
|
24861
|
0 |
0 |
if (p->keepSizeAfter >= p->streamPos - p->pos) |
|
24867
|
0 |
0 |
if (MatchFinder_NeedMove(p)) |
|
24890
|
0 |
0 |
for (i = 0; i < 256; i++) |
|
|
0 |
0 |
for (i = 0; i < 256; i++) |
|
24894
|
0 |
0 |
for (j = 0; j < 8; j++) |
|
|
0 |
0 |
for (j = 0; j < 8; j++) |
|
24915
|
0 |
0 |
if (sizeInBytes / sizeof(CLzRef) != num) |
|
24925
|
0 |
0 |
if (historySize > kMaxHistorySize) |
|
24931
|
0 |
0 |
if (historySize > ((uint32_t)2 << 30)) |
|
24938
|
0 |
0 |
if (LzInWindow_Create(p, sizeReserv, alloc)) |
|
24945
|
0 |
0 |
if (p->numHashBytes == 2) |
|
24956
|
0 |
0 |
if (hs > (1 << 24)) |
|
24958
|
0 |
0 |
if (p->numHashBytes == 3) |
|
24966
|
0 |
0 |
if (p->numHashBytes > 2) p->fixedHashSize += kHash2Size; |
|
24967
|
0 |
0 |
if (p->numHashBytes > 3) p->fixedHashSize += kHash3Size; |
|
24968
|
0 |
0 |
if (p->numHashBytes > 4) p->fixedHashSize += kHash4Size; |
|
24978
|
0 |
0 |
p->numSons = (p->btMode ? newCyclicBufferSize * 2 : newCyclicBufferSize); |
|
24980
|
0 |
0 |
if (p->hash != 0 && prevSize == newSize) |
|
|
0 |
0 |
if (p->hash != 0 && prevSize == newSize) |
|
24984
|
0 |
0 |
if (p->hash != 0) |
|
24999
|
0 |
0 |
if (limit2 < limit) |
|
25002
|
0 |
0 |
if (limit2 <= p->keepSizeAfter) |
|
25004
|
0 |
0 |
if (limit2 > 0) |
|
25009
|
0 |
0 |
if (limit2 < limit) |
|
25013
|
0 |
0 |
if (lenLimit > p->matchMaxLen) |
|
25023
|
0 |
0 |
for (i = 0; i < p->hashSizeSum; i++) |
|
25042
|
0 |
0 |
for (i = 0; i < numItems; i++) |
|
|
0 |
0 |
for (i = 0; i < numItems; i++) |
|
25045
|
0 |
0 |
if (value <= subValue) |
|
|
0 |
0 |
if (value <= subValue) |
|
25062
|
0 |
0 |
if (p->pos == kMaxValForNormalize) |
|
25064
|
0 |
0 |
if (!p->streamEndWasReached && p->keepSizeAfter == p->streamPos - p->pos) |
|
|
0 |
0 |
if (!p->streamEndWasReached && p->keepSizeAfter == p->streamPos - p->pos) |
|
25066
|
0 |
0 |
if (p->cyclicBufferPos == p->cyclicBufferSize) |
|
25079
|
0 |
0 |
if (cutValue-- == 0 || delta >= _cyclicBufferSize) |
|
|
0 |
0 |
if (cutValue-- == 0 || delta >= _cyclicBufferSize) |
|
|
0 |
0 |
if (cutValue-- == 0 || delta >= _cyclicBufferSize) |
|
25083
|
0 |
0 |
curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)]; |
|
25084
|
0 |
0 |
if (pb[maxLen] == cur[maxLen] && *pb == *cur) |
|
|
0 |
0 |
if (pb[maxLen] == cur[maxLen] && *pb == *cur) |
|
25087
|
0 |
0 |
while (++len != lenLimit) |
|
25088
|
0 |
0 |
if (pb[len] != cur[len]) |
|
25090
|
0 |
0 |
if (maxLen < len) |
|
25094
|
0 |
0 |
if (len == lenLimit) |
|
25112
|
0 |
0 |
if (cutValue-- == 0 || delta >= _cyclicBufferSize) |
|
|
0 |
0 |
if (cutValue-- == 0 || delta >= _cyclicBufferSize) |
|
|
0 |
0 |
if (cutValue-- == 0 || delta >= _cyclicBufferSize) |
|
25118
|
0 |
0 |
CLzRef *pair = son + ((_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1); |
|
25120
|
0 |
0 |
uint32_t len = (len0 < len1 ? len0 : len1); |
|
25121
|
0 |
0 |
if (pb[len] == cur[len]) |
|
25123
|
0 |
0 |
if (++len != lenLimit && pb[len] == cur[len]) |
|
|
0 |
0 |
if (++len != lenLimit && pb[len] == cur[len]) |
|
|
0 |
0 |
if (++len != lenLimit && pb[len] == cur[len]) |
|
25124
|
0 |
0 |
while (++len != lenLimit) |
|
25125
|
0 |
0 |
if (pb[len] != cur[len]) |
|
25127
|
0 |
0 |
if (maxLen < len) |
|
25131
|
0 |
0 |
if (len == lenLimit) |
|
25139
|
0 |
0 |
if (pb[len] < cur[len]) |
|
25166
|
0 |
0 |
if (cutValue-- == 0 || delta >= _cyclicBufferSize) |
|
|
0 |
0 |
if (cutValue-- == 0 || delta >= _cyclicBufferSize) |
|
|
0 |
0 |
if (cutValue-- == 0 || delta >= _cyclicBufferSize) |
|
25172
|
0 |
0 |
CLzRef *pair = son + ((_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1); |
|
25174
|
0 |
0 |
uint32_t len = (len0 < len1 ? len0 : len1); |
|
25175
|
0 |
0 |
if (pb[len] == cur[len]) |
|
25177
|
0 |
0 |
while (++len != lenLimit) |
|
25178
|
0 |
0 |
if (pb[len] != cur[len]) |
|
25181
|
0 |
0 |
if (len == lenLimit) |
|
25189
|
0 |
0 |
if (pb[len] < cur[len]) |
|
25214
|
0 |
0 |
static void MatchFinder_MovePos(CMatchFinder *p) { MOVE_POS; } |
|
25236
|
0 |
0 |
GET_MATCHES_HEADER(2) |
|
25241
|
0 |
0 |
GET_MATCHES_FOOTER(offset, 1) |
|
25247
|
0 |
0 |
GET_MATCHES_HEADER(3) |
|
25252
|
0 |
0 |
GET_MATCHES_FOOTER(offset, 2) |
|
25258
|
0 |
0 |
GET_MATCHES_HEADER(3) |
|
25270
|
0 |
0 |
if (delta2 < p->cyclicBufferSize && *(cur - delta2) == *cur) |
|
|
0 |
0 |
if (delta2 < p->cyclicBufferSize && *(cur - delta2) == *cur) |
|
25272
|
0 |
0 |
for (; maxLen != lenLimit; maxLen++) |
|
25273
|
0 |
0 |
if (cur[(ptrdiff_t)maxLen - delta2] != cur[maxLen]) |
|
25278
|
0 |
0 |
if (maxLen == lenLimit) |
|
25281
|
0 |
0 |
MOVE_POS_RET; |
|
25284
|
0 |
0 |
GET_MATCHES_FOOTER(offset, maxLen) |
|
25290
|
0 |
0 |
GET_MATCHES_HEADER(4) |
|
25304
|
0 |
0 |
if (delta2 < p->cyclicBufferSize && *(cur - delta2) == *cur) |
|
|
0 |
0 |
if (delta2 < p->cyclicBufferSize && *(cur - delta2) == *cur) |
|
25310
|
0 |
0 |
if (delta2 != delta3 && delta3 < p->cyclicBufferSize && *(cur - delta3) == *cur) |
|
|
0 |
0 |
if (delta2 != delta3 && delta3 < p->cyclicBufferSize && *(cur - delta3) == *cur) |
|
|
0 |
0 |
if (delta2 != delta3 && delta3 < p->cyclicBufferSize && *(cur - delta3) == *cur) |
|
25317
|
0 |
0 |
if (offset != 0) |
|
25319
|
0 |
0 |
for (; maxLen != lenLimit; maxLen++) |
|
25320
|
0 |
0 |
if (cur[(ptrdiff_t)maxLen - delta2] != cur[maxLen]) |
|
25323
|
0 |
0 |
if (maxLen == lenLimit) |
|
25326
|
0 |
0 |
MOVE_POS_RET; |
|
25329
|
0 |
0 |
if (maxLen < 3) |
|
25331
|
0 |
0 |
GET_MATCHES_FOOTER(offset, maxLen) |
|
25337
|
0 |
0 |
GET_MATCHES_HEADER(4) |
|
25351
|
0 |
0 |
if (delta2 < p->cyclicBufferSize && *(cur - delta2) == *cur) |
|
|
0 |
0 |
if (delta2 < p->cyclicBufferSize && *(cur - delta2) == *cur) |
|
25357
|
0 |
0 |
if (delta2 != delta3 && delta3 < p->cyclicBufferSize && *(cur - delta3) == *cur) |
|
|
0 |
0 |
if (delta2 != delta3 && delta3 < p->cyclicBufferSize && *(cur - delta3) == *cur) |
|
|
0 |
0 |
if (delta2 != delta3 && delta3 < p->cyclicBufferSize && *(cur - delta3) == *cur) |
|
25364
|
0 |
0 |
if (offset != 0) |
|
25366
|
0 |
0 |
for (; maxLen != lenLimit; maxLen++) |
|
25367
|
0 |
0 |
if (cur[(ptrdiff_t)maxLen - delta2] != cur[maxLen]) |
|
25370
|
0 |
0 |
if (maxLen == lenLimit) |
|
25373
|
0 |
0 |
MOVE_POS_RET; |
|
25376
|
0 |
0 |
if (maxLen < 3) |
|
25380
|
0 |
0 |
MOVE_POS_RET |
|
25386
|
0 |
0 |
GET_MATCHES_HEADER(3) |
|
25392
|
0 |
0 |
MOVE_POS_RET |
|
25397
|
0 |
0 |
do |
|
25399
|
0 |
0 |
SKIP_HEADER(2) |
|
25403
|
0 |
0 |
SKIP_FOOTER |
|
25410
|
0 |
0 |
do |
|
25412
|
0 |
0 |
SKIP_HEADER(3) |
|
25416
|
0 |
0 |
SKIP_FOOTER |
|
25423
|
0 |
0 |
do |
|
25426
|
0 |
0 |
SKIP_HEADER(3) |
|
25431
|
0 |
0 |
SKIP_FOOTER |
|
25438
|
0 |
0 |
do |
|
25441
|
0 |
0 |
SKIP_HEADER(4) |
|
25447
|
0 |
0 |
SKIP_FOOTER |
|
25454
|
0 |
0 |
do |
|
25457
|
0 |
0 |
SKIP_HEADER(4) |
|
25464
|
0 |
0 |
MOVE_POS |
|
25471
|
0 |
0 |
do |
|
25473
|
0 |
0 |
SKIP_HEADER(3) |
|
25478
|
0 |
0 |
MOVE_POS |
|
25489
|
0 |
0 |
if (!p->btMode) |
|
|
0 |
0 |
if (!p->btMode) |
|
25494
|
0 |
0 |
else if (p->numHashBytes == 2) |
|
|
0 |
0 |
else if (p->numHashBytes == 2) |
|
25499
|
0 |
0 |
else if (p->numHashBytes == 3) |
|
|
0 |
0 |
else if (p->numHashBytes == 3) |
|
25611
|
0 |
0 |
if (level < 0) level = 5; |
|
25613
|
0 |
0 |
if (p->dictSize == 0) p->dictSize = (level <= 5 ? (1 << (level * 2 + 14)) : (level == 6 ? (1 << 25) : (1 << 26))); |
|
|
0 |
0 |
if (p->dictSize == 0) p->dictSize = (level <= 5 ? (1 << (level * 2 + 14)) : (level == 6 ? (1 << 25) : (1 << 26))); |
|
|
0 |
0 |
if (p->dictSize == 0) p->dictSize = (level <= 5 ? (1 << (level * 2 + 14)) : (level == 6 ? (1 << 25) : (1 << 26))); |
|
25614
|
0 |
0 |
if (p->lc < 0) p->lc = 3; |
|
25615
|
0 |
0 |
if (p->lp < 0) p->lp = 0; |
|
25616
|
0 |
0 |
if (p->pb < 0) p->pb = 2; |
|
25617
|
0 |
0 |
if (p->algo < 0) p->algo = (level < 5 ? 0 : 1); |
|
25618
|
0 |
0 |
if (p->fb < 0) p->fb = (level < 7 ? 32 : 64); |
|
|
0 |
0 |
if (p->fb < 0) p->fb = (level < 7 ? 32 : 64); |
|
25619
|
0 |
0 |
if (p->btMode < 0) p->btMode = (p->algo == 0 ? 0 : 1); |
|
25620
|
0 |
0 |
if (p->numHashBytes < 0) p->numHashBytes = 4; |
|
25621
|
0 |
0 |
if (p->mc == 0) p->mc = (16 + (p->fb >> 1)) >> (p->btMode ? 0 : 1); |
|
25622
|
0 |
0 |
if (p->numThreads < 0) |
|
25663
|
0 |
0 |
for (slotFast = 2; slotFast < kNumLogBits * 2; slotFast++) |
|
|
0 |
0 |
for (slotFast = 2; slotFast < kNumLogBits * 2; slotFast++) |
|
25667
|
0 |
0 |
for (j = 0; j < k; j++, c++) |
|
|
0 |
0 |
for (j = 0; j < k; j++, c++) |
|
25886
|
0 |
0 |
for (i = 0; i < kNumStates; i++) |
|
25891
|
0 |
0 |
for (i = 0; i < kNumLenToPosStates; i++) |
|
25912
|
0 |
0 |
for (i = 0; i < kNumStates; i++) |
|
25917
|
0 |
0 |
for (i = 0; i < kNumLenToPosStates; i++) |
|
25935
|
0 |
0 |
if (props.lc > LZMA_LC_MAX || props.lp > LZMA_LP_MAX || props.pb > LZMA_PB_MAX || |
|
|
0 |
0 |
if (props.lc > LZMA_LC_MAX || props.lp > LZMA_LP_MAX || props.pb > LZMA_PB_MAX || |
|
|
0 |
0 |
if (props.lc > LZMA_LC_MAX || props.lp > LZMA_LP_MAX || props.pb > LZMA_PB_MAX || |
|
|
0 |
0 |
if (props.lc > LZMA_LC_MAX || props.lp > LZMA_LP_MAX || props.pb > LZMA_PB_MAX || |
|
25936
|
0 |
0 |
props.dictSize > ((uint32_t)1 << kDicLogSizeMaxCompress) || props.dictSize > ((uint32_t)1 << 30)) |
|
25942
|
0 |
0 |
if (fb < 5) |
|
25944
|
0 |
0 |
if (fb > LZMA_MATCH_LEN_MAX) |
|
25955
|
0 |
0 |
if (props.btMode) |
|
25957
|
0 |
0 |
if (props.numHashBytes < 2) |
|
25959
|
0 |
0 |
else if (props.numHashBytes < 4) |
|
25994
|
0 |
0 |
if (p->bufBase == 0) |
|
25997
|
0 |
0 |
if (p->bufBase == 0) |
|
26027
|
0 |
0 |
if (p->res != SZ_OK) |
|
26030
|
0 |
0 |
if (num != p->outStream->Write(p->outStream, p->bufBase, num)) |
|
26038
|
0 |
0 |
if ((uint32_t)p->low < (uint32_t)0xFF000000 || (int)(p->low >> 32) != 0) |
|
|
0 |
0 |
if ((uint32_t)p->low < (uint32_t)0xFF000000 || (int)(p->low >> 32) != 0) |
|
26041
|
0 |
0 |
do |
|
26046
|
0 |
0 |
if (buf == p->bufLim) |
|
26060
|
0 |
0 |
for (i = 0; i < 5; i++) |
|
26070
|
0 |
0 |
if (p->range < kTopValue) |
|
26076
|
0 |
0 |
while (numBits != 0); |
|
26083
|
0 |
0 |
if (symbol == 0) |
|
26095
|
0 |
0 |
if (p->range < kTopValue) |
|
26110
|
0 |
0 |
while (symbol < 0x10000); |
|
26124
|
0 |
0 |
while (symbol < 0x10000); |
|
26130
|
0 |
0 |
for (i = (1 << kNumMoveReducingBits) / 2; i < kBitModelTotal; i += (1 << kNumMoveReducingBits)) |
|
26136
|
0 |
0 |
for (j = 0; j < kCyclesBits; j++) |
|
26140
|
0 |
0 |
while (w >= ((uint32_t)1 << 16)) |
|
26171
|
0 |
0 |
while (symbol < 0x10000); |
|
|
0 |
0 |
while (symbol < 0x10000); |
|
26187
|
0 |
0 |
while (symbol < 0x10000); |
|
26195
|
0 |
0 |
for (i = numBitLevels; i != 0;) |
|
26209
|
0 |
0 |
for (i = 0; i < numBitLevels; i++) |
|
26222
|
0 |
0 |
while (symbol != 1) |
|
|
0 |
0 |
while (symbol != 1) |
|
|
0 |
0 |
while (symbol != 1) |
|
|
0 |
0 |
while (symbol != 1) |
|
26235
|
0 |
0 |
for (i = numBitLevels; i != 0; i--) |
|
|
0 |
0 |
for (i = numBitLevels; i != 0; i--) |
|
26249
|
0 |
0 |
for (i = 0; i < (LZMA_NUM_PB_STATES_MAX << kLenNumLowBits); i++) |
|
|
0 |
0 |
for (i = 0; i < (LZMA_NUM_PB_STATES_MAX << kLenNumLowBits); i++) |
|
26251
|
0 |
0 |
for (i = 0; i < (LZMA_NUM_PB_STATES_MAX << kLenNumMidBits); i++) |
|
|
0 |
0 |
for (i = 0; i < (LZMA_NUM_PB_STATES_MAX << kLenNumMidBits); i++) |
|
26253
|
0 |
0 |
for (i = 0; i < kLenNumHighSymbols; i++) |
|
|
0 |
0 |
for (i = 0; i < kLenNumHighSymbols; i++) |
|
26259
|
0 |
0 |
if (symbol < kLenNumLowSymbols) |
|
26267
|
0 |
0 |
if (symbol < kLenNumLowSymbols + kLenNumMidSymbols) |
|
26287
|
0 |
0 |
for (i = 0; i < kLenNumLowSymbols; i++) |
|
26289
|
0 |
0 |
if (i >= numSymbols) |
|
26293
|
0 |
0 |
for (; i < kLenNumLowSymbols + kLenNumMidSymbols; i++) |
|
26295
|
0 |
0 |
if (i >= numSymbols) |
|
26299
|
0 |
0 |
for (; i < numSymbols; i++) |
|
26312
|
0 |
0 |
for (posState = 0; posState < numPosStates; posState++) |
|
|
0 |
0 |
for (posState = 0; posState < numPosStates; posState++) |
|
26319
|
0 |
0 |
if (updatePrice) |
|
26320
|
0 |
0 |
if (--p->counters[posState] == 0) |
|
26326
|
0 |
0 |
if (num != 0) |
|
|
0 |
0 |
if (num != 0) |
|
|
0 |
0 |
if (num != 0) |
|
|
0 |
0 |
if (num != 0) |
|
|
0 |
0 |
if (num != 0) |
|
|
0 |
0 |
if (num != 0) |
|
26338
|
0 |
0 |
if (numPairs > 0) |
|
26341
|
0 |
0 |
if (lenRes == p->numFastBytes) |
|
26346
|
0 |
0 |
if (numAvail > LZMA_MATCH_LEN_MAX) |
|
26350
|
0 |
0 |
for (; lenRes < numAvail && pby[lenRes] == pby2[lenRes]; lenRes++); |
|
|
0 |
0 |
for (; lenRes < numAvail && pby[lenRes] == pby2[lenRes]; lenRes++); |
|
26373
|
0 |
0 |
if (repIndex == 0) |
|
26381
|
0 |
0 |
if (repIndex == 1) |
|
26405
|
0 |
0 |
if (p->opt[cur].prev1IsChar) |
|
|
0 |
0 |
if (p->opt[cur].prev1IsChar) |
|
26409
|
0 |
0 |
if (p->opt[cur].prev2) |
|
|
0 |
0 |
if (p->opt[cur].prev2) |
|
26428
|
0 |
0 |
while (cur != 0); |
|
|
0 |
0 |
while (cur != 0); |
|
26444
|
0 |
0 |
if (p->optimumEndIndex != p->optimumCurrentIndex) |
|
26454
|
0 |
0 |
if (p->additionalOffset == 0) |
|
26463
|
0 |
0 |
if (numAvail < 2) |
|
26468
|
0 |
0 |
if (numAvail > LZMA_MATCH_LEN_MAX) |
|
26473
|
0 |
0 |
for (i = 0; i < LZMA_NUM_REPS; i++) |
|
26479
|
0 |
0 |
if (data[0] != data2[0] || data[1] != data2[1]) |
|
|
0 |
0 |
if (data[0] != data2[0] || data[1] != data2[1]) |
|
26484
|
0 |
0 |
for (lenTest = 2; lenTest < numAvail && data[lenTest] == data2[lenTest]; lenTest++); |
|
|
0 |
0 |
for (lenTest = 2; lenTest < numAvail && data[lenTest] == data2[lenTest]; lenTest++); |
|
26486
|
0 |
0 |
if (lenTest > repLens[repMaxIndex]) |
|
26489
|
0 |
0 |
if (repLens[repMaxIndex] >= p->numFastBytes) |
|
26499
|
0 |
0 |
if (mainLen >= p->numFastBytes) |
|
26508
|
0 |
0 |
if (mainLen < 2 && curByte != matchByte && repLens[repMaxIndex] < 2) |
|
|
0 |
0 |
if (mainLen < 2 && curByte != matchByte && repLens[repMaxIndex] < 2) |
|
26520
|
0 |
0 |
p->opt[1].price = GET_PRICE_0(p->isMatch[p->state][posState]) + |
|
26531
|
0 |
0 |
if (matchByte == curByte) |
|
26534
|
0 |
0 |
if (shortRepPrice < p->opt[1].price) |
|
26540
|
0 |
0 |
lenEnd = ((mainLen >= repLens[repMaxIndex]) ? mainLen : repLens[repMaxIndex]); |
|
26542
|
0 |
0 |
if (lenEnd < 2) |
|
26549
|
0 |
0 |
for (i = 0; i < LZMA_NUM_REPS; i++) |
|
26555
|
0 |
0 |
while (len >= 2); |
|
26557
|
0 |
0 |
for (i = 0; i < LZMA_NUM_REPS; i++) |
|
26561
|
0 |
0 |
if (repLen < 2) |
|
26564
|
0 |
0 |
do |
|
26568
|
0 |
0 |
if (curAndLenPrice < opt->price) |
|
26581
|
0 |
0 |
len = ((repLens[0] >= 2) ? repLens[0] + 1 : 2); |
|
26582
|
0 |
0 |
if (len <= mainLen) |
|
26585
|
0 |
0 |
while (len > matches[offs]) |
|
26593
|
0 |
0 |
uint32_t lenToPosState = GetLenToPosState(len); |
|
26594
|
0 |
0 |
if (distance < kNumFullDistances) |
|
26603
|
0 |
0 |
if (curAndLenPrice < opt->price) |
|
26610
|
0 |
0 |
if (len == matches[offs]) |
|
26613
|
0 |
0 |
if (offs == numPairs) |
|
26632
|
0 |
0 |
if (cur == lenEnd) |
|
26636
|
0 |
0 |
if (newLen >= p->numFastBytes) |
|
26645
|
0 |
0 |
if (curOpt->prev1IsChar) |
|
26648
|
0 |
0 |
if (curOpt->prev2) |
|
26651
|
0 |
0 |
if (curOpt->backPrev2 < LZMA_NUM_REPS) |
|
26662
|
0 |
0 |
if (posPrev == cur - 1) |
|
26664
|
0 |
0 |
if (IsShortRep(curOpt)) |
|
26673
|
0 |
0 |
if (curOpt->prev1IsChar && curOpt->prev2) |
|
|
0 |
0 |
if (curOpt->prev1IsChar && curOpt->prev2) |
|
26682
|
0 |
0 |
if (pos < LZMA_NUM_REPS) |
|
26688
|
0 |
0 |
if (pos < LZMA_NUM_REPS) |
|
26692
|
0 |
0 |
for (i = 1; i <= pos; i++) |
|
26694
|
0 |
0 |
for (; i < LZMA_NUM_REPS; i++) |
|
26701
|
0 |
0 |
for (i = 1; i < LZMA_NUM_REPS; i++) |
|
26726
|
0 |
0 |
LitEnc_GetPrice(probs, curByte, p->ProbPrices)); |
|
26731
|
0 |
0 |
if (curAnd1Price < nextOpt->price) |
|
26742
|
0 |
0 |
if (matchByte == curByte && !(nextOpt->posPrev < cur && nextOpt->backPrev == 0)) |
|
|
0 |
0 |
if (matchByte == curByte && !(nextOpt->posPrev < cur && nextOpt->backPrev == 0)) |
|
|
0 |
0 |
if (matchByte == curByte && !(nextOpt->posPrev < cur && nextOpt->backPrev == 0)) |
|
26745
|
0 |
0 |
if (shortRepPrice <= nextOpt->price) |
|
26756
|
0 |
0 |
if (temp < numAvailFull) |
|
26760
|
0 |
0 |
if (numAvailFull < 2) |
|
26762
|
0 |
0 |
numAvail = (numAvailFull <= p->numFastBytes ? numAvailFull : p->numFastBytes); |
|
26764
|
0 |
0 |
if (!nextIsChar && matchByte != curByte) /* speed optimization */ |
|
26771
|
0 |
0 |
if (limit > numAvailFull) |
|
26774
|
0 |
0 |
for (temp = 1; temp < limit && data[temp] == data2[temp]; temp++); |
|
|
0 |
0 |
for (temp = 1; temp < limit && data[temp] == data2[temp]; temp++); |
|
26776
|
0 |
0 |
if (lenTest2 >= 2) |
|
26788
|
0 |
0 |
while (lenEnd < offset) |
|
26792
|
0 |
0 |
if (curAndLenPrice < opt->price) |
|
26807
|
0 |
0 |
for (repIndex = 0; repIndex < LZMA_NUM_REPS; repIndex++) |
|
26813
|
0 |
0 |
if (data[0] != data2[0] || data[1] != data2[1]) |
|
|
0 |
0 |
if (data[0] != data2[0] || data[1] != data2[1]) |
|
26815
|
0 |
0 |
for (lenTest = 2; lenTest < numAvail && data[lenTest] == data2[lenTest]; lenTest++); |
|
|
0 |
0 |
for (lenTest = 2; lenTest < numAvail && data[lenTest] == data2[lenTest]; lenTest++); |
|
26816
|
0 |
0 |
while (lenEnd < cur + lenTest) |
|
26820
|
0 |
0 |
do |
|
26824
|
0 |
0 |
if (curAndLenPrice < opt->price) |
|
26835
|
0 |
0 |
if (repIndex == 0) |
|
26844
|
0 |
0 |
if (limit > numAvailFull) |
|
26846
|
0 |
0 |
for (; lenTest2 < limit && data[lenTest2] == data2[lenTest2]; lenTest2++); |
|
|
0 |
0 |
for (; lenTest2 < limit && data[lenTest2] == data2[lenTest2]; lenTest2++); |
|
26848
|
0 |
0 |
if (lenTest2 >= 2) |
|
26868
|
0 |
0 |
while (lenEnd < offset) |
|
26872
|
0 |
0 |
if (curAndLenPrice < opt->price) |
|
26888
|
0 |
0 |
if (newLen > numAvail) |
|
26891
|
0 |
0 |
for (numPairs = 0; newLen > matches[numPairs]; numPairs += 2); |
|
26895
|
0 |
0 |
if (newLen >= startLen) |
|
26900
|
0 |
0 |
while (lenEnd < cur + newLen) |
|
26904
|
0 |
0 |
while (startLen > matches[offs]) |
|
26911
|
0 |
0 |
uint32_t lenToPosState = GetLenToPosState(lenTest); |
|
26913
|
0 |
0 |
if (curBack < kNumFullDistances) |
|
26919
|
0 |
0 |
if (curAndLenPrice < opt->price) |
|
26927
|
0 |
0 |
if (/*_maxMode && */lenTest == matches[offs]) |
|
26934
|
0 |
0 |
if (limit > numAvailFull) |
|
26936
|
0 |
0 |
for (; lenTest2 < limit && data[lenTest2] == data2[lenTest2]; lenTest2++); |
|
|
0 |
0 |
for (; lenTest2 < limit && data[lenTest2] == data2[lenTest2]; lenTest2++); |
|
26938
|
0 |
0 |
if (lenTest2 >= 2) |
|
26957
|
0 |
0 |
while (lenEnd < offset) |
|
26961
|
0 |
0 |
if (curAndLenPrice < opt->price) |
|
26974
|
0 |
0 |
if (offs == numPairs) |
|
26977
|
0 |
0 |
if (curBack >= kNumFullDistances) |
|
26993
|
0 |
0 |
if (p->additionalOffset == 0) |
|
27003
|
0 |
0 |
if (numAvail < 2) |
|
27005
|
0 |
0 |
if (numAvail > LZMA_MATCH_LEN_MAX) |
|
27010
|
0 |
0 |
for (i = 0; i < LZMA_NUM_REPS; i++) |
|
27014
|
0 |
0 |
if (data[0] != data2[0] || data[1] != data2[1]) |
|
|
0 |
0 |
if (data[0] != data2[0] || data[1] != data2[1]) |
|
27016
|
0 |
0 |
for (len = 2; len < numAvail && data[len] == data2[len]; len++); |
|
|
0 |
0 |
for (len = 2; len < numAvail && data[len] == data2[len]; len++); |
|
27017
|
0 |
0 |
if (len >= p->numFastBytes) |
|
27023
|
0 |
0 |
if (len > repLen) |
|
27031
|
0 |
0 |
if (mainLen >= p->numFastBytes) |
|
27039
|
0 |
0 |
if (mainLen >= 2) |
|
27042
|
0 |
0 |
while (numPairs > 2 && mainLen == matches[numPairs - 4] + 1) |
|
|
0 |
0 |
while (numPairs > 2 && mainLen == matches[numPairs - 4] + 1) |
|
27044
|
0 |
0 |
if (!ChangePair(matches[numPairs - 3], mainDist)) |
|
27050
|
0 |
0 |
if (mainLen == 2 && mainDist >= 0x80) |
|
27054
|
0 |
0 |
if (repLen >= 2 && ( |
|
|
0 |
0 |
if (repLen >= 2 && ( |
|
27055
|
0 |
0 |
(repLen + 1 >= mainLen) || |
|
27056
|
0 |
0 |
(repLen + 2 >= mainLen && mainDist >= (1 << 9)) || |
|
|
0 |
0 |
(repLen + 2 >= mainLen && mainDist >= (1 << 9)) || |
|
27057
|
0 |
0 |
(repLen + 3 >= mainLen && mainDist >= (1 << 15)))) |
|
27064
|
0 |
0 |
if (mainLen < 2 || numAvail <= 2) |
|
27068
|
0 |
0 |
if (p->longestMatchLength >= 2) |
|
27071
|
0 |
0 |
if ((p->longestMatchLength >= mainLen && newDistance < mainDist) || |
|
|
0 |
0 |
if ((p->longestMatchLength >= mainLen && newDistance < mainDist) || |
|
|
0 |
0 |
if ((p->longestMatchLength >= mainLen && newDistance < mainDist) || |
|
27072
|
0 |
0 |
(p->longestMatchLength == mainLen + 1 && !ChangePair(mainDist, newDistance)) || |
|
|
0 |
0 |
(p->longestMatchLength == mainLen + 1 && !ChangePair(mainDist, newDistance)) || |
|
27073
|
0 |
0 |
(p->longestMatchLength > mainLen + 1) || |
|
27074
|
0 |
0 |
(p->longestMatchLength + 1 >= mainLen && mainLen >= 3 && ChangePair(newDistance, mainDist))) |
|
|
0 |
0 |
(p->longestMatchLength + 1 >= mainLen && mainLen >= 3 && ChangePair(newDistance, mainDist))) |
|
27079
|
0 |
0 |
for (i = 0; i < LZMA_NUM_REPS; i++) |
|
27083
|
0 |
0 |
if (data[0] != data2[0] || data[1] != data2[1]) |
|
|
0 |
0 |
if (data[0] != data2[0] || data[1] != data2[1]) |
|
27086
|
0 |
0 |
for (len = 2; len < limit && data[len] == data2[len]; len++); |
|
|
0 |
0 |
for (len = 2; len < limit && data[len] == data2[len]; len++); |
|
27087
|
0 |
0 |
if (len >= limit) |
|
27110
|
0 |
0 |
if (p->result != SZ_OK) |
|
27112
|
0 |
0 |
if (p->rc.res != SZ_OK) |
|
|
0 |
0 |
if (p->rc.res != SZ_OK) |
|
|
0 |
0 |
if (p->rc.res != SZ_OK) |
|
27114
|
0 |
0 |
if (p->matchFinderBase.result != SZ_OK) |
|
|
0 |
0 |
if (p->matchFinderBase.result != SZ_OK) |
|
|
0 |
0 |
if (p->matchFinderBase.result != SZ_OK) |
|
27116
|
0 |
0 |
if (p->result != SZ_OK) |
|
|
0 |
0 |
if (p->result != SZ_OK) |
|
|
0 |
0 |
if (p->result != SZ_OK) |
|
27125
|
0 |
0 |
if (p->writeEndMark) |
|
27135
|
0 |
0 |
for (i = 0; i < kAlignTableSize; i++) |
|
27144
|
0 |
0 |
for (i = kStartPosModelIndex; i < kNumFullDistances; i++) |
|
27152
|
0 |
0 |
for (lenToPosState = 0; lenToPosState < kNumLenToPosStates; lenToPosState++) |
|
27157
|
0 |
0 |
for (posSlot = 0; posSlot < p->distTableSize; posSlot++) |
|
27159
|
0 |
0 |
for (posSlot = kEndPosModelIndex; posSlot < p->distTableSize; posSlot++) |
|
27165
|
0 |
0 |
for (i = 0; i < kStartPosModelIndex; i++) |
|
27167
|
0 |
0 |
for (; i < kNumFullDistances; i++) |
|
27198
|
0 |
0 |
if (p != 0) |
|
27227
|
0 |
0 |
if (p->needInit) |
|
27233
|
0 |
0 |
if (p->finished) |
|
27235
|
0 |
0 |
RINOK(CheckErrors(p)); |
|
27240
|
0 |
0 |
if (p->nowPos64 == 0) |
|
27244
|
0 |
0 |
if (p->matchFinder.GetNumAvailableBytes(p->matchFinderObj) == 0) |
|
27255
|
0 |
0 |
if (p->matchFinder.GetNumAvailableBytes(p->matchFinderObj) != 0) |
|
27260
|
0 |
0 |
if (p->fastMode) |
|
27266
|
0 |
0 |
if (len == 1 && pos == (uint32_t)-1) |
|
|
0 |
0 |
if (len == 1 && pos == (uint32_t)-1) |
|
27276
|
0 |
0 |
if (IsCharState(p->state)) |
|
27285
|
0 |
0 |
if (pos < LZMA_NUM_REPS) |
|
27288
|
0 |
0 |
if (pos == 0) |
|
27297
|
0 |
0 |
if (pos == 1) |
|
27303
|
0 |
0 |
if (pos == 3) |
|
27310
|
0 |
0 |
if (len == 1) |
|
27325
|
0 |
0 |
GetPosSlot(pos, posSlot); |
|
27326
|
0 |
0 |
RcTree_Encode(&p->rc, p->posSlotEncoder[GetLenToPosState(len)], kNumPosSlotBits, posSlot); |
|
27328
|
0 |
0 |
if (posSlot >= kStartPosModelIndex) |
|
27334
|
0 |
0 |
if (posSlot < kEndPosModelIndex) |
|
27352
|
0 |
0 |
if (p->additionalOffset == 0) |
|
27355
|
0 |
0 |
if (!p->fastMode) |
|
27357
|
0 |
0 |
if (p->matchPriceCount >= (1 << 7)) |
|
27359
|
0 |
0 |
if (p->alignPriceCount >= kAlignTableSize) |
|
27362
|
0 |
0 |
if (p->matchFinder.GetNumAvailableBytes(p->matchFinderObj) == 0) |
|
27365
|
0 |
0 |
if (useLimits) |
|
27367
|
0 |
0 |
if (processed + kNumOpts + 300 >= maxUnpackSize || |
|
|
0 |
0 |
if (processed + kNumOpts + 300 >= maxUnpackSize || |
|
27371
|
0 |
0 |
else if (processed >= (1 << 15)) |
|
27387
|
0 |
0 |
if (!RangeEnc_Alloc(&p->rc, alloc)) |
|
27392
|
0 |
0 |
if (p->litProbs == 0 || p->saveState.litProbs == 0 || p->lclp != lclp) |
|
|
0 |
0 |
if (p->litProbs == 0 || p->saveState.litProbs == 0 || p->lclp != lclp) |
|
|
0 |
0 |
if (p->litProbs == 0 || p->saveState.litProbs == 0 || p->lclp != lclp) |
|
27397
|
0 |
0 |
if (p->litProbs == 0 || p->saveState.litProbs == 0) |
|
|
0 |
0 |
if (p->litProbs == 0 || p->saveState.litProbs == 0) |
|
27408
|
0 |
0 |
if (beforeSize + p->dictSize < keepWindowSize) |
|
27412
|
0 |
0 |
if (!MatchFinder_Create(&p->matchFinderBase, p->dictSize, beforeSize, p->numFastBytes, LZMA_MATCH_LEN_MAX, allocBig)) |
|
27424
|
0 |
0 |
for (i = 0 ; i < LZMA_NUM_REPS; i++) |
|
27429
|
0 |
0 |
for (i = 0; i < kNumStates; i++) |
|
27432
|
0 |
0 |
for (j = 0; j < LZMA_NUM_PB_STATES_MAX; j++) |
|
27445
|
0 |
0 |
for (i = 0; i < num; i++) |
|
27450
|
0 |
0 |
for (i = 0; i < kNumLenToPosStates; i++) |
|
27454
|
0 |
0 |
for (j = 0; j < (1 << kNumPosSlotBits); j++) |
|
27459
|
0 |
0 |
for (i = 0; i < kNumFullDistances - kEndPosModelIndex; i++) |
|
27466
|
0 |
0 |
for (i = 0; i < (1 << kNumAlignBits); i++) |
|
27479
|
0 |
0 |
if (!p->fastMode) |
|
27495
|
0 |
0 |
for (i = 0; i < (uint32_t)kDicLogSizeMaxCompress; i++) |
|
27496
|
0 |
0 |
if (p->dictSize <= ((uint32_t)1 << i)) |
|
27502
|
0 |
0 |
RINOK(LzmaEnc_Alloc(p, keepWindowSize, alloc, allocBig)); |
|
27561
|
0 |
0 |
if (p->rem < size) |
|
27601
|
0 |
0 |
if (reInit) |
|
27612
|
0 |
0 |
if (outStream.overflow) |
|
27625
|
0 |
0 |
if (res != SZ_OK || p->finished != 0) |
|
|
0 |
0 |
if (res != SZ_OK || p->finished != 0) |
|
27627
|
0 |
0 |
if (progress != 0) |
|
27630
|
0 |
0 |
if (res != SZ_OK) |
|
27644
|
0 |
0 |
RINOK(LzmaEnc_Prepare(pp, outStream, inStream, alloc, allocBig)); |
|
27653
|
0 |
0 |
if (*size < LZMA_PROPS_SIZE) |
|
27658
|
0 |
0 |
for (i = 11; i <= 30; i++) |
|
27660
|
0 |
0 |
if (dictSize <= ((uint32_t)2 << i)) |
|
27665
|
0 |
0 |
if (dictSize <= ((uint32_t)3 << i)) |
|
27672
|
0 |
0 |
for (i = 0; i < 4; i++) |
|
27696
|
0 |
0 |
if (res == SZ_OK) |
|
27700
|
0 |
0 |
if (outStream.overflow) |
|
27711
|
0 |
0 |
if (p == 0) |
|
27715
|
0 |
0 |
if (res == SZ_OK) |
|
27718
|
0 |
0 |
if (res == SZ_OK) |
|
27746
|
0 |
0 |
auto res = lzma::LzmaEncode(compressed.data(), &compressed_size, enc.data.data(), uncompressed_size, &props, props_encoded, &props_encoded_size, 0, nullptr, &lzmaAllocator, &lzmaAllocator); |
|
27747
|
0 |
0 |
if (res != SZ_OK) return false; |
|
27750
|
0 |
0 |
if (uint32_t(uncompressed_size) != uncompressed_size || uint32_t(compressed_size) != compressed_size) return false; |
|
|
0 |
0 |
if (uint32_t(uncompressed_size) != uncompressed_size || uint32_t(compressed_size) != compressed_size) return false; |
|
27751
|
0 |
0 |
if (!os.write((const char*) &uncompressed_size, sizeof(uint32_t))) return false; |
|
|
0 |
0 |
if (!os.write((const char*) &uncompressed_size, sizeof(uint32_t))) return false; |
|
27752
|
0 |
0 |
if (!os.write((const char*) &compressed_size, sizeof(uint32_t))) return false; |
|
|
0 |
0 |
if (!os.write((const char*) &compressed_size, sizeof(uint32_t))) return false; |
|
27753
|
0 |
0 |
if (!os.write((const char*) &poor_crc, sizeof(uint32_t))) return false; |
|
|
0 |
0 |
if (!os.write((const char*) &poor_crc, sizeof(uint32_t))) return false; |
|
27754
|
0 |
0 |
if (!os.write((const char*) props_encoded, sizeof(props_encoded))) return false; |
|
|
0 |
0 |
if (!os.write((const char*) props_encoded, sizeof(props_encoded))) return false; |
|
27755
|
0 |
0 |
if (!os.write((const char*) compressed.data(), compressed_size)) return false; |
|
|
0 |
0 |
if (!os.write((const char*) compressed.data(), compressed_size)) return false; |
|
27777
|
0 |
0 |
return {1, 3, 0, ""}; |
|
27790
|
0 |
0 |
<< (udpipe.prerelease.empty() ? "" : "-") << udpipe.prerelease |
|
|
0 |
0 |
<< (udpipe.prerelease.empty() ? "" : "-") << udpipe.prerelease |
|
27792
|
0 |
0 |
<< (unilib.prerelease.empty() ? "" : "-") << unilib.prerelease |
|
|
0 |
0 |
<< (unilib.prerelease.empty() ? "" : "-") << unilib.prerelease |
|
27794
|
0 |
0 |
<< (morphodita.prerelease.empty() ? "" : "-") << morphodita.prerelease |
|
|
0 |
0 |
<< (morphodita.prerelease.empty() ? "" : "-") << morphodita.prerelease |
|
27796
|
0 |
0 |
<< (parsito.prerelease.empty() ? "" : "-") << parsito.prerelease |
|
|
0 |
0 |
<< (parsito.prerelease.empty() ? "" : "-") << parsito.prerelease |
|
27797
|
0 |
0 |
<< (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n" |
|
|
0 |
0 |
<< (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n" |
|
27799
|
0 |
0 |
"Mathematics and Physics, Charles University in Prague, Czech Republic."; |
|
27805
|
2 |
0 |
} // namespace ufal |
|
|
2 |
0 |
} // namespace ufal |