line |
true |
false |
branch |
93
|
0 |
0 |
return os.write(str.str, str.len); |
|
0 |
0 |
return os.write(str.str, str.len); |
|
0 |
0 |
return os.write(str.str, str.len); |
|
0 |
0 |
return os.write(str.str, str.len); |
|
0 |
0 |
return os.write(str.str, str.len); |
|
0 |
0 |
return os.write(str.str, str.len); |
|
0 |
0 |
return os.write(str.str, str.len); |
|
0 |
0 |
return os.write(str.str, str.len); |
|
0 |
0 |
return os.write(str.str, str.len); |
|
0 |
0 |
return os.write(str.str, str.len); |
97
|
1 |
3 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
1 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
1 |
2 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
1 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
3 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
3 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
1 |
2 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
1 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
1 |
1 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
1 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
1 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
1 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
1 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
15 |
3 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
15 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
3 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
3 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
16 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
16 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
16 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
|
0 |
0 |
return a.len == b.len && memcmp(a.str, b.str, a.len) == 0; |
207
|
0 |
0 |
class multiword_token : public token { |
|
0 |
0 |
class multiword_token : public token { |
|
0 |
0 |
class multiword_token : public token { |
229
|
0 |
0 |
class word : public token { |
259
|
0 |
0 |
class sentence { |
|
0 |
0 |
class sentence { |
|
0 |
0 |
class sentence { |
417
|
0 |
0 |
pair_system_gold(const word& system, const word& gold) : system(system), gold(gold) {} |
492
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
16 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
1 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
6 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
1 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
28 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
1 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
1 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
18 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
0 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
7 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
|
22 |
0 |
return chr < CHARS ? 1 << category_block[category_index[chr >> 8]][chr & 0xFF] : DEFAULT_CAT; |
496
|
30 |
0 |
if (chr < CHARS) { |
498
|
2 |
28 |
if ((othercase & 0xFF) == othercase_type::LOWER_ONLY) return othercase >> 8; |
499
|
0 |
28 |
if ((othercase & 0xFF) == othercase_type::LOWER_THEN_UPPER) return othercase >> 8; |
500
|
0 |
28 |
if ((othercase & 0xFF) == othercase_type::TITLE_THEN_LOWER) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8; |
506
|
0 |
0 |
if (chr < CHARS) { |
508
|
0 |
0 |
if ((othercase & 0xFF) == othercase_type::UPPERTITLE_ONLY) return othercase >> 8; |
509
|
0 |
0 |
if ((othercase & 0xFF) == othercase_type::UPPER_ONLY) return othercase >> 8; |
510
|
0 |
0 |
if ((othercase & 0xFF) == othercase_type::UPPER_THEN_TITLE) return othercase >> 8; |
511
|
0 |
0 |
if ((othercase & 0xFF) == othercase_type::LOWER_THEN_UPPER) return othercase_block[othercase_index[(othercase >> 8) >> 8]][(othercase >> 8) & 0xFF] >> 8; |
605
|
54 |
0 |
if (((unsigned char)*str) < 0x80) return (unsigned char)*str++; |
606
|
0 |
0 |
else if (((unsigned char)*str) < 0xC0) return ++str, REPLACEMENT_CHAR; |
607
|
0 |
0 |
else if (((unsigned char)*str) < 0xE0) { |
609
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
611
|
0 |
0 |
} else if (((unsigned char)*str) < 0xF0) { |
613
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
615
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
617
|
0 |
0 |
} else if (((unsigned char)*str) < 0xF8) { |
619
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
621
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
623
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
629
|
145 |
0 |
if (!len) return 0; |
631
|
122 |
23 |
if (((unsigned char)*str) < 0x80) return (unsigned char)*str++; |
632
|
0 |
23 |
else if (((unsigned char)*str) < 0xC0) return ++str, REPLACEMENT_CHAR; |
633
|
23 |
0 |
else if (((unsigned char)*str) < 0xE0) { |
635
|
23 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
23 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
23 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
637
|
0 |
0 |
} else if (((unsigned char)*str) < 0xF0) { |
639
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
641
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
643
|
0 |
0 |
} else if (((unsigned char)*str) < 0xF8) { |
645
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
647
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
649
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
|
0 |
0 |
if (len <= 0 || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return REPLACEMENT_CHAR; |
674
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
36 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
33 |
3 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
18 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
15 |
3 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
|
0 |
0 |
iterator& operator++() { if (next) { codepoint = decode(next); if (!codepoint) next = nullptr; } return *this; } |
706
|
0 |
0 |
iterator& operator++() { if (!len) next = nullptr; if (next) codepoint = decode(next, len); return *this; } |
|
0 |
0 |
iterator& operator++() { if (!len) next = nullptr; if (next) codepoint = decode(next, len); return *this; } |
740
|
25 |
5 |
if (chr < 0x80) str += chr; |
741
|
5 |
0 |
else if (chr < 0x800) { str += 0xC0 + (chr >> 6); str += 0x80 + (chr & 0x3F); } |
742
|
0 |
0 |
else if (chr < 0x10000) { str += 0xE0 + (chr >> 12); str += 0x80 + ((chr >> 6) & 0x3F); str += 0x80 + (chr & 0x3F); } |
743
|
0 |
0 |
else if (chr < 0x200000) { str += 0xF0 + (chr >> 18); str += 0x80 + ((chr >> 12) & 0x3F); str += 0x80 + ((chr >> 6) & 0x3F); str += 0x80 + (chr & 0x3F); } |
750
|
0 |
0 |
for (char32_t chr; (chr = decode(str)); ) |
757
|
29 |
7 |
while (len) |
762
|
0 |
0 |
map(f, str.c_str(), result); |
|
0 |
0 |
map(f, str.c_str(), result); |
|
0 |
0 |
map(f, str.c_str(), result); |
|
0 |
0 |
map(f, str.c_str(), result); |
|
0 |
0 |
map(f, str.c_str(), result); |
809
|
0 |
0 |
unique_ptr conllu_input(input_format::new_conllu_input_format()); |
810
|
0 |
0 |
if (!conllu_input) return error.assign("Cannot allocate CoNLL-U input format instance!"), false; |
|
0 |
0 |
if (!conllu_input) return error.assign("Cannot allocate CoNLL-U input format instance!"), false; |
812
|
0 |
0 |
vector plain_text_paragraphs(1); unsigned space_after_nos = 0; |
813
|
0 |
0 |
sentence system, gold; |
|
0 |
0 |
sentence system, gold; |
817
|
0 |
0 |
while (conllu_input->read_block(is, block)) { |
|
0 |
0 |
while (conllu_input->read_block(is, block)) { |
818
|
0 |
0 |
conllu_input->set_text(block); |
819
|
0 |
0 |
while (conllu_input->next_sentence(gold, error)) { |
|
0 |
0 |
while (conllu_input->next_sentence(gold, error)) { |
820
|
0 |
0 |
gold_data.add_sentence(gold); |
823
|
0 |
0 |
if (tokenizer != NONE) { |
824
|
0 |
0 |
if (gold.get_new_doc() || gold.get_new_par()) { |
|
0 |
0 |
if (gold.get_new_doc() || gold.get_new_par()) { |
|
0 |
0 |
if (gold.get_new_doc() || gold.get_new_par()) { |
|
0 |
0 |
if (gold.get_new_doc() || gold.get_new_par()) { |
|
0 |
0 |
if (gold.get_new_doc() || gold.get_new_par()) { |
825
|
0 |
0 |
plain_text_paragraphs.back().append("\n\n"); |
826
|
0 |
0 |
plain_text_paragraphs.emplace_back(); |
829
|
0 |
0 |
for (size_t i = 1, j = 0; i < gold.words.size(); i++) { |
830
|
0 |
0 |
const token& tok = j < gold.multiword_tokens.size() && gold.multiword_tokens[j].id_first == int(i) ? (const token&)gold.multiword_tokens[j] : (const token&)gold.words[i]; |
|
0 |
0 |
const token& tok = j < gold.multiword_tokens.size() && gold.multiword_tokens[j].id_first == int(i) ? (const token&)gold.multiword_tokens[j] : (const token&)gold.words[i]; |
832
|
0 |
0 |
if (tok.get_space_after()) |
|
0 |
0 |
if (tok.get_space_after()) |
833
|
0 |
0 |
plain_text_paragraphs.back().push_back(' '); |
836
|
0 |
0 |
if (j < gold.multiword_tokens.size() && gold.multiword_tokens[j].id_first == int(i)) |
|
0 |
0 |
if (j < gold.multiword_tokens.size() && gold.multiword_tokens[j].id_first == int(i)) |
|
0 |
0 |
if (j < gold.multiword_tokens.size() && gold.multiword_tokens[j].id_first == int(i)) |
842
|
0 |
0 |
if (tokenizer == NONE && tagger != NONE) { |
843
|
0 |
0 |
system.clear(); |
844
|
0 |
0 |
for (size_t i = 1; i < gold.words.size(); i++) |
847
|
0 |
0 |
if (tagger != NONE) { |
848
|
0 |
0 |
if (!m->tag(system, tagger, error)) |
|
0 |
0 |
if (!m->tag(system, tagger, error)) |
850
|
0 |
0 |
if (parser != NONE) |
851
|
0 |
0 |
if (!m->parse(system, parser, error)) |
|
0 |
0 |
if (!m->parse(system, parser, error)) |
854
|
0 |
0 |
system_goldtok_data.add_sentence(system); |
858
|
0 |
0 |
if (tokenizer == NONE && tagger == NONE && parser != NONE) { |
|
0 |
0 |
if (tokenizer == NONE && tagger == NONE && parser != NONE) { |
859
|
0 |
0 |
system.clear(); |
860
|
0 |
0 |
for (size_t i = 1; i < gold.words.size(); i++) { |
867
|
0 |
0 |
if (parser != NONE) |
868
|
0 |
0 |
if (!m->parse(system, parser, error)) |
|
0 |
0 |
if (!m->parse(system, parser, error)) |
870
|
0 |
0 |
system_goldtok_goldtags_data.add_sentence(system); |
873
|
0 |
0 |
if (!error.empty()) return false; |
877
|
0 |
0 |
if (tokenizer != NONE) { |
878
|
0 |
0 |
unique_ptr t(m->new_tokenizer(tokenizer)); |
879
|
0 |
0 |
if (!t) return error.assign("Cannot allocate new tokenizer!"), false; |
|
0 |
0 |
if (!t) return error.assign("Cannot allocate new tokenizer!"), false; |
881
|
0 |
0 |
for (auto&& plain_text : plain_text_paragraphs) { |
882
|
0 |
0 |
t->set_text(plain_text); |
883
|
0 |
0 |
while (t->next_sentence(system, error)) { |
|
0 |
0 |
while (t->next_sentence(system, error)) { |
884
|
0 |
0 |
if (tagger != NONE) { |
885
|
0 |
0 |
if (!m->tag(system, tagger, error)) |
|
0 |
0 |
if (!m->tag(system, tagger, error)) |
888
|
0 |
0 |
if (parser != NONE) |
889
|
0 |
0 |
if (!m->parse(system, parser, error)) |
|
0 |
0 |
if (!m->parse(system, parser, error)) |
892
|
0 |
0 |
system_plaintext_data.add_sentence(system); |
894
|
0 |
0 |
if (!error.empty()) return false; |
899
|
0 |
0 |
if (tokenizer != NONE) { |
900
|
0 |
0 |
if (system_plaintext_data.chars != gold_data.chars) { |
904
|
0 |
0 |
word_alignment::best_alignment(system_plaintext_data, gold_data, plaintext_alignment); |
912
|
0 |
0 |
if (multiwords.total_gold || multiwords.total_system) |
|
0 |
0 |
if (multiwords.total_gold || multiwords.total_system) |
926
|
0 |
0 |
if (tagger != NONE) { |
930
|
0 |
0 |
auto alltags = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag && w.xpostag == u.xpostag && w.feats == u.feats; }); |
|
0 |
0 |
auto alltags = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag && w.xpostag == u.xpostag && w.feats == u.feats; }); |
|
0 |
0 |
auto alltags = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag && w.xpostag == u.xpostag && w.feats == u.feats; }); |
938
|
0 |
0 |
if (tagger != NONE && parser != NONE) { |
940
|
0 |
0 |
auto las = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; }); |
|
0 |
0 |
auto las = plaintext_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; }); |
948
|
0 |
0 |
if (tokenizer == NONE && tagger != NONE) { |
950
|
0 |
0 |
if (!word_alignment::perfect_alignment(system_goldtok_data, gold_data, goldtok_alignment)) |
|
0 |
0 |
if (!word_alignment::perfect_alignment(system_goldtok_data, gold_data, goldtok_alignment)) |
951
|
0 |
0 |
return error.assign("Internal UDPipe error (the words of the gold data do not match)!"), false; |
956
|
0 |
0 |
auto alltags = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag && w.xpostag == u.xpostag && w.feats == u.feats; }); |
|
0 |
0 |
auto alltags = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag && w.xpostag == u.xpostag && w.feats == u.feats; }); |
|
0 |
0 |
auto alltags = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.upostag == u.upostag && w.xpostag == u.xpostag && w.feats == u.feats; }); |
963
|
0 |
0 |
if (parser != NONE) { |
965
|
0 |
0 |
auto las = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; }); |
|
0 |
0 |
auto las = goldtok_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; }); |
972
|
0 |
0 |
if (tokenizer == NONE && tagger == NONE && parser != NONE) { |
|
0 |
0 |
if (tokenizer == NONE && tagger == NONE && parser != NONE) { |
974
|
0 |
0 |
if (!word_alignment::perfect_alignment(system_goldtok_goldtags_data, gold_data, goldtok_goldtags_alignment)) |
|
0 |
0 |
if (!word_alignment::perfect_alignment(system_goldtok_goldtags_data, gold_data, goldtok_goldtags_alignment)) |
975
|
0 |
0 |
return error.assign("Internal UDPipe error (the words of the goldtok data do not match)!"), false; |
978
|
0 |
0 |
auto las = goldtok_goldtags_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; }); |
|
0 |
0 |
auto las = goldtok_goldtags_alignment.evaluate_f1([](const word& w, const word& u) { return w.head == u.head && w.deprel == u.deprel; }); |
989
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
990
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].first < gold[gi].first)) |
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].first < gold[gi].first)) |
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].first < gold[gi].first)) |
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].first < gold[gi].first)) |
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].first < gold[gi].first)) |
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].first < gold[gi].first)) |
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].first < gold[gi].first)) |
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].first < gold[gi].first)) |
992
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].first < system[si].first)) |
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].first < system[si].first)) |
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].first < system[si].first)) |
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].first < system[si].first)) |
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].first < system[si].first)) |
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].first < system[si].first)) |
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].first < system[si].first)) |
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].first < system[si].first)) |
998
|
0 |
0 |
gold.size() ? both / double(gold.size()) : 0., system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0. }; |
|
0 |
0 |
gold.size() ? both / double(gold.size()) : 0., system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0. }; |
|
0 |
0 |
gold.size() ? both / double(gold.size()) : 0., system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0. }; |
|
0 |
0 |
gold.size() ? both / double(gold.size()) : 0., system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0. }; |
|
0 |
0 |
gold.size() ? both / double(gold.size()) : 0., system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0. }; |
|
0 |
0 |
gold.size() ? both / double(gold.size()) : 0., system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0. }; |
1006
|
0 |
0 |
this->w.head = w.head ? id + (w.head - w.id) : 0; |
1014
|
0 |
0 |
if (colon != string::npos) |
1015
|
0 |
0 |
this->w.deprel.erase(colon); |
1020
|
0 |
0 |
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
1022
|
0 |
0 |
const string& form = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? s.multiword_tokens[j].form : s.words[i].form; |
|
0 |
0 |
const string& form = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? s.multiword_tokens[j].form : s.words[i].form; |
1023
|
0 |
0 |
for (auto&& chr : unilib::utf8::decoder(form)) |
1024
|
0 |
0 |
if (chr != ' ') |
1028
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) { |
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) { |
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) { |
1030
|
0 |
0 |
for (size_t k = i; int(k) <= s.multiword_tokens[j].id_last; k++) { |
1045
|
0 |
0 |
for (auto&& match : matched) |
|
0 |
0 |
for (auto&& match : matched) |
|
0 |
0 |
for (auto&& match : matched) |
|
0 |
0 |
for (auto&& match : matched) |
|
0 |
0 |
for (auto&& match : matched) |
|
0 |
0 |
for (auto&& match : matched) |
|
0 |
0 |
for (auto&& match : matched) |
|
0 |
0 |
for (auto&& match : matched) |
|
0 |
0 |
for (auto&& match : matched) |
|
0 |
0 |
for (auto&& match : matched) |
|
0 |
0 |
for (auto&& match : matched) |
|
0 |
0 |
for (auto&& match : matched) |
|
0 |
0 |
for (auto&& match : matched) |
|
0 |
0 |
for (auto&& match : matched) |
|
0 |
0 |
for (auto&& match : matched) |
|
0 |
0 |
for (auto&& match : matched) |
|
0 |
0 |
for (auto&& match : matched) |
1046
|
0 |
0 |
if (equals(match.system, match.gold)) |
|
0 |
0 |
if (equals(match.system, match.gold)) |
|
0 |
0 |
if (equals(match.system, match.gold)) |
|
0 |
0 |
if (equals(match.system, match.gold)) |
1050
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
|
0 |
0 |
total_gold ? both / double(total_gold) : 0., total_system+total_gold ? 2 * both / double(total_system + total_gold) : 0. }; |
1056
|
0 |
0 |
if (alignment.total_system != alignment.total_gold) return false; |
1060
|
0 |
0 |
for (size_t i = 0; i < system.words.size(); i++) { |
1061
|
0 |
0 |
if (system.words[i].w.form != gold.words[i].w.form) |
1074
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.words.size() && gi < gold.words.size(); ) |
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.words.size() && gi < gold.words.size(); ) |
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.words.size() && gi < gold.words.size(); ) |
1075
|
0 |
0 |
if ((system.words[si].start > gold.words[gi].start || !system.words[si].is_multiword) && |
|
0 |
0 |
if ((system.words[si].start > gold.words[gi].start || !system.words[si].is_multiword) && |
|
0 |
0 |
if ((system.words[si].start > gold.words[gi].start || !system.words[si].is_multiword) && |
|
0 |
0 |
if ((system.words[si].start > gold.words[gi].start || !system.words[si].is_multiword) && |
1076
|
0 |
0 |
(gold.words[gi].start > system.words[si].start || !gold.words[gi].is_multiword)) { |
1078
|
0 |
0 |
if (system.words[si].start == gold.words[gi].start && system.words[si].end == gold.words[gi].end) |
|
0 |
0 |
if (system.words[si].start == gold.words[gi].start && system.words[si].end == gold.words[gi].end) |
|
0 |
0 |
if (system.words[si].start == gold.words[gi].start && system.words[si].end == gold.words[gi].end) |
1080
|
0 |
0 |
else if (system.words[si].start <= gold.words[gi].start) |
1086
|
0 |
0 |
size_t ss = si, gs = gi, multiword_range_end = system.words[si].is_multiword ? system.words[si].end : gold.words[gi].end; |
1089
|
0 |
0 |
while ((si < system.words.size() && (system.words[si].is_multiword ? system.words[si].start < multiword_range_end : |
|
0 |
0 |
while ((si < system.words.size() && (system.words[si].is_multiword ? system.words[si].start < multiword_range_end : |
|
0 |
0 |
while ((si < system.words.size() && (system.words[si].is_multiword ? system.words[si].start < multiword_range_end : |
1090
|
0 |
0 |
system.words[si].end <= multiword_range_end)) || |
|
0 |
0 |
system.words[si].end <= multiword_range_end)) || |
|
0 |
0 |
system.words[si].end <= multiword_range_end)) || |
|
0 |
0 |
system.words[si].end <= multiword_range_end)) || |
1091
|
0 |
0 |
(gi < gold.words.size() && (gold.words[gi].is_multiword ? gold.words[gi].start < multiword_range_end : |
|
0 |
0 |
(gi < gold.words.size() && (gold.words[gi].is_multiword ? gold.words[gi].start < multiword_range_end : |
1094
|
0 |
0 |
if (si < system.words.size() && (gi >= gold.words.size() || system.words[si].start <= gold.words[gi].start)) { |
|
0 |
0 |
if (si < system.words.size() && (gi >= gold.words.size() || system.words[si].start <= gold.words[gi].start)) { |
|
0 |
0 |
if (si < system.words.size() && (gi >= gold.words.size() || system.words[si].start <= gold.words[gi].start)) { |
|
0 |
0 |
if (si < system.words.size() && (gi >= gold.words.size() || system.words[si].start <= gold.words[gi].start)) { |
1095
|
0 |
0 |
if (system.words[si].is_multiword) multiword_range_end = max(multiword_range_end, system.words[si].end); |
1098
|
0 |
0 |
if (gold.words[gi].is_multiword) multiword_range_end = max(multiword_range_end, gold.words[gi].end); |
1105
|
0 |
0 |
for (unsigned s = si - ss; s--; ) { |
1106
|
0 |
0 |
lcs[s].resize(gi - gs); |
1107
|
0 |
0 |
for (unsigned g = gi - gs; g--; ) { |
1108
|
0 |
0 |
lcs[s][g] = max(lcs[s][g], s+1 < lcs.size() ? lcs[s+1][g] : 0); |
1109
|
0 |
0 |
lcs[s][g] = max(lcs[s][g], g+1 < lcs[s].size() ? lcs[s][g+1] : 0); |
1110
|
0 |
0 |
if (system.words[ss + s].w.form == gold.words[gs + g].w.form) |
1111
|
0 |
0 |
lcs[s][g] = max(lcs[s][g], 1 + (s+1 < lcs.size() && g+1 < lcs[s].size() ? lcs[s+1][g+1] : 0)); |
|
0 |
0 |
lcs[s][g] = max(lcs[s][g], 1 + (s+1 < lcs.size() && g+1 < lcs[s].size() ? lcs[s+1][g+1] : 0)); |
1115
|
0 |
0 |
for (unsigned s = 0, g = 0; s < si - ss && g < gi - gs; ) { |
|
0 |
0 |
for (unsigned s = 0, g = 0; s < si - ss && g < gi - gs; ) { |
1116
|
0 |
0 |
if (system.words[ss + s].w.form == gold.words[gs + g].w.form) |
1117
|
0 |
0 |
alignment.matched.emplace_back(system.words[ss + s++].w, gold.words[gs + g++].w); |
1118
|
0 |
0 |
else if (lcs[s][g] == (s+1 < lcs.size() ? lcs[s+1][g] : 0)) |
|
0 |
0 |
else if (lcs[s][g] == (s+1 < lcs.size() ? lcs[s+1][g] : 0)) |
1127
|
0 |
0 |
for (auto&& match : alignment.matched) |
1129
|
0 |
0 |
for (auto&& match : alignment.matched) |
1130
|
0 |
0 |
if (match.system.head > 0) |
1395
|
0 |
0 |
class node { |
1430
|
0 |
0 |
class tree { |
|
0 |
0 |
class tree { |
|
0 |
0 |
class tree { |
|
0 |
0 |
class tree { |
|
0 |
0 |
class tree { |
1499
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
|
0 |
0 |
explicit binary_decoder_error(const char* description) : runtime_error(description) {} |
1502
|
0 |
0 |
class binary_decoder { |
|
0 |
0 |
class binary_decoder { |
|
0 |
0 |
class binary_decoder { |
|
0 |
0 |
class binary_decoder { |
|
0 |
0 |
class binary_decoder { |
|
0 |
0 |
class binary_decoder { |
|
0 |
0 |
class binary_decoder { |
|
0 |
0 |
class binary_decoder { |
|
0 |
0 |
class binary_decoder { |
1527
|
6 |
0 |
buffer.resize(len); |
1535
|
0 |
1308 |
if (data + 1 > data_end) throw binary_decoder_error("No more data in binary_decoder"); |
1540
|
0 |
26 |
if (data + sizeof(uint16_t) > data_end) throw binary_decoder_error("No more data in binary_decoder"); |
1548
|
0 |
1573 |
if (data + sizeof(uint32_t) > data_end) throw binary_decoder_error("No more data in binary_decoder"); |
1557
|
1 |
35 |
if (len == 255) len = next_4B(); |
1562
|
0 |
603 |
if (data + sizeof(T) * elements > data_end) throw binary_decoder_error("No more data in binary_decoder"); |
|
0 |
484 |
if (data + sizeof(T) * elements > data_end) throw binary_decoder_error("No more data in binary_decoder"); |
|
0 |
185 |
if (data + sizeof(T) * elements > data_end) throw binary_decoder_error("No more data in binary_decoder"); |
1577
|
0 |
1 |
if (pos > buffer.size()) throw binary_decoder_error("Cannot seek past end of binary_decoder"); |
1683
|
34 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
34 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
34 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
34 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
34 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
0 |
34 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
1688
|
34 |
0 |
if (str.len && (str.str[0] == '+' || str.str[0] == '-')) { |
|
8 |
26 |
if (str.len && (str.str[0] == '+' || str.str[0] == '-')) { |
1694
|
0 |
34 |
if (!str.len) return error.assign("Cannot parse ").append(value_name).append(" int value '").append(original.str, original.len).append("': empty string."), false; |
1698
|
34 |
34 |
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
0 |
34 |
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
0 |
34 |
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
1699
|
26 |
8 |
if (positive) { |
1700
|
0 |
26 |
if (value > (numeric_limits::max() - (str.str[0] - '0')) / 10) |
1704
|
0 |
8 |
if (value < (numeric_limits::min() + (str.str[0] - '0')) / 10) |
1712
|
0 |
34 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
1716
|
0 |
34 |
if (str.len) return error.assign("Cannot parse ").append(value_name).append(" int value '").append(original.str, original.len).append("': non-digit character found."), false; |
1725
|
0 |
0 |
if (!parse_int(str, value_name, result, error)) |
|
0 |
0 |
if (!parse_int(str, value_name, result, error)) |
1866
|
0 |
3 |
for (size_t start = 0; start < values.size(); ) { |
1867
|
0 |
0 |
while (start < values.size() && values[start] == ';') start++; |
|
0 |
0 |
while (start < values.size() && values[start] == ';') start++; |
|
0 |
0 |
while (start < values.size() && values[start] == ';') start++; |
1868
|
0 |
0 |
if (start >= values.size()) break; |
1871
|
0 |
0 |
name.assign(values, start, name_end - start); |
1874
|
0 |
0 |
if (name_end == string::npos) { |
1876
|
0 |
0 |
} else if (values[name_end] == ';') { |
1881
|
0 |
0 |
if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "file:") == 0) { |
|
0 |
0 |
if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "file:") == 0) { |
|
0 |
0 |
if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "file:") == 0) { |
|
0 |
0 |
if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "file:") == 0) { |
1886
|
0 |
0 |
file.assign(values, file_name, semicolon - file_name); |
1887
|
0 |
0 |
ifstream is(path_from_utf8(file).c_str()); |
1888
|
0 |
0 |
if (!is.is_open()) return error.assign("Cannot open file '").append(file).append("'!"), false; |
|
0 |
0 |
if (!is.is_open()) return error.assign("Cannot open file '").append(file).append("'!"), false; |
|
0 |
0 |
if (!is.is_open()) return error.assign("Cannot open file '").append(file).append("'!"), false; |
1891
|
0 |
0 |
for (value.clear(); is.read(buffer, sizeof(buffer)); ) |
|
0 |
0 |
for (value.clear(); is.read(buffer, sizeof(buffer)); ) |
1892
|
0 |
0 |
value.append(buffer, sizeof(buffer)); |
1893
|
0 |
0 |
value.append(buffer, is.gcount()); |
1896
|
0 |
0 |
} else if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "data:") == 0) { |
|
0 |
0 |
} else if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "data:") == 0) { |
|
0 |
0 |
} else if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "data:") == 0) { |
|
0 |
0 |
} else if (equal_sign + 1 + 5 <= values.size() && values.compare(equal_sign + 1, 5, "data:") == 0) { |
1900
|
0 |
0 |
if (data_size_end == string::npos) return error.assign("Cannot parse named values, data size of value '").append(name).append("' not terminated!"), false; |
|
0 |
0 |
if (data_size_end == string::npos) return error.assign("Cannot parse named values, data size of value '").append(name).append("' not terminated!"), false; |
|
0 |
0 |
if (data_size_end == string::npos) return error.assign("Cannot parse named values, data size of value '").append(name).append("' not terminated!"), false; |
1903
|
0 |
0 |
if (!parse_int(string_piece(values.c_str() + data_size_start, data_size_end - data_size_start), "data_size", data_size, error)) return false; |
|
0 |
0 |
if (!parse_int(string_piece(values.c_str() + data_size_start, data_size_end - data_size_start), "data_size", data_size, error)) return false; |
1906
|
0 |
0 |
if (data_end > values.size()) return error.assign("Cannot parse named values, value '").append(name).append("' shorter than specified length!"), false; |
|
0 |
0 |
if (data_end > values.size()) return error.assign("Cannot parse named values, value '").append(name).append("' shorter than specified length!"), false; |
|
0 |
0 |
if (data_end > values.size()) return error.assign("Cannot parse named values, value '").append(name).append("' shorter than specified length!"), false; |
1907
|
0 |
0 |
if (data_end < values.size() && values[data_end] != ';') return error.assign("Cannot parse named values, value '").append(name).append("' not terminated by semicolon!"), false; |
|
0 |
0 |
if (data_end < values.size() && values[data_end] != ';') return error.assign("Cannot parse named values, value '").append(name).append("' not terminated by semicolon!"), false; |
|
0 |
0 |
if (data_end < values.size() && values[data_end] != ';') return error.assign("Cannot parse named values, value '").append(name).append("' not terminated by semicolon!"), false; |
|
0 |
0 |
if (data_end < values.size() && values[data_end] != ';') return error.assign("Cannot parse named values, value '").append(name).append("' not terminated by semicolon!"), false; |
|
0 |
0 |
if (data_end < values.size() && values[data_end] != ';') return error.assign("Cannot parse named values, value '").append(name).append("' not terminated by semicolon!"), false; |
1909
|
0 |
0 |
value.assign(values, data_start, data_end - data_start); |
1914
|
0 |
0 |
value.assign(values, equal_sign + 1, semicolon - equal_sign - 1); |
1961
|
0 |
0 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
0 |
0 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
0 |
1 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
0 |
1 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
0 |
1 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
0 |
1 |
while (lock.test_and_set(memory_order_acquire)) {} |
1970
|
0 |
0 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
0 |
0 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
0 |
1 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
0 |
1 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
0 |
1 |
while (lock.test_and_set(memory_order_acquire)) {} |
|
0 |
1 |
while (lock.test_and_set(memory_order_acquire)) {} |
1971
|
0 |
0 |
if (!stack.empty()) { |
|
0 |
0 |
if (!stack.empty()) { |
|
0 |
1 |
if (!stack.empty()) { |
|
0 |
1 |
if (!stack.empty()) { |
|
0 |
1 |
if (!stack.empty()) { |
|
0 |
1 |
if (!stack.empty()) { |
2027
|
1 |
0 |
struct parser_cache { |
2090
|
1 |
0 |
ifstream in(path_from_utf8(fname).c_str(), ifstream::in | ifstream::binary); |
2091
|
1 |
0 |
if (!in.is_open()) return nullptr; |
2092
|
1 |
0 |
return load(in); |
2097
|
1 |
0 |
if (!is.get(len)) return nullptr; |
2099
|
1 |
0 |
if (!is.read(&name[0], len)) return nullptr; |
|
1 |
0 |
if (!is.read(&name[0], len)) return nullptr; |
2101
|
1 |
0 |
if (name == "morphodita_parsito") return model_morphodita_parsito::load(is); |
|
1 |
0 |
if (name == "morphodita_parsito") return model_morphodita_parsito::load(is); |
2245
|
0 |
0 |
for (string line; getline(is, line); ) { |
|
0 |
0 |
for (string line; getline(is, line); ) { |
2247
|
0 |
0 |
para.push_back('\n'); |
2249
|
0 |
0 |
if (line.empty()) break; |
2252
|
0 |
0 |
if (is.eof() && !para.empty()) is.clear(istream::eofbit); |
|
0 |
0 |
if (is.eof() && !para.empty()) is.clear(istream::eofbit); |
|
0 |
0 |
if (is.eof() && !para.empty()) is.clear(istream::eofbit); |
2294
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
2299
|
0 |
0 |
if (str.len && (str.str[0] == '+' || str.str[0] == '-')) { |
|
0 |
0 |
if (str.len && (str.str[0] == '+' || str.str[0] == '-')) { |
2305
|
0 |
0 |
if (!str.len) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': empty string."), false; |
2309
|
0 |
0 |
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
0 |
0 |
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
0 |
0 |
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
2315
|
0 |
0 |
if (str.len && str.str[0] == '.') { |
|
0 |
0 |
if (str.len && str.str[0] == '.') { |
2319
|
0 |
0 |
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
0 |
0 |
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
0 |
0 |
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
2327
|
0 |
0 |
if (!isfinite(value)) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': overflow occured."), false; |
2330
|
0 |
0 |
if (str.len && (str.str[0] == 'e' || str.str[0] == 'E')) { |
|
0 |
0 |
if (str.len && (str.str[0] == 'e' || str.str[0] == 'E')) { |
2335
|
0 |
0 |
if (str.len && (str.str[0] == '+' || str.str[0] == '-')) { |
|
0 |
0 |
if (str.len && (str.str[0] == '+' || str.str[0] == '-')) { |
2340
|
0 |
0 |
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
0 |
0 |
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
|
0 |
0 |
while (str.len && str.str[0] >= '0' && str.str[0] <= '9') { |
2345
|
0 |
0 |
exponent = pow(10., exponent_negative ? -exponent : exponent); |
2346
|
0 |
0 |
if (!isfinite(exponent)) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': exponent overflow occured."), false; |
2347
|
0 |
0 |
if (exponent == 0) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': exponent underflow occured."), false; |
2349
|
0 |
0 |
if (value) { |
2351
|
0 |
0 |
if (!isfinite(value)) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': overflow occured."), false; |
2352
|
0 |
0 |
if (value == 0) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': underflow occured."), false; |
2357
|
0 |
0 |
if (negative) value *= -1; |
2360
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
|
0 |
0 |
while (str.len && (str.str[0] == ' ' || str.str[0] == '\f' || str.str[0] == '\n' || str.str[0] == '\r' || str.str[0] == '\t' || str.str[0] == '\v')) |
2364
|
0 |
0 |
if (str.len) return error.assign("Cannot parse ").append(value_name).append(" double value '").append(original.str, original.len).append("': non-digit character found."), false; |
2373
|
0 |
0 |
if (!parse_double(str, value_name, result, error)) |
|
0 |
0 |
if (!parse_double(str, value_name, result, error)) |
2400
|
1 |
0 |
if (!tokenizer_factory) |
2405
|
1 |
0 |
if (!named_values::parse(options, parsed_options, parse_error)) |
|
1 |
0 |
if (!named_values::parse(options, parsed_options, parse_error)) |
2408
|
1 |
0 |
bool normalized_spaces = parsed_options.count("normalized_spaces"); |
2409
|
1 |
0 |
bool token_ranges = parsed_options.count("ranges"); |
2411
|
1 |
0 |
const auto* morpho = !taggers.empty() ? taggers[0].tagger->get_morpho() : nullptr; |
|
1 |
0 |
const auto* morpho = !taggers.empty() ? taggers[0].tagger->get_morpho() : nullptr; |
2412
|
1 |
0 |
unique_ptr result(new morphodita_tokenizer_wrapper(tokenizer_factory->new_tokenizer(morpho), splitter.get(), normalized_spaces, token_ranges)); |
|
1 |
0 |
unique_ptr result(new morphodita_tokenizer_wrapper(tokenizer_factory->new_tokenizer(morpho), splitter.get(), normalized_spaces, token_ranges)); |
|
1 |
0 |
unique_ptr result(new morphodita_tokenizer_wrapper(tokenizer_factory->new_tokenizer(morpho), splitter.get(), normalized_spaces, token_ranges)); |
2415
|
0 |
0 |
if (parsed_options.count("presegmented") && result) |
|
0 |
1 |
if (parsed_options.count("presegmented") && result) |
2416
|
0 |
0 |
result.reset(input_format::new_presegmented_tokenizer(result.release())); |
2419
|
0 |
0 |
if (parsed_options.count("joint_with_parsing") && result) { |
|
0 |
1 |
if (parsed_options.count("joint_with_parsing") && result) { |
2421
|
0 |
0 |
if (parsed_options.count("joint_max_sentence_len") && !parse_int(parsed_options["joint_max_sentence_len"], "joint max sentence len", max_sentence_len, parse_error)) |
|
0 |
0 |
if (parsed_options.count("joint_max_sentence_len") && !parse_int(parsed_options["joint_max_sentence_len"], "joint max sentence len", max_sentence_len, parse_error)) |
|
0 |
0 |
if (parsed_options.count("joint_max_sentence_len") && !parse_int(parsed_options["joint_max_sentence_len"], "joint max sentence len", max_sentence_len, parse_error)) |
|
0 |
0 |
if (parsed_options.count("joint_max_sentence_len") && !parse_int(parsed_options["joint_max_sentence_len"], "joint max sentence len", max_sentence_len, parse_error)) |
|
0 |
0 |
if (parsed_options.count("joint_max_sentence_len") && !parse_int(parsed_options["joint_max_sentence_len"], "joint max sentence len", max_sentence_len, parse_error)) |
|
0 |
0 |
if (parsed_options.count("joint_max_sentence_len") && !parse_int(parsed_options["joint_max_sentence_len"], "joint max sentence len", max_sentence_len, parse_error)) |
|
0 |
0 |
if (parsed_options.count("joint_max_sentence_len") && !parse_int(parsed_options["joint_max_sentence_len"], "joint max sentence len", max_sentence_len, parse_error)) |
2425
|
0 |
0 |
if (parsed_options.count("joint_change_boundary_logprob") && !parse_double(parsed_options["joint_change_boundary_logprob"], "joint change boundary logprob", change_boundary_logprob, parse_error)) |
|
0 |
0 |
if (parsed_options.count("joint_change_boundary_logprob") && !parse_double(parsed_options["joint_change_boundary_logprob"], "joint change boundary logprob", change_boundary_logprob, parse_error)) |
|
0 |
0 |
if (parsed_options.count("joint_change_boundary_logprob") && !parse_double(parsed_options["joint_change_boundary_logprob"], "joint change boundary logprob", change_boundary_logprob, parse_error)) |
|
0 |
0 |
if (parsed_options.count("joint_change_boundary_logprob") && !parse_double(parsed_options["joint_change_boundary_logprob"], "joint change boundary logprob", change_boundary_logprob, parse_error)) |
|
0 |
0 |
if (parsed_options.count("joint_change_boundary_logprob") && !parse_double(parsed_options["joint_change_boundary_logprob"], "joint change boundary logprob", change_boundary_logprob, parse_error)) |
|
0 |
0 |
if (parsed_options.count("joint_change_boundary_logprob") && !parse_double(parsed_options["joint_change_boundary_logprob"], "joint change boundary logprob", change_boundary_logprob, parse_error)) |
|
0 |
0 |
if (parsed_options.count("joint_change_boundary_logprob") && !parse_double(parsed_options["joint_change_boundary_logprob"], "joint change boundary logprob", change_boundary_logprob, parse_error)) |
2429
|
0 |
0 |
if (parsed_options.count("joint_sentence_logprob") && !parse_double(parsed_options["joint_sentence_logprob"], "joint sentence logprob", sentence_logprob, parse_error)) |
|
0 |
0 |
if (parsed_options.count("joint_sentence_logprob") && !parse_double(parsed_options["joint_sentence_logprob"], "joint sentence logprob", sentence_logprob, parse_error)) |
|
0 |
0 |
if (parsed_options.count("joint_sentence_logprob") && !parse_double(parsed_options["joint_sentence_logprob"], "joint sentence logprob", sentence_logprob, parse_error)) |
|
0 |
0 |
if (parsed_options.count("joint_sentence_logprob") && !parse_double(parsed_options["joint_sentence_logprob"], "joint sentence logprob", sentence_logprob, parse_error)) |
|
0 |
0 |
if (parsed_options.count("joint_sentence_logprob") && !parse_double(parsed_options["joint_sentence_logprob"], "joint sentence logprob", sentence_logprob, parse_error)) |
|
0 |
0 |
if (parsed_options.count("joint_sentence_logprob") && !parse_double(parsed_options["joint_sentence_logprob"], "joint sentence logprob", sentence_logprob, parse_error)) |
|
0 |
0 |
if (parsed_options.count("joint_sentence_logprob") && !parse_double(parsed_options["joint_sentence_logprob"], "joint sentence logprob", sentence_logprob, parse_error)) |
2432
|
0 |
0 |
result.reset(new joint_with_parsing_tokenizer(result.release(), *this, max_sentence_len, change_boundary_logprob, sentence_logprob)); |
2441
|
0 |
1 |
if (taggers.empty()) return error.assign("No tagger defined for the UDPipe model!"), false; |
2442
|
1 |
0 |
if (s.empty()) return true; |
2445
|
1 |
0 |
if (!c) c = new tagger_cache(); |
2450
|
1 |
7 |
for (size_t i = 1; i < s.words.size(); i++) |
2454
|
7 |
1 |
for (size_t i = 1; i < s.words.size(); i++) { |
2462
|
1 |
1 |
for (auto&& tagger : taggers) { |
2463
|
0 |
1 |
if (!tagger.tagger) return error.assign("No tagger defined for the UDPipe model!"), false; |
2467
|
7 |
1 |
for (size_t i = 0; i < c->lemmas.size(); i++) |
2472
|
1 |
0 |
if (taggers.size() == 1 && taggers[0].raw && taggers[0].tagger->get_morpho()) { |
|
0 |
1 |
if (taggers.size() == 1 && taggers[0].raw && taggers[0].tagger->get_morpho()) { |
|
0 |
0 |
if (taggers.size() == 1 && taggers[0].raw && taggers[0].tagger->get_morpho()) { |
|
0 |
1 |
if (taggers.size() == 1 && taggers[0].raw && taggers[0].tagger->get_morpho()) { |
2474
|
0 |
0 |
for (size_t i = 0; i < c->forms_string_pieces.size(); i++) { |
2475
|
0 |
0 |
if (morpho->analyze(c->forms_string_pieces[i], morphodita::morpho::GUESSER, c->lemmas) == morphodita::morpho::GUESSER) |
2476
|
0 |
0 |
s.words[i + 1].misc.append(s.words[i + 1].misc.empty() ? "" : "|").append("MorphoGuesser=Yes"); |
2491
|
0 |
1 |
if (!parser) return error.assign("No parser defined for the UDPipe model!"), false; |
2492
|
1 |
0 |
if (s.empty()) return true; |
2495
|
1 |
0 |
if (!c) c = new parser_cache(); |
2498
|
1 |
0 |
if (!named_values::parse(options, c->options, error)) |
2500
|
0 |
1 |
if (c->options.count("beam_search")) |
2501
|
0 |
0 |
if (!parse_int(c->options["beam_search"], "beam_search", beam_search, error)) |
|
0 |
0 |
if (!parse_int(c->options["beam_search"], "beam_search", beam_search, error)) |
2505
|
7 |
1 |
for (size_t i = 1; i < s.words.size(); i++) { |
2517
|
7 |
1 |
for (size_t i = 1; i < s.words.size(); i++) |
2526
|
1 |
0 |
if (!is.get(version)) return nullptr; |
2527
|
1 |
0 |
if (!(version >= 1 && version <= VERSION_LATEST)) return nullptr; |
2532
|
0 |
1 |
if (version >= 2) { |
2534
|
0 |
0 |
if (!is.get(sentinel) || sentinel != 0x7F) return nullptr; |
|
0 |
0 |
if (!is.get(sentinel) || sentinel != 0x7F) return nullptr; |
|
0 |
0 |
if (!is.get(sentinel) || sentinel != 0x7F) return nullptr; |
2535
|
0 |
0 |
if (!is.get(sentinel) || sentinel != 0x7F) return nullptr; |
|
0 |
0 |
if (!is.get(sentinel) || sentinel != 0x7F) return nullptr; |
|
0 |
0 |
if (!is.get(sentinel) || sentinel != 0x7F) return nullptr; |
2539
|
1 |
0 |
if (!m) return nullptr; |
2542
|
1 |
0 |
if (!is.get(tokenizer)) return nullptr; |
|
1 |
0 |
if (!is.get(tokenizer)) return nullptr; |
2543
|
1 |
0 |
m->tokenizer_factory.reset(tokenizer ? morphodita::tokenizer_factory::load(is) : nullptr); |
|
1 |
0 |
m->tokenizer_factory.reset(tokenizer ? morphodita::tokenizer_factory::load(is) : nullptr); |
2544
|
1 |
0 |
if (tokenizer && !m->tokenizer_factory) return nullptr; |
|
1 |
0 |
if (tokenizer && !m->tokenizer_factory) return nullptr; |
|
1 |
0 |
if (tokenizer && !m->tokenizer_factory) return nullptr; |
2545
|
1 |
0 |
m->splitter.reset(tokenizer ? multiword_splitter::load(is) : nullptr); |
|
1 |
0 |
m->splitter.reset(tokenizer ? multiword_splitter::load(is) : nullptr); |
2546
|
1 |
0 |
if (tokenizer && !m->splitter) return nullptr; |
|
1 |
0 |
if (tokenizer && !m->splitter) return nullptr; |
|
1 |
0 |
if (tokenizer && !m->splitter) return nullptr; |
2549
|
1 |
0 |
char taggers; if (!is.get(taggers)) return nullptr; |
|
1 |
0 |
char taggers; if (!is.get(taggers)) return nullptr; |
2550
|
1 |
1 |
for (char i = 0; i < taggers; i++) { |
2551
|
1 |
0 |
char lemma; if (!is.get(lemma)) return nullptr; |
|
1 |
0 |
char lemma; if (!is.get(lemma)) return nullptr; |
2552
|
1 |
0 |
char xpostag; if (!is.get(xpostag)) return nullptr; |
|
1 |
0 |
char xpostag; if (!is.get(xpostag)) return nullptr; |
2553
|
1 |
0 |
char feats; if (!is.get(feats)) return nullptr; |
|
1 |
0 |
char feats; if (!is.get(feats)) return nullptr; |
2554
|
1 |
0 |
int model_type = is.peek(); |
2557
|
1 |
0 |
model_type == morphodita::tagger_ids::CONLLU3); |
|
1 |
0 |
model_type == morphodita::tagger_ids::CONLLU3); |
2558
|
1 |
0 |
morphodita::tagger* tagger = morphodita::tagger::load(is); |
2559
|
1 |
0 |
if (!tagger) return nullptr; |
2560
|
1 |
0 |
m->taggers.emplace_back(raw, i == 0, int(lemma), bool(xpostag), bool(feats), tagger); |
2564
|
1 |
0 |
if (!is.get(parser)) return nullptr; |
|
1 |
0 |
if (!is.get(parser)) return nullptr; |
2565
|
1 |
0 |
m->parser.reset(parser ? parsito::parser::load(is) : nullptr); |
|
1 |
0 |
m->parser.reset(parser ? parsito::parser::load(is) : nullptr); |
2566
|
1 |
0 |
if (parser && !m->parser) return nullptr; |
|
1 |
0 |
if (parser && !m->parser) return nullptr; |
|
1 |
0 |
if (parser && !m->parser) return nullptr; |
2576
|
0 |
0 |
for (string line; getline(is, line); ) { |
|
0 |
0 |
for (string line; getline(is, line); ) { |
2578
|
0 |
0 |
block.push_back('\n'); |
2581
|
0 |
0 |
if (is.eof() && !block.empty()) is.clear(istream::eofbit); |
|
0 |
0 |
if (is.eof() && !block.empty()) is.clear(istream::eofbit); |
|
0 |
0 |
if (is.eof() && !block.empty()) is.clear(istream::eofbit); |
2595
|
0 |
0 |
if (make_copy) { |
2605
|
0 |
0 |
if (text.len) { |
2613
|
0 |
0 |
while (tokenizer->next_sentence(input, error)) { |
|
0 |
0 |
while (tokenizer->next_sentence(input, error)) { |
2614
|
0 |
0 |
if (input.get_new_par() && !paragraph.empty()) { |
|
0 |
0 |
if (input.get_new_par() && !paragraph.empty()) { |
|
0 |
0 |
if (input.get_new_par() && !paragraph.empty()) { |
|
0 |
0 |
if (input.get_new_par() && !paragraph.empty()) { |
2615
|
0 |
0 |
if (!parse_paragraph(paragraph, error)) return false; |
|
0 |
0 |
if (!parse_paragraph(paragraph, error)) return false; |
2616
|
0 |
0 |
for (auto&& sentence : paragraph) |
2617
|
0 |
0 |
sentences.push_back(sentence); |
2620
|
0 |
0 |
paragraph.push_back(input); |
2622
|
0 |
0 |
if (!error.empty()) return false; |
2624
|
0 |
0 |
if (!paragraph.empty()) { |
2625
|
0 |
0 |
if (!parse_paragraph(paragraph, error)) return false; |
|
0 |
0 |
if (!parse_paragraph(paragraph, error)) return false; |
2626
|
0 |
0 |
for (auto&& sentence : paragraph) |
2627
|
0 |
0 |
sentences.push_back(sentence); |
2633
|
0 |
0 |
if (sentences_index < sentences.size()) { |
2643
|
0 |
0 |
vector sentence_boundary(1, true); |
2644
|
0 |
0 |
vector token_boundary(1, true); |
2646
|
0 |
0 |
for (auto&& s : paragraph) { |
2648
|
0 |
0 |
for (unsigned i = 1; i < s.words.size(); i++) { |
2649
|
0 |
0 |
all_words.words.push_back(s.words[i]); |
2651
|
0 |
0 |
sentence_boundary.push_back(i+1 == s.words.size()); |
2652
|
0 |
0 |
token_boundary.push_back(true); |
2655
|
0 |
0 |
for (auto&& mwt : s.multiword_tokens) { |
2656
|
0 |
0 |
all_words.multiword_tokens.push_back(mwt); |
2659
|
0 |
0 |
for (int i = all_words.multiword_tokens.back().id_first; i < all_words.multiword_tokens.back().id_last; i++) |
2664
|
0 |
0 |
vector best_logprob(all_words.words.size(), -numeric_limits::infinity()); best_logprob[0] = 0.; |
2665
|
0 |
0 |
vector best_length(all_words.words.size(), 0); |
2666
|
0 |
0 |
sentence s; |
2668
|
0 |
0 |
for (unsigned start = 1; start < all_words.words.size(); start++) { |
2669
|
0 |
0 |
if (!token_boundary[start - 1]) continue; |
2670
|
0 |
0 |
s.clear(); |
2671
|
0 |
0 |
for (unsigned end = start + 1; end <= all_words.words.size() && (end - start) <= unsigned(max_sentence_len); end++) { |
|
0 |
0 |
for (unsigned end = start + 1; end <= all_words.words.size() && (end - start) <= unsigned(max_sentence_len); end++) { |
|
0 |
0 |
for (unsigned end = start + 1; end <= all_words.words.size() && (end - start) <= unsigned(max_sentence_len); end++) { |
2672
|
0 |
0 |
s.words.push_back(all_words.words[end - 1]); |
2674
|
0 |
0 |
if (!token_boundary[end - 1]) continue; |
2676
|
0 |
0 |
for (unsigned i = 1; i < s.words.size(); i++) { |
2682
|
0 |
0 |
if (!model.parse(s, DEFAULT, error, &cost)) return false; |
|
0 |
0 |
if (!model.parse(s, DEFAULT, error, &cost)) return false; |
2684
|
0 |
0 |
if (best_logprob[start - 1] + cost > best_logprob[end - 1]) { |
2692
|
0 |
0 |
for (unsigned end = all_words.words.size(); end > 1; end -= best_length[end - 1]) |
2693
|
0 |
0 |
sentence_lengths.push_back(best_length[end - 1]); |
2699
|
0 |
0 |
for (unsigned i = 1; i < sentence_lengths.size(); i++) { |
2702
|
0 |
0 |
paragraph.emplace_back(); |
2703
|
0 |
0 |
while (!all_words.multiword_tokens.empty() && unsigned(all_words.multiword_tokens.front().id_first) < sentence_lengths[i]) { |
|
0 |
0 |
while (!all_words.multiword_tokens.empty() && unsigned(all_words.multiword_tokens.front().id_first) < sentence_lengths[i]) { |
|
0 |
0 |
while (!all_words.multiword_tokens.empty() && unsigned(all_words.multiword_tokens.front().id_first) < sentence_lengths[i]) { |
2704
|
0 |
0 |
paragraph.back().multiword_tokens.push_back(all_words.multiword_tokens.front()); |
2710
|
0 |
0 |
for (unsigned word = sentence_lengths[i - 1]; word < sentence_lengths[i]; word++) { |
2711
|
0 |
0 |
paragraph.back().words.push_back(all_words.words[word]); |
2718
|
0 |
0 |
if (!paragraph.empty()) { |
2719
|
0 |
0 |
if (new_document) { |
2720
|
0 |
0 |
paragraph.front().set_new_doc(true, document_id); |
2724
|
0 |
0 |
paragraph.front().set_new_par(true); |
2732
|
0 |
7 |
if (raw) { |
2733
|
0 |
0 |
if (lemma) word.lemma.assign(analysis.lemma); |
2734
|
0 |
0 |
if (xpostag) word.xpostag.assign(analysis.tag); |
2739
|
7 |
0 |
if (lemma == 1) { |
2741
|
0 |
0 |
} else if (lemma == 2) { |
2745
|
0 |
0 |
if (analysis.lemma[0] == '~') { |
2747
|
0 |
0 |
if (end != string::npos) { |
2749
|
0 |
0 |
if (analysis.lemma.compare(end + 1, string::npos, word.lemma) == 0) |
2756
|
0 |
7 |
if (version == 2) { |
2758
|
0 |
0 |
for (auto && chr : word.lemma) |
2759
|
0 |
0 |
if (chr == '\001') |
2761
|
0 |
7 |
} else if (version >= 3) { |
2763
|
0 |
0 |
for (size_t i = 0; i + 1 < word.lemma.size(); i++) |
2764
|
0 |
0 |
if (word.lemma[i] == char(0xC2) && word.lemma[i+1] == char(0xA0)) |
|
0 |
0 |
if (word.lemma[i] == char(0xC2) && word.lemma[i+1] == char(0xA0)) |
|
0 |
0 |
if (word.lemma[i] == char(0xC2) && word.lemma[i+1] == char(0xA0)) |
2768
|
0 |
7 |
if (!upostag && !xpostag && !feats) return; |
|
0 |
0 |
if (!upostag && !xpostag && !feats) return; |
2773
|
7 |
0 |
if (upostag) word.upostag.assign(analysis.tag, start, end - start); |
2775
|
7 |
0 |
if (!xpostag && !feats) return; |
2780
|
7 |
0 |
if (xpostag) word.xpostag.assign(analysis.tag, start, end - start); |
2782
|
7 |
0 |
if (!feats) return; |
2793
|
14 |
0 |
if (version <= 1) return output.assign(form.str, form.len); |
2835
|
0 |
0 |
for (auto&& chr : utf8::decoder(form.str, form.len)) { |
2837
|
0 |
0 |
if (chr == 0x640 || (chr >= 0x64B && chr <= 0x657) || chr == 0x670) {} |
|
0 |
0 |
if (chr == 0x640 || (chr >= 0x64B && chr <= 0x657) || chr == 0x670) {} |
2838
|
0 |
0 |
else if (chr == 0x622) utf8::append(output, 0x627); |
2839
|
0 |
0 |
else if (chr == 0x623) utf8::append(output, 0x627); |
2840
|
0 |
0 |
else if (chr == 0x624) utf8::append(output, 0x648); |
2841
|
0 |
0 |
else if (chr == 0x625) utf8::append(output, 0x627); |
2842
|
0 |
0 |
else if (chr == 0x626) utf8::append(output, 0x64A); |
2843
|
0 |
0 |
else if (chr == 0x671) utf8::append(output, 0x627); |
2844
|
0 |
0 |
else if (chr == 0x6A9) utf8::append(output, 0x643); |
2845
|
0 |
0 |
else if (chr == 0x6AA) utf8::append(output, 0x643); |
2846
|
0 |
0 |
else if (chr == 0x6CC) utf8::append(output, 0x64A); |
2848
|
0 |
0 |
else if (chr == ' ' && version == 2) utf8::append(output, 0x01); |
|
0 |
0 |
else if (chr == ' ' && version == 2) utf8::append(output, 0x01); |
2849
|
0 |
0 |
else if (chr == ' ' && version >= 3) utf8::append(output, 0xA0); |
|
0 |
0 |
else if (chr == ' ' && version >= 3) utf8::append(output, 0xA0); |
2855
|
0 |
0 |
if (output.empty() && form.len) |
|
0 |
0 |
if (output.empty() && form.len) |
|
0 |
0 |
if (output.empty() && form.len) |
2865
|
7 |
0 |
if (version <= 2) return output.assign(lemma.str, lemma.len); |
2869
|
0 |
0 |
for (size_t i = 0; i < lemma.len; i++) { |
2871
|
0 |
0 |
if (lemma.str[i] == ' ') utf8::append(output, 0xA0); |
2982
|
0 |
0 |
for (string line; getline(is, line); ) |
|
0 |
0 |
for (string line; getline(is, line); ) |
2983
|
0 |
0 |
whole.append(line).push_back('\n'); |
2985
|
0 |
0 |
if (is.eof() && !whole.empty()) is.clear(istream::eofbit); |
|
0 |
0 |
if (is.eof() && !whole.empty()) is.clear(istream::eofbit); |
|
0 |
0 |
if (is.eof() && !whole.empty()) is.clear(istream::eofbit); |
3009
|
0 |
0 |
set_input(input); |
3012
|
0 |
0 |
set_output(output); |
3022
|
0 |
0 |
if (input.empty()) { |
3024
|
0 |
0 |
} else if (input == "tokenize" || input == "tokenizer") { |
3026
|
0 |
0 |
} else if (input.compare(0, 10, "tokenizer=") == 0) { |
3043
|
0 |
0 |
this->output = output.empty() ? "conllu" : output; |
3060
|
0 |
0 |
if (input == "tokenizer") { |
3061
|
0 |
0 |
reader.reset(m->new_tokenizer(tokenizer)); |
3062
|
0 |
0 |
if (!reader) return error.assign("The model does not have a tokenizer!"), false; |
|
0 |
0 |
if (!reader) return error.assign("The model does not have a tokenizer!"), false; |
3064
|
0 |
0 |
reader.reset(input_format::new_input_format(input)); |
3065
|
0 |
0 |
if (!reader) return error.assign("The requested input format '").append(input).append("' does not exist!"), false; |
|
0 |
0 |
if (!reader) return error.assign("The requested input format '").append(input).append("' does not exist!"), false; |
|
0 |
0 |
if (!reader) return error.assign("The requested input format '").append(input).append("' does not exist!"), false; |
3067
|
0 |
0 |
reader->reset_document(document_id); |
3069
|
0 |
0 |
unique_ptr writer(output_format::new_output_format(output)); |
3070
|
0 |
0 |
if (!writer) return error.assign("The requested output format '").append(output).append("' does not exist!"), false; |
|
0 |
0 |
if (!writer) return error.assign("The requested output format '").append(output).append("' does not exist!"), false; |
|
0 |
0 |
if (!writer) return error.assign("The requested output format '").append(output).append("' does not exist!"), false; |
3073
|
0 |
0 |
while (immediate ? reader->read_block(is, block) : bool(getwhole(is, block))) { |
|
0 |
0 |
while (immediate ? reader->read_block(is, block) : bool(getwhole(is, block))) { |
|
0 |
0 |
while (immediate ? reader->read_block(is, block) : bool(getwhole(is, block))) { |
|
0 |
0 |
while (immediate ? reader->read_block(is, block) : bool(getwhole(is, block))) { |
3074
|
0 |
0 |
reader->set_text(block); |
3075
|
0 |
0 |
while (reader->next_sentence(s, error)) { |
|
0 |
0 |
while (reader->next_sentence(s, error)) { |
3076
|
0 |
0 |
if (tagger != NONE) |
3077
|
0 |
0 |
if (!m->tag(s, tagger, error)) |
|
0 |
0 |
if (!m->tag(s, tagger, error)) |
3080
|
0 |
0 |
if (parser != NONE) |
3081
|
0 |
0 |
if (!m->parse(s, parser, error)) |
|
0 |
0 |
if (!m->parse(s, parser, error)) |
3084
|
0 |
0 |
writer->write_sentence(s, os); |
3086
|
0 |
0 |
if (!error.empty()) return false; |
3088
|
0 |
0 |
writer->finish_document(os); |
3198
|
0 |
0 |
format_tagged_lemma(result); |
3203
|
0 |
0 |
for (auto&& lemma : lemmas) |
3206
|
0 |
0 |
if (lemmas.size() > 1) |
3214
|
0 |
0 |
if (converter) converter->convert(lemma); |
3218
|
0 |
0 |
if (converter) converter->convert_analyzed(lemmas); |
3231
|
0 |
0 |
for (derivated_lemma parent; derinet->parent(lemma.lemma, parent); ) |
|
0 |
0 |
for (derivated_lemma parent; derinet->parent(lemma.lemma, parent); ) |
3233
|
0 |
0 |
if (converter) converter->convert(lemma); |
3241
|
0 |
0 |
return derinet ? new root_derivation_formatter(derinet) : nullptr; |
|
0 |
0 |
return derinet ? new root_derivation_formatter(derinet) : nullptr; |
3250
|
0 |
0 |
if (converter) converter->convert(lemma); |
|
0 |
0 |
if (converter) converter->convert(lemma); |
3251
|
0 |
0 |
for (derivated_lemma parent; derinet->parent(current.lemma, parent); current.lemma.swap(parent.lemma)) { |
|
0 |
0 |
for (derivated_lemma parent; derinet->parent(current.lemma, parent); current.lemma.swap(parent.lemma)) { |
3252
|
0 |
0 |
tagged_lemma parrent_lemma(parent.lemma, current.tag); |
3253
|
0 |
0 |
if (converter) converter->convert(parrent_lemma); |
|
0 |
0 |
if (converter) converter->convert(parrent_lemma); |
3254
|
0 |
0 |
lemma.lemma.append(" ").append(parrent_lemma.lemma); |
3263
|
0 |
0 |
return derinet ? new path_derivation_formatter(derinet) : nullptr; |
|
0 |
0 |
return derinet ? new path_derivation_formatter(derinet) : nullptr; |
3272
|
0 |
0 |
if (converter) converter->convert(lemma); |
|
0 |
0 |
if (converter) converter->convert(lemma); |
3273
|
0 |
0 |
for (derivated_lemma parent; derinet->parent(root, parent); root.swap(parent.lemma)) {} |
|
0 |
0 |
for (derivated_lemma parent; derinet->parent(root, parent); root.swap(parent.lemma)) {} |
3274
|
0 |
0 |
format_tree(root, tag, lemma, converter); |
3280
|
0 |
0 |
if (converter) { |
3281
|
0 |
0 |
tagged_lemma current(root, tag); |
3282
|
0 |
0 |
converter->convert(current); |
3283
|
0 |
0 |
tree.lemma.append(" ").append(current.lemma); |
3285
|
0 |
0 |
tree.lemma.append(" ").append(root); |
3288
|
0 |
0 |
if (derinet->children(root, children)) |
|
0 |
0 |
if (derinet->children(root, children)) |
3289
|
0 |
0 |
for (auto&& child : children) |
3290
|
0 |
0 |
format_tree(child.lemma, tag, tree, converter); |
3291
|
0 |
0 |
tree.lemma.push_back(' '); |
3299
|
0 |
0 |
return derinet ? new tree_derivation_formatter(derinet) : nullptr; |
|
0 |
0 |
return derinet ? new tree_derivation_formatter(derinet) : nullptr; |
3303
|
0 |
0 |
if (name == "none") return new_none_derivation_formatter(); |
3304
|
0 |
0 |
if (name == "root") return new_root_derivation_formatter(derinet); |
3305
|
0 |
0 |
if (name == "path") return new_path_derivation_formatter(derinet); |
3306
|
0 |
0 |
if (name == "tree") return new_tree_derivation_formatter(derinet); |
3336
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
529 |
133 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
133 |
36 |
while (len--) |
|
1009 |
66 |
while (len--) |
|
0 |
0 |
while (len--) |
|
64 |
10 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
3337
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
307 |
222 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
111 |
22 |
if (*a++ != *b++) |
|
1000 |
9 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
61 |
3 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
|
0 |
0 |
if (*a++ != *b++) |
3346
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
729 |
346 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
20 |
118 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
120 |
20 |
while (len--) |
3419
|
0 |
0 |
data.reserve(16); |
3423
|
0 |
0 |
if (uint8_t(val) != val) training_failure("Should encode value " << val << " in one byte!"); |
|
0 |
0 |
if (uint8_t(val) != val) training_failure("Should encode value " << val << " in one byte!"); |
|
0 |
0 |
if (uint8_t(val) != val) training_failure("Should encode value " << val << " in one byte!"); |
3428
|
0 |
0 |
if (uint16_t(val) != val) training_failure("Should encode value " << val << " in two bytes!"); |
|
0 |
0 |
if (uint16_t(val) != val) training_failure("Should encode value " << val << " in two bytes!"); |
|
0 |
0 |
if (uint16_t(val) != val) training_failure("Should encode value " << val << " in two bytes!"); |
3447
|
0 |
0 |
if (!(str.len < 255)) add_4B(str.len); |
3606
|
30 |
10 |
while (size) { |
|
0 |
0 |
while (size) { |
|
0 |
0 |
while (size) { |
3608
|
21 |
9 |
if (unaligned_load(first + step) < val) { |
|
0 |
0 |
if (unaligned_load(first + step) < val) { |
|
0 |
0 |
if (unaligned_load(first + step) < val) { |
3650
|
0 |
0 |
class persistent_unordered_map { |
|
0 |
0 |
class persistent_unordered_map { |
3696
|
0 |
0 |
struct persistent_unordered_map::fnv_hash { |
3699
|
52 |
24 |
while (mask < num) |
3701
|
24 |
0 |
hash.resize(mask + 1); |
3705
|
484 |
0 |
uint32_t size = data.next_4B(); |
3707
|
484 |
0 |
hash.resize(size); |
3708
|
484 |
0 |
memcpy(hash.data(), data.next(size), size * sizeof(uint32_t)); |
3710
|
484 |
0 |
size = data.next_4B(); |
3711
|
484 |
0 |
this->data.resize(size); |
3712
|
145 |
339 |
if (size) memcpy(this->data.data(), data.next(size), size); |
|
145 |
0 |
if (size) memcpy(this->data.data(), data.next(size), size); |
3716
|
0 |
0 |
if (len <= 0) return 0; |
|
0 |
0 |
if (len <= 0) return 0; |
|
0 |
0 |
if (len <= 0) return 0; |
|
0 |
0 |
if (len <= 0) return 0; |
|
0 |
0 |
if (len <= 0) return 0; |
|
0 |
0 |
if (len <= 0) return 0; |
|
0 |
0 |
if (len <= 0) return 0; |
|
0 |
0 |
if (len <= 0) return 0; |
|
0 |
0 |
if (len <= 0) return 0; |
|
8 |
0 |
if (len <= 0) return 0; |
|
0 |
8 |
if (len <= 0) return 0; |
|
0 |
0 |
if (len <= 0) return 0; |
|
0 |
0 |
if (len <= 0) return 0; |
|
0 |
0 |
if (len <= 0) return 0; |
|
0 |
0 |
if (len <= 0) return 0; |
|
0 |
0 |
if (len <= 0) return 0; |
|
0 |
0 |
if (len <= 0) return 0; |
|
330 |
0 |
if (len <= 0) return 0; |
|
78 |
0 |
if (len <= 0) return 0; |
|
20 |
0 |
if (len <= 0) return 0; |
|
20 |
0 |
if (len <= 0) return 0; |
3717
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
2 |
6 |
if (len == 1) return unaligned_load(data); |
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
0 |
0 |
if (len == 1) return unaligned_load(data); |
|
13 |
317 |
if (len == 1) return unaligned_load(data); |
|
6 |
72 |
if (len == 1) return unaligned_load(data); |
|
4 |
16 |
if (len == 1) return unaligned_load(data); |
|
4 |
16 |
if (len == 1) return unaligned_load(data); |
3718
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
5 |
1 |
if (len == 2) return unaligned_load(data); |
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
0 |
0 |
if (len == 2) return unaligned_load(data); |
|
48 |
269 |
if (len == 2) return unaligned_load(data); |
|
67 |
5 |
if (len == 2) return unaligned_load(data); |
|
15 |
1 |
if (len == 2) return unaligned_load(data); |
|
15 |
1 |
if (len == 2) return unaligned_load(data); |
3721
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
38 |
5 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
0 |
0 |
while (len--) |
|
144 |
48 |
while (len--) |
|
1003 |
67 |
while (len--) |
|
114 |
15 |
while (len--) |
|
114 |
15 |
while (len--) |
3735
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
8 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return nullptr; |
3741
|
0 |
0 |
if (len <= 2) |
|
0 |
0 |
if (len <= 2) |
|
0 |
0 |
if (len <= 2) |
|
0 |
0 |
if (len <= 2) |
|
0 |
0 |
if (len <= 2) |
|
0 |
0 |
if (len <= 2) |
|
0 |
0 |
if (len <= 2) |
|
0 |
0 |
if (len <= 2) |
|
0 |
8 |
if (len <= 2) |
|
0 |
0 |
if (len <= 2) |
|
0 |
0 |
if (len <= 2) |
3742
|
0 |
0 |
return data != end ? data + len : nullptr; |
|
0 |
0 |
return data != end ? data + len : nullptr; |
|
0 |
0 |
return data != end ? data + len : nullptr; |
|
0 |
0 |
return data != end ? data + len : nullptr; |
|
0 |
0 |
return data != end ? data + len : nullptr; |
|
0 |
0 |
return data != end ? data + len : nullptr; |
|
0 |
0 |
return data != end ? data + len : nullptr; |
|
0 |
0 |
return data != end ? data + len : nullptr; |
|
8 |
0 |
return data != end ? data + len : nullptr; |
|
0 |
0 |
return data != end ? data + len : nullptr; |
|
0 |
0 |
return data != end ? data + len : nullptr; |
3744
|
0 |
0 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
3745
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
|
0 |
0 |
if (small_memeq(str, data, len)) return data + len; |
3756
|
330 |
16 |
if (unsigned(len) >= hashes.size()) return nullptr; |
|
78 |
14 |
if (unsigned(len) >= hashes.size()) return nullptr; |
3762
|
48 |
282 |
if (len <= 2) |
|
67 |
11 |
if (len <= 2) |
3763
|
234 |
48 |
return data != end ? (const T*)(data + len) : nullptr; |
|
10 |
1 |
return data != end ? (const T*)(data + len) : nullptr; |
3765
|
58 |
12 |
while (data < end) { |
|
75 |
1 |
while (data < end) { |
3766
|
36 |
22 |
if (small_memeq(str, data, len)) return (const T*)(data + len); |
|
66 |
9 |
if (small_memeq(str, data, len)) return (const T*)(data + len); |
3775
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return; |
|
8 |
0 |
if (unsigned(len) >= hashes.size()) return; |
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return; |
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return; |
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return; |
|
0 |
0 |
if (unsigned(len) >= hashes.size()) return; |
3781
|
0 |
0 |
while (data < end) { |
|
13 |
8 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
3791
|
1 |
1 |
for (unsigned len = 0; len < hashes.size(); len++) { |
|
0 |
0 |
for (unsigned len = 0; len < hashes.size(); len++) { |
|
0 |
0 |
for (unsigned len = 0; len < hashes.size(); len++) { |
3795
|
1 |
1 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
|
0 |
0 |
while (data < end) { |
3809
|
0 |
0 |
return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr; |
|
0 |
0 |
return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr; |
|
0 |
0 |
return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr; |
|
0 |
0 |
return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr; |
|
0 |
0 |
return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr; |
|
0 |
0 |
return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr; |
|
0 |
0 |
return unsigned(len) < hashes.size() ? hashes[len].data.data() : nullptr; |
3813
|
2 |
22 |
if (hashes.size() == 0) hashes.emplace_back(1); |
3814
|
2 |
20 |
else if (hashes.size() == 1) hashes.emplace_back(1<<8); |
3815
|
2 |
18 |
else if (hashes.size() == 2) hashes.emplace_back(1<<16); |
3820
|
20 |
0 |
if (unsigned(str_len) < hashes.size()) |
3825
|
24 |
2 |
for (auto&& hash : hashes) { |
3827
|
131633 |
24 |
for (auto&& len : hash.hash) total += len, len = total - len; |
3833
|
20 |
0 |
if (unsigned(str_len) < hashes.size()) { |
3844
|
24 |
2 |
for (auto&& hash : hashes) |
3845
|
131633 |
24 |
for (int i = hash.hash.size() - 1; i >= 0; i--) |
3846
|
131609 |
24 |
hash.hash[i] = i > 0 ? hash.hash[i-1] : 0; |
3853
|
484 |
103 |
for (unsigned i = 0; i < sizes; i++) |
3931
|
0 |
0 |
if (dictionary) lemma.len = dictionary->lemma_id_len(lemma); |
3938
|
0 |
0 |
if (lemma_data) { |
3940
|
0 |
0 |
if (parent_encoded) { |
3944
|
0 |
0 |
if (parent_data[parent_len]) |
3954
|
0 |
0 |
if (dictionary) lemma.len = dictionary->lemma_id_len(lemma); |
3961
|
0 |
0 |
if (lemma_data) { |
3964
|
0 |
0 |
if (children_len) { |
3966
|
0 |
0 |
for (unsigned i = 0; i < children_len; i++) { |
3970
|
0 |
0 |
if (child_data[child_len]) |
3982
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
3985
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
3986
|
0 |
0 |
derinet.resize(data.next_4B()); |
|
0 |
0 |
derinet.resize(data.next_4B()); |
3990
|
0 |
0 |
for (int pass = 1; pass <= 3; pass++) { |
3991
|
0 |
0 |
if (pass > 1) data.seek(data_position); |
|
0 |
0 |
if (pass > 1) data.seek(data_position); |
3994
|
0 |
0 |
for (int i = data.next_4B(); i > 0; i--) { |
|
0 |
0 |
for (int i = data.next_4B(); i > 0; i--) { |
3995
|
0 |
0 |
lemma.resize(lemma.size() - data.next_1B()); |
|
0 |
0 |
lemma.resize(lemma.size() - data.next_1B()); |
3996
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
3997
|
0 |
0 |
lemma.push_back(data.next_1B()); |
3999
|
0 |
0 |
unsigned char lemma_comment_len = data.next_1B(); |
4000
|
0 |
0 |
const char* lemma_comment = lemma_comment_len ? data.next(lemma_comment_len) : nullptr; |
|
0 |
0 |
const char* lemma_comment = lemma_comment_len ? data.next(lemma_comment_len) : nullptr; |
4002
|
0 |
0 |
unsigned children = data.next_2B(); |
4004
|
0 |
0 |
if (pass == 3) parent.clear(); |
4006
|
0 |
0 |
int operations = data.next_1B(); |
4007
|
0 |
0 |
if (operations) { |
4008
|
0 |
0 |
int remove_start = operations & REMOVE_START ? data.next_1B() : 0; |
|
0 |
0 |
int remove_start = operations & REMOVE_START ? data.next_1B() : 0; |
4009
|
0 |
0 |
int remove_end = operations & REMOVE_END ? data.next_1B() : 0; |
|
0 |
0 |
int remove_end = operations & REMOVE_END ? data.next_1B() : 0; |
4010
|
0 |
0 |
if (operations & ADD_START) { |
4011
|
0 |
0 |
int add_start = data.next_1B(); |
4012
|
0 |
0 |
const char* str = data.next(add_start); |
4013
|
0 |
0 |
if (pass == 3) parent.assign(str, str + add_start); |
4015
|
0 |
0 |
if (pass == 3) parent.insert(parent.end(), lemma.begin() + remove_start, lemma.end() - remove_end); |
|
0 |
0 |
if (pass == 3) parent.insert(parent.end(), lemma.begin() + remove_start, lemma.end() - remove_end); |
4016
|
0 |
0 |
if (operations & ADD_END) { |
4017
|
0 |
0 |
int add_end = data.next_1B(); |
4018
|
0 |
0 |
const char* str = data.next(add_end); |
4019
|
0 |
0 |
if (pass == 3) parent.insert(parent.end(), str, str + add_end); |
4023
|
0 |
0 |
if (pass == 1) { |
4025
|
0 |
0 |
} else if (pass == 2) { |
4028
|
0 |
0 |
while (lemma_comment_len--) *lemma_data++ = *lemma_comment++; |
4031
|
0 |
0 |
if (children) unaligned_store(((uint32_t*)lemma_data) + children - 1, 0); |
4032
|
0 |
0 |
} else if (pass == 3 && !parent.empty()) { |
|
0 |
0 |
} else if (pass == 3 && !parent.empty()) { |
|
0 |
0 |
} else if (pass == 3 && !parent.empty()) { |
4043
|
0 |
0 |
assert(lemma_data && parent_data); |
4046
|
0 |
0 |
assert(parent.size() < (1<<8) && parent_offset < (1<<24)); |
|
0 |
0 |
assert(parent.size() < (1<<8) && parent_offset < (1<<24)); |
4050
|
0 |
0 |
assert(lemma.size() < (1<<8) && lemma_offset < (1<<24)); |
|
0 |
0 |
assert(lemma.size() < (1<<8) && lemma_offset < (1<<24)); |
4055
|
0 |
0 |
if (child_index+1 < children_len) |
4060
|
0 |
0 |
if (pass == 1) |
4061
|
0 |
0 |
derinet.done_adding(); |
4062
|
0 |
0 |
if (pass == 2) |
4064
|
0 |
0 |
} |
4097
|
22 |
7 |
while (form_tmp.len && !rest_has_Lut) |
|
22 |
0 |
while (form_tmp.len && !rest_has_Lut) |
4106
|
1 |
6 |
if (first_Lut && !rest_has_Lut) { // common case allowing fast execution |
4111
|
0 |
6 |
} else if (!first_Lut && rest_has_Lut) { |
4114
|
0 |
6 |
} else if (first_Lut && rest_has_Lut) { |
4121
|
0 |
0 |
while (form_tmp.len) { |
4162
|
0 |
0 |
for (unsigned len = 1; len < lemma.len; len++) |
4163
|
0 |
0 |
if (lemma.str[len] == '`' || lemma.str[len] == '_' || |
|
0 |
0 |
if (lemma.str[len] == '`' || lemma.str[len] == '_' || |
4164
|
0 |
0 |
(lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9')) |
|
0 |
0 |
(lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9')) |
|
0 |
0 |
(lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9')) |
4171
|
0 |
0 |
for (unsigned len = 1; len < lemma.len; len++) { |
4172
|
0 |
0 |
if (lemma.str[len] == '`' || lemma.str[len] == '_') |
4174
|
0 |
0 |
if (lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9') { |
|
0 |
0 |
if (lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9') { |
|
0 |
0 |
if (lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9') { |
|
0 |
0 |
if (lemma.str[len] == '-' && len+1 < lemma.len && lemma.str[len+1] >= '0' && lemma.str[len+1] <= '9') { |
4176
|
0 |
0 |
while (len < lemma.len && lemma.str[len] >= '0' && lemma.str[len] <= '9') len++; |
|
0 |
0 |
while (len < lemma.len && lemma.str[len] >= '0' && lemma.str[len] <= '9') len++; |
|
0 |
0 |
while (len < lemma.len && lemma.str[len] >= '0' && lemma.str[len] <= '9') len++; |
4186
|
0 |
0 |
if (addinfo_len) { |
4187
|
0 |
0 |
res.reserve(addinfo_len + 4); |
4188
|
0 |
0 |
if (addinfo[0] != 255) { |
4193
|
0 |
0 |
for (int i = 1; i < addinfo_len; i++) |
4201
|
0 |
0 |
for (int i = 1; i + 2 < addinfo_len; i++) |
4202
|
0 |
0 |
if (addinfo[i] == '_' && addinfo[i+1] == ',' && addinfo[i+2] == 'x') |
|
0 |
0 |
if (addinfo[i] == '_' && addinfo[i+1] == ',' && addinfo[i+2] == 'x') |
|
0 |
0 |
if (addinfo[i] == '_' && addinfo[i+1] == ',' && addinfo[i+2] == 'x') |
4212
|
0 |
0 |
if (lemma_info < lemma.str + lemma.len) { |
4216
|
0 |
0 |
if (*lemma_info == '-') { |
4219
|
0 |
0 |
lemma_additional_info < lemma.str + lemma.len && (*lemma_additional_info >= '0' && *lemma_additional_info <= '9'); |
|
0 |
0 |
lemma_additional_info < lemma.str + lemma.len && (*lemma_additional_info >= '0' && *lemma_additional_info <= '9'); |
4223
|
0 |
0 |
if (lemma_additional_info == lemma_info + 1 || (lemma_additional_info < lemma.str + lemma.len && *lemma_additional_info != '`' && *lemma_additional_info != '_') || lemma_num >= 255) { |
|
0 |
0 |
if (lemma_additional_info == lemma_info + 1 || (lemma_additional_info < lemma.str + lemma.len && *lemma_additional_info != '`' && *lemma_additional_info != '_') || lemma_num >= 255) { |
|
0 |
0 |
if (lemma_additional_info == lemma_info + 1 || (lemma_additional_info < lemma.str + lemma.len && *lemma_additional_info != '`' && *lemma_additional_info != '_') || lemma_num >= 255) { |
|
0 |
0 |
if (lemma_additional_info == lemma_info + 1 || (lemma_additional_info < lemma.str + lemma.len && *lemma_additional_info != '`' && *lemma_additional_info != '_') || lemma_num >= 255) { |
|
0 |
0 |
if (lemma_additional_info == lemma_info + 1 || (lemma_additional_info < lemma.str + lemma.len && *lemma_additional_info != '`' && *lemma_additional_info != '_') || lemma_num >= 255) { |
4224
|
0 |
0 |
if (die_on_failure) |
4225
|
0 |
0 |
training_failure("Lemma number " << lemma_num << " in lemma " << lemma << " out of range!"); |
|
0 |
0 |
training_failure("Lemma number " << lemma_num << " in lemma " << lemma << " out of range!"); |
|
0 |
0 |
training_failure("Lemma number " << lemma_num << " in lemma " << lemma << " out of range!"); |
4231
|
0 |
0 |
while (lemma_additional_info < lemma.str + lemma.len) |
4234
|
0 |
0 |
if (data.size() > 255) { |
4235
|
0 |
0 |
if (die_on_failure) |
4236
|
0 |
0 |
training_failure("Too long lemma info " << lemma_info << " in lemma " << lemma << '!'); |
|
0 |
0 |
training_failure("Too long lemma info " << lemma_info << " in lemma " << lemma << '!'); |
4246
|
0 |
0 |
if (data.empty()) return true; |
4247
|
0 |
0 |
if (data[0] != 255 && (!other_addinfo_len || other_addinfo[0] != data[0])) return false; |
|
0 |
0 |
if (data[0] != 255 && (!other_addinfo_len || other_addinfo[0] != data[0])) return false; |
|
0 |
0 |
if (data[0] != 255 && (!other_addinfo_len || other_addinfo[0] != data[0])) return false; |
|
0 |
0 |
if (data[0] != 255 && (!other_addinfo_len || other_addinfo[0] != data[0])) return false; |
4291
|
0 |
0 |
if (filters.empty()) return true; |
4294
|
0 |
0 |
for (auto&& filter : filters) { |
4296
|
0 |
0 |
while (tag_pos < filter.pos) |
4297
|
0 |
0 |
if (!tag[tag_pos++]) |
4299
|
0 |
0 |
if (!tag[tag_pos]) |
4304
|
0 |
0 |
for (int i = 1; i < filter.chars_len && ((!matched) ^ filter.negate); i++) |
|
0 |
0 |
for (int i = 1; i < filter.chars_len && ((!matched) ^ filter.negate); i++) |
4306
|
0 |
0 |
if (!matched) return false; |
4346
|
12 |
1 |
for (int i = data.next_1B(); i > 0; i--) |
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
4348
|
12 |
1 |
for (int i = data.next_1B(); i > 0; i--) |
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
4354
|
1 |
0 |
vector root(max(lemmas.max_length(), roots.max_length())); |
|
0 |
0 |
vector root(max(lemmas.max_length(), roots.max_length())); |
|
0 |
0 |
vector root(max(lemmas.max_length(), roots.max_length())); |
4356
|
2 |
1 |
for (int pass = 1; pass <= 2; pass++) { |
|
0 |
0 |
for (int pass = 1; pass <= 2; pass++) { |
|
0 |
0 |
for (int pass = 1; pass <= 2; pass++) { |
4357
|
1 |
1 |
if (pass > 1) data.seek(data_position); |
|
1 |
0 |
if (pass > 1) data.seek(data_position); |
|
0 |
0 |
if (pass > 1) data.seek(data_position); |
|
0 |
0 |
if (pass > 1) data.seek(data_position); |
|
0 |
0 |
if (pass > 1) data.seek(data_position); |
|
0 |
0 |
if (pass > 1) data.seek(data_position); |
4362
|
2 |
0 |
for (int i = data.next_4B(); i > 0; i--) { |
|
20 |
2 |
for (int i = data.next_4B(); i > 0; i--) { |
|
0 |
0 |
for (int i = data.next_4B(); i > 0; i--) { |
|
0 |
0 |
for (int i = data.next_4B(); i > 0; i--) { |
|
0 |
0 |
for (int i = data.next_4B(); i > 0; i--) { |
|
0 |
0 |
for (int i = data.next_4B(); i > 0; i--) { |
4363
|
20 |
0 |
lemma_len -= data.next_1B(); |
|
0 |
0 |
lemma_len -= data.next_1B(); |
|
0 |
0 |
lemma_len -= data.next_1B(); |
4364
|
20 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
|
106 |
20 |
for (int i = data.next_1B(); i > 0; i--) |
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
|
0 |
0 |
for (int i = data.next_1B(); i > 0; i--) |
4365
|
106 |
0 |
lemma[lemma_len++] = data.next_1B(); |
|
0 |
0 |
lemma[lemma_len++] = data.next_1B(); |
|
0 |
0 |
lemma[lemma_len++] = data.next_1B(); |
4366
|
20 |
0 |
unsigned char lemma_info_len = data.next_1B(); |
|
0 |
0 |
unsigned char lemma_info_len = data.next_1B(); |
|
0 |
0 |
unsigned char lemma_info_len = data.next_1B(); |
4367
|
0 |
20 |
const char* lemma_info = lemma_info_len ? data.next(lemma_info_len) : nullptr; |
|
0 |
0 |
const char* lemma_info = lemma_info_len ? data.next(lemma_info_len) : nullptr; |
|
0 |
0 |
const char* lemma_info = lemma_info_len ? data.next(lemma_info_len) : nullptr; |
|
0 |
0 |
const char* lemma_info = lemma_info_len ? data.next(lemma_info_len) : nullptr; |
|
0 |
0 |
const char* lemma_info = lemma_info_len ? data.next(lemma_info_len) : nullptr; |
|
0 |
0 |
const char* lemma_info = lemma_info_len ? data.next(lemma_info_len) : nullptr; |
4368
|
20 |
0 |
unsigned lemma_roots = data.next_1B(); |
|
0 |
0 |
unsigned lemma_roots = data.next_1B(); |
|
0 |
0 |
unsigned lemma_roots = data.next_1B(); |
4373
|
10 |
10 |
if (pass == 1) { |
|
0 |
0 |
if (pass == 1) { |
|
0 |
0 |
if (pass == 1) { |
4380
|
0 |
10 |
if (lemma_info_len) small_memcpy(lemma_data, lemma_info, lemma_info_len), lemma_data += lemma_info_len; |
|
0 |
0 |
if (lemma_info_len) small_memcpy(lemma_data, lemma_info, lemma_info_len), lemma_data += lemma_info_len; |
|
0 |
0 |
if (lemma_info_len) small_memcpy(lemma_data, lemma_info, lemma_info_len), lemma_data += lemma_info_len; |
4385
|
20 |
20 |
for (unsigned i = 0; i < lemma_roots; i++) { |
|
0 |
0 |
for (unsigned i = 0; i < lemma_roots; i++) { |
|
0 |
0 |
for (unsigned i = 0; i < lemma_roots; i++) { |
4387
|
20 |
0 |
int operations = data.next_1B(); |
|
0 |
0 |
int operations = data.next_1B(); |
|
0 |
0 |
int operations = data.next_1B(); |
4388
|
4 |
16 |
if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; } |
|
4 |
0 |
if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; } |
|
28 |
4 |
if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; } |
|
0 |
0 |
if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; } |
|
0 |
0 |
if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; } |
|
0 |
0 |
if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; } |
|
0 |
0 |
if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; } |
|
0 |
0 |
if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; } |
|
0 |
0 |
if (operations & REMOVE_START) { int from = data.next_1B(), to = 0; while (from < root_len) root[to++] = root[from++]; root_len = to; } |
4389
|
12 |
8 |
if (operations & REMOVE_END) root_len -= data.next_1B(); |
|
12 |
0 |
if (operations & REMOVE_END) root_len -= data.next_1B(); |
|
0 |
0 |
if (operations & REMOVE_END) root_len -= data.next_1B(); |
|
0 |
0 |
if (operations & REMOVE_END) root_len -= data.next_1B(); |
|
0 |
0 |
if (operations & REMOVE_END) root_len -= data.next_1B(); |
|
0 |
0 |
if (operations & REMOVE_END) root_len -= data.next_1B(); |
4390
|
6 |
14 |
if (operations & ADD_START) { |
|
0 |
0 |
if (operations & ADD_START) { |
|
0 |
0 |
if (operations & ADD_START) { |
4391
|
6 |
0 |
int from = root_len, to = from + data.next_1B(); while (from > 0) root[--to] = root[--from]; root_len += to; |
|
38 |
6 |
int from = root_len, to = from + data.next_1B(); while (from > 0) root[--to] = root[--from]; root_len += to; |
|
0 |
0 |
int from = root_len, to = from + data.next_1B(); while (from > 0) root[--to] = root[--from]; root_len += to; |
|
0 |
0 |
int from = root_len, to = from + data.next_1B(); while (from > 0) root[--to] = root[--from]; root_len += to; |
|
0 |
0 |
int from = root_len, to = from + data.next_1B(); while (from > 0) root[--to] = root[--from]; root_len += to; |
|
0 |
0 |
int from = root_len, to = from + data.next_1B(); while (from > 0) root[--to] = root[--from]; root_len += to; |
4392
|
8 |
6 |
for (int i = 0; i < to; i++) root[i] = data.next_1B(); |
|
8 |
0 |
for (int i = 0; i < to; i++) root[i] = data.next_1B(); |
|
0 |
0 |
for (int i = 0; i < to; i++) root[i] = data.next_1B(); |
|
0 |
0 |
for (int i = 0; i < to; i++) root[i] = data.next_1B(); |
|
0 |
0 |
for (int i = 0; i < to; i++) root[i] = data.next_1B(); |
|
0 |
0 |
for (int i = 0; i < to; i++) root[i] = data.next_1B(); |
4394
|
12 |
8 |
if (operations & ADD_END) |
|
0 |
0 |
if (operations & ADD_END) |
|
0 |
0 |
if (operations & ADD_END) |
4395
|
12 |
0 |
for (int len = data.next_1B(); len > 0; len--) |
|
22 |
12 |
for (int len = data.next_1B(); len > 0; len--) |
|
0 |
0 |
for (int len = data.next_1B(); len > 0; len--) |
|
0 |
0 |
for (int len = data.next_1B(); len > 0; len--) |
|
0 |
0 |
for (int len = data.next_1B(); len > 0; len--) |
|
0 |
0 |
for (int len = data.next_1B(); len > 0; len--) |
4396
|
22 |
0 |
root[root_len++] = data.next_1B(); |
|
0 |
0 |
root[root_len++] = data.next_1B(); |
|
0 |
0 |
root[root_len++] = data.next_1B(); |
4397
|
20 |
0 |
uint16_t clas = data.next_2B(); |
|
0 |
0 |
uint16_t clas = data.next_2B(); |
|
0 |
0 |
uint16_t clas = data.next_2B(); |
4399
|
10 |
10 |
if (pass == 1) { // for each root |
|
0 |
0 |
if (pass == 1) { // for each root |
|
0 |
0 |
if (pass == 1) { // for each root |
4408
|
0 |
10 |
assert(uint8_t(lemma_len) == lemma_len); |
|
0 |
0 |
assert(uint8_t(lemma_len) == lemma_len); |
|
0 |
0 |
assert(uint8_t(lemma_len) == lemma_len); |
4413
|
0 |
10 |
assert(uint8_t(root_len) == root_len); |
|
0 |
0 |
assert(uint8_t(root_len) == root_len); |
|
0 |
0 |
assert(uint8_t(root_len) == root_len); |
4418
|
1 |
1 |
if (pass == 1) { // after the whole pass |
|
0 |
0 |
if (pass == 1) { // after the whole pass |
|
0 |
0 |
if (pass == 1) { // after the whole pass |
4419
|
1 |
0 |
lemmas.done_adding(); |
|
0 |
0 |
lemmas.done_adding(); |
|
0 |
0 |
lemmas.done_adding(); |
4420
|
1 |
0 |
roots.done_adding(); |
|
0 |
0 |
roots.done_adding(); |
|
0 |
0 |
roots.done_adding(); |
4428
|
1 |
0 |
tags.resize(data.next_2B()); |
|
1 |
0 |
tags.resize(data.next_2B()); |
|
0 |
0 |
tags.resize(data.next_2B()); |
|
0 |
0 |
tags.resize(data.next_2B()); |
|
0 |
0 |
tags.resize(data.next_2B()); |
|
0 |
0 |
tags.resize(data.next_2B()); |
4429
|
6 |
1 |
for (auto&& tag : tags) { |
|
0 |
0 |
for (auto&& tag : tags) { |
|
0 |
0 |
for (auto&& tag : tags) { |
4430
|
6 |
0 |
tag.resize(data.next_1B()); |
|
0 |
0 |
tag.resize(data.next_1B()); |
|
0 |
0 |
tag.resize(data.next_1B()); |
4431
|
397 |
6 |
for (unsigned i = 0; i < tag.size(); i++) |
|
0 |
0 |
for (unsigned i = 0; i < tag.size(); i++) |
|
0 |
0 |
for (unsigned i = 0; i < tag.size(); i++) |
4432
|
397 |
0 |
tag[i] = data.next_1B(); |
|
0 |
0 |
tag[i] = data.next_1B(); |
|
0 |
0 |
tag[i] = data.next_1B(); |
4436
|
1 |
0 |
suffixes.load(data); |
|
0 |
0 |
suffixes.load(data); |
|
0 |
0 |
suffixes.load(data); |
4439
|
1 |
0 |
suffixes.iter_all([this](const char* suffix, int len, pointer_decoder& data) mutable { |
|
0 |
0 |
suffixes.iter_all([this](const char* suffix, int len, pointer_decoder& data) mutable { |
|
0 |
0 |
suffixes.iter_all([this](const char* suffix, int len, pointer_decoder& data) mutable { |
4444
|
6 |
1 |
for (unsigned i = 0; i < classes_len; i++) |
|
0 |
0 |
for (unsigned i = 0; i < classes_len; i++) |
|
0 |
0 |
for (unsigned i = 0; i < classes_len; i++) |
4450
|
6 |
1 |
for (unsigned i = 0; i < classes_len; i++) { |
|
0 |
0 |
for (unsigned i = 0; i < classes_len; i++) { |
|
0 |
0 |
for (unsigned i = 0; i < classes_len; i++) { |
4452
|
6 |
0 |
if (classes_ptr_i >= classes.size()) classes.resize(classes_ptr_i + 1); |
|
6 |
0 |
if (classes_ptr_i >= classes.size()) classes.resize(classes_ptr_i + 1); |
|
0 |
0 |
if (classes_ptr_i >= classes.size()) classes.resize(classes_ptr_i + 1); |
|
0 |
0 |
if (classes_ptr_i >= classes.size()) classes.resize(classes_ptr_i + 1); |
|
0 |
0 |
if (classes_ptr_i >= classes.size()) classes.resize(classes_ptr_i + 1); |
|
0 |
0 |
if (classes_ptr_i >= classes.size()) classes.resize(classes_ptr_i + 1); |
4455
|
6 |
0 |
classes[classes_ptr_i].emplace_back(suffix_str, vector()); |
|
0 |
0 |
classes[classes_ptr_i].emplace_back(suffix_str, vector()); |
|
0 |
0 |
classes[classes_ptr_i].emplace_back(suffix_str, vector()); |
4456
|
6 |
6 |
for (const uint16_t* ptr = tags_ptr + prev_index; ptr < tags_ptr + index; ptr++) |
|
0 |
0 |
for (const uint16_t* ptr = tags_ptr + prev_index; ptr < tags_ptr + index; ptr++) |
|
0 |
0 |
for (const uint16_t* ptr = tags_ptr + prev_index; ptr < tags_ptr + index; ptr++) |
4457
|
6 |
0 |
classes[classes_ptr_i].back().second.emplace_back(unaligned_load(ptr)); |
|
0 |
0 |
classes[classes_ptr_i].back().second.emplace_back(unaligned_load(ptr)); |
|
0 |
0 |
classes[classes_ptr_i].back().second.emplace_back(unaligned_load(ptr)); |
4467
|
0 |
8 |
uint16_t** suff = max_suffix_len <= 16 ? suff_stack : (suff_heap.resize(max_suffix_len), suff_heap.data()); |
|
0 |
0 |
uint16_t** suff = max_suffix_len <= 16 ? suff_stack : (suff_heap.resize(max_suffix_len), suff_heap.data()); |
|
0 |
0 |
uint16_t** suff = max_suffix_len <= 16 ? suff_stack : (suff_heap.resize(max_suffix_len), suff_heap.data()); |
|
0 |
0 |
uint16_t** suff = max_suffix_len <= 16 ? suff_stack : (suff_heap.resize(max_suffix_len), suff_heap.data()); |
|
0 |
0 |
uint16_t** suff = max_suffix_len <= 16 ? suff_stack : (suff_heap.resize(max_suffix_len), suff_heap.data()); |
|
0 |
0 |
uint16_t** suff = max_suffix_len <= 16 ? suff_stack : (suff_heap.resize(max_suffix_len), suff_heap.data()); |
4469
|
8 |
8 |
for (int i = form.len; i >= 0 && suff_len < max_suffix_len; i--, suff_len++) { |
|
0 |
0 |
for (int i = form.len; i >= 0 && suff_len < max_suffix_len; i--, suff_len++) { |
|
0 |
0 |
for (int i = form.len; i >= 0 && suff_len < max_suffix_len; i--, suff_len++) { |
4477
|
8 |
8 |
for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++) |
|
0 |
8 |
for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++) |
|
8 |
8 |
for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++) |
|
0 |
0 |
for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++) |
|
0 |
0 |
for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++) |
|
0 |
0 |
for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++) |
|
0 |
0 |
for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++) |
|
0 |
0 |
for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++) |
|
0 |
0 |
for (int root_len = int(form.len) - --suff_len; suff_len >= 0 && root_len < int(roots.max_length()); suff_len--, root_len++) |
4478
|
8 |
0 |
if (unaligned_load(suff[suff_len])) { |
|
0 |
0 |
if (unaligned_load(suff[suff_len])) { |
|
0 |
0 |
if (unaligned_load(suff[suff_len])) { |
4482
|
8 |
0 |
roots.iter(form.str, root_len, [&](const char* root, pointer_decoder& root_data) { |
|
0 |
0 |
roots.iter(form.str, root_len, [&](const char* root, pointer_decoder& root_data) { |
|
0 |
0 |
roots.iter(form.str, root_len, [&](const char* root, pointer_decoder& root_data) { |
4487
|
10 |
3 |
if (small_memeq(form.str, root, root_len)) { |
|
0 |
0 |
if (small_memeq(form.str, root, root_len)) { |
|
0 |
0 |
if (small_memeq(form.str, root, root_len)) { |
4489
|
10 |
0 |
if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) { |
|
0 |
10 |
if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) { |
|
10 |
0 |
if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) { |
|
0 |
0 |
if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) { |
|
0 |
0 |
if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) { |
|
0 |
0 |
if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) { |
|
0 |
0 |
if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) { |
|
0 |
0 |
if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) { |
|
0 |
0 |
if (suffix_class_ptr < suff_data + suff_classes && unaligned_load(suffix_class_ptr) == root_class) { |
4492
|
0 |
10 |
if (lemma_data[lemma_len]) lemma += LemmaAddinfo::format(lemma_data + lemma_len + 1, lemma_data[lemma_len]); |
|
0 |
0 |
if (lemma_data[lemma_len]) lemma += LemmaAddinfo::format(lemma_data + lemma_len + 1, lemma_data[lemma_len]); |
|
0 |
0 |
if (lemma_data[lemma_len]) lemma += LemmaAddinfo::format(lemma_data + lemma_len + 1, lemma_data[lemma_len]); |
|
0 |
0 |
if (lemma_data[lemma_len]) lemma += LemmaAddinfo::format(lemma_data + lemma_len + 1, lemma_data[lemma_len]); |
4496
|
10 |
10 |
for (unsigned i = unaligned_load(suff_tag_indices + (suffix_class_ptr - suff_data)); |
|
0 |
0 |
for (unsigned i = unaligned_load(suff_tag_indices + (suffix_class_ptr - suff_data)); |
|
0 |
0 |
for (unsigned i = unaligned_load(suff_tag_indices + (suffix_class_ptr - suff_data)); |
4498
|
10 |
0 |
lemmas.emplace_back(lemma, tags[unaligned_load(suff_tags + i)]); |
|
0 |
0 |
lemmas.emplace_back(lemma, tags[unaligned_load(suff_tags + i)]); |
|
0 |
0 |
lemmas.emplace_back(lemma, tags[unaligned_load(suff_tags + i)]); |
4508
|
0 |
0 |
int raw_lemma_len = addinfo.parse(lemma); |
|
0 |
0 |
int raw_lemma_len = addinfo.parse(lemma); |
4511
|
0 |
0 |
lemmas.iter(lemma.str, raw_lemma_len, [&](const char* lemma_str, pointer_decoder& data) { |
|
0 |
0 |
lemmas.iter(lemma.str, raw_lemma_len, [&](const char* lemma_str, pointer_decoder& data) { |
|
0 |
0 |
lemmas.iter(lemma.str, raw_lemma_len, [&](const char* lemma_str, pointer_decoder& data) { |
4517
|
0 |
0 |
if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) { |
|
0 |
0 |
if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) { |
|
0 |
0 |
if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) { |
|
0 |
0 |
if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) { |
|
0 |
0 |
if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) { |
|
0 |
0 |
if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) { |
|
0 |
0 |
if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) { |
|
0 |
0 |
if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) { |
|
0 |
0 |
if (small_memeq(lemma.str, lemma_str, raw_lemma_len) && addinfo.match_lemma_id(lemma_info, lemma_info_len) && LemmaAddinfo::generatable(lemma_info, lemma_info_len)) { |
4522
|
0 |
0 |
for (unsigned i = 0; i < lemma_roots_len; i++) { |
|
0 |
0 |
for (unsigned i = 0; i < lemma_roots_len; i++) { |
|
0 |
0 |
for (unsigned i = 0; i < lemma_roots_len; i++) { |
4528
|
0 |
0 |
for (auto&& suffix : classes[clas]) { |
|
0 |
0 |
for (auto&& suffix : classes[clas]) { |
|
0 |
0 |
for (auto&& suffix : classes[clas]) { |
4530
|
0 |
0 |
for (auto&& tag : suffix.second) |
|
0 |
0 |
for (auto&& tag : suffix.second) |
|
0 |
0 |
for (auto&& tag : suffix.second) |
4531
|
0 |
0 |
if (filter.matches(tags[tag].c_str())) { |
|
0 |
0 |
if (filter.matches(tags[tag].c_str())) { |
|
0 |
0 |
if (filter.matches(tags[tag].c_str())) { |
4532
|
0 |
0 |
if (!forms) { |
|
0 |
0 |
if (!forms) { |
|
0 |
0 |
if (!forms) { |
4533
|
0 |
0 |
lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len)); |
|
0 |
0 |
lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len)); |
|
0 |
0 |
lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len)); |
|
0 |
0 |
lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len)); |
|
0 |
0 |
lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len)); |
|
0 |
0 |
lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len)); |
|
0 |
0 |
lemmas_forms.emplace_back(string(lemma.str, raw_lemma_len) + LemmaAddinfo::format(lemma_info, lemma_info_len)); |
4537
|
0 |
0 |
if (root_with_suffix.empty() && root_len + suffix.first.size()) { |
|
0 |
0 |
if (root_with_suffix.empty() && root_len + suffix.first.size()) { |
|
0 |
0 |
if (root_with_suffix.empty() && root_len + suffix.first.size()) { |
|
0 |
0 |
if (root_with_suffix.empty() && root_len + suffix.first.size()) { |
|
0 |
0 |
if (root_with_suffix.empty() && root_len + suffix.first.size()) { |
|
0 |
0 |
if (root_with_suffix.empty() && root_len + suffix.first.size()) { |
|
0 |
0 |
if (root_with_suffix.empty() && root_len + suffix.first.size()) { |
|
0 |
0 |
if (root_with_suffix.empty() && root_len + suffix.first.size()) { |
|
0 |
0 |
if (root_with_suffix.empty() && root_len + suffix.first.size()) { |
4538
|
0 |
0 |
root_with_suffix.reserve(root_len + suffix.first.size()); |
|
0 |
0 |
root_with_suffix.reserve(root_len + suffix.first.size()); |
|
0 |
0 |
root_with_suffix.reserve(root_len + suffix.first.size()); |
4543
|
0 |
0 |
forms->emplace_back(root_with_suffix, tags[tag]); |
|
0 |
0 |
forms->emplace_back(root_with_suffix, tags[tag]); |
|
0 |
0 |
forms->emplace_back(root_with_suffix, tags[tag]); |
4590
|
0 |
0 |
for (unsigned tag_filters_len = data.next_1B(); tag_filters_len; tag_filters_len--) { |
4594
|
0 |
0 |
tag_filters.emplace_back(tag_filter.c_str()); |
4605
|
0 |
0 |
if (!form.len) return; |
4609
|
0 |
0 |
middle_masks.reserve(form.len); |
4611
|
0 |
0 |
for (unsigned initial = 0; initial < form.len; initial++) { |
4614
|
0 |
0 |
if (initial) { |
4616
|
0 |
0 |
if (!found) break; |
4621
|
0 |
0 |
if (initial_mask) { |
4622
|
0 |
0 |
middle_masks.resize(initial); |
4623
|
0 |
0 |
middle_masks.emplace_back(initial_mask); |
4624
|
0 |
0 |
for (unsigned middle = initial; middle < middle_masks.size(); middle++) { |
4625
|
0 |
0 |
if (!middle_masks[middle]) continue; |
4627
|
0 |
0 |
for (unsigned i = middle + 1; i < form.len; i++) { |
4629
|
0 |
0 |
if (!found) break; |
4630
|
0 |
0 |
if (unaligned_load(found)) { |
4631
|
0 |
0 |
if (i + 1 > middle_masks.size()) middle_masks.resize(i + 1); |
|
0 |
0 |
if (i + 1 > middle_masks.size()) middle_masks.resize(i + 1); |
4637
|
0 |
0 |
if (middle > initial && middle < form.len ) { |
|
0 |
0 |
if (middle > initial && middle < form.len ) { |
4638
|
0 |
0 |
if (initial) { |
4639
|
0 |
0 |
if (form_tmp.empty()) form_tmp.assign(form.str, form.str + form.len); |
4643
|
0 |
0 |
dictionary.analyze(string_piece((initial ? form_tmp.data() : form.str) + middle - initial, form.len - middle + initial), lemmas); |
|
0 |
0 |
dictionary.analyze(string_piece((initial ? form_tmp.data() : form.str) + middle - initial, form.len - middle + initial), lemmas); |
4645
|
0 |
0 |
for (unsigned i = lemmas_ori_size; i < lemmas.size(); i++) { |
4646
|
0 |
0 |
for (unsigned filter = 0; filter < tag_filters.size(); filter++) |
4647
|
0 |
0 |
if ((middle_masks[middle] & (1<
|
|
0 |
0 |
if ((middle_masks[middle] & (1<
|
|
0 |
0 |
if ((middle_masks[middle] & (1<
|
4648
|
0 |
0 |
if (i == lemmas_new_size) { |
4651
|
0 |
0 |
lemmas[lemmas_new_size].lemma.reserve(lemmas[i].lemma.size() + middle - initial); |
4660
|
0 |
0 |
if (lemmas_new_size < lemmas.size()) lemmas.erase(lemmas.begin() + lemmas_new_size, lemmas.end()); |
4785
|
30 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
4 |
26 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
|
0 |
0 |
return chr.chr < ragel_map.size() && ragel_map[chr.chr] != 128 ? ragel_map[chr.chr] : 128 + (uint32_t(chr.cat) * uint32_t(0x077CB531U) >> 27); |
4845
|
0 |
0 |
czech_morpho(morpho_language language, unsigned version) : language(language), version(version) {} |
|
0 |
0 |
czech_morpho(morpho_language language, unsigned version) : language(language), version(version) {} |
|
0 |
0 |
czech_morpho(morpho_language language, unsigned version) : language(language), version(version) {} |
4888
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
4892
|
0 |
0 |
unsigned tag_length = data.next_1B(); |
4893
|
0 |
0 |
if (tag_length < unknown_tag.size()) unknown_tag.erase(tag_length); |
|
0 |
0 |
if (tag_length < unknown_tag.size()) unknown_tag.erase(tag_length); |
4894
|
0 |
0 |
if (tag_length < number_tag.size()) number_tag.erase(tag_length); |
|
0 |
0 |
if (tag_length < number_tag.size()) number_tag.erase(tag_length); |
4895
|
0 |
0 |
if (tag_length < punctuation_tag.size()) punctuation_tag.erase(tag_length); |
|
0 |
0 |
if (tag_length < punctuation_tag.size()) punctuation_tag.erase(tag_length); |
4898
|
0 |
0 |
dictionary.load(data); |
4902
|
0 |
0 |
if (data.next_1B()) { |
|
0 |
0 |
if (data.next_1B()) { |
4903
|
0 |
0 |
prefix_guesser.reset(new morpho_prefix_guesser(dictionary)); |
4904
|
0 |
0 |
prefix_guesser->load(data); |
4909
|
0 |
0 |
if (data.next_1B()) { |
|
0 |
0 |
if (data.next_1B()) { |
4910
|
0 |
0 |
statistical_guesser.reset(new morpho_statistical_guesser()); |
4911
|
0 |
0 |
statistical_guesser->load(data); |
4912
|
0 |
0 |
} |
4923
|
0 |
0 |
if (form.len) { |
4927
|
0 |
0 |
generate_casing_variants(form, form_uclc, form_lc); |
4930
|
0 |
0 |
dictionary.analyze(form, lemmas); |
4931
|
0 |
0 |
if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas); |
|
0 |
0 |
if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas); |
4932
|
0 |
0 |
if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas); |
|
0 |
0 |
if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas); |
4933
|
0 |
0 |
if (!lemmas.empty()) return NO_GUESSER; |
4936
|
0 |
0 |
analyze_special(form, lemmas); |
4937
|
0 |
0 |
if (!lemmas.empty()) return NO_GUESSER; |
4940
|
0 |
0 |
if (guesser == GUESSER && prefix_guesser) |
|
0 |
0 |
if (guesser == GUESSER && prefix_guesser) |
|
0 |
0 |
if (guesser == GUESSER && prefix_guesser) |
4941
|
0 |
0 |
prefix_guesser->analyze(form_lc.empty() ? form : form_lc, lemmas); |
|
0 |
0 |
prefix_guesser->analyze(form_lc.empty() ? form : form_lc, lemmas); |
4945
|
0 |
0 |
if (guesser == GUESSER && statistical_guesser) { |
|
0 |
0 |
if (guesser == GUESSER && statistical_guesser) { |
|
0 |
0 |
if (guesser == GUESSER && statistical_guesser) { |
4946
|
0 |
0 |
if (form_uclc.empty() && form_lc.empty()) |
|
0 |
0 |
if (form_uclc.empty() && form_lc.empty()) |
|
0 |
0 |
if (form_uclc.empty() && form_lc.empty()) |
4947
|
0 |
0 |
statistical_guesser->analyze(form, lemmas, nullptr); |
4949
|
0 |
0 |
morpho_statistical_guesser::used_rules used_rules; used_rules.reserve(3); |
4950
|
0 |
0 |
statistical_guesser->analyze(form, lemmas, &used_rules); |
4951
|
0 |
0 |
if (!form_uclc.empty()) statistical_guesser->analyze(form_uclc, lemmas, &used_rules); |
|
0 |
0 |
if (!form_uclc.empty()) statistical_guesser->analyze(form_uclc, lemmas, &used_rules); |
4952
|
0 |
0 |
if (!form_lc.empty()) statistical_guesser->analyze(form_lc, lemmas, &used_rules); |
|
0 |
0 |
if (!form_lc.empty()) statistical_guesser->analyze(form_lc, lemmas, &used_rules); |
4958
|
0 |
0 |
if (prefix_guesser_guesses) { |
4961
|
0 |
0 |
return lemma_compare < 0 || (lemma_compare == 0 && a.tag < b.tag); |
4964
|
0 |
0 |
return a.lemma == b.lemma && a.tag == b.tag; |
|
0 |
0 |
return a.lemma == b.lemma && a.tag == b.tag; |
4966
|
0 |
0 |
if (lemmas_end != lemmas.end()) lemmas.erase(lemmas_end, lemmas.end()); |
4969
|
0 |
0 |
if (!lemmas.empty()) return GUESSER; |
4972
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), unknown_tag); |
4981
|
0 |
0 |
if (lemma.len) { |
4982
|
0 |
0 |
if (dictionary.generate(lemma, filter, forms)) |
|
0 |
0 |
if (dictionary.generate(lemma, filter, forms)) |
4985
|
0 |
0 |
if (guesser == GUESSER && prefix_guesser) |
|
0 |
0 |
if (guesser == GUESSER && prefix_guesser) |
5006
|
0 |
0 |
return new czech_tokenizer(language, version, this); |
5037
|
0 |
0 |
if (!form.len) return; |
5045
|
0 |
0 |
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(form.str, form.len); |
5046
|
0 |
0 |
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(form.str, form.len); |
5047
|
0 |
0 |
if ((codepoint == '.' && form.len) || codepoint == ',') codepoint = utf8::decode(form.str, form.len); |
|
0 |
0 |
if ((codepoint == '.' && form.len) || codepoint == ',') codepoint = utf8::decode(form.str, form.len); |
|
0 |
0 |
if ((codepoint == '.' && form.len) || codepoint == ',') codepoint = utf8::decode(form.str, form.len); |
5048
|
0 |
0 |
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(form.str, form.len); |
5049
|
0 |
0 |
if (any_digit && (codepoint == 'e' || codepoint == 'E')) { |
|
0 |
0 |
if (any_digit && (codepoint == 'e' || codepoint == 'E')) { |
5051
|
0 |
0 |
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(form.str, form.len); |
5053
|
0 |
0 |
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(form.str, form.len); |
5056
|
0 |
0 |
if (any_digit && !form.len && (!codepoint || codepoint == '.')) { |
|
0 |
0 |
if (any_digit && !form.len && (!codepoint || codepoint == '.')) { |
|
0 |
0 |
if (any_digit && !form.len && (!codepoint || codepoint == '.')) { |
5057
|
0 |
0 |
lemmas.emplace_back(string(form_ori.str, form_ori.len), number_tag); |
5058
|
0 |
0 |
} else if ((first < sizeof(punctuation_additional) && punctuation_additional[first]) || |
|
0 |
0 |
} else if ((first < sizeof(punctuation_additional) && punctuation_additional[first]) || |
|
0 |
0 |
} else if ((first < sizeof(punctuation_additional) && punctuation_additional[first]) || |
|
0 |
0 |
} else if ((first < sizeof(punctuation_additional) && punctuation_additional[first]) || |
5059
|
0 |
0 |
((unicode::category(first) & unicode::P) && (first >= sizeof(punctuation_exceptions) || !punctuation_exceptions[first]))) |
|
0 |
0 |
((unicode::category(first) & unicode::P) && (first >= sizeof(punctuation_exceptions) || !punctuation_exceptions[first]))) |
5060
|
0 |
0 |
lemmas.emplace_back(string(form_ori.str, form_ori.len), punctuation_tag); |
5098
|
0 |
0 |
for (unsigned len = 1; len < lemma.len; len++) { |
5099
|
0 |
0 |
if (len + 1 == lemma.len && (lemma.str[len] == '^' || lemma.str[len] == '+')) |
|
0 |
0 |
if (len + 1 == lemma.len && (lemma.str[len] == '^' || lemma.str[len] == '+')) |
5101
|
0 |
0 |
if (len + 1 < lemma.len && lemma.str[len] == '^') { |
|
0 |
0 |
if (len + 1 < lemma.len && lemma.str[len] == '^') { |
5103
|
0 |
0 |
for (unsigned i = len + 1; ok && i < lemma.len; i++) |
|
0 |
0 |
for (unsigned i = len + 1; ok && i < lemma.len; i++) |
5104
|
0 |
0 |
ok &= (lemma.str[i] >= 'A' && lemma.str[i] <= 'Z') || |
5105
|
0 |
0 |
(lemma.str[i] >= 'a' && lemma.str[i] <= 'z') || |
|
0 |
0 |
(lemma.str[i] >= 'a' && lemma.str[i] <= 'z') || |
5106
|
0 |
0 |
(i > len + 1 && lemma.str[i] == '-'); |
5107
|
0 |
0 |
if (ok) return len; |
5130
|
0 |
0 |
for (size_t i = len; i < lemma.len; i++) |
5137
|
0 |
0 |
if (data.empty()) return true; |
5138
|
0 |
0 |
if (data.size() == 1 && data[0] == '^') return other_addinfo_len > 0 && other_addinfo[0] == '^'; |
|
0 |
0 |
if (data.size() == 1 && data[0] == '^') return other_addinfo_len > 0 && other_addinfo[0] == '^'; |
|
0 |
0 |
if (data.size() == 1 && data[0] == '^') return other_addinfo_len > 0 && other_addinfo[0] == '^'; |
|
0 |
0 |
if (data.size() == 1 && data[0] == '^') return other_addinfo_len > 0 && other_addinfo[0] == '^'; |
|
0 |
0 |
if (data.size() == 1 && data[0] == '^') return other_addinfo_len > 0 && other_addinfo[0] == '^'; |
5139
|
0 |
0 |
if (data.size() == 1 && data[0] == '+') return other_addinfo_len == 0; |
|
0 |
0 |
if (data.size() == 1 && data[0] == '+') return other_addinfo_len == 0; |
|
0 |
0 |
if (data.size() == 1 && data[0] == '+') return other_addinfo_len == 0; |
5140
|
0 |
0 |
return data.size() == size_t(other_addinfo_len) && small_memeq(data.data(), other_addinfo, other_addinfo_len); |
|
0 |
0 |
return data.size() == size_t(other_addinfo_len) && small_memeq(data.data(), other_addinfo, other_addinfo_len); |
5160
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
|
0 |
0 |
class english_morpho_guesser { |
5208
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
|
0 |
0 |
english_morpho(unsigned version) : version(version) {} |
5282
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
5285
|
0 |
0 |
dictionary.load(data); |
5286
|
0 |
0 |
morpho_guesser.load(data); |
|
0 |
0 |
morpho_guesser.load(data); |
5297
|
0 |
0 |
if (form.len) { |
5301
|
0 |
0 |
generate_casing_variants(form, form_uclc, form_lc); |
5304
|
0 |
0 |
dictionary.analyze(form, lemmas); |
5305
|
0 |
0 |
if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas); |
|
0 |
0 |
if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas); |
5306
|
0 |
0 |
if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas); |
|
0 |
0 |
if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas); |
5307
|
0 |
0 |
if (!lemmas.empty()) |
5308
|
0 |
0 |
return guesser == NO_GUESSER || !morpho_guesser.analyze_proper_names(form, form_lc.empty() ? form : form_lc, lemmas) ? NO_GUESSER : GUESSER; |
|
0 |
0 |
return guesser == NO_GUESSER || !morpho_guesser.analyze_proper_names(form, form_lc.empty() ? form : form_lc, lemmas) ? NO_GUESSER : GUESSER; |
|
0 |
0 |
return guesser == NO_GUESSER || !morpho_guesser.analyze_proper_names(form, form_lc.empty() ? form : form_lc, lemmas) ? NO_GUESSER : GUESSER; |
|
0 |
0 |
return guesser == NO_GUESSER || !morpho_guesser.analyze_proper_names(form, form_lc.empty() ? form : form_lc, lemmas) ? NO_GUESSER : GUESSER; |
5311
|
0 |
0 |
analyze_special(form, lemmas); |
5312
|
0 |
0 |
if (!lemmas.empty()) return NO_GUESSER; |
5315
|
0 |
0 |
if (guesser == GUESSER) |
5316
|
0 |
0 |
morpho_guesser.analyze(form, form_lc.empty() ? form : form_lc, lemmas); |
|
0 |
0 |
morpho_guesser.analyze(form, form_lc.empty() ? form : form_lc, lemmas); |
5317
|
0 |
0 |
if (!lemmas.empty()) return GUESSER; |
5320
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), unknown_tag); |
5329
|
0 |
0 |
if (lemma.len) { |
5330
|
0 |
0 |
if (dictionary.generate(lemma, filter, forms)) |
|
0 |
0 |
if (dictionary.generate(lemma, filter, forms)) |
5350
|
0 |
0 |
return new english_tokenizer(version <= 2 ? 1 : 2); |
5357
|
0 |
0 |
if (!form.len) return; |
5360
|
0 |
0 |
if (form.len == 1) |
5364
|
0 |
0 |
case '?': lemmas.emplace_back(string(form.str, form.len), dot_tag); return; |
5365
|
0 |
0 |
case ',': lemmas.emplace_back(string(form.str, form.len), comma_tag); return; |
5366
|
0 |
0 |
case '#': lemmas.emplace_back(string(form.str, form.len), hash_tag); return; |
5367
|
0 |
0 |
case '$': lemmas.emplace_back(string(form.str, form.len), dollar_tag); return; |
5368
|
0 |
0 |
case '[': lemmas.emplace_back(string(form.str, form.len), sym_tag); return; |
5369
|
0 |
0 |
case ']': lemmas.emplace_back(string(form.str, form.len), sym_tag); return; |
5370
|
0 |
0 |
case '%': lemmas.emplace_back(string(form.str, form.len), jj_tag); |
5371
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), nn_tag); return; |
5372
|
0 |
0 |
case '&': lemmas.emplace_back(string(form.str, form.len), cc_tag); |
5373
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), sym_tag); return; |
5374
|
0 |
0 |
case '*': lemmas.emplace_back(string(form.str, form.len), sym_tag); |
5375
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), nn_tag); return; |
5376
|
0 |
0 |
case '@': lemmas.emplace_back(string(form.str, form.len), sym_tag); |
5377
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), in_tag); return; |
5378
|
0 |
0 |
case '\'': lemmas.emplace_back(string(form.str, form.len), close_quotation_tag); |
5379
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), pos_tag); return; |
5386
|
0 |
0 |
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(number.str, number.len); |
5387
|
0 |
0 |
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
5388
|
0 |
0 |
while (codepoint == ',') { |
5390
|
0 |
0 |
if (unicode::category(utf8::decode(group.str, group.len) & ~unicode::N)) break; |
5391
|
0 |
0 |
if (unicode::category(utf8::decode(group.str, group.len) & ~unicode::N)) break; |
5392
|
0 |
0 |
if (unicode::category(utf8::decode(group.str, group.len) & ~unicode::N)) break; |
5397
|
0 |
0 |
if (codepoint == '.' && number.len) { |
|
0 |
0 |
if (codepoint == '.' && number.len) { |
5399
|
0 |
0 |
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
5401
|
0 |
0 |
if (version >= 2 && any_digit && codepoint == 's' && !number.len) { |
|
0 |
0 |
if (version >= 2 && any_digit && codepoint == 's' && !number.len) { |
|
0 |
0 |
if (version >= 2 && any_digit && codepoint == 's' && !number.len) { |
5402
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), number_tag); |
5403
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len - 1), nns_tag); |
5406
|
0 |
0 |
if (any_digit && (codepoint == 'e' || codepoint == 'E')) { |
|
0 |
0 |
if (any_digit && (codepoint == 'e' || codepoint == 'E')) { |
5408
|
0 |
0 |
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(number.str, number.len); |
5410
|
0 |
0 |
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
5412
|
0 |
0 |
if (any_digit && !number.len && (!codepoint || codepoint == '.')) { |
|
0 |
0 |
if (any_digit && !number.len && (!codepoint || codepoint == '.')) { |
|
0 |
0 |
if (any_digit && !number.len && (!codepoint || codepoint == '.')) { |
5413
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), number_tag); |
5414
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), nnp_tag); |
5415
|
0 |
0 |
if (form.len == 1 + (codepoint == '.') && *form.str >= '1' && *form.str <= '9') |
|
0 |
0 |
if (form.len == 1 + (codepoint == '.') && *form.str >= '1' && *form.str <= '9') |
|
0 |
0 |
if (form.len == 1 + (codepoint == '.') && *form.str >= '1' && *form.str <= '9') |
|
0 |
0 |
if (form.len == 1 + (codepoint == '.') && *form.str >= '1' && *form.str <= '9') |
5416
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), ls_tag); |
5423
|
0 |
0 |
while ((symbol || any_punctuation) && punctuation.len) { |
|
0 |
0 |
while ((symbol || any_punctuation) && punctuation.len) { |
5425
|
0 |
0 |
if (open_quotation) open_quotation = codepoint == '`' || unicode::category(codepoint) & unicode::Pi; |
|
0 |
0 |
if (open_quotation) open_quotation = codepoint == '`' || unicode::category(codepoint) & unicode::Pi; |
|
0 |
0 |
if (open_quotation) open_quotation = codepoint == '`' || unicode::category(codepoint) & unicode::Pi; |
5426
|
0 |
0 |
if (close_quotation) close_quotation = codepoint == '\'' || codepoint == '"' || unicode::category(codepoint) & unicode::Pf; |
|
0 |
0 |
if (close_quotation) close_quotation = codepoint == '\'' || codepoint == '"' || unicode::category(codepoint) & unicode::Pf; |
|
0 |
0 |
if (close_quotation) close_quotation = codepoint == '\'' || codepoint == '"' || unicode::category(codepoint) & unicode::Pf; |
5427
|
0 |
0 |
if (open_parenthesis) open_parenthesis = unicode::category(codepoint) & unicode::Ps; |
5428
|
0 |
0 |
if (close_parenthesis) close_parenthesis = unicode::category(codepoint) & unicode::Pe; |
5429
|
0 |
0 |
if (any_punctuation) any_punctuation = unicode::category(codepoint) & unicode::P; |
5430
|
0 |
0 |
if (symbol) symbol = codepoint == '*' || unicode::category(codepoint) & unicode::S; |
|
0 |
0 |
if (symbol) symbol = codepoint == '*' || unicode::category(codepoint) & unicode::S; |
|
0 |
0 |
if (symbol) symbol = codepoint == '*' || unicode::category(codepoint) & unicode::S; |
5432
|
0 |
0 |
if (!punctuation.len && open_quotation) { lemmas.emplace_back(string(form.str, form.len), open_quotation_tag); return; } |
|
0 |
0 |
if (!punctuation.len && open_quotation) { lemmas.emplace_back(string(form.str, form.len), open_quotation_tag); return; } |
|
0 |
0 |
if (!punctuation.len && open_quotation) { lemmas.emplace_back(string(form.str, form.len), open_quotation_tag); return; } |
5433
|
0 |
0 |
if (!punctuation.len && close_quotation) { lemmas.emplace_back(string(form.str, form.len), close_quotation_tag); return; } |
|
0 |
0 |
if (!punctuation.len && close_quotation) { lemmas.emplace_back(string(form.str, form.len), close_quotation_tag); return; } |
|
0 |
0 |
if (!punctuation.len && close_quotation) { lemmas.emplace_back(string(form.str, form.len), close_quotation_tag); return; } |
5434
|
0 |
0 |
if (!punctuation.len && open_parenthesis) { lemmas.emplace_back(string(form.str, form.len), open_parenthesis_tag); return; } |
|
0 |
0 |
if (!punctuation.len && open_parenthesis) { lemmas.emplace_back(string(form.str, form.len), open_parenthesis_tag); return; } |
|
0 |
0 |
if (!punctuation.len && open_parenthesis) { lemmas.emplace_back(string(form.str, form.len), open_parenthesis_tag); return; } |
5435
|
0 |
0 |
if (!punctuation.len && close_parenthesis) { lemmas.emplace_back(string(form.str, form.len), close_parenthesis_tag); return; } |
|
0 |
0 |
if (!punctuation.len && close_parenthesis) { lemmas.emplace_back(string(form.str, form.len), close_parenthesis_tag); return; } |
|
0 |
0 |
if (!punctuation.len && close_parenthesis) { lemmas.emplace_back(string(form.str, form.len), close_parenthesis_tag); return; } |
5436
|
0 |
0 |
if (!punctuation.len && symbol) { lemmas.emplace_back(string(form.str, form.len), sym_tag); return; } |
|
0 |
0 |
if (!punctuation.len && symbol) { lemmas.emplace_back(string(form.str, form.len), sym_tag); return; } |
|
0 |
0 |
if (!punctuation.len && symbol) { lemmas.emplace_back(string(form.str, form.len), sym_tag); return; } |
5437
|
0 |
0 |
if (!punctuation.len && any_punctuation) { lemmas.emplace_back(string(form.str, form.len), punctuation_tag); return; } |
|
0 |
0 |
if (!punctuation.len && any_punctuation) { lemmas.emplace_back(string(form.str, form.len), punctuation_tag); return; } |
|
0 |
0 |
if (!punctuation.len && any_punctuation) { lemmas.emplace_back(string(form.str, form.len), punctuation_tag); return; } |
5471
|
0 |
0 |
while (tags--) { |
5473
|
0 |
0 |
exceptions_tags.emplace_back(string(data.next(len), len)); |
5609
|
0 |
0 |
for (unsigned len = data.next_1B(); len; len--) { |
5615
|
0 |
0 |
if (exception) { |
5618
|
0 |
0 |
for (unsigned len = data.next_1B(); len; len--) { |
5621
|
0 |
0 |
for (unsigned tags = data.next_1B(); tags; tags--) |
5622
|
0 |
0 |
lemmas.emplace_back(lemma, exceptions_tags[data.next_2B()]); |
5629
|
0 |
0 |
for (unsigned prefix = 1; prefix <= form_lc.len; prefix++) { |
5631
|
0 |
0 |
if (!found) break; |
5632
|
0 |
0 |
if (found[NEGATION_LEN]) { |
5633
|
0 |
0 |
if (form_lc.len - prefix >= found[TO_FOLLOW]) negation_len = found[NEGATION_LEN]; |
5639
|
0 |
0 |
add(JJ, lemma_lc, negation_len, lemmas); |
5640
|
0 |
0 |
add(RB, lemma_lc, negation_len, lemmas); |
5641
|
0 |
0 |
add(NN, lemma_lc, negation_len, lemmas); |
5642
|
0 |
0 |
add_NNS(lemma_lc, negation_len, lemmas); |
5659
|
0 |
0 |
if ( p == ( (form_lc.str + form_lc.len)) ) |
5666
|
0 |
0 |
if ( _klen > 0 ) { |
5671
|
0 |
0 |
if ( _upper < _lower ) |
5675
|
0 |
0 |
if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) < *_mid ) |
5677
|
0 |
0 |
else if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) > *_mid ) |
5689
|
0 |
0 |
if ( _klen > 0 ) { |
5694
|
0 |
0 |
if ( _upper < _lower ) |
5698
|
0 |
0 |
if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) < _mid[0] ) |
5700
|
0 |
0 |
else if ( ( form_lc.str[form_lc.len - 1 - (p - form_lc.str)]) > _mid[1] ) |
5714
|
0 |
0 |
if ( _tag_guesser_trans_actions[_trans] == 0 ) |
5719
|
0 |
0 |
while ( _nacts-- > 0 ) |
5724
|
0 |
0 |
{ if (!added_JJR_RBR) added_JJR_RBR = true, add_JJR_RBR(lemma_lc, negation_len, lemmas); } |
|
0 |
0 |
{ if (!added_JJR_RBR) added_JJR_RBR = true, add_JJR_RBR(lemma_lc, negation_len, lemmas); } |
5727
|
0 |
0 |
{ if (!added_JJS_RBS) added_JJS_RBS = true, add_JJS_RBS(lemma_lc, negation_len, lemmas); } |
|
0 |
0 |
{ if (!added_JJS_RBS) added_JJS_RBS = true, add_JJS_RBS(lemma_lc, negation_len, lemmas); } |
5730
|
0 |
0 |
{ add_VBG(lemma_lc, lemmas); } |
5733
|
0 |
0 |
{ add_VBD_VBN(lemma_lc, lemmas); } |
5736
|
0 |
0 |
{ add_VBZ(lemma_lc, lemmas); } |
5742
|
0 |
0 |
{ if (!added_SYM) added_SYM = true, add(SYM, lemma_lc, lemmas); } |
5745
|
0 |
0 |
{ if (!added_CD) added_CD = true, add(CD, lemma_lc, lemmas); } |
5751
|
0 |
0 |
if ( ++p != ( (form_lc.str + form_lc.len)) ) |
5754
|
0 |
0 |
if ( p == ( (form_lc.str + form_lc.len)) ) |
5758
|
0 |
0 |
while ( __nacts-- > 0 ) { |
5759
|
0 |
0 |
switch ( *__acts++ ) { |
5761
|
0 |
0 |
{ if (!added_CD) added_CD = true, add(CD, lemma_lc, lemmas); } |
5777
|
0 |
0 |
bool is_NNP = form.str != form_lc.str || (form.len && (*form.str == '\'' || (*form.str >= '0' && *form.str <= '9'))); |
|
0 |
0 |
bool is_NNP = form.str != form_lc.str || (form.len && (*form.str == '\'' || (*form.str >= '0' && *form.str <= '9'))); |
|
0 |
0 |
bool is_NNP = form.str != form_lc.str || (form.len && (*form.str == '\'' || (*form.str >= '0' && *form.str <= '9'))); |
|
0 |
0 |
bool is_NNP = form.str != form_lc.str || (form.len && (*form.str == '\'' || (*form.str >= '0' && *form.str <= '9'))); |
5779
|
0 |
0 |
if (!is_NNP && !is_NNPS) return false; |
5782
|
0 |
0 |
for (auto&& lemma : lemmas) { |
5786
|
0 |
0 |
if (!((is_NNP && !was_NNP) || (is_NNPS && !was_NNPS))) return false; |
|
0 |
0 |
if (!((is_NNP && !was_NNP) || (is_NNPS && !was_NNPS))) return false; |
5789
|
0 |
0 |
if (is_NNP && !was_NNP) add(NNP, lemma, lemmas); |
5790
|
0 |
0 |
if (is_NNPS && !was_NNPS) add_NNPS(lemma, lemmas); |
|
0 |
0 |
if (is_NNPS && !was_NNPS) add_NNPS(lemma, lemmas); |
5795
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
0 |
0 |
lemmas.emplace_back(form, tag); |
|
0 |
0 |
lemmas.emplace_back(form, tag); |
5804
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
|
0 |
0 |
lemmas.emplace_back(negation_len ? form.substr(negation_len) + "^" + form.substr(0, negation_len) : form, tag); |
5906
|
0 |
0 |
if ( p == ( (form.c_str() + form.size())) ) |
5915
|
0 |
0 |
if ( _klen > 0 ) { |
5920
|
0 |
0 |
if ( _upper < _lower ) |
5924
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < *_mid ) |
5926
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > *_mid ) |
5938
|
0 |
0 |
if ( _klen > 0 ) { |
5943
|
0 |
0 |
if ( _upper < _lower ) |
5947
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < _mid[0] ) |
5949
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > _mid[1] ) |
5963
|
0 |
0 |
if ( _NNS_trans_actions[_trans] == 0 ) |
5968
|
0 |
0 |
while ( _nacts-- > 0 ) |
5973
|
0 |
0 |
{ if (best > 'a') best = 'a', remove = 2, append = "an"; } |
5976
|
0 |
0 |
{ if (best > 'b') best = 'b', remove = 1, append = nullptr; } |
5979
|
0 |
0 |
{ if (best > 'c') best = 'c', remove = 3, append = "fe"; } |
5982
|
0 |
0 |
{ if (best > 'd') best = 'd', remove = 2, append = nullptr; } |
5985
|
0 |
0 |
{ if (best > 'e') best = 'e', remove = 1, append = nullptr; } |
5988
|
0 |
0 |
{ if (best > 'f') best = 'f', remove = 2, append = nullptr; } |
5991
|
0 |
0 |
{ if (best > 'g') best = 'g', remove = 1, append = nullptr; } |
5994
|
0 |
0 |
{ if (best > 'h') best = 'h', remove = 2, append = nullptr; } |
5997
|
0 |
0 |
{ if (best > 'i') best = 'i', remove = 1, append = nullptr; } |
6000
|
0 |
0 |
{ if (best > 'j') best = 'j', remove = 1, append = nullptr; } |
6003
|
0 |
0 |
{ if (best > 'k') best = 'k', remove = 2, append = nullptr; } |
6006
|
0 |
0 |
{ if (best > 'l') best = 'l', remove = 3, append = "y"; } |
6009
|
0 |
0 |
{ if (best > 'm') best = 'm', remove = 2, append = nullptr; } |
6012
|
0 |
0 |
{ if (best > 'n') best = 'n', remove = 1, append = nullptr; } |
6018
|
0 |
0 |
if ( cs == 0 ) |
6020
|
0 |
0 |
if ( ++p != ( (form.c_str() + form.size())) ) |
6026
|
0 |
0 |
add(NNS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
|
0 |
0 |
add(NNS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
|
0 |
0 |
add(NNS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
6152
|
0 |
0 |
if ( p == ( (form.c_str() + form.size())) ) |
6161
|
0 |
0 |
if ( _klen > 0 ) { |
6166
|
0 |
0 |
if ( _upper < _lower ) |
6170
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid ) |
6172
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid ) |
6184
|
0 |
0 |
if ( _klen > 0 ) { |
6189
|
0 |
0 |
if ( _upper < _lower ) |
6193
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] ) |
6195
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] ) |
6209
|
0 |
0 |
if ( _NNPS_trans_actions[_trans] == 0 ) |
6214
|
0 |
0 |
while ( _nacts-- > 0 ) |
6219
|
0 |
0 |
{ if (best > 'a') best = 'a', remove = 2, append = "AN"; } |
6222
|
0 |
0 |
{ if (best > 'b') best = 'b', remove = 2, append = "an"; } |
6225
|
0 |
0 |
{ if (best > 'c') best = 'c', remove = 1, append = nullptr; } |
6228
|
0 |
0 |
{ if (best > 'd') best = 'd', remove = 3, append = "FE"; } |
6231
|
0 |
0 |
{ if (best > 'e') best = 'e', remove = 3, append = "fe"; } |
6234
|
0 |
0 |
{ if (best > 'f') best = 'f', remove = 2, append = nullptr; } |
6237
|
0 |
0 |
{ if (best > 'g') best = 'g', remove = 1, append = nullptr; } |
6240
|
0 |
0 |
{ if (best > 'h') best = 'h', remove = 2, append = nullptr; } |
6243
|
0 |
0 |
{ if (best > 'i') best = 'i', remove = 1, append = nullptr; } |
6246
|
0 |
0 |
{ if (best > 'j') best = 'j', remove = 2, append = nullptr; } |
6249
|
0 |
0 |
{ if (best > 'k') best = 'k', remove = 1, append = nullptr; } |
6252
|
0 |
0 |
{ if (best > 'l') best = 'l', remove = 1, append = nullptr; } |
6255
|
0 |
0 |
{ if (best > 'm') best = 'm', remove = 2, append = nullptr; } |
6258
|
0 |
0 |
{ if (best > 'n') best = 'n', remove = 3, append = "Y"; } |
6261
|
0 |
0 |
{ if (best > 'o') best = 'o', remove = 3, append = "y"; } |
6264
|
0 |
0 |
{ if (best > 'p') best = 'p', remove = 2, append = nullptr; } |
6267
|
0 |
0 |
{ if (best > 'q') best = 'q', remove = 1, append = nullptr; } |
6273
|
0 |
0 |
if ( cs == 0 ) |
6275
|
0 |
0 |
if ( ++p != ( (form.c_str() + form.size())) ) |
6281
|
0 |
0 |
add(NNPS, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
|
0 |
0 |
add(NNPS, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
6581
|
0 |
0 |
if ( p == ( (form.c_str() + form.size())) ) |
6590
|
0 |
0 |
if ( _klen > 0 ) { |
6595
|
0 |
0 |
if ( _upper < _lower ) |
6599
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid ) |
6601
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid ) |
6613
|
0 |
0 |
if ( _klen > 0 ) { |
6618
|
0 |
0 |
if ( _upper < _lower ) |
6622
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] ) |
6624
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] ) |
6638
|
0 |
0 |
if ( _VBG_trans_actions[_trans] == 0 ) |
6643
|
0 |
0 |
while ( _nacts-- > 0 ) |
6648
|
0 |
0 |
{ if (best > 'a') best = 'a', remove = 3, append = nullptr; } |
6651
|
0 |
0 |
{ if (best > 'b') best = 'b', remove = 3, append = "e"; } |
6654
|
0 |
0 |
{ if (best > 'c') best = 'c', remove = 3, append = nullptr; } |
6657
|
0 |
0 |
{ if (best > 'd') best = 'd', remove = 3, append = "e"; } |
6660
|
0 |
0 |
{ if (best > 'e') best = 'e', remove = 3, append = nullptr; } |
6663
|
0 |
0 |
{ if (best > 'f') best = 'f', remove = 3, append = "e"; } |
6666
|
0 |
0 |
{ if (best > 'g') best = 'g', remove = 3, append = nullptr; } |
6669
|
0 |
0 |
{ if (best > 'h') best = 'h', remove = 3, append = "e"; } |
6672
|
0 |
0 |
{ if (best > 'i') best = 'i', remove = 3, append = nullptr; } |
6675
|
0 |
0 |
{ if (best > 'j') best = 'j', remove = 3, append = "e"; } |
6678
|
0 |
0 |
{ if (best > 'k') best = 'k', remove = 3, append = nullptr; } |
6681
|
0 |
0 |
{ if (best > 'l') best = 'l', remove = 3, append = "e"; } |
6684
|
0 |
0 |
{ if (best > 'm') best = 'm', remove = 3, append = nullptr; } |
6687
|
0 |
0 |
{ if (best > 'n') best = 'n', remove = 3, append = "e"; } |
6690
|
0 |
0 |
{ if (best > 'o') best = 'o', remove = 3, append = nullptr; } |
6693
|
0 |
0 |
{ if (best > 'p') best = 'p', remove = 3, append = "e"; } |
6696
|
0 |
0 |
{ if (best > 'q') best = 'q', remove = 3, append = nullptr; } |
6699
|
0 |
0 |
{ if (best > 'r') best = 'r', remove = 3, append = "e"; } |
6705
|
0 |
0 |
if ( cs == 0 ) |
6707
|
0 |
0 |
if ( ++p != ( (form.c_str() + form.size())) ) |
6710
|
0 |
0 |
if ( p == ( (form.c_str() + form.size())) ) |
6714
|
0 |
0 |
while ( __nacts-- > 0 ) { |
6717
|
0 |
0 |
{ if (best > 'c') best = 'c', remove = 3, append = nullptr; } |
6720
|
0 |
0 |
{ if (best > 'f') best = 'f', remove = 3, append = "e"; } |
6723
|
0 |
0 |
{ if (best > 'p') best = 'p', remove = 3, append = "e"; } |
6732
|
0 |
0 |
add(VBG, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
|
0 |
0 |
add(VBG, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
7035
|
0 |
0 |
if ( p == ( (form.c_str() + form.size())) ) |
7044
|
0 |
0 |
if ( _klen > 0 ) { |
7049
|
0 |
0 |
if ( _upper < _lower ) |
7053
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid ) |
7055
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid ) |
7067
|
0 |
0 |
if ( _klen > 0 ) { |
7072
|
0 |
0 |
if ( _upper < _lower ) |
7076
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] ) |
7078
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] ) |
7092
|
0 |
0 |
if ( _VBD_VBN_trans_actions[_trans] == 0 ) |
7097
|
0 |
0 |
while ( _nacts-- > 0 ) |
7102
|
0 |
0 |
{ if (best > 'a') best = 'a', remove = 1, append = nullptr; } |
7105
|
0 |
0 |
{ if (best > 'b') best = 'b', remove = 2, append = nullptr; } |
7108
|
0 |
0 |
{ if (best > 'c') best = 'c', remove = 1, append = nullptr; } |
7111
|
0 |
0 |
{ if (best > 'd') best = 'd', remove = 2, append = nullptr; } |
7114
|
0 |
0 |
{ if (best > 'e') best = 'e', remove = 1, append = nullptr; } |
7117
|
0 |
0 |
{ if (best > 'f') best = 'f', remove = 2, append = nullptr; } |
7120
|
0 |
0 |
{ if (best > 'h') best = 'h', remove = 2, append = nullptr; } |
7123
|
0 |
0 |
{ if (best > 'i') best = 'i', remove = 3, append = "y"; } |
7126
|
0 |
0 |
{ if (best > 'j') best = 'j', remove = 1, append = nullptr; } |
7129
|
0 |
0 |
{ if (best > 'k') best = 'k', remove = 2, append = nullptr; } |
7132
|
0 |
0 |
{ if (best > 'l') best = 'l', remove = 1, append = nullptr; } |
7135
|
0 |
0 |
{ if (best > 'm') best = 'm', remove = 2, append = nullptr; } |
7138
|
0 |
0 |
{ if (best > 'n') best = 'n', remove = 1, append = nullptr; } |
7141
|
0 |
0 |
{ if (best > 'o') best = 'o', remove = 2, append = nullptr; } |
7144
|
0 |
0 |
{ if (best > 'p') best = 'p', remove = 1, append = nullptr; } |
7147
|
0 |
0 |
{ if (best > 'q') best = 'q', remove = 2, append = nullptr; } |
7150
|
0 |
0 |
{ if (best > 'r') best = 'r', remove = 1, append = nullptr; } |
7156
|
0 |
0 |
if ( cs == 0 ) |
7158
|
0 |
0 |
if ( ++p != ( (form.c_str() + form.size())) ) |
7161
|
0 |
0 |
if ( p == ( (form.c_str() + form.size())) ) |
7165
|
0 |
0 |
while ( __nacts-- > 0 ) { |
7168
|
0 |
0 |
{ if (best > 'd') best = 'd', remove = 2, append = nullptr; } |
7171
|
0 |
0 |
{ if (best > 'g') best = 'g', remove = 1, append = nullptr; } |
7174
|
0 |
0 |
{ if (best > 'j') best = 'j', remove = 1, append = nullptr; } |
7183
|
0 |
0 |
add(VBD, VBN, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
|
0 |
0 |
add(VBD, VBN, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
7262
|
0 |
0 |
if ( p == ( (form.c_str() + form.size())) ) |
7271
|
0 |
0 |
if ( _klen > 0 ) { |
7276
|
0 |
0 |
if ( _upper < _lower ) |
7280
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str())]) < *_mid ) |
7282
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > *_mid ) |
7294
|
0 |
0 |
if ( _klen > 0 ) { |
7299
|
0 |
0 |
if ( _upper < _lower ) |
7303
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str())]) < _mid[0] ) |
7305
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str())]) > _mid[1] ) |
7319
|
0 |
0 |
if ( _VBZ_trans_actions[_trans] == 0 ) |
7324
|
0 |
0 |
while ( _nacts-- > 0 ) |
7329
|
0 |
0 |
{ if (best > 'a') best = 'a', remove = 1, append = nullptr; } |
7332
|
0 |
0 |
{ if (best > 'b') best = 'b', remove = 2, append = nullptr; } |
7335
|
0 |
0 |
{ if (best > 'c') best = 'c', remove = 1, append = nullptr; } |
7338
|
0 |
0 |
{ if (best > 'd') best = 'd', remove = 2, append = nullptr; } |
7341
|
0 |
0 |
{ if (best > 'e') best = 'e', remove = 1, append = nullptr; } |
7344
|
0 |
0 |
{ if (best > 'f') best = 'f', remove = 2, append = nullptr; } |
7347
|
0 |
0 |
{ if (best > 'g') best = 'g', remove = 3, append = "y"; } |
7350
|
0 |
0 |
{ if (best > 'h') best = 'h', remove = 2, append = nullptr; } |
7353
|
0 |
0 |
{ if (best > 'i') best = 'i', remove = 1, append = nullptr; } |
7359
|
0 |
0 |
if ( cs == 0 ) |
7361
|
0 |
0 |
if ( ++p != ( (form.c_str() + form.size())) ) |
7367
|
0 |
0 |
add(VBZ, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
|
0 |
0 |
add(VBZ, form.substr(0, form.size() - remove).append(append ? append : ""), lemmas); |
7493
|
0 |
0 |
if ( p == ( (form.c_str() + form.size())) ) |
7502
|
0 |
0 |
if ( _klen > 0 ) { |
7507
|
0 |
0 |
if ( _upper < _lower ) |
7511
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < *_mid ) |
7513
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > *_mid ) |
7525
|
0 |
0 |
if ( _klen > 0 ) { |
7530
|
0 |
0 |
if ( _upper < _lower ) |
7534
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < _mid[0] ) |
7536
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > _mid[1] ) |
7550
|
0 |
0 |
if ( _JJR_RBR_trans_actions[_trans] == 0 ) |
7555
|
0 |
0 |
while ( _nacts-- > 0 ) |
7560
|
0 |
0 |
{ if (best > 'a') best = 'a', remove = 2, append = nullptr; } |
7563
|
0 |
0 |
{ if (best > 'b') best = 'b', remove = 3, append = nullptr; } |
7566
|
0 |
0 |
{ if (best > 'c') best = 'c', remove = 3, append = "y"; } |
7569
|
0 |
0 |
{ if (best > 'd') best = 'd', remove = 2, append = nullptr; } |
7572
|
0 |
0 |
{ if (best > 'e') best = 'e', remove = 1, append = nullptr; } |
7575
|
0 |
0 |
{ if (best > 'f') best = 'f', remove = 2, append = nullptr; } |
7581
|
0 |
0 |
if ( cs == 0 ) |
7583
|
0 |
0 |
if ( ++p != ( (form.c_str() + form.size())) ) |
7589
|
0 |
0 |
add(JJR, RBR, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
|
0 |
0 |
add(JJR, RBR, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
|
0 |
0 |
add(JJR, RBR, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
7719
|
0 |
0 |
if ( p == ( (form.c_str() + form.size())) ) |
7728
|
0 |
0 |
if ( _klen > 0 ) { |
7733
|
0 |
0 |
if ( _upper < _lower ) |
7737
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < *_mid ) |
7739
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > *_mid ) |
7751
|
0 |
0 |
if ( _klen > 0 ) { |
7756
|
0 |
0 |
if ( _upper < _lower ) |
7760
|
0 |
0 |
if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) < _mid[0] ) |
7762
|
0 |
0 |
else if ( ( form[form.size() - 1 - (p - form.c_str() - negation_len)]) > _mid[1] ) |
7776
|
0 |
0 |
if ( _JJS_RBS_trans_actions[_trans] == 0 ) |
7781
|
0 |
0 |
while ( _nacts-- > 0 ) |
7786
|
0 |
0 |
{ if (best > 'a') best = 'a', remove = 3, append = nullptr; } |
7789
|
0 |
0 |
{ if (best > 'b') best = 'b', remove = 4, append = nullptr; } |
7792
|
0 |
0 |
{ if (best > 'c') best = 'c', remove = 4, append = "y"; } |
7795
|
0 |
0 |
{ if (best > 'd') best = 'd', remove = 3, append = nullptr; } |
7798
|
0 |
0 |
{ if (best > 'e') best = 'e', remove = 2, append = nullptr; } |
7801
|
0 |
0 |
{ if (best > 'f') best = 'f', remove = 3, append = nullptr; } |
7807
|
0 |
0 |
if ( cs == 0 ) |
7809
|
0 |
0 |
if ( ++p != ( (form.c_str() + form.size())) ) |
7815
|
0 |
0 |
add(JJS, RBS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
|
0 |
0 |
add(JJS, RBS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
|
0 |
0 |
add(JJS, RBS, form.substr(0, form.size() - remove).append(append ? append : ""), negation_len, lemmas); |
7898
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
7902
|
0 |
0 |
unsigned length = data.next_1B(); |
7903
|
0 |
0 |
unknown_tag.assign(data.next(length), length); |
|
0 |
0 |
unknown_tag.assign(data.next(length), length); |
7914
|
0 |
0 |
if (form.len) { |
7917
|
0 |
0 |
while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++; |
|
0 |
0 |
while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++; |
7918
|
0 |
0 |
if (lemmatags.len) lemmatags.len--, lemmatags.str++; |
7921
|
0 |
0 |
while (lemmatags.len) { |
7923
|
0 |
0 |
while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++; |
|
0 |
0 |
while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++; |
7924
|
0 |
0 |
if (!lemmatags.len) break; |
7929
|
0 |
0 |
while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++; |
|
0 |
0 |
while (lemmatags.len && *lemmatags.str != ' ') lemmatags.len--, lemmatags.str++; |
7931
|
0 |
0 |
if (lemmatags.len) lemmatags.len--, lemmatags.str++; |
7933
|
0 |
0 |
lemmas.emplace_back(string(lemma_start, lemma_len), string(tag_start, tag_len)); |
7936
|
0 |
0 |
if (!lemmas.empty()) return NO_GUESSER; |
7939
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), unknown_tag); |
7948
|
0 |
0 |
if (lemma.len) { |
7951
|
0 |
0 |
while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++; |
|
0 |
0 |
while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++; |
7953
|
0 |
0 |
if (formtags.len) formtags.len--, formtags.str++; |
7957
|
0 |
0 |
while (formtags.len) { |
7959
|
0 |
0 |
while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++; |
|
0 |
0 |
while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++; |
7960
|
0 |
0 |
if (!formtags.len) break; |
7965
|
0 |
0 |
while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++; |
|
0 |
0 |
while (formtags.len && *formtags.str != ' ') formtags.len--, formtags.str++; |
7967
|
0 |
0 |
if (formtags.len) formtags.len--, formtags.str++; |
7971
|
0 |
0 |
if (filter.matches(tag.c_str())) { |
7972
|
0 |
0 |
if (forms.empty()) forms.emplace_back(string(real_lemma.str, real_lemma.len)); |
|
0 |
0 |
if (forms.empty()) forms.emplace_back(string(real_lemma.str, real_lemma.len)); |
7973
|
0 |
0 |
forms.back().forms.emplace_back(string(form_start, form_len), tag); |
7977
|
0 |
0 |
if (any_result) return NO_GUESSER; |
7985
|
0 |
0 |
while (lemma_len < lemma.len && lemma.str[lemma_len] != ' ') lemma_len++; |
|
0 |
0 |
while (lemma_len < lemma.len && lemma.str[lemma_len] != ' ') lemma_len++; |
7991
|
0 |
0 |
while (lemma_len < lemma.len && lemma.str[lemma_len] != ' ') lemma_len++; |
|
0 |
0 |
while (lemma_len < lemma.len && lemma.str[lemma_len] != ' ') lemma_len++; |
7997
|
0 |
0 |
while (form_len < form.len && form.str[form_len] != ' ') form_len++; |
|
0 |
0 |
while (form_len < form.len && form.str[form_len] != ' ') form_len++; |
8118
|
1 |
0 |
if (!compressor::load(is, data)) return false; |
|
1 |
0 |
if (!compressor::load(is, data)) return false; |
8122
|
1 |
0 |
unsigned length = data.next_1B(); |
8123
|
1 |
0 |
unknown_tag.assign(data.next(length), length); |
8124
|
1 |
0 |
length = data.next_1B(); |
8125
|
1 |
0 |
number_tag.assign(data.next(length), length); |
8126
|
1 |
0 |
length = data.next_1B(); |
8127
|
1 |
0 |
punctuation_tag.assign(data.next(length), length); |
8128
|
1 |
0 |
length = data.next_1B(); |
8129
|
1 |
0 |
symbol_tag.assign(data.next(length), length); |
8132
|
1 |
0 |
dictionary.load(data); |
8136
|
1 |
0 |
if (data.next_1B()) { |
|
1 |
0 |
if (data.next_1B()) { |
8137
|
1 |
0 |
statistical_guesser.reset(new morpho_statistical_guesser()); |
8138
|
1 |
0 |
statistical_guesser->load(data); |
8139
|
0 |
0 |
} |
8150
|
7 |
0 |
if (form.len) { |
8154
|
7 |
0 |
generate_casing_variants(form, form_uclc, form_lc); |
8157
|
7 |
0 |
dictionary.analyze(form, lemmas); |
8158
|
0 |
7 |
if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas); |
|
0 |
0 |
if (!form_uclc.empty()) dictionary.analyze(form_uclc, lemmas); |
8159
|
1 |
6 |
if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas); |
|
1 |
0 |
if (!form_lc.empty()) dictionary.analyze(form_lc, lemmas); |
8160
|
0 |
7 |
if (!lemmas.empty()) return NO_GUESSER; |
8163
|
0 |
0 |
analyze_special(form, lemmas); |
8164
|
0 |
0 |
if (!lemmas.empty()) return NO_GUESSER; |
8167
|
0 |
0 |
if (guesser == GUESSER && statistical_guesser) { |
|
0 |
0 |
if (guesser == GUESSER && statistical_guesser) { |
|
0 |
0 |
if (guesser == GUESSER && statistical_guesser) { |
8168
|
0 |
0 |
if (form_uclc.empty() && form_lc.empty()) |
|
0 |
0 |
if (form_uclc.empty() && form_lc.empty()) |
|
0 |
0 |
if (form_uclc.empty() && form_lc.empty()) |
8169
|
0 |
0 |
statistical_guesser->analyze(form, lemmas, nullptr); |
8171
|
0 |
0 |
morpho_statistical_guesser::used_rules used_rules; used_rules.reserve(3); |
8172
|
0 |
0 |
statistical_guesser->analyze(form, lemmas, &used_rules); |
8173
|
0 |
0 |
if (!form_uclc.empty()) statistical_guesser->analyze(form_uclc, lemmas, &used_rules); |
|
0 |
0 |
if (!form_uclc.empty()) statistical_guesser->analyze(form_uclc, lemmas, &used_rules); |
8174
|
0 |
0 |
if (!form_lc.empty()) statistical_guesser->analyze(form_lc, lemmas, &used_rules); |
|
0 |
0 |
if (!form_lc.empty()) statistical_guesser->analyze(form_lc, lemmas, &used_rules); |
8177
|
0 |
0 |
if (!lemmas.empty()) return GUESSER; |
8180
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), unknown_tag); |
8189
|
0 |
0 |
if (lemma.len) { |
8190
|
0 |
0 |
if (dictionary.generate(lemma, filter, forms)) |
|
0 |
0 |
if (dictionary.generate(lemma, filter, forms)) |
8220
|
0 |
0 |
if (!form.len) return; |
8228
|
0 |
0 |
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(number.str, number.len); |
8229
|
0 |
0 |
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
8230
|
0 |
0 |
if ((codepoint == '.' && number.len) || codepoint == ',') codepoint = utf8::decode(number.str, number.len); |
|
0 |
0 |
if ((codepoint == '.' && number.len) || codepoint == ',') codepoint = utf8::decode(number.str, number.len); |
|
0 |
0 |
if ((codepoint == '.' && number.len) || codepoint == ',') codepoint = utf8::decode(number.str, number.len); |
8231
|
0 |
0 |
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
8232
|
0 |
0 |
if (any_digit && (codepoint == 'e' || codepoint == 'E')) { |
|
0 |
0 |
if (any_digit && (codepoint == 'e' || codepoint == 'E')) { |
8234
|
0 |
0 |
if (codepoint == '+' || codepoint == '-') codepoint = utf8::decode(number.str, number.len); |
8236
|
0 |
0 |
while (unicode::category(codepoint) & unicode::N) any_digit = true, codepoint = utf8::decode(number.str, number.len); |
8239
|
0 |
0 |
if (any_digit && !number.len && (!codepoint || codepoint == '.')) { |
|
0 |
0 |
if (any_digit && !number.len && (!codepoint || codepoint == '.')) { |
|
0 |
0 |
if (any_digit && !number.len && (!codepoint || codepoint == '.')) { |
8240
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), number_tag); |
8247
|
0 |
0 |
while (form.len) { |
8249
|
0 |
0 |
punctuation = punctuation && unicode::category(codepoint) & unicode::P; |
|
0 |
0 |
punctuation = punctuation && unicode::category(codepoint) & unicode::P; |
8250
|
0 |
0 |
symbol = symbol && unicode::category(codepoint) & unicode::S; |
|
0 |
0 |
symbol = symbol && unicode::category(codepoint) & unicode::S; |
8252
|
0 |
0 |
if (punctuation) |
8253
|
0 |
0 |
lemmas.emplace_back(string(form_ori.str, form_ori.len), punctuation_tag); |
8254
|
0 |
0 |
else if (symbol) |
8255
|
0 |
0 |
lemmas.emplace_back(string(form_ori.str, form_ori.len), symbol_tag); |
8302
|
0 |
0 |
construct(std::map(map.begin(), map.end()), load_factor, entry_encode); |
|
0 |
0 |
construct(std::map(map.begin(), map.end()), load_factor, entry_encode); |
|
0 |
0 |
construct(std::map(map.begin(), map.end()), load_factor, entry_encode); |
|
0 |
0 |
construct(std::map(map.begin(), map.end()), load_factor, entry_encode); |
8308
|
0 |
0 |
std::map enlarged_map(map.begin(), map.end()); |
|
0 |
0 |
std::map enlarged_map(map.begin(), map.end()); |
8310
|
0 |
0 |
for (auto&& entry : map) { |
|
0 |
0 |
for (auto&& entry : map) { |
8313
|
0 |
0 |
if (!key.empty() && add_prefixes) |
|
0 |
0 |
if (!key.empty() && add_prefixes) |
|
0 |
0 |
if (!key.empty() && add_prefixes) |
|
0 |
0 |
if (!key.empty() && add_prefixes) |
|
0 |
0 |
if (!key.empty() && add_prefixes) |
|
0 |
0 |
if (!key.empty() && add_prefixes) |
8314
|
0 |
0 |
for (unsigned i = key.size() - 1; i; i--) |
|
0 |
0 |
for (unsigned i = key.size() - 1; i; i--) |
8315
|
0 |
0 |
enlarged_map[key.substr(0, i)]; |
|
0 |
0 |
enlarged_map[key.substr(0, i)]; |
|
0 |
0 |
enlarged_map[key.substr(0, i)]; |
|
0 |
0 |
enlarged_map[key.substr(0, i)]; |
8317
|
0 |
0 |
if (!key.empty() && add_suffixes) |
|
0 |
0 |
if (!key.empty() && add_suffixes) |
|
0 |
0 |
if (!key.empty() && add_suffixes) |
|
0 |
0 |
if (!key.empty() && add_suffixes) |
|
0 |
0 |
if (!key.empty() && add_suffixes) |
|
0 |
0 |
if (!key.empty() && add_suffixes) |
8318
|
0 |
0 |
for (unsigned i = 1; i < key.size(); i++) |
|
0 |
0 |
for (unsigned i = 1; i < key.size(); i++) |
8319
|
0 |
0 |
enlarged_map[key.substr(i)]; |
|
0 |
0 |
enlarged_map[key.substr(i)]; |
|
0 |
0 |
enlarged_map[key.substr(i)]; |
|
0 |
0 |
enlarged_map[key.substr(i)]; |
8322
|
0 |
0 |
construct(enlarged_map, load_factor, entry_encode); |
|
0 |
0 |
construct(enlarged_map, load_factor, entry_encode); |
8333
|
0 |
0 |
for (auto&& elem : map) { |
|
0 |
0 |
for (auto&& elem : map) { |
|
0 |
0 |
for (auto&& elem : map) { |
|
0 |
0 |
for (auto&& elem : map) { |
8335
|
0 |
0 |
if (len >= sizes.size()) sizes.resize(len + 1); |
|
0 |
0 |
if (len >= sizes.size()) sizes.resize(len + 1); |
|
0 |
0 |
if (len >= sizes.size()) sizes.resize(len + 1); |
|
0 |
0 |
if (len >= sizes.size()) sizes.resize(len + 1); |
|
0 |
0 |
if (len >= sizes.size()) sizes.resize(len + 1); |
|
0 |
0 |
if (len >= sizes.size()) sizes.resize(len + 1); |
|
0 |
0 |
if (len >= sizes.size()) sizes.resize(len + 1); |
|
0 |
0 |
if (len >= sizes.size()) sizes.resize(len + 1); |
8338
|
0 |
0 |
for (auto&& size : sizes) |
|
0 |
0 |
for (auto&& size : sizes) |
|
0 |
0 |
for (auto&& size : sizes) |
|
0 |
0 |
for (auto&& size : sizes) |
8339
|
0 |
0 |
resize(unsigned(load_factor * size)); |
|
0 |
0 |
resize(unsigned(load_factor * size)); |
|
0 |
0 |
resize(unsigned(load_factor * size)); |
|
0 |
0 |
resize(unsigned(load_factor * size)); |
8342
|
0 |
0 |
for (auto&& elem : map) { |
|
0 |
0 |
for (auto&& elem : map) { |
|
0 |
0 |
for (auto&& elem : map) { |
|
0 |
0 |
for (auto&& elem : map) { |
8343
|
0 |
0 |
binary_encoder enc; |
|
0 |
0 |
binary_encoder enc; |
|
0 |
0 |
binary_encoder enc; |
|
0 |
0 |
binary_encoder enc; |
8344
|
0 |
0 |
entry_encode(enc, elem.second); |
|
0 |
0 |
entry_encode(enc, elem.second); |
|
0 |
0 |
entry_encode(enc, elem.second); |
|
0 |
0 |
entry_encode(enc, elem.second); |
8347
|
0 |
0 |
done_adding(); |
|
0 |
0 |
done_adding(); |
|
0 |
0 |
done_adding(); |
|
0 |
0 |
done_adding(); |
8350
|
0 |
0 |
for (auto&& elem : map) { |
|
0 |
0 |
for (auto&& elem : map) { |
|
0 |
0 |
for (auto&& elem : map) { |
|
0 |
0 |
for (auto&& elem : map) { |
8351
|
0 |
0 |
binary_encoder enc; |
|
0 |
0 |
binary_encoder enc; |
|
0 |
0 |
binary_encoder enc; |
|
0 |
0 |
binary_encoder enc; |
8352
|
0 |
0 |
entry_encode(enc, elem.second); |
|
0 |
0 |
entry_encode(enc, elem.second); |
|
0 |
0 |
entry_encode(enc, elem.second); |
|
0 |
0 |
entry_encode(enc, elem.second); |
8361
|
0 |
0 |
for (auto&& hash : hashes) |
8420
|
1 |
0 |
return unique_ptr(new T(std::forward(args)...)); |
|
0 |
0 |
return unique_ptr(new T(std::forward(args)...)); |
|
0 |
0 |
return unique_ptr(new T(std::forward(args)...)); |
8460
|
0 |
0 |
if (!*str) return; |
8462
|
0 |
0 |
for (auto&& child : children) |
8463
|
0 |
0 |
if (child.first == *str) { |
8468
|
0 |
0 |
children.emplace_back(*str, new_unique_ptr()); |
8476
|
0 |
0 |
find_candidate_prefix(max_suffix_len, current, best, best_length, 0); |
8480
|
0 |
0 |
if (depth < max_suffix_len && length > best_length) { |
|
0 |
0 |
if (depth < max_suffix_len && length > best_length) { |
8484
|
0 |
0 |
for (auto&& child : children) { |
8486
|
0 |
0 |
child.second->find_candidate_prefix(max_suffix_len, current, best, best_length, children.size() == 1 ? length + 1 : 1); |
8498
|
0 |
0 |
if (str.size() >= lengths.size()) lengths.resize(str.size() + 1); |
8504
|
0 |
0 |
for (auto&& set : lengths) |
8513
|
0 |
0 |
this->lemma = lemma.substr(0, addinfo.parse(lemma, true)); |
8524
|
0 |
0 |
bool operator<(const lemma_form_info& other) const { return form < other.form || (form == other.form && clas < other.clas); } |
|
0 |
0 |
bool operator<(const lemma_form_info& other) const { return form < other.form || (form == other.form && clas < other.clas); } |
|
0 |
0 |
bool operator<(const lemma_form_info& other) const { return form < other.form || (form == other.form && clas < other.clas); } |
8528
|
0 |
0 |
bool operator<(const lemma_info& other) const { return lemma < other.lemma || (lemma == other.lemma && addinfo.data < other.addinfo.data); } |
8547
|
0 |
0 |
dict.load(is, max_suffix_len); |
8550
|
0 |
0 |
dict.encode(enc); |
8559
|
0 |
0 |
while(raw.next_lemma(lemma, forms)) { |
|
0 |
0 |
while(raw.next_lemma(lemma, forms)) { |
8563
|
0 |
0 |
if (forms_end != forms.end()) { |
8569
|
0 |
0 |
lemmas.emplace_back(lemma); |
8571
|
0 |
0 |
lemmas_hist.add(lemma_info.lemma); |
8574
|
0 |
0 |
while (!forms.empty()) { |
8576
|
0 |
0 |
for (auto&& form : forms) |
8577
|
0 |
0 |
t.add(form.first.c_str()); |
8580
|
0 |
0 |
string prefix = t.find_candidate_prefix(max_suffix_len); |
8584
|
0 |
0 |
while (start != forms.end() && start->first.compare(0, prefix.size(), prefix) != 0) start++; |
|
0 |
0 |
while (start != forms.end() && start->first.compare(0, prefix.size(), prefix) != 0) start++; |
|
0 |
0 |
while (start != forms.end() && start->first.compare(0, prefix.size(), prefix) != 0) start++; |
|
0 |
0 |
while (start != forms.end() && start->first.compare(0, prefix.size(), prefix) != 0) start++; |
8585
|
0 |
0 |
if (start == forms.end()) training_failure("Internal error when generating classes, cannot find prefix '" << prefix << "'!"); |
|
0 |
0 |
if (start == forms.end()) training_failure("Internal error when generating classes, cannot find prefix '" << prefix << "'!"); |
|
0 |
0 |
if (start == forms.end()) training_failure("Internal error when generating classes, cannot find prefix '" << prefix << "'!"); |
8587
|
0 |
0 |
while (end != forms.end() && end->first.compare(0, prefix.size(), prefix) == 0) end++; |
|
0 |
0 |
while (end != forms.end() && end->first.compare(0, prefix.size(), prefix) == 0) end++; |
|
0 |
0 |
while (end != forms.end() && end->first.compare(0, prefix.size(), prefix) == 0) end++; |
|
0 |
0 |
while (end != forms.end() && end->first.compare(0, prefix.size(), prefix) == 0) end++; |
8591
|
0 |
0 |
while (common_prefix < int(start->first.size()) && start->first[common_prefix] == (end-1)->first[common_prefix]) common_prefix++; |
|
0 |
0 |
while (common_prefix < int(start->first.size()) && start->first[common_prefix] == (end-1)->first[common_prefix]) common_prefix++; |
|
0 |
0 |
while (common_prefix < int(start->first.size()) && start->first[common_prefix] == (end-1)->first[common_prefix]) common_prefix++; |
8594
|
0 |
0 |
for (auto form = start; form != end; form++) { |
8595
|
0 |
0 |
if (!clas.empty()) clas.push_back('\t'); |
|
0 |
0 |
if (!clas.empty()) clas.push_back('\t'); |
8596
|
0 |
0 |
clas.append(form->first, common_prefix, string::npos); |
8597
|
0 |
0 |
clas.push_back('\t'); |
8603
|
0 |
0 |
if (class_it.second) { |
8605
|
0 |
0 |
for (auto form = start; form != end; form++) { |
8607
|
0 |
0 |
if (tag >= int(tags.size())) tags.emplace_back(form->second); |
|
0 |
0 |
if (tag >= int(tags.size())) tags.emplace_back(form->second); |
8608
|
0 |
0 |
suffixes[form->first.substr(common_prefix)][class_id].emplace_back(tag); |
|
0 |
0 |
suffixes[form->first.substr(common_prefix)][class_id].emplace_back(tag); |
|
0 |
0 |
suffixes[form->first.substr(common_prefix)][class_id].emplace_back(tag); |
8613
|
0 |
0 |
lemma_info.forms.emplace_back(start->first.substr(0, common_prefix), class_id); |
|
0 |
0 |
lemma_info.forms.emplace_back(start->first.substr(0, common_prefix), class_id); |
8614
|
0 |
0 |
forms_hist.add(lemma_info.forms.back().form); |
8630
|
0 |
0 |
for (auto&& lemma : lemmas) { |
8632
|
0 |
0 |
while (prev[cpl] && prev[cpl] == lemma.lemma[cpl]) cpl++; |
|
0 |
0 |
while (prev[cpl] && prev[cpl] == lemma.lemma[cpl]) cpl++; |
|
0 |
0 |
while (prev[cpl] && prev[cpl] == lemma.lemma[cpl]) cpl++; |
8634
|
0 |
0 |
enc.add_1B(prev.length() - cpl); |
8635
|
0 |
0 |
enc.add_1B(lemma.lemma.size() - cpl); |
8636
|
0 |
0 |
enc.add_data(lemma.lemma.substr(cpl)); |
8637
|
0 |
0 |
enc.add_1B(lemma.addinfo.data.size()); |
8639
|
0 |
0 |
enc.add_1B(lemma.forms.size()); |
8642
|
0 |
0 |
for (auto&& lemma_form : lemma.forms) { |
8644
|
0 |
0 |
for (unsigned prev_from = 0; prev_from < prev_form.size(); prev_from++) |
8645
|
0 |
0 |
for (unsigned form_from = 0; form_from < lemma_form.form.size(); form_from++) { |
8647
|
0 |
0 |
while (prev_from + len < prev_form.size() && form_from + len < lemma_form.form.size() && prev_form[prev_from+len] == lemma_form.form[form_from+len]) len++; |
|
0 |
0 |
while (prev_from + len < prev_form.size() && form_from + len < lemma_form.form.size() && prev_form[prev_from+len] == lemma_form.form[form_from+len]) len++; |
|
0 |
0 |
while (prev_from + len < prev_form.size() && form_from + len < lemma_form.form.size() && prev_form[prev_from+len] == lemma_form.form[form_from+len]) len++; |
|
0 |
0 |
while (prev_from + len < prev_form.size() && form_from + len < lemma_form.form.size() && prev_form[prev_from+len] == lemma_form.form[form_from+len]) len++; |
8648
|
0 |
0 |
if (len > best_len) best_prev_from = prev_from, best_form_from = form_from, best_len = len; |
8652
|
0 |
0 |
enc.add_1B(REMOVE_START * (best_prev_from>0) + REMOVE_END * (best_prev_from+best_len
|
|
0 |
0 |
enc.add_1B(REMOVE_START * (best_prev_from>0) + REMOVE_END * (best_prev_from+best_len
|
|
0 |
0 |
enc.add_1B(REMOVE_START * (best_prev_from>0) + REMOVE_END * (best_prev_from+best_len
|
|
0 |
0 |
enc.add_1B(REMOVE_START * (best_prev_from>0) + REMOVE_END * (best_prev_from+best_len
|
8654
|
0 |
0 |
if (best_prev_from > 0) enc.add_1B(best_prev_from); |
|
0 |
0 |
if (best_prev_from > 0) enc.add_1B(best_prev_from); |
8655
|
0 |
0 |
if (best_prev_from + best_len < prev_form.size()) enc.add_1B(prev_form.size() - best_prev_from - best_len); |
|
0 |
0 |
if (best_prev_from + best_len < prev_form.size()) enc.add_1B(prev_form.size() - best_prev_from - best_len); |
8656
|
0 |
0 |
if (best_form_from > 0) { |
8657
|
0 |
0 |
enc.add_1B(best_form_from); |
8658
|
0 |
0 |
enc.add_data(lemma_form.form.substr(0, best_form_from)); |
8660
|
0 |
0 |
if (best_form_from + best_len < lemma_form.form.size()) { |
8661
|
0 |
0 |
enc.add_1B(lemma_form.form.size() - best_form_from - best_len); |
8662
|
0 |
0 |
enc.add_data(lemma_form.form.substr(best_form_from + best_len)); |
8664
|
0 |
0 |
enc.add_2B(lemma_form.clas); |
8673
|
0 |
0 |
enc.add_2B(tags.size()); |
8674
|
0 |
0 |
for (auto&& tag : tags) { |
8675
|
0 |
0 |
enc.add_1B(tag.size()); |
8680
|
0 |
0 |
persistent_unordered_map(suffixes, 5, false, true, [](binary_encoder& enc, const map>& suffix) { |
8682
|
0 |
0 |
for (auto&& clas : suffix) |
8685
|
0 |
0 |
for (auto&& clas : suffix) { |
8686
|
0 |
0 |
enc.add_2B(tags - prev_tags < (1<<16) ? uint16_t(tags) : tags); |
8690
|
0 |
0 |
enc.add_2B(tags - prev_tags < (1<<16) ? uint16_t(tags) : tags); |
8691
|
0 |
0 |
for (auto&& clas : suffix) |
8692
|
0 |
0 |
for (auto&& tag : clas.second) |
8694
|
0 |
0 |
}).save(enc); |
8761
|
0 |
0 |
enc.add_1B(tags.unknown_tag.size()); |
8763
|
0 |
0 |
enc.add_1B(tags.number_tag.size()); |
8765
|
0 |
0 |
enc.add_1B(tags.punctuation_tag.size()); |
8767
|
0 |
0 |
enc.add_1B(tags.symbol_tag.size()); |
8771
|
0 |
0 |
morpho_dictionary_encoder::encode(in_dictionary, max_suffix_len, enc); |
8774
|
0 |
0 |
enc.add_1B(bool(in_statistical_guesser)); |
8775
|
0 |
0 |
if (in_statistical_guesser) { |
8777
|
0 |
0 |
morpho_statistical_guesser_encoder::encode(in_statistical_guesser, enc); |
8782
|
0 |
0 |
if (!compressor::save(out_morpho, enc)) training_failure("Cannot compress and write dictionary to file!"); |
|
0 |
0 |
if (!compressor::save(out_morpho, enc)) training_failure("Cannot compress and write dictionary to file!"); |
|
0 |
0 |
if (!compressor::save(out_morpho, enc)) training_failure("Cannot compress and write dictionary to file!"); |
|
0 |
0 |
if (!compressor::save(out_morpho, enc)) training_failure("Cannot compress and write dictionary to file!"); |
8851
|
0 |
0 |
if (res->load(is)) return res.release(); |
|
0 |
0 |
if (res->load(is)) return res.release(); |
8860
|
0 |
0 |
3); |
|
0 |
0 |
3); |
8861
|
0 |
0 |
if (res->load(is)) return res.release(); |
|
0 |
0 |
if (res->load(is)) return res.release(); |
8867
|
0 |
0 |
if (res->load(is)) return res.release(); |
|
0 |
0 |
if (res->load(is)) return res.release(); |
8873
|
1 |
0 |
if (res->load(is)) return res.release(); |
|
1 |
0 |
if (res->load(is)) return res.release(); |
8879
|
0 |
0 |
if (res->load(is)) return res.release(); |
|
0 |
0 |
if (res->load(is)) return res.release(); |
8885
|
0 |
0 |
if (!derinet->load(is)) return nullptr; |
|
0 |
0 |
if (!derinet->load(is)) return nullptr; |
8887
|
0 |
0 |
unique_ptr dictionary(load(is)); |
8888
|
0 |
0 |
if (!dictionary) return nullptr; |
8899
|
0 |
0 |
ifstream f(path_from_utf8(fname).c_str(), ifstream::binary); |
8900
|
0 |
0 |
if (!f) return nullptr; |
8902
|
0 |
0 |
return load(f); |
8929
|
6 |
1 |
for (auto&& tag : tags) { |
8931
|
397 |
6 |
for (unsigned i = 0; i < tag.size(); i++) |
8942
|
0 |
0 |
if (!used) return false; |
8944
|
0 |
0 |
for (auto&& used_rule : *used) |
8945
|
0 |
0 |
if (used_rule == rule) |
8957
|
0 |
0 |
string rule_label; rule_label.reserve(12); |
8959
|
0 |
0 |
for (; suffix_len < form.len; suffix_len++) { |
8960
|
0 |
0 |
rule_label.push_back(form.str[form.len - (suffix_len + 1)]); |
8961
|
0 |
0 |
if (!rules.at(rule_label.c_str(), rule_label.size(), [](pointer_decoder& data){ data.next(data.next_2B()); })) |
8965
|
0 |
0 |
for (suffix_len++; suffix_len--; ) { |
8967
|
0 |
0 |
rule_label.push_back(' '); |
8971
|
0 |
0 |
for (unsigned prefix_len = 0; prefix_len + suffix_len <= form.len; prefix_len++) { |
8972
|
0 |
0 |
if (prefix_len) rule_label.push_back(form.str[prefix_len - 1]); |
|
0 |
0 |
if (prefix_len) rule_label.push_back(form.str[prefix_len - 1]); |
8974
|
0 |
0 |
if (!found) break; |
8975
|
0 |
0 |
if (*(found += sizeof(uint16_t))) { |
8981
|
0 |
0 |
if (rule) { |
8983
|
0 |
0 |
if (rule_label.size() > 1 && !contains(used, rule_label)) { // ignore rule ' ' |
|
0 |
0 |
if (rule_label.size() > 1 && !contains(used, rule_label)) { // ignore rule ' ' |
|
0 |
0 |
if (rule_label.size() > 1 && !contains(used, rule_label)) { // ignore rule ' ' |
8984
|
0 |
0 |
if (used) used->push_back(rule_label); |
|
0 |
0 |
if (used) used->push_back(rule_label); |
8985
|
0 |
0 |
for (int rules_len = *rule++; rules_len; rules_len--) { |
8992
|
0 |
0 |
if (pref_del_len + suff_del_len > form.len || |
|
0 |
0 |
if (pref_del_len + suff_del_len > form.len || |
8993
|
0 |
0 |
(pref_del_len && !small_memeq(pref_del, form.str, pref_del_len)) || |
|
0 |
0 |
(pref_del_len && !small_memeq(pref_del, form.str, pref_del_len)) || |
8994
|
0 |
0 |
(suff_del_len && !small_memeq(suff_del, form.str + form.len - suff_del_len, suff_del_len)) || |
|
0 |
0 |
(suff_del_len && !small_memeq(suff_del, form.str + form.len - suff_del_len, suff_del_len)) || |
|
0 |
0 |
(suff_del_len && !small_memeq(suff_del, form.str + form.len - suff_del_len, suff_del_len)) || |
8999
|
0 |
0 |
lemma.reserve(form.len + pref_add_len - pref_del_len + suff_add_len - suff_del_len); |
9000
|
0 |
0 |
if (pref_add_len) lemma.append(pref_add, pref_add_len); |
|
0 |
0 |
if (pref_add_len) lemma.append(pref_add, pref_add_len); |
9001
|
0 |
0 |
if (pref_del_len + suff_del_len < form.len) lemma.append(form.str + pref_del_len, form.len - pref_del_len - suff_del_len); |
|
0 |
0 |
if (pref_del_len + suff_del_len < form.len) lemma.append(form.str + pref_del_len, form.len - pref_del_len - suff_del_len); |
9002
|
0 |
0 |
if (suff_add_len) lemma.append(suff_add, suff_add_len); |
|
0 |
0 |
if (suff_add_len) lemma.append(suff_add, suff_add_len); |
9003
|
0 |
0 |
while (tags_len--) |
9004
|
0 |
0 |
lemmas.emplace_back(lemma, this->tags[unaligned_load_inc(tags)]); |
9012
|
0 |
0 |
if (lemmas.size() == lemmas_initial_size) |
9013
|
0 |
0 |
if (!contains(used, string())) { |
9014
|
0 |
0 |
if (used) used->push_back(string()); |
9015
|
0 |
0 |
lemmas.emplace_back(string(form.str, form.len), tags[default_tag]); |
9050
|
0 |
0 |
if (text.empty()) return; |
9053
|
0 |
0 |
for (string::size_type next; (next = text.find(sep, index)) != string::npos; index = next + 1) |
9061
|
53 |
0 |
if (!text.len) return; |
9064
|
68 |
53 |
for (const char* next; (next = (const char*) memchr(str, sep, text.str + text.len - str)); str = next + 1) |
9095
|
0 |
0 |
if (!getline(is, line)) training_failure("Missing first line with default tag in statistical guesser file"); |
|
0 |
0 |
if (!getline(is, line)) training_failure("Missing first line with default tag in statistical guesser file"); |
|
0 |
0 |
if (!getline(is, line)) training_failure("Missing first line with default tag in statistical guesser file"); |
|
0 |
0 |
if (!getline(is, line)) training_failure("Missing first line with default tag in statistical guesser file"); |
9097
|
0 |
0 |
if (unsigned(statistical_guesser_default) >= tags.size()) tags.emplace_back(line.data()); |
|
0 |
0 |
if (unsigned(statistical_guesser_default) >= tags.size()) tags.emplace_back(line.data()); |
9099
|
0 |
0 |
while (getline(is, line)) { |
|
0 |
0 |
while (getline(is, line)) { |
9100
|
0 |
0 |
split(line, '\t', tokens); |
9101
|
0 |
0 |
if (tokens.size() < 3 || (tokens.size() % 2) != 1) training_failure("Cannot parse line " << line << " in statistical guesser file!"); |
|
0 |
0 |
if (tokens.size() < 3 || (tokens.size() % 2) != 1) training_failure("Cannot parse line " << line << " in statistical guesser file!"); |
|
0 |
0 |
if (tokens.size() < 3 || (tokens.size() % 2) != 1) training_failure("Cannot parse line " << line << " in statistical guesser file!"); |
|
0 |
0 |
if (tokens.size() < 3 || (tokens.size() % 2) != 1) training_failure("Cannot parse line " << line << " in statistical guesser file!"); |
|
0 |
0 |
if (tokens.size() < 3 || (tokens.size() % 2) != 1) training_failure("Cannot parse line " << line << " in statistical guesser file!"); |
9104
|
0 |
0 |
split(tokens[0], ' ', affixes); |
9105
|
0 |
0 |
if (affixes.size() != 2) training_failure("Cannot parse prefix_suffix '" << tokens[0] << "' in statistical guesser file!"); |
|
0 |
0 |
if (affixes.size() != 2) training_failure("Cannot parse prefix_suffix '" << tokens[0] << "' in statistical guesser file!"); |
|
0 |
0 |
if (affixes.size() != 2) training_failure("Cannot parse prefix_suffix '" << tokens[0] << "' in statistical guesser file!"); |
9108
|
0 |
0 |
auto& rules = statistical_guesser[affixes[1] + ' ' + affixes[0]]; |
|
0 |
0 |
auto& rules = statistical_guesser[affixes[1] + ' ' + affixes[0]]; |
9109
|
0 |
0 |
for (unsigned i = 1; i < tokens.size(); i+= 2) { |
9111
|
0 |
0 |
split(tokens[i], ' ', replacements); |
9112
|
0 |
0 |
if (replacements.size() != 4) training_failure("Cannot parse replacement rule '" << tokens[i] << "' in statistical guesser file!"); |
|
0 |
0 |
if (replacements.size() != 4) training_failure("Cannot parse replacement rule '" << tokens[i] << "' in statistical guesser file!"); |
|
0 |
0 |
if (replacements.size() != 4) training_failure("Cannot parse replacement rule '" << tokens[i] << "' in statistical guesser file!"); |
9115
|
0 |
0 |
split(tokens[i+1], ' ', rule_tags); |
9117
|
0 |
0 |
for (auto&& rule_tag : rule_tags) { |
9119
|
0 |
0 |
if (unsigned(tag) >= tags.size()) tags.emplace_back(rule_tag); |
|
0 |
0 |
if (unsigned(tag) >= tags.size()) tags.emplace_back(rule_tag); |
9120
|
0 |
0 |
decoded_tags.emplace_back(tag); |
9123
|
0 |
0 |
rules.emplace_back(replacements, decoded_tags); |
9128
|
0 |
0 |
enc.add_2B(tags.size()); |
9129
|
0 |
0 |
for (auto&& tag : tags) { |
9130
|
0 |
0 |
enc.add_1B(tag.size()); |
9133
|
0 |
0 |
enc.add_2B(statistical_guesser_default); |
9137
|
0 |
0 |
e.add_1B(rules.size()); |
9138
|
0 |
0 |
for (auto&& rule : rules) { |
9139
|
0 |
0 |
if (rule.first.size() != 4) training_failure("Replacement rule not of size 4 in statistical guesser!"); |
|
0 |
0 |
if (rule.first.size() != 4) training_failure("Replacement rule not of size 4 in statistical guesser!"); |
|
0 |
0 |
if (rule.first.size() != 4) training_failure("Replacement rule not of size 4 in statistical guesser!"); |
9140
|
0 |
0 |
for (auto&& affix : rule.first) { |
9141
|
0 |
0 |
e.add_1B(affix.size()); |
9144
|
0 |
0 |
e.add_1B(rule.second.size()); |
9145
|
0 |
0 |
for (auto&& tag : rule.second) |
9146
|
0 |
0 |
e.add_2B(tag); |
9148
|
0 |
0 |
enc.add_2B(e.data.size()); |
9150
|
0 |
0 |
}).save(enc); |
|
0 |
0 |
}).save(enc); |
9211
|
0 |
0 |
for (string line; getline(is, line);) { |
|
0 |
0 |
for (string line; getline(is, line);) { |
9212
|
0 |
0 |
if (line.empty()) continue; |
9214
|
0 |
0 |
split(line, '\t', tokens); |
9215
|
0 |
0 |
if (tokens.size() != 3) training_failure("The guesser training line '" << line << "' does not contain three columns!"); |
|
0 |
0 |
if (tokens.size() != 3) training_failure("The guesser training line '" << line << "' does not contain three columns!"); |
|
0 |
0 |
if (tokens.size() != 3) training_failure("The guesser training line '" << line << "' does not contain three columns!"); |
9216
|
0 |
0 |
if (tokens[0].empty() || tokens[1].empty() || tokens[2].empty()) training_failure("The guesser training line '" << line << "' contains an empty column!"); |
|
0 |
0 |
if (tokens[0].empty() || tokens[1].empty() || tokens[2].empty()) training_failure("The guesser training line '" << line << "' contains an empty column!"); |
|
0 |
0 |
if (tokens[0].empty() || tokens[1].empty() || tokens[2].empty()) training_failure("The guesser training line '" << line << "' contains an empty column!"); |
|
0 |
0 |
if (tokens[0].empty() || tokens[1].empty() || tokens[2].empty()) training_failure("The guesser training line '" << line << "' contains an empty column!"); |
|
0 |
0 |
if (tokens[0].empty() || tokens[1].empty() || tokens[2].empty()) training_failure("The guesser training line '" << line << "' contains an empty column!"); |
|
0 |
0 |
if (tokens[0].empty() || tokens[1].empty() || tokens[2].empty()) training_failure("The guesser training line '" << line << "' contains an empty column!"); |
9221
|
0 |
0 |
if ((lemma_case == CASE_LC && (form_case == CASE_UCLC || form_case == CASE_UC)) || |
|
0 |
0 |
if ((lemma_case == CASE_LC && (form_case == CASE_UCLC || form_case == CASE_UC)) || |
|
0 |
0 |
if ((lemma_case == CASE_LC && (form_case == CASE_UCLC || form_case == CASE_UC)) || |
9223
|
0 |
0 |
set_casing(tokens[0], lemma_case, form); |
9228
|
0 |
0 |
data.emplace_back(form, tokens[1], tokens[2]); |
9233
|
0 |
0 |
for (auto&& instance : data) |
9234
|
0 |
0 |
if (!instance.form_prefix.empty()) |
9238
|
0 |
0 |
for (auto&& prefix : prefixes_with_forms) |
9239
|
0 |
0 |
if (prefix.second.size() >= min_prefix_count) |
9240
|
0 |
0 |
prefixes_with_counts.emplace_back(unsigned(prefix.second.size()), prefix.first); |
9242
|
0 |
0 |
if (prefixes_with_counts.size() > max_prefixes) { |
9244
|
0 |
0 |
prefixes_with_counts.resize(max_prefixes); |
9249
|
0 |
0 |
for (auto&& prefix : prefixes_with_counts) |
9257
|
0 |
0 |
for (auto&& instance : data) { |
9263
|
0 |
0 |
for (auto&& prefix : prefixes) |
9264
|
0 |
0 |
if (prefix.size() > prefix_length && instance.form.compare(0, prefix.size(), prefix) == 0) |
|
0 |
0 |
if (prefix.size() > prefix_length && instance.form.compare(0, prefix.size(), prefix) == 0) |
|
0 |
0 |
if (prefix.size() > prefix_length && instance.form.compare(0, prefix.size(), prefix) == 0) |
|
0 |
0 |
if (prefix.size() > prefix_length && instance.form.compare(0, prefix.size(), prefix) == 0) |
9267
|
0 |
0 |
tag_lemma_rule.assign(instance.lemma_rule).append("\t").append(instance.tag); |
9270
|
0 |
0 |
for (unsigned length = 0, utf8_length = 0; length < suffix_len && suffix(instance.form, utf8_length); length++) { |
|
0 |
0 |
for (unsigned length = 0, utf8_length = 0; length < suffix_len && suffix(instance.form, utf8_length); length++) { |
|
0 |
0 |
for (unsigned length = 0, utf8_length = 0; length < suffix_len && suffix(instance.form, utf8_length); length++) { |
9271
|
0 |
0 |
prefix_suffix.assign(instance.form, 0, prefix_length).append(" ").append(instance.form, instance.form.size() - utf8_length, utf8_length); |
|
0 |
0 |
prefix_suffix.assign(instance.form, 0, prefix_length).append(" ").append(instance.form, instance.form.size() - utf8_length, utf8_length); |
|
0 |
0 |
prefix_suffix.assign(instance.form, 0, prefix_length).append(" ").append(instance.form, instance.form.size() - utf8_length, utf8_length); |
9279
|
0 |
0 |
for (auto&& tag : tags) |
9280
|
0 |
0 |
if (tag.second.size() > most_frequent_tag_count) |
9289
|
0 |
0 |
for (auto&& suffix : suffixes) { |
9290
|
0 |
0 |
for (auto&& prefix : prefixes) { |
9295
|
0 |
0 |
for (int prefix_len = int(prefix.size()); prefix_len >= 0; prefix_len -= prefix.empty() ? 1 : prefix.size()) { |
|
0 |
0 |
for (int prefix_len = int(prefix.size()); prefix_len >= 0; prefix_len -= prefix.empty() ? 1 : prefix.size()) { |
9296
|
0 |
0 |
for (int suffix_len = int(suffix.size()); rules_counts.size() < rules_per_suffix && suffix_len > 0; suffix_len--) { |
|
0 |
0 |
for (int suffix_len = int(suffix.size()); rules_counts.size() < rules_per_suffix && suffix_len > 0; suffix_len--) { |
|
0 |
0 |
for (int suffix_len = int(suffix.size()); rules_counts.size() < rules_per_suffix && suffix_len > 0; suffix_len--) { |
9297
|
0 |
0 |
rule_key.assign(prefix, 0, prefix_len).append(" ").append(suffix, suffix.size() - suffix_len, suffix_len); |
|
0 |
0 |
rule_key.assign(prefix, 0, prefix_len).append(" ").append(suffix, suffix.size() - suffix_len, suffix_len); |
|
0 |
0 |
rule_key.assign(prefix, 0, prefix_len).append(" ").append(suffix, suffix.size() - suffix_len, suffix_len); |
9298
|
0 |
0 |
if (!rules.count(rule_key)) continue; |
9301
|
0 |
0 |
for (auto&& entry : rules[rule_key]) |
9302
|
0 |
0 |
if (!rules_set.count(entry.first)) { |
9303
|
0 |
0 |
rules_counts.emplace_back(unsigned(entry.second.size()), entry.first); |
9309
|
0 |
0 |
if (rules_counts.size() >= rules_per_suffix) { |
9310
|
0 |
0 |
rules_counts.resize(rules_per_suffix); |
9315
|
0 |
0 |
if (rules_set.empty()) break; |
9317
|
0 |
0 |
if (!rules_set.empty()) { |
9319
|
0 |
0 |
output.assign(prefix).append(" ").append(suffix); |
9320
|
0 |
0 |
for (unsigned i = 0; i < rules_counts.size(); i++) { |
9323
|
0 |
0 |
output.append("\t").append(rules_counts[i].second, 0, tab).append("\t").append(rules_counts[i].second, tab + 1, string::npos); |
|
0 |
0 |
output.append("\t").append(rules_counts[i].second, 0, tab).append("\t").append(rules_counts[i].second, tab + 1, string::npos); |
|
0 |
0 |
output.append("\t").append(rules_counts[i].second, 0, tab).append("\t").append(rules_counts[i].second, tab + 1, string::npos); |
|
0 |
0 |
output.append("\t").append(rules_counts[i].second, 0, tab).append("\t").append(rules_counts[i].second, tab + 1, string::npos); |
9326
|
0 |
0 |
for (unsigned start = i; i+1 < rules_counts.size() && rules_counts[i+1].second.compare(0, tab + 1, rules_counts[start].second, 0, tab + 1) == 0; i++) |
|
0 |
0 |
for (unsigned start = i; i+1 < rules_counts.size() && rules_counts[i+1].second.compare(0, tab + 1, rules_counts[start].second, 0, tab + 1) == 0; i++) |
|
0 |
0 |
for (unsigned start = i; i+1 < rules_counts.size() && rules_counts[i+1].second.compare(0, tab + 1, rules_counts[start].second, 0, tab + 1) == 0; i++) |
|
0 |
0 |
for (unsigned start = i; i+1 < rules_counts.size() && rules_counts[i+1].second.compare(0, tab + 1, rules_counts[start].second, 0, tab + 1) == 0; i++) |
9327
|
0 |
0 |
output.append(" ").append(rules_counts[i+1].second, tab + 1, string::npos); |
|
0 |
0 |
output.append(" ").append(rules_counts[i+1].second, tab + 1, string::npos); |
9342
|
0 |
0 |
for (int offset = -int(lemma.size() - 1); offset < int(form.size()) - 1; offset++) { |
9345
|
0 |
0 |
for (unsigned length = 0; form_offset < form.size() && lemma_offset < lemma.size(); form_offset++, lemma_offset++) |
|
0 |
0 |
for (unsigned length = 0; form_offset < form.size() && lemma_offset < lemma.size(); form_offset++, lemma_offset++) |
|
0 |
0 |
for (unsigned length = 0; form_offset < form.size() && lemma_offset < lemma.size(); form_offset++, lemma_offset++) |
9346
|
0 |
0 |
if (form[form_offset] == lemma[lemma_offset]) { |
9347
|
0 |
0 |
if (++length > length_best && utf8::valid(form.c_str() + form_offset + 1 - length, length)) |
|
0 |
0 |
if (++length > length_best && utf8::valid(form.c_str() + form_offset + 1 - length, length)) |
|
0 |
0 |
if (++length > length_best && utf8::valid(form.c_str() + form_offset + 1 - length, length)) |
9354
|
0 |
0 |
form_prefix.assign(form, 0, lemma_best == 0 ? form_best : 0); |
|
0 |
0 |
form_prefix.assign(form, 0, lemma_best == 0 ? form_best : 0); |
9355
|
0 |
0 |
lemma_rule.assign(form, 0, form_best).append(" ").append(lemma, 0, lemma_best).append(" ") |
|
0 |
0 |
lemma_rule.assign(form, 0, form_best).append(" ").append(lemma, 0, lemma_best).append(" ") |
|
0 |
0 |
lemma_rule.assign(form, 0, form_best).append(" ").append(lemma, 0, lemma_best).append(" ") |
|
0 |
0 |
lemma_rule.assign(form, 0, form_best).append(" ").append(lemma, 0, lemma_best).append(" ") |
9356
|
0 |
0 |
.append(form, form_best + length_best, string::npos).append(" ").append(lemma, lemma_best + length_best, string::npos); |
|
0 |
0 |
.append(form, form_best + length_best, string::npos).append(" ").append(lemma, lemma_best + length_best, string::npos); |
|
0 |
0 |
.append(form, form_best + length_best, string::npos).append(" ").append(lemma, lemma_best + length_best, string::npos); |
9364
|
0 |
0 |
for (auto&& chr : utf8::decoder(word)) { |
9368
|
0 |
0 |
if (allow_nonletters && index >= 2 && cat & ~unicode::L) continue; |
|
0 |
0 |
if (allow_nonletters && index >= 2 && cat & ~unicode::L) continue; |
9369
|
0 |
0 |
if (cat & ~unicode::L) return CASE_OTHER; |
9371
|
0 |
0 |
if (index == 0) { |
9372
|
0 |
0 |
c = cat & unicode::Ll ? CASE_LC : CASE_UC; |
9373
|
0 |
0 |
} else if (c == CASE_UC && index == 1) { |
9374
|
0 |
0 |
c = cat & unicode::Ll ? CASE_UCLC : CASE_UC; |
9375
|
0 |
0 |
} else if (c == CASE_UC) { |
9376
|
0 |
0 |
if (cat & ~unicode::Lut) return CASE_OTHER; |
9378
|
0 |
0 |
if (cat & ~unicode::Ll) return CASE_OTHER; |
9390
|
0 |
0 |
for (auto&& chr : utf8::decoder(original)) { |
9391
|
0 |
0 |
utf8::append(word, (c == CASE_UC || (c == CASE_UCLC && first)) ? unicode::uppercase(chr) : unicode::lowercase(chr)); |
|
0 |
0 |
utf8::append(word, (c == CASE_UC || (c == CASE_UCLC && first)) ? unicode::uppercase(chr) : unicode::lowercase(chr)); |
9400
|
0 |
0 |
while (additional + length <= word.size() && !utf8::valid(word.c_str() + word.size() - length - additional, additional)) |
|
0 |
0 |
while (additional + length <= word.size() && !utf8::valid(word.c_str() + word.size() - length - additional, additional)) |
|
0 |
0 |
while (additional + length <= word.size() && !utf8::valid(word.c_str() + word.size() - length - additional, additional)) |
9403
|
0 |
0 |
if (additional + length > word.size()) return false; |
9427
|
0 |
0 |
if (line.empty()) { |
9428
|
0 |
0 |
if (!getline(in, line)) |
9431
|
0 |
0 |
if (tokens.size() != 3) training_failure("Line " << line << " does not have three columns!"); |
|
0 |
0 |
if (tokens.size() != 3) training_failure("Line " << line << " does not have three columns!"); |
|
0 |
0 |
if (tokens.size() != 3) training_failure("Line " << line << " does not have three columns!"); |
9435
|
0 |
0 |
if (seen_lemmas.count(lemma)) |
9436
|
0 |
0 |
training_failure("Raw morphological dictionary contains lemma '" << lemma << "' multiple times - all forms of one lemma must be in continuous region!"); |
|
0 |
0 |
training_failure("Raw morphological dictionary contains lemma '" << lemma << "' multiple times - all forms of one lemma must be in continuous region!"); |
9441
|
0 |
0 |
while (getline(in, line)) { |
9443
|
0 |
0 |
if (tokens.size() != 3) training_failure("Line " << line << " does not have three columns!"); |
|
0 |
0 |
if (tokens.size() != 3) training_failure("Line " << line << " does not have three columns!"); |
|
0 |
0 |
if (tokens.size() != 3) training_failure("Line " << line << " does not have three columns!"); |
9445
|
0 |
0 |
if (lemma != tokens[0]) break; |
9470
|
0 |
0 |
if (!filter) return; |
9472
|
0 |
0 |
wildcard.assign(filter); |
9475
|
0 |
0 |
for (int tag_pos = 0, filter_pos = 0; filter[filter_pos]; tag_pos++, filter_pos++) { |
9476
|
0 |
0 |
if (filter[filter_pos] == '?') continue; |
9477
|
0 |
0 |
if (filter[filter_pos] == '[') { |
9481
|
0 |
0 |
if (filter[filter_pos] == '^') negate = true, filter_pos++; |
9484
|
0 |
0 |
for (bool first = true; filter[filter_pos] && (first || filter[filter_pos] != ']'); first = false) |
|
0 |
0 |
for (bool first = true; filter[filter_pos] && (first || filter[filter_pos] != ']'); first = false) |
|
0 |
0 |
for (bool first = true; filter[filter_pos] && (first || filter[filter_pos] != ']'); first = false) |
9487
|
0 |
0 |
filters.emplace_back(tag_pos, negate, chars_start, filter_pos - chars_start); |
9488
|
0 |
0 |
if (!filter[filter_pos]) break; |
9490
|
0 |
0 |
filters.emplace_back(tag_pos, false, filter_pos, 1); |
9543
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
0 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
10 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
8 |
2 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
10 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
5 |
14 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
7 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
|
2 |
0 |
return it ? unaligned_load(it) : elementary_feature_unknown; |
9551
|
1 |
0 |
if (!compressor::load(is, data)) return false; |
|
1 |
0 |
if (!compressor::load(is, data)) return false; |
9554
|
1 |
0 |
maps.resize(data.next_1B()); |
|
1 |
0 |
maps.resize(data.next_1B()); |
9555
|
27 |
1 |
for (auto&& map : maps) |
9556
|
27 |
0 |
map.load(data); |
|
0 |
0 |
map.load(data); |
9598
|
1171 |
0 |
if (value < 0x80) *where++ = value; |
9599
|
0 |
0 |
else if (value < 0x4000) *where++ = (value >> 7) | 0x80u, *where++ = value & 0x7Fu; |
9600
|
0 |
0 |
else if (value < 0x200000) *where++ = (value >> 14) | 0x80u, *where++ = ((value >> 7) & 0x7Fu) | 0x80u, *where++ = value & 0x7Fu; |
9601
|
0 |
0 |
else if (value < 0x10000000) *where++ = (value >> 21) | 0x80u, *where++ = ((value >> 14) & 0x7Fu) | 0x80u, *where++ = ((value >> 7) & 0x7Fu) | 0x80u, *where++ = value & 0x7Fu; |
9608
|
0 |
0 |
while (((unsigned char)(*from)) & 0x80u) value = (value << 7) | (((unsigned char)(*from++)) ^ 0x80u); |
|
0 |
0 |
while (((unsigned char)(*from)) & 0x80u) value = (value << 7) | (((unsigned char)(*from++)) ^ 0x80u); |
9643
|
0 |
0 |
struct feature_sequence { |
|
0 |
0 |
struct feature_sequence { |
9649
|
0 |
0 |
class feature_sequences { |
|
0 |
0 |
class feature_sequences { |
|
1 |
0 |
class feature_sequences { |
|
0 |
0 |
class feature_sequences { |
|
0 |
0 |
class feature_sequences { |
9678
|
0 |
0 |
return it ? unaligned_load(it) : 0; |
|
0 |
0 |
return it ? unaligned_load(it) : 0; |
|
270 |
76 |
return it ? unaligned_load(it) : 0; |
9687
|
1 |
0 |
if (!elementary.load(is)) return false; |
|
0 |
0 |
if (!elementary.load(is)) return false; |
|
0 |
0 |
if (!elementary.load(is)) return false; |
9690
|
1 |
0 |
if (!compressor::load(is, data)) return false; |
|
1 |
0 |
if (!compressor::load(is, data)) return false; |
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
|
0 |
0 |
if (!compressor::load(is, data)) return false; |
9693
|
1 |
0 |
sequences.resize(data.next_1B()); |
|
1 |
0 |
sequences.resize(data.next_1B()); |
|
0 |
0 |
sequences.resize(data.next_1B()); |
|
0 |
0 |
sequences.resize(data.next_1B()); |
|
0 |
0 |
sequences.resize(data.next_1B()); |
|
0 |
0 |
sequences.resize(data.next_1B()); |
9694
|
74 |
1 |
for (auto&& sequence : sequences) { |
|
0 |
0 |
for (auto&& sequence : sequences) { |
|
0 |
0 |
for (auto&& sequence : sequences) { |
9695
|
74 |
0 |
sequence.dependant_range = data.next_4B(); |
|
0 |
0 |
sequence.dependant_range = data.next_4B(); |
|
0 |
0 |
sequence.dependant_range = data.next_4B(); |
9696
|
74 |
0 |
sequence.elements.resize(data.next_1B()); |
|
74 |
0 |
sequence.elements.resize(data.next_1B()); |
|
0 |
0 |
sequence.elements.resize(data.next_1B()); |
|
0 |
0 |
sequence.elements.resize(data.next_1B()); |
|
0 |
0 |
sequence.elements.resize(data.next_1B()); |
|
0 |
0 |
sequence.elements.resize(data.next_1B()); |
9697
|
154 |
74 |
for (auto&& element : sequence.elements) { |
|
0 |
0 |
for (auto&& element : sequence.elements) { |
|
0 |
0 |
for (auto&& element : sequence.elements) { |
9698
|
154 |
0 |
element.type = elementary_feature_type(data.next_4B()); |
|
0 |
0 |
element.type = elementary_feature_type(data.next_4B()); |
|
0 |
0 |
element.type = elementary_feature_type(data.next_4B()); |
9699
|
154 |
0 |
element.elementary_index = data.next_4B(); |
|
0 |
0 |
element.elementary_index = data.next_4B(); |
|
0 |
0 |
element.elementary_index = data.next_4B(); |
9700
|
154 |
0 |
element.sequence_index = data.next_4B(); |
|
0 |
0 |
element.sequence_index = data.next_4B(); |
|
0 |
0 |
element.sequence_index = data.next_4B(); |
9704
|
1 |
0 |
scores.resize(data.next_1B()); |
|
1 |
0 |
scores.resize(data.next_1B()); |
|
0 |
0 |
scores.resize(data.next_1B()); |
|
0 |
0 |
scores.resize(data.next_1B()); |
|
0 |
0 |
scores.resize(data.next_1B()); |
|
0 |
0 |
scores.resize(data.next_1B()); |
9705
|
74 |
1 |
for (auto&& score : scores) |
|
0 |
0 |
for (auto&& score : scores) |
|
0 |
0 |
for (auto&& score : scores) |
9706
|
74 |
0 |
score.load(data); |
|
0 |
0 |
score.load(data); |
|
0 |
0 |
score.load(data); |
|
0 |
0 |
score.load(data); |
|
0 |
0 |
score.load(data); |
|
0 |
0 |
score.load(data); |
9726
|
0 |
0 |
cache_element(int elements) : key(vli::max_length() * elements), key_size(0), score(0) {} |
|
0 |
0 |
cache_element(int elements) : key(vli::max_length() * elements), key_size(0), score(0) {} |
|
0 |
0 |
cache_element(int elements) : key(vli::max_length() * elements), key_size(0), score(0) {} |
|
0 |
0 |
cache_element(int elements) : key(vli::max_length() * elements), key_size(0), score(0) {} |
9734
|
0 |
0 |
caches.reserve(self.sequences.size()); |
|
0 |
0 |
caches.reserve(self.sequences.size()); |
|
1 |
0 |
caches.reserve(self.sequences.size()); |
|
0 |
0 |
caches.reserve(self.sequences.size()); |
9736
|
0 |
0 |
for (auto&& sequence : self.sequences) { |
|
0 |
0 |
for (auto&& sequence : self.sequences) { |
|
74 |
1 |
for (auto&& sequence : self.sequences) { |
|
0 |
0 |
for (auto&& sequence : self.sequences) { |
9737
|
0 |
0 |
caches.emplace_back(int(sequence.elements.size())); |
|
0 |
0 |
caches.emplace_back(int(sequence.elements.size())); |
|
74 |
0 |
caches.emplace_back(int(sequence.elements.size())); |
|
0 |
0 |
caches.emplace_back(int(sequence.elements.size())); |
9738
|
0 |
0 |
if (int(sequence.elements.size()) > max_sequence_elements) max_sequence_elements = sequence.elements.size(); |
|
0 |
0 |
if (int(sequence.elements.size()) > max_sequence_elements) max_sequence_elements = sequence.elements.size(); |
|
2 |
72 |
if (int(sequence.elements.size()) > max_sequence_elements) max_sequence_elements = sequence.elements.size(); |
|
0 |
0 |
if (int(sequence.elements.size()) > max_sequence_elements) max_sequence_elements = sequence.elements.size(); |
9739
|
0 |
0 |
for (auto&& element : sequence.elements) |
|
0 |
0 |
for (auto&& element : sequence.elements) |
|
154 |
74 |
for (auto&& element : sequence.elements) |
|
0 |
0 |
for (auto&& element : sequence.elements) |
9740
|
0 |
0 |
if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size) |
|
0 |
0 |
if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size) |
|
0 |
0 |
if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size) |
|
0 |
0 |
if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size) |
|
92 |
62 |
if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size) |
|
2 |
90 |
if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size) |
|
0 |
0 |
if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size) |
|
0 |
0 |
if (element.type == PER_TAG && 1 - element.sequence_index > max_window_size) |
9743
|
0 |
0 |
key.resize(max_sequence_elements * vli::max_length()); |
|
0 |
0 |
key.resize(max_sequence_elements * vli::max_length()); |
|
1 |
0 |
key.resize(max_sequence_elements * vli::max_length()); |
|
0 |
0 |
key.resize(max_sequence_elements * vli::max_length()); |
9744
|
0 |
0 |
window.resize(max_window_size); |
|
0 |
0 |
window.resize(max_window_size); |
|
1 |
0 |
window.resize(max_window_size); |
|
0 |
0 |
window.resize(max_window_size); |
9755
|
0 |
0 |
if (forms.size() > c.elementary_per_form.size()) c.elementary_per_form.resize(forms.size() * 2); |
|
0 |
0 |
if (forms.size() > c.elementary_per_form.size()) c.elementary_per_form.resize(forms.size() * 2); |
|
1 |
0 |
if (forms.size() > c.elementary_per_form.size()) c.elementary_per_form.resize(forms.size() * 2); |
|
0 |
0 |
if (forms.size() > c.elementary_per_form.size()) c.elementary_per_form.resize(forms.size() * 2); |
9756
|
0 |
0 |
if (forms.size() > c.elementary_per_tag.size()) c.elementary_per_tag.resize(forms.size() * 2); |
|
0 |
0 |
if (forms.size() > c.elementary_per_tag.size()) c.elementary_per_tag.resize(forms.size() * 2); |
|
1 |
0 |
if (forms.size() > c.elementary_per_tag.size()) c.elementary_per_tag.resize(forms.size() * 2); |
|
0 |
0 |
if (forms.size() > c.elementary_per_tag.size()) c.elementary_per_tag.resize(forms.size() * 2); |
9757
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) |
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) |
|
7 |
1 |
for (unsigned i = 0; i < forms.size(); i++) |
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) |
9758
|
0 |
0 |
if (analyses[i].size() > c.elementary_per_tag[i].size()) |
|
0 |
0 |
if (analyses[i].size() > c.elementary_per_tag[i].size()) |
|
7 |
0 |
if (analyses[i].size() > c.elementary_per_tag[i].size()) |
|
0 |
0 |
if (analyses[i].size() > c.elementary_per_tag[i].size()) |
9766
|
0 |
0 |
for (auto&& cache : c.caches) |
|
0 |
0 |
for (auto&& cache : c.caches) |
|
74 |
1 |
for (auto&& cache : c.caches) |
|
0 |
0 |
for (auto&& cache : c.caches) |
9772
|
0 |
0 |
elementary.compute_dynamic_features((*c.analyses)[form_index][tag_index], c.elementary_per_form[form_index], c.elementary_per_tag[form_index][tag_index], form_index > 0 ? prev_dynamic : nullptr, dynamic); |
|
0 |
0 |
elementary.compute_dynamic_features((*c.analyses)[form_index][tag_index], c.elementary_per_form[form_index], c.elementary_per_tag[form_index][tag_index], form_index > 0 ? prev_dynamic : nullptr, dynamic); |
|
12 |
3 |
elementary.compute_dynamic_features((*c.analyses)[form_index][tag_index], c.elementary_per_form[form_index], c.elementary_per_tag[form_index][tag_index], form_index > 0 ? prev_dynamic : nullptr, dynamic); |
|
0 |
0 |
elementary.compute_dynamic_features((*c.analyses)[form_index][tag_index], c.elementary_per_form[form_index], c.elementary_per_tag[form_index][tag_index], form_index > 0 ? prev_dynamic : nullptr, dynamic); |
9778
|
0 |
0 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
0 |
0 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
0 |
0 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
0 |
0 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
0 |
0 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
0 |
0 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
36 |
7 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
6 |
30 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
30 |
13 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
0 |
0 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
0 |
0 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
|
0 |
0 |
for (int i = 0; i < int(c.window.size()) && form_index - i >= 0; i++) |
9783
|
0 |
0 |
for (unsigned i = 0; i < sequences.size(); i++) { |
|
0 |
0 |
for (unsigned i = 0; i < sequences.size(); i++) { |
|
658 |
8 |
for (unsigned i = 0; i < sequences.size(); i++) { |
|
0 |
0 |
for (unsigned i = 0; i < sequences.size(); i++) { |
9784
|
0 |
0 |
if (tags_unchanged >= sequences[i].dependant_range) |
|
0 |
0 |
if (tags_unchanged >= sequences[i].dependant_range) |
|
653 |
5 |
if (tags_unchanged >= sequences[i].dependant_range) |
|
0 |
0 |
if (tags_unchanged >= sequences[i].dependant_range) |
9788
|
0 |
0 |
for (unsigned j = 0; j < sequences[i].elements.size(); j++) { |
|
0 |
0 |
for (unsigned j = 0; j < sequences[i].elements.size(); j++) { |
|
1345 |
479 |
for (unsigned j = 0; j < sequences[i].elements.size(); j++) { |
|
0 |
0 |
for (unsigned j = 0; j < sequences[i].elements.size(); j++) { |
9794
|
0 |
0 |
value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index]; |
|
0 |
0 |
value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index]; |
|
0 |
0 |
value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index]; |
|
0 |
0 |
value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index]; |
|
458 |
17 |
value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index]; |
|
452 |
6 |
value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index]; |
|
0 |
0 |
value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index]; |
|
0 |
0 |
value = form_index + element.sequence_index < 0 || unsigned(form_index + element.sequence_index) >= c.forms->size() ? elementary_feature_empty : c.elementary_per_form[form_index + element.sequence_index].values[element.elementary_index]; |
9797
|
0 |
0 |
value = form_index + element.sequence_index < 0 ? elementary_feature_empty : c.window[-element.sequence_index]->values[element.elementary_index]; |
|
0 |
0 |
value = form_index + element.sequence_index < 0 ? elementary_feature_empty : c.window[-element.sequence_index]->values[element.elementary_index]; |
|
778 |
66 |
value = form_index + element.sequence_index < 0 ? elementary_feature_empty : c.window[-element.sequence_index]->values[element.elementary_index]; |
|
0 |
0 |
value = form_index + element.sequence_index < 0 ? elementary_feature_empty : c.window[-element.sequence_index]->values[element.elementary_index]; |
9804
|
0 |
0 |
if (value == elementary_feature_unknown) { |
|
0 |
0 |
if (value == elementary_feature_unknown) { |
|
174 |
1171 |
if (value == elementary_feature_unknown) { |
|
0 |
0 |
if (value == elementary_feature_unknown) { |
9813
|
0 |
0 |
if (!key_size) { |
|
0 |
0 |
if (!key_size) { |
|
174 |
479 |
if (!key_size) { |
|
0 |
0 |
if (!key_size) { |
9816
|
0 |
0 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
0 |
0 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
0 |
0 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
0 |
0 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
0 |
0 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
0 |
0 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
355 |
124 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
222 |
133 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
346 |
133 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
0 |
0 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
0 |
0 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
|
0 |
0 |
} else if (key_size != c.caches[i].key_size || !small_memeq(c.key.data(), c.caches[i].key.data(), key_size)) { |
9833
|
0 |
0 |
for (unsigned i = 0; i < c.caches.size(); i++) |
9877
|
0 |
0 |
cache(const viterbi& self) : features_cache(self.features) {} |
|
0 |
0 |
cache(const viterbi& self) : features_cache(self.features) {} |
|
1 |
0 |
cache(const viterbi& self) : features_cache(self.features) {} |
|
0 |
0 |
cache(const viterbi& self) : features_cache(self.features) {} |
9890
|
0 |
0 |
if (!forms.size()) return; |
|
0 |
0 |
if (!forms.size()) return; |
|
1 |
0 |
if (!forms.size()) return; |
|
0 |
0 |
if (!forms.size()) return; |
9894
|
0 |
0 |
for (unsigned i = 0, states = 1; i < forms.size(); i++) { |
|
0 |
0 |
for (unsigned i = 0, states = 1; i < forms.size(); i++) { |
|
7 |
1 |
for (unsigned i = 0, states = 1; i < forms.size(); i++) { |
|
0 |
0 |
for (unsigned i = 0, states = 1; i < forms.size(); i++) { |
9895
|
0 |
0 |
if (analyses[i].empty()) return; |
|
0 |
0 |
if (analyses[i].empty()) return; |
|
7 |
0 |
if (analyses[i].empty()) return; |
|
0 |
0 |
if (analyses[i].empty()) return; |
9896
|
0 |
0 |
states = (i+1 >= unsigned(decoding_order) ? states / analyses[i-decoding_order+1].size() : states) * analyses[i].size(); |
|
0 |
0 |
states = (i+1 >= unsigned(decoding_order) ? states / analyses[i-decoding_order+1].size() : states) * analyses[i].size(); |
|
5 |
2 |
states = (i+1 >= unsigned(decoding_order) ? states / analyses[i-decoding_order+1].size() : states) * analyses[i].size(); |
|
0 |
0 |
states = (i+1 >= unsigned(decoding_order) ? states / analyses[i-decoding_order+1].size() : states) * analyses[i].size(); |
9899
|
0 |
0 |
if (nodes > c.nodes.size()) c.nodes.resize(nodes); |
|
0 |
0 |
if (nodes > c.nodes.size()) c.nodes.resize(nodes); |
|
1 |
0 |
if (nodes > c.nodes.size()) c.nodes.resize(nodes); |
|
0 |
0 |
if (nodes > c.nodes.size()) c.nodes.resize(nodes); |
9905
|
0 |
0 |
int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data()); |
|
0 |
0 |
int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data()); |
|
0 |
0 |
int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data()); |
|
0 |
0 |
int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data()); |
|
0 |
1 |
int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data()); |
|
0 |
0 |
int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data()); |
|
0 |
0 |
int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data()); |
|
0 |
0 |
int* window = window_size <= 16 ? window_stack : (window_heap.resize(window_size), window_heap.data()); |
9911
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) { |
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) { |
|
7 |
1 |
for (unsigned i = 0; i < forms.size(); i++) { |
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) { |
9914
|
0 |
0 |
for (int j = 0; j < window_size; j++) window[j] = -1; |
|
0 |
0 |
for (int j = 0; j < window_size; j++) window[j] = -1; |
|
7 |
21 |
for (int j = 0; j < window_size; j++) window[j] = -1; |
|
0 |
0 |
for (int j = 0; j < window_size; j++) window[j] = -1; |
9915
|
0 |
0 |
for (int tag = 0; tag < int(analyses[i].size()); tag++) |
|
0 |
0 |
for (int tag = 0; tag < int(analyses[i].size()); tag++) |
|
10 |
7 |
for (int tag = 0; tag < int(analyses[i].size()); tag++) |
|
0 |
0 |
for (int tag = 0; tag < int(analyses[i].size()); tag++) |
9916
|
0 |
0 |
for (int prev = nodes_prev; prev < nodes_now; prev++) { |
|
0 |
0 |
for (int prev = nodes_prev; prev < nodes_now; prev++) { |
|
15 |
10 |
for (int prev = nodes_prev; prev < nodes_now; prev++) { |
|
0 |
0 |
for (int prev = nodes_prev; prev < nodes_now; prev++) { |
9920
|
0 |
0 |
for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) { |
|
0 |
0 |
for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) { |
|
0 |
0 |
for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) { |
|
0 |
0 |
for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) { |
|
27 |
9 |
for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) { |
|
21 |
6 |
for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) { |
|
0 |
0 |
for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) { |
|
0 |
0 |
for (int p = prev, n = 1; p >= 0 && n < window_size; p = c.nodes[p].prev, n++) { |
9921
|
0 |
0 |
same_tags += same_tags == n && window[n] == c.nodes[p].tag; |
|
0 |
0 |
same_tags += same_tags == n && window[n] == c.nodes[p].tag; |
|
0 |
0 |
same_tags += same_tags == n && window[n] == c.nodes[p].tag; |
|
0 |
0 |
same_tags += same_tags == n && window[n] == c.nodes[p].tag; |
|
7 |
14 |
same_tags += same_tags == n && window[n] == c.nodes[p].tag; |
|
5 |
2 |
same_tags += same_tags == n && window[n] == c.nodes[p].tag; |
|
0 |
0 |
same_tags += same_tags == n && window[n] == c.nodes[p].tag; |
|
0 |
0 |
same_tags += same_tags == n && window[n] == c.nodes[p].tag; |
9926
|
0 |
0 |
features.compute_dynamic_features(i, tag, prev >= 0 ? &c.nodes[prev].dynamic : nullptr, dynamic, c.features_cache); |
|
0 |
0 |
features.compute_dynamic_features(i, tag, prev >= 0 ? &c.nodes[prev].dynamic : nullptr, dynamic, c.features_cache); |
|
12 |
3 |
features.compute_dynamic_features(i, tag, prev >= 0 ? &c.nodes[prev].dynamic : nullptr, dynamic, c.features_cache); |
|
0 |
0 |
features.compute_dynamic_features(i, tag, prev >= 0 ? &c.nodes[prev].dynamic : nullptr, dynamic, c.features_cache); |
9927
|
0 |
0 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
0 |
0 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
0 |
0 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
0 |
0 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
0 |
0 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
0 |
0 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
7 |
8 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
5 |
2 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
12 |
3 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
0 |
0 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
0 |
0 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
0 |
0 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
|
0 |
0 |
score = (nodes_prev + 1 == nodes_now && analyses[i].size() == 1 ? 0 : features.score(i, window, same_tags, dynamic, c.features_cache)) + |
9931
|
0 |
0 |
if (same_tags >= decoding_order-1) { |
|
0 |
0 |
if (same_tags >= decoding_order-1) { |
|
2 |
13 |
if (same_tags >= decoding_order-1) { |
|
0 |
0 |
if (same_tags >= decoding_order-1) { |
9932
|
0 |
0 |
if (score <= c.nodes[nodes_next-1].score) continue; |
|
0 |
0 |
if (score <= c.nodes[nodes_next-1].score) continue; |
|
1 |
1 |
if (score <= c.nodes[nodes_next-1].score) continue; |
|
0 |
0 |
if (score <= c.nodes[nodes_next-1].score) continue; |
9947
|
0 |
0 |
for (int node = nodes_prev + 1; node < nodes_now; node++) |
|
0 |
0 |
for (int node = nodes_prev + 1; node < nodes_now; node++) |
|
1 |
1 |
for (int node = nodes_prev + 1; node < nodes_now; node++) |
|
0 |
0 |
for (int node = nodes_prev + 1; node < nodes_now; node++) |
9948
|
0 |
0 |
if (c.nodes[node].score > c.nodes[best].score) |
|
0 |
0 |
if (c.nodes[node].score > c.nodes[best].score) |
|
1 |
0 |
if (c.nodes[node].score > c.nodes[best].score) |
|
0 |
0 |
if (c.nodes[node].score > c.nodes[best].score) |
9951
|
0 |
0 |
for (int i = forms.size() - 1; i >= 0; i--, best = c.nodes[best].prev) |
|
0 |
0 |
for (int i = forms.size() - 1; i >= 0; i--, best = c.nodes[best].prev) |
|
7 |
1 |
for (int i = forms.size() - 1; i >= 0; i--, best = c.nodes[best].prev) |
|
0 |
0 |
for (int i = forms.size() - 1; i >= 0; i--, best = c.nodes[best].prev) |
10000
|
0 |
0 |
maps.resize(MAP_TOTAL); |
|
1 |
0 |
maps.resize(MAP_TOTAL); |
10004
|
2 |
0 |
vector conllu_elementary_features |
|
0 |
2 |
vector conllu_elementary_features |
|
68 |
2 |
vector conllu_elementary_features |
|
0 |
0 |
vector conllu_elementary_features |
10049
|
7 |
1 |
for (unsigned i = forms.size(); i--;) { |
|
0 |
0 |
for (unsigned i = forms.size(); i--;) { |
10053
|
10 |
7 |
for (unsigned j = 0; j < analyses[i].size(); j++) { |
|
0 |
0 |
for (unsigned j = 0; j < analyses[i].size(); j++) { |
10064
|
3 |
7 |
per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == lemma ? per_tag[i][j-1].values[LEMMA] : |
|
0 |
3 |
per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == lemma ? per_tag[i][j-1].values[LEMMA] : |
|
0 |
0 |
per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == lemma ? per_tag[i][j-1].values[LEMMA] : |
|
0 |
0 |
per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == lemma ? per_tag[i][j-1].values[LEMMA] : |
10069
|
0 |
10 |
if (index == string::npos) index = tag.size(); |
|
0 |
0 |
if (index == string::npos) index = tag.size(); |
10070
|
0 |
10 |
per_tag[i][j].values[TAG_UPOS] = maps[MAP_TAG_UPOS].value(tag.c_str() + (index ? 1 : 0), index - (index ? 1 : 0)); |
|
0 |
10 |
per_tag[i][j].values[TAG_UPOS] = maps[MAP_TAG_UPOS].value(tag.c_str() + (index ? 1 : 0), index - (index ? 1 : 0)); |
|
0 |
0 |
per_tag[i][j].values[TAG_UPOS] = maps[MAP_TAG_UPOS].value(tag.c_str() + (index ? 1 : 0), index - (index ? 1 : 0)); |
|
0 |
0 |
per_tag[i][j].values[TAG_UPOS] = maps[MAP_TAG_UPOS].value(tag.c_str() + (index ? 1 : 0), index - (index ? 1 : 0)); |
10072
|
10 |
0 |
if (index < tag.size()) index++; |
|
0 |
0 |
if (index < tag.size()) index++; |
10073
|
10 |
0 |
if (index < tag.size()) index = tag.find(separator, index); |
|
0 |
0 |
if (index < tag.size()) index = tag.find(separator, index); |
10074
|
10 |
0 |
if (index < tag.size()) index++; |
|
0 |
0 |
if (index < tag.size()) index++; |
10075
|
40 |
10 |
for (size_t length; index < tag.size(); index += length + 1) { |
|
0 |
0 |
for (size_t length; index < tag.size(); index += length + 1) { |
10077
|
6 |
34 |
length = (length == string::npos ? tag.size() : length) - index; |
|
0 |
0 |
length = (length == string::npos ? tag.size() : length) - index; |
10079
|
280 |
0 |
for (size_t equal_sign = 0; equal_sign + 1 < length; equal_sign++) |
|
0 |
0 |
for (size_t equal_sign = 0; equal_sign + 1 < length; equal_sign++) |
10080
|
240 |
40 |
if (tag[index + equal_sign] == '=') { |
|
0 |
0 |
if (tag[index + equal_sign] == '=') { |
10084
|
2 |
4 |
if (tag.compare(index, equal_sign, "Case") == 0) value = TAG_CASE, map = MAP_TAG_CASE; |
|
0 |
0 |
if (tag.compare(index, equal_sign, "Case") == 0) value = TAG_CASE, map = MAP_TAG_CASE; |
10087
|
2 |
14 |
if (tag.compare(index, equal_sign, "Gender") == 0) value = TAG_GENDER, map = MAP_TAG_GENDER; |
|
0 |
0 |
if (tag.compare(index, equal_sign, "Gender") == 0) value = TAG_GENDER, map = MAP_TAG_GENDER; |
10088
|
6 |
10 |
if (tag.compare(index, equal_sign, "Number") == 0) value = TAG_NUMBER, map = MAP_TAG_NUMBER; |
|
0 |
0 |
if (tag.compare(index, equal_sign, "Number") == 0) value = TAG_NUMBER, map = MAP_TAG_NUMBER; |
10089
|
4 |
12 |
if (tag.compare(index, equal_sign, "Person") == 0) value = TAG_PERSON, map = MAP_TAG_PERSON; |
|
0 |
0 |
if (tag.compare(index, equal_sign, "Person") == 0) value = TAG_PERSON, map = MAP_TAG_PERSON; |
10092
|
5 |
5 |
if (tag.compare(index, equal_sign, "Negative") == 0) value = TAG_NEGATIVE, map = MAP_TAG_NEGATIVE; |
|
0 |
0 |
if (tag.compare(index, equal_sign, "Negative") == 0) value = TAG_NEGATIVE, map = MAP_TAG_NEGATIVE; |
10096
|
19 |
21 |
if (value >= 0) |
|
0 |
0 |
if (value >= 0) |
10102
|
10 |
0 |
if (tag.size() >= 2 && tag[1] == 'V') { |
|
6 |
4 |
if (tag.size() >= 2 && tag[1] == 'V') { |
|
4 |
6 |
if (tag.size() >= 2 && tag[1] == 'V') { |
|
0 |
0 |
if (tag.size() >= 2 && tag[1] == 'V') { |
|
0 |
0 |
if (tag.size() >= 2 && tag[1] == 'V') { |
|
0 |
0 |
if (tag.size() >= 2 && tag[1] == 'V') { |
10104
|
2 |
2 |
verb_candidate = verb_candidate < 0 || (tag_compare = tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate; |
|
2 |
0 |
verb_candidate = verb_candidate < 0 || (tag_compare = tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate; |
|
0 |
0 |
verb_candidate = verb_candidate < 0 || (tag_compare = tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate; |
|
0 |
0 |
verb_candidate = verb_candidate < 0 || (tag_compare = tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate; |
10120
|
5 |
2 |
if (analyses[i].size() == 1) { |
|
0 |
0 |
if (analyses[i].size() == 1) { |
10128
|
0 |
2 |
} else if (forms[i].len <= 0) { |
|
0 |
0 |
} else if (forms[i].len <= 0) { |
10143
|
16 |
2 |
while (form.len) { |
|
0 |
0 |
while (form.len) { |
10147
|
16 |
0 |
num = num || cat & unicode::N; |
|
16 |
0 |
num = num || cat & unicode::N; |
|
0 |
0 |
num = num || cat & unicode::N; |
|
0 |
0 |
num = num || cat & unicode::N; |
10148
|
10 |
6 |
cap = cap || cat & unicode::Lut; |
|
9 |
1 |
cap = cap || cat & unicode::Lut; |
|
0 |
0 |
cap = cap || cat & unicode::Lut; |
|
0 |
0 |
cap = cap || cat & unicode::Lut; |
10149
|
16 |
0 |
dash = dash || cat & unicode::Pd; |
|
16 |
0 |
dash = dash || cat & unicode::Pd; |
|
0 |
0 |
dash = dash || cat & unicode::Pd; |
|
0 |
0 |
dash = dash || cat & unicode::Pd; |
10151
|
16 |
0 |
if (index == 10 || (!form.len && index < 10)) { |
|
14 |
2 |
if (index == 10 || (!form.len && index < 10)) { |
|
0 |
2 |
if (index == 10 || (!form.len && index < 10)) { |
|
0 |
0 |
if (index == 10 || (!form.len && index < 10)) { |
|
0 |
0 |
if (index == 10 || (!form.len && index < 10)) { |
|
0 |
0 |
if (index == 10 || (!form.len && index < 10)) { |
10181
|
12 |
3 |
if (prev_dynamic) { |
|
0 |
0 |
if (prev_dynamic) { |
10189
|
15 |
0 |
if (tag.tag.size() >= 2 && tag.tag[1] == 'V') { |
|
11 |
4 |
if (tag.tag.size() >= 2 && tag.tag[1] == 'V') { |
|
4 |
11 |
if (tag.tag.size() >= 2 && tag.tag[1] == 'V') { |
|
0 |
0 |
if (tag.tag.size() >= 2 && tag.tag[1] == 'V') { |
|
0 |
0 |
if (tag.tag.size() >= 2 && tag.tag[1] == 'V') { |
|
0 |
0 |
if (tag.tag.size() >= 2 && tag.tag[1] == 'V') { |
10243
|
0 |
0 |
maps.resize(MAP_TOTAL); |
10279
|
0 |
0 |
for (unsigned i = forms.size(); i--;) { |
10283
|
0 |
0 |
for (unsigned j = 0; j < analyses[i].size(); j++) { |
10286
|
0 |
0 |
per_tag[i][j].values[TAG3] = analyses[i][j].tag.size() >= 3 ? maps[MAP_TAG3].value(analyses[i][j].tag.c_str() + 2, 1) : elementary_feature_empty; |
10287
|
0 |
0 |
per_tag[i][j].values[TAG5] = analyses[i][j].tag.size() >= 5 ? maps[MAP_TAG5].value(analyses[i][j].tag.c_str() + 4, 1) : elementary_feature_empty; |
10288
|
0 |
0 |
per_tag[i][j].values[TAG25] = analyses[i][j].tag.size() >= 5 ? maps[MAP_TAG25].value((tag25[0] = analyses[i][j].tag[1], tag25[1] = analyses[i][j].tag[4], tag25), 2) : elementary_feature_empty; |
10289
|
0 |
0 |
per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == analyses[i][j].lemma ? per_tag[i][j-1].values[LEMMA] : |
|
0 |
0 |
per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == analyses[i][j].lemma ? per_tag[i][j-1].values[LEMMA] : |
10292
|
0 |
0 |
if (analyses[i][j].tag[0] == 'V') { |
10294
|
0 |
0 |
verb_candidate = verb_candidate < 0 || (tag_compare = analyses[i][j].tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && analyses[i][j].lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate; |
|
0 |
0 |
verb_candidate = verb_candidate < 0 || (tag_compare = analyses[i][j].tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && analyses[i][j].lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate; |
10304
|
0 |
0 |
if (verb_candidate >= 0) { |
10310
|
0 |
0 |
if (analyses[i].size() == 1) { |
10314
|
0 |
0 |
} else if (forms[i].len <= 0) { |
10325
|
0 |
0 |
while (form.len) { |
10329
|
0 |
0 |
num = num || cat & unicode::N; |
|
0 |
0 |
num = num || cat & unicode::N; |
10330
|
0 |
0 |
cap = cap || cat & unicode::Lut; |
|
0 |
0 |
cap = cap || cat & unicode::Lut; |
10331
|
0 |
0 |
dash = dash || cat & unicode::Pd; |
|
0 |
0 |
dash = dash || cat & unicode::Pd; |
10333
|
0 |
0 |
if (index == 5 || (!form.len && index < 5)) { |
|
0 |
0 |
if (index == 5 || (!form.len && index < 5)) { |
|
0 |
0 |
if (index == 5 || (!form.len && index < 5)) { |
10353
|
0 |
0 |
if (prev_dynamic) { |
10361
|
0 |
0 |
if (tag.tag[0] == 'V') { |
10415
|
0 |
0 |
maps.resize(MAP_TOTAL); |
10463
|
0 |
0 |
for (unsigned i = forms.size(); i--;) { |
10467
|
0 |
0 |
for (unsigned j = 0; j < analyses[i].size(); j++) { |
10469
|
0 |
0 |
per_tag[i][j].values[TAG1] = analyses[i][j].tag.size() >= 1 ? maps[MAP_TAG1].value(analyses[i][j].tag.c_str() + 0, 1) : elementary_feature_empty; |
10470
|
0 |
0 |
per_tag[i][j].values[TAG2] = analyses[i][j].tag.size() >= 2 ? maps[MAP_TAG2].value(analyses[i][j].tag.c_str() + 1, 1) : elementary_feature_empty; |
10471
|
0 |
0 |
per_tag[i][j].values[TAG3] = analyses[i][j].tag.size() >= 3 ? maps[MAP_TAG3].value(analyses[i][j].tag.c_str() + 2, 1) : elementary_feature_empty; |
10472
|
0 |
0 |
per_tag[i][j].values[TAG4] = analyses[i][j].tag.size() >= 4 ? maps[MAP_TAG4].value(analyses[i][j].tag.c_str() + 3, 1) : elementary_feature_empty; |
10473
|
0 |
0 |
per_tag[i][j].values[TAG5] = analyses[i][j].tag.size() >= 5 ? maps[MAP_TAG5].value(analyses[i][j].tag.c_str() + 4, 1) : elementary_feature_empty; |
10474
|
0 |
0 |
per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == analyses[i][j].lemma ? per_tag[i][j-1].values[LEMMA] : |
|
0 |
0 |
per_tag[i][j].values[LEMMA] = j && analyses[i][j-1].lemma == analyses[i][j].lemma ? per_tag[i][j-1].values[LEMMA] : |
10477
|
0 |
0 |
if (analyses[i][j].tag[0] == 'V') { |
10479
|
0 |
0 |
verb_candidate = verb_candidate < 0 || (tag_compare = analyses[i][j].tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && analyses[i][j].lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate; |
|
0 |
0 |
verb_candidate = verb_candidate < 0 || (tag_compare = analyses[i][j].tag.compare(analyses[i][verb_candidate].tag), tag_compare < 0) || (tag_compare == 0 && analyses[i][j].lemma < analyses[i][verb_candidate].lemma) ? j : verb_candidate; |
10489
|
0 |
0 |
if (verb_candidate >= 0) { |
10495
|
0 |
0 |
if (analyses[i].size() == 1) { |
10503
|
0 |
0 |
} else if (forms[i].len <= 0) { |
10518
|
0 |
0 |
while (form.len) { |
10522
|
0 |
0 |
num = num || cat & unicode::N; |
|
0 |
0 |
num = num || cat & unicode::N; |
10523
|
0 |
0 |
cap = cap || cat & unicode::Lut; |
|
0 |
0 |
cap = cap || cat & unicode::Lut; |
10524
|
0 |
0 |
dash = dash || cat & unicode::Pd; |
|
0 |
0 |
dash = dash || cat & unicode::Pd; |
10526
|
0 |
0 |
if (index == 10 || (!form.len && index < 10)) { |
|
0 |
0 |
if (index == 10 || (!form.len && index < 10)) { |
|
0 |
0 |
if (index == 10 || (!form.len && index < 10)) { |
10556
|
0 |
0 |
if (prev_dynamic) { |
10564
|
0 |
0 |
if (tag.tag[0] == 'V') { |
10615
|
0 |
0 |
cache(const perceptron_tagger& self) : decoder_cache(self.decoder) {} |
|
0 |
0 |
cache(const perceptron_tagger& self) : decoder_cache(self.decoder) {} |
|
1 |
0 |
cache(const perceptron_tagger& self) : decoder_cache(self.decoder) {} |
10629
|
1 |
0 |
if (dict.reset(morpho::load(is)), !dict) return false; |
|
0 |
0 |
if (dict.reset(morpho::load(is)), !dict) return false; |
|
0 |
0 |
if (dict.reset(morpho::load(is)), !dict) return false; |
10631
|
1 |
0 |
if (!features.load(is)) return false; |
|
0 |
0 |
if (!features.load(is)) return false; |
|
0 |
0 |
if (!features.load(is)) return false; |
10643
|
0 |
0 |
if (!dict) return; |
|
0 |
0 |
if (!dict) return; |
|
1 |
0 |
if (!dict) return; |
10646
|
0 |
0 |
if (!c) c = new cache(*this); |
|
0 |
0 |
if (!c) c = new cache(*this); |
|
0 |
0 |
if (!c) c = new cache(*this); |
|
0 |
0 |
if (!c) c = new cache(*this); |
|
1 |
0 |
if (!c) c = new cache(*this); |
|
1 |
0 |
if (!c) c = new cache(*this); |
10649
|
0 |
0 |
if (c->analyses.size() < forms.size()) c->analyses.resize(forms.size()); |
|
0 |
0 |
if (c->analyses.size() < forms.size()) c->analyses.resize(forms.size()); |
|
1 |
0 |
if (c->analyses.size() < forms.size()) c->analyses.resize(forms.size()); |
10650
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) { |
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) { |
|
7 |
1 |
for (unsigned i = 0; i < forms.size(); i++) { |
10653
|
0 |
0 |
dict->analyze(forms[i], guesser >= 0 ? guesser : use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, c->analyses[i]); |
|
0 |
0 |
dict->analyze(forms[i], guesser >= 0 ? guesser : use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, c->analyses[i]); |
|
0 |
0 |
dict->analyze(forms[i], guesser >= 0 ? guesser : use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, c->analyses[i]); |
|
0 |
0 |
dict->analyze(forms[i], guesser >= 0 ? guesser : use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, c->analyses[i]); |
|
7 |
0 |
dict->analyze(forms[i], guesser >= 0 ? guesser : use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, c->analyses[i]); |
|
0 |
7 |
dict->analyze(forms[i], guesser >= 0 ? guesser : use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, c->analyses[i]); |
10656
|
0 |
0 |
if (c->tags.size() < forms.size()) c->tags.resize(forms.size() * 2); |
|
0 |
0 |
if (c->tags.size() < forms.size()) c->tags.resize(forms.size() * 2); |
|
1 |
0 |
if (c->tags.size() < forms.size()) c->tags.resize(forms.size() * 2); |
10659
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) |
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) |
|
7 |
1 |
for (unsigned i = 0; i < forms.size(); i++) |
10670
|
0 |
0 |
if (!c) c = new cache(*this); |
|
0 |
0 |
if (!c) c = new cache(*this); |
|
0 |
0 |
if (!c) c = new cache(*this); |
|
0 |
0 |
if (!c) c = new cache(*this); |
|
0 |
0 |
if (!c) c = new cache(*this); |
|
0 |
0 |
if (!c) c = new cache(*this); |
10696
|
1 |
0 |
tagger_id id = tagger_id(is.get()); |
|
1 |
0 |
tagger_id id = tagger_id(is.get()); |
|
1 |
0 |
tagger_id id = tagger_id(is.get()); |
|
1 |
0 |
tagger_id id = tagger_id(is.get()); |
|
0 |
0 |
tagger_id id = tagger_id(is.get()); |
|
0 |
0 |
tagger_id id = tagger_id(is.get()); |
|
0 |
0 |
tagger_id id = tagger_id(is.get()); |
10702
|
0 |
0 |
auto res = new_unique_ptr>>(tagger_ids::decoding_order(id), tagger_ids::window_size(id)); |
10703
|
0 |
0 |
if (res->load(is)) return res.release(); |
|
0 |
0 |
if (res->load(is)) return res.release(); |
10711
|
0 |
0 |
auto res = new_unique_ptr>>(tagger_ids::decoding_order(id), tagger_ids::window_size(id)); |
10712
|
0 |
0 |
if (res->load(is)) return res.release(); |
|
0 |
0 |
if (res->load(is)) return res.release(); |
10719
|
1 |
0 |
auto res = new_unique_ptr>>(tagger_ids::decoding_order(id), tagger_ids::window_size(id)); |
10720
|
1 |
0 |
if (res->load(is)) return res.release(); |
|
1 |
0 |
if (res->load(is)) return res.release(); |
10729
|
0 |
0 |
ifstream f(path_from_utf8(fname).c_str(), ifstream::binary); |
10730
|
0 |
0 |
if (!f) return nullptr; |
10732
|
0 |
0 |
return load(f); |
10737
|
0 |
0 |
return morpho ? morpho->new_tokenizer() : nullptr; |
10842
|
0 |
0 |
for (int i = 0; i < 15 && pdt_tag[i]; i++) |
|
0 |
0 |
for (int i = 0; i < 15 && pdt_tag[i]; i++) |
10843
|
0 |
0 |
if (pdt_tag[i] != '-') { |
10844
|
0 |
0 |
if (!tag.empty()) tag.push_back('|'); |
10851
|
0 |
0 |
for (unsigned i = 0; i + 2 < lemma.size(); i++) |
10852
|
0 |
0 |
if (lemma[i] == '_' && lemma[i + 1] == ';') { |
|
0 |
0 |
if (lemma[i] == '_' && lemma[i + 1] == ';') { |
|
0 |
0 |
if (lemma[i] == '_' && lemma[i + 1] == ';') { |
10853
|
0 |
0 |
if (!tag.empty()) tag.push_back('|'); |
10862
|
0 |
0 |
return raw_lemma < lemma.size() ? (lemma.resize(raw_lemma), true) : false; |
10873
|
0 |
0 |
for (auto&& tagged_lemma : tagged_lemmas) { |
10879
|
0 |
0 |
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
|
0 |
0 |
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
|
0 |
0 |
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
10887
|
0 |
0 |
for (auto&& tagged_lemma_forms : forms) { |
10888
|
0 |
0 |
for (auto&& tagged_form : tagged_lemma_forms.forms) |
10894
|
0 |
0 |
if (!lemma_changed || forms.size() < 2) return; |
|
0 |
0 |
if (!lemma_changed || forms.size() < 2) return; |
|
0 |
0 |
if (!lemma_changed || forms.size() < 2) return; |
10948
|
0 |
0 |
return lemma_id_len < lemma.size() ? (lemma.resize(lemma_id_len), true) : false; |
10958
|
0 |
0 |
for (auto&& tagged_lemma : tagged_lemmas) |
10962
|
0 |
0 |
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
|
0 |
0 |
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
|
0 |
0 |
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
10970
|
0 |
0 |
for (auto&& tagged_lemma_forms : forms) |
10974
|
0 |
0 |
if (!lemma_changed || forms.size() < 2) return; |
|
0 |
0 |
if (!lemma_changed || forms.size() < 2) return; |
|
0 |
0 |
if (!lemma_changed || forms.size() < 2) return; |
11028
|
0 |
0 |
return raw_lemma_len < lemma.size() ? (lemma.resize(raw_lemma_len), true) : false; |
11038
|
0 |
0 |
for (auto&& tagged_lemma : tagged_lemmas) |
11042
|
0 |
0 |
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
|
0 |
0 |
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
|
0 |
0 |
if (!lemma_changed || tagged_lemmas.size() < 2) return; |
11050
|
0 |
0 |
for (auto&& tagged_lemma_forms : forms) |
11054
|
0 |
0 |
if (!lemma_changed || forms.size() < 2) return; |
|
0 |
0 |
if (!lemma_changed || forms.size() < 2) return; |
|
0 |
0 |
if (!lemma_changed || forms.size() < 2) return; |
11093
|
0 |
0 |
if (name == "pdt_to_conll2009") return tagset_converter::new_pdt_to_conll2009_converter(); |
11094
|
0 |
0 |
if (name == "strip_lemma_comment") return tagset_converter::new_strip_lemma_comment_converter(dictionary); |
11095
|
0 |
0 |
if (name == "strip_lemma_id") return tagset_converter::new_strip_lemma_id_converter(dictionary); |
11102
|
0 |
0 |
inline static bool eq(const tagged_lemma& a, const tagged_lemma& b) { return a.lemma == b.lemma && a.tag == b.tag; } |
|
0 |
0 |
inline static bool eq(const tagged_lemma& a, const tagged_lemma& b) { return a.lemma == b.lemma && a.tag == b.tag; } |
11103
|
0 |
0 |
inline static bool lt(const tagged_lemma& a, const tagged_lemma& b) { int lemma_compare = a.lemma.compare(b.lemma); return lemma_compare < 0 || (lemma_compare == 0 && a.tag < b.tag); } |
11112
|
0 |
0 |
for (unsigned i = 0; i < forms.size(); i++) { |
11114
|
0 |
0 |
for (unsigned j = forms.size() - 1; j > i; j--) |
11115
|
0 |
0 |
if (forms[j].lemma == forms[i].lemma) { |
11117
|
0 |
0 |
for (auto&& tagged_form : forms[j].forms) |
11121
|
0 |
0 |
if (j < forms.size() - 1) { |
11129
|
0 |
0 |
if (any_merged && forms[i].forms.size() > 1) { |
|
0 |
0 |
if (any_merged && forms[i].forms.size() > 1) { |
|
0 |
0 |
if (any_merged && forms[i].forms.size() > 1) { |
11132
|
0 |
0 |
inline static bool eq(const tagged_form& a, const tagged_form& b) { return a.tag == b.tag && a.form == b.form; } |
|
0 |
0 |
inline static bool eq(const tagged_form& a, const tagged_form& b) { return a.tag == b.tag && a.form == b.form; } |
11133
|
0 |
0 |
inline static bool lt(const tagged_form& a, const tagged_form& b) { int tag_compare = a.tag.compare(b.tag); return tag_compare < 0 || (tag_compare == 0 && a.form < b.form); } |
11291
|
214 |
2 |
const unordered_set czech_tokenizer::abbreviations_czech = { |
|
0 |
0 |
const unordered_set czech_tokenizer::abbreviations_czech = { |
11307
|
206 |
2 |
const unordered_set czech_tokenizer::abbreviations_slovak = { |
|
0 |
0 |
const unordered_set czech_tokenizer::abbreviations_slovak = { |
11324
|
0 |
0 |
: ragel_tokenizer(version <= 1 ? 1 : 2), m(m) { |
|
0 |
0 |
: ragel_tokenizer(version <= 1 ? 1 : 2), m(m) { |
11338
|
0 |
0 |
if (!m) return; |
11339
|
0 |
0 |
if (tokens.empty() || chars[tokens.back().start].cat & ~unicode::L) return; |
|
0 |
0 |
if (tokens.empty() || chars[tokens.back().start].cat & ~unicode::L) return; |
|
0 |
0 |
if (tokens.empty() || chars[tokens.back().start].cat & ~unicode::L) return; |
11342
|
0 |
0 |
for (unsigned hyphens = 1; hyphens <= 2; hyphens++) { |
11344
|
0 |
0 |
if (tokens.size() < 2*hyphens + 1) break; |
11346
|
0 |
0 |
if (tokens[first_hyphen].length != 1 || chars[tokens[first_hyphen].start].cat & ~unicode::P || |
|
0 |
0 |
if (tokens[first_hyphen].length != 1 || chars[tokens[first_hyphen].start].cat & ~unicode::P || |
|
0 |
0 |
if (tokens[first_hyphen].length != 1 || chars[tokens[first_hyphen].start].cat & ~unicode::P || |
11347
|
0 |
0 |
tokens[first_hyphen].start + tokens[first_hyphen].length != tokens[first_hyphen + 1].start || |
11348
|
0 |
0 |
tokens[first_hyphen-1].start + tokens[first_hyphen-1].length != tokens[first_hyphen].start || |
|
0 |
0 |
tokens[first_hyphen-1].start + tokens[first_hyphen-1].length != tokens[first_hyphen].start || |
11352
|
0 |
0 |
if (m->analyze(string_piece(chars[tokens[first_hyphen-1].start].str, chars[tokens.back().start + tokens.back().length].str - chars[tokens[first_hyphen-1].start].str), morpho::NO_GUESSER, lemmas) >= 0) |
11356
|
0 |
0 |
if (matched_hyphens) { |
11370
|
0 |
0 |
while (tokenize_url_email(tokens)) |
11371
|
0 |
0 |
if (emergency_sentence_split(tokens)) |
11387
|
0 |
0 |
if ( ( current) == ( (chars.size() - 1)) ) |
11392
|
0 |
0 |
switch ( _czech_tokenizer_from_state_actions[cs] ) { |
11401
|
0 |
0 |
if ( _klen > 0 ) { |
11406
|
0 |
0 |
if ( _upper < _lower ) |
11410
|
0 |
0 |
if ( _widec < _mid[0] ) |
11412
|
0 |
0 |
else if ( _widec > _mid[1] ) |
11418
|
0 |
0 |
if ( |
11419
|
0 |
0 |
!current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256; |
|
0 |
0 |
!current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256; |
11424
|
0 |
0 |
if ( |
11425
|
0 |
0 |
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
|
0 |
0 |
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
|
0 |
0 |
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
11438
|
0 |
0 |
if ( _klen > 0 ) { |
11443
|
0 |
0 |
if ( _upper < _lower ) |
11447
|
0 |
0 |
if ( _widec < *_mid ) |
11449
|
0 |
0 |
else if ( _widec > *_mid ) |
11461
|
0 |
0 |
if ( _klen > 0 ) { |
11466
|
0 |
0 |
if ( _upper < _lower ) |
11470
|
0 |
0 |
if ( _widec < _mid[0] ) |
11472
|
0 |
0 |
else if ( _widec > _mid[1] ) |
11487
|
0 |
0 |
if ( _czech_tokenizer_trans_actions[_trans] == 0 ) |
11501
|
0 |
0 |
do |
11502
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
11510
|
0 |
0 |
for (current = ts; current < whitespace; current++) |
11513
|
0 |
0 |
if (eos) {( current)++; goto _out; } |
11518
|
0 |
0 |
if (!tokens.empty()) {( current)++; goto _out; } |
11520
|
0 |
0 |
do |
11521
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
11530
|
0 |
0 |
do |
11531
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
11539
|
0 |
0 |
do |
11540
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
11547
|
0 |
0 |
if (!tokens.empty()) {( current)++; goto _out; } |
11549
|
0 |
0 |
do |
11550
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
11559
|
0 |
0 |
do |
11560
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
11568
|
0 |
0 |
switch ( _czech_tokenizer_to_state_actions[cs] ) { |
11574
|
0 |
0 |
if ( cs == 0 ) |
11576
|
0 |
0 |
if ( ++( current) != ( (chars.size() - 1)) ) |
11579
|
0 |
0 |
if ( ( current) == ( (chars.size() - 1)) ) |
11581
|
0 |
0 |
if ( _czech_tokenizer_eof_trans[cs] > 0 ) { |
11641
|
0 |
0 |
return new czech_tokenizer(language, version, m); |
11648
|
0 |
0 |
return bool(is) && (language == czech_tokenizer::CZECH || language == czech_tokenizer::SLOVAK); |
|
0 |
0 |
return bool(is) && (language == czech_tokenizer::CZECH || language == czech_tokenizer::SLOVAK); |
11713
|
228 |
2 |
const unordered_set english_tokenizer::abbreviations = { |
|
0 |
0 |
const unordered_set english_tokenizer::abbreviations = { |
11812
|
0 |
0 |
if (tokens.empty() || chars[tokens.back().start].cat & ~unilib::unicode::L) return; |
|
0 |
0 |
if (tokens.empty() || chars[tokens.back().start].cat & ~unilib::unicode::L) return; |
|
0 |
0 |
if (tokens.empty() || chars[tokens.back().start].cat & ~unilib::unicode::L) return; |
11827
|
0 |
0 |
if ( ( index) == ( end) ) |
11836
|
0 |
0 |
if ( _klen > 0 ) { |
11841
|
0 |
0 |
if ( _upper < _lower ) |
11845
|
0 |
0 |
if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) < *_mid ) |
11847
|
0 |
0 |
else if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) > *_mid ) |
11859
|
0 |
0 |
if ( _klen > 0 ) { |
11864
|
0 |
0 |
if ( _upper < _lower ) |
11868
|
0 |
0 |
if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) < _mid[0] ) |
11870
|
0 |
0 |
else if ( ( ragel_char(chars[tokens.back().start + end - index - 1])) > _mid[1] ) |
11884
|
0 |
0 |
if ( _english_tokenizer_split_token_trans_actions[_trans] == 0 ) |
11898
|
0 |
0 |
if ( cs == 0 ) |
11900
|
0 |
0 |
if ( ++( index) != ( end) ) |
11903
|
0 |
0 |
if ( ( index) == ( end) ) |
11905
|
0 |
0 |
switch ( _english_tokenizer_split_token_eof_actions[cs] ) { |
11915
|
0 |
0 |
if (split_len && split_len < end) { |
12069
|
0 |
0 |
english_tokenizer::english_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {} |
|
0 |
0 |
english_tokenizer::english_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {} |
|
0 |
0 |
english_tokenizer::english_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {} |
|
0 |
0 |
english_tokenizer::english_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {} |
12078
|
0 |
0 |
while (tokenize_url_email(tokens)) |
12079
|
0 |
0 |
if (emergency_sentence_split(tokens)) |
12095
|
0 |
0 |
if ( ( current) == ( (chars.size() - 1)) ) |
12100
|
0 |
0 |
switch ( _english_tokenizer_from_state_actions[cs] ) { |
12109
|
0 |
0 |
if ( _klen > 0 ) { |
12114
|
0 |
0 |
if ( _upper < _lower ) |
12118
|
0 |
0 |
if ( _widec < _mid[0] ) |
12120
|
0 |
0 |
else if ( _widec > _mid[1] ) |
12126
|
0 |
0 |
if ( |
12127
|
0 |
0 |
!current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256; |
|
0 |
0 |
!current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256; |
12132
|
0 |
0 |
if ( |
12133
|
0 |
0 |
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
|
0 |
0 |
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
|
0 |
0 |
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
12146
|
0 |
0 |
if ( _klen > 0 ) { |
12151
|
0 |
0 |
if ( _upper < _lower ) |
12155
|
0 |
0 |
if ( _widec < *_mid ) |
12157
|
0 |
0 |
else if ( _widec > *_mid ) |
12169
|
0 |
0 |
if ( _klen > 0 ) { |
12174
|
0 |
0 |
if ( _upper < _lower ) |
12178
|
0 |
0 |
if ( _widec < _mid[0] ) |
12180
|
0 |
0 |
else if ( _widec > _mid[1] ) |
12195
|
0 |
0 |
if ( _english_tokenizer_trans_actions[_trans] == 0 ) |
12209
|
0 |
0 |
do |
12210
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
12218
|
0 |
0 |
for (current = ts; current < whitespace; current++) |
12221
|
0 |
0 |
if (eos) {( current)++; goto _out; } |
12226
|
0 |
0 |
if (!tokens.empty()) {( current)++; goto _out; } |
12228
|
0 |
0 |
do |
12229
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
12238
|
0 |
0 |
do |
12239
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
12247
|
0 |
0 |
do |
12248
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
12255
|
0 |
0 |
if (!tokens.empty()) {( current)++; goto _out; } |
12257
|
0 |
0 |
do |
12258
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
12267
|
0 |
0 |
do |
12268
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
12276
|
0 |
0 |
switch ( _english_tokenizer_to_state_actions[cs] ) { |
12282
|
0 |
0 |
if ( cs == 0 ) |
12284
|
0 |
0 |
if ( ++( current) != ( (chars.size() - 1)) ) |
12287
|
0 |
0 |
if ( ( current) == ( (chars.size() - 1)) ) |
12289
|
0 |
0 |
if ( _english_tokenizer_eof_trans[cs] > 0 ) { |
12446
|
0 |
0 |
generic_tokenizer::generic_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {} |
|
0 |
0 |
generic_tokenizer::generic_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {} |
|
0 |
0 |
generic_tokenizer::generic_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {} |
|
0 |
0 |
generic_tokenizer::generic_tokenizer(unsigned version) : ragel_tokenizer(version <= 1 ? 1 : 2) {} |
12455
|
0 |
0 |
while (tokenize_url_email(tokens)) |
12456
|
0 |
0 |
if (emergency_sentence_split(tokens)) |
12472
|
0 |
0 |
if ( ( current) == ( (chars.size() - 1)) ) |
12477
|
0 |
0 |
switch ( _generic_tokenizer_from_state_actions[cs] ) { |
12486
|
0 |
0 |
if ( _klen > 0 ) { |
12491
|
0 |
0 |
if ( _upper < _lower ) |
12495
|
0 |
0 |
if ( _widec < _mid[0] ) |
12497
|
0 |
0 |
else if ( _widec > _mid[1] ) |
12503
|
0 |
0 |
if ( |
12504
|
0 |
0 |
!current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256; |
|
0 |
0 |
!current || (chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N | unicode::Pd)) ) _widec += 256; |
12509
|
0 |
0 |
if ( |
12510
|
0 |
0 |
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
|
0 |
0 |
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
|
0 |
0 |
!current || ((chars[current-1].cat & ~(unicode::L | unicode::M | unicode::N)) && chars[current-1].chr != '+') ) _widec += 256; |
12523
|
0 |
0 |
if ( _klen > 0 ) { |
12528
|
0 |
0 |
if ( _upper < _lower ) |
12532
|
0 |
0 |
if ( _widec < *_mid ) |
12534
|
0 |
0 |
else if ( _widec > *_mid ) |
12546
|
0 |
0 |
if ( _klen > 0 ) { |
12551
|
0 |
0 |
if ( _upper < _lower ) |
12555
|
0 |
0 |
if ( _widec < _mid[0] ) |
12557
|
0 |
0 |
else if ( _widec > _mid[1] ) |
12572
|
0 |
0 |
if ( _generic_tokenizer_trans_actions[_trans] == 0 ) |
12585
|
0 |
0 |
do |
12586
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
12594
|
0 |
0 |
for (current = ts; current < whitespace; current++) |
12597
|
0 |
0 |
if (eos) {( current)++; goto _out; } |
12602
|
0 |
0 |
if (!tokens.empty()) {( current)++; goto _out; } |
12604
|
0 |
0 |
do |
12605
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
12613
|
0 |
0 |
do |
12614
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
12622
|
0 |
0 |
do |
12623
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
12630
|
0 |
0 |
if (!tokens.empty()) {( current)++; goto _out; } |
12632
|
0 |
0 |
do |
12633
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
12641
|
0 |
0 |
do |
12642
|
0 |
0 |
if (emergency_sentence_split(tokens)) { ( current)--; {( current)++; goto _out; } } |
12650
|
0 |
0 |
switch ( _generic_tokenizer_to_state_actions[cs] ) { |
12656
|
0 |
0 |
if ( cs == 0 ) |
12658
|
0 |
0 |
if ( ++( current) != ( (chars.size() - 1)) ) |
12661
|
0 |
0 |
if ( ( current) == ( (chars.size() - 1)) ) |
12663
|
0 |
0 |
if ( _generic_tokenizer_eof_trans[cs] > 0 ) { |
12726
|
0 |
0 |
version = is.get(); |
12771
|
0 |
0 |
os.put(version); |
12908
|
2 |
2 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
0 |
0 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
0 |
0 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
0 |
0 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
0 |
0 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
0 |
0 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
0 |
0 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
0 |
0 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
0 |
0 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
0 |
0 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
0 |
0 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
0 |
0 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
0 |
0 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
0 |
0 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
|
0 |
0 |
for (int i = 0; i < R; i++) fill_n(w[i], C, 0.f); |
12914
|
0 |
0 |
for (int i = 0; i < R; i++) memcpy(w[i], data.next(C), sizeof(float) * C); |
|
0 |
0 |
for (int i = 0; i < R; i++) memcpy(w[i], data.next(C), sizeof(float) * C); |
|
192 |
12 |
for (int i = 0; i < R; i++) memcpy(w[i], data.next(C), sizeof(float) * C); |
|
0 |
0 |
for (int i = 0; i < R; i++) memcpy(w[i], data.next(C), sizeof(float) * C); |
|
0 |
0 |
for (int i = 0; i < R; i++) memcpy(w[i], data.next(C), sizeof(float) * C); |
|
6 |
2 |
for (int i = 0; i < R; i++) memcpy(w[i], data.next(C), sizeof(float) * C); |
12930
|
1 |
0 |
if (chars.empty()) return; |
|
0 |
0 |
if (chars.empty()) return; |
|
0 |
0 |
if (chars.empty()) return; |
12934
|
34 |
1 |
for (size_t i = 0; i < chars.size(); i++) { |
|
0 |
0 |
for (size_t i = 0; i < chars.size(); i++) { |
|
0 |
0 |
for (size_t i = 0; i < chars.size(); i++) { |
12942
|
0 |
0 |
if (decomposition[0] == 0x3001) decomposition[0] = char32_t(','); |
|
0 |
0 |
if (decomposition[0] == 0x3001) decomposition[0] = char32_t(','); |
|
0 |
0 |
if (decomposition[0] == 0x3001) decomposition[0] = char32_t(','); |
12943
|
0 |
0 |
if (decomposition[0] == 0x3002) decomposition[0] = char32_t('.'); |
|
0 |
0 |
if (decomposition[0] == 0x3002) decomposition[0] = char32_t('.'); |
|
0 |
0 |
if (decomposition[0] == 0x3002) decomposition[0] = char32_t('.'); |
12944
|
0 |
0 |
if (decomposition[0] != chars[i].chr) embedding = embeddings.find(decomposition[0]); |
|
0 |
0 |
if (decomposition[0] != chars[i].chr) embedding = embeddings.find(decomposition[0]); |
|
0 |
0 |
if (decomposition[0] != chars[i].chr) embedding = embeddings.find(decomposition[0]); |
12947
|
34 |
0 |
if (embedding != embeddings.end()) { |
|
0 |
0 |
if (embedding != embeddings.end()) { |
|
0 |
0 |
if (embedding != embeddings.end()) { |
12952
|
0 |
0 |
outcomes[i].embedding = embedding != embeddings.end() ? embedding->second.cache.w[0] : empty_embedding.cache.w[0]; |
|
0 |
0 |
outcomes[i].embedding = embedding != embeddings.end() ? embedding->second.cache.w[0] : empty_embedding.cache.w[0]; |
|
0 |
0 |
outcomes[i].embedding = embedding != embeddings.end() ? embedding->second.cache.w[0] : empty_embedding.cache.w[0]; |
12957
|
34 |
1 |
for (auto&& outcome : outcomes) |
|
0 |
0 |
for (auto&& outcome : outcomes) |
|
0 |
0 |
for (auto&& outcome : outcomes) |
12958
|
102 |
34 |
for (int i = 0; i < 3; i++) |
|
0 |
0 |
for (int i = 0; i < 3; i++) |
|
0 |
0 |
for (int i = 0; i < 3; i++) |
12963
|
2 |
1 |
for (int dir = 0; dir < 2; dir++) { |
|
0 |
0 |
for (int dir = 0; dir < 2; dir++) { |
|
0 |
0 |
for (int dir = 0; dir < 2; dir++) { |
12964
|
1 |
1 |
auto& gru = dir == 0 ? gru_fwd : gru_bwd; |
|
0 |
0 |
auto& gru = dir == 0 ? gru_fwd : gru_bwd; |
|
0 |
0 |
auto& gru = dir == 0 ? gru_fwd : gru_bwd; |
12965
|
1 |
1 |
auto& projection = dir == 0 ? projection_fwd : projection_bwd; |
|
0 |
0 |
auto& projection = dir == 0 ? projection_fwd : projection_bwd; |
|
0 |
0 |
auto& projection = dir == 0 ? projection_fwd : projection_bwd; |
12968
|
68 |
2 |
for (size_t i = 0; i < outcomes.size(); i++) { |
|
0 |
0 |
for (size_t i = 0; i < outcomes.size(); i++) { |
|
0 |
0 |
for (size_t i = 0; i < outcomes.size(); i++) { |
12969
|
34 |
34 |
auto& outcome = outcomes[dir == 0 ? i : outcomes.size() - 1 - i]; |
|
0 |
0 |
auto& outcome = outcomes[dir == 0 ? i : outcomes.size() - 1 - i]; |
|
0 |
0 |
auto& outcome = outcomes[dir == 0 ? i : outcomes.size() - 1 - i]; |
12970
|
34 |
34 |
auto* embedding_cache = outcome.embedding + (dir == 1) * 3 * D; |
|
0 |
0 |
auto* embedding_cache = outcome.embedding + (dir == 1) * 3 * D; |
|
0 |
0 |
auto* embedding_cache = outcome.embedding + (dir == 1) * 3 * D; |
12972
|
68 |
1088 |
for (int j = 0; j < D; j++) { |
|
0 |
0 |
for (int j = 0; j < D; j++) { |
|
0 |
0 |
for (int j = 0; j < D; j++) { |
12975
|
17408 |
1088 |
for (int k = 0; k < D; k++) { |
|
0 |
0 |
for (int k = 0; k < D; k++) { |
|
0 |
0 |
for (int k = 0; k < D; k++) { |
12983
|
68 |
1088 |
for (int j = 0; j < D; j++) { |
|
0 |
0 |
for (int j = 0; j < D; j++) { |
|
0 |
0 |
for (int j = 0; j < D; j++) { |
12985
|
17408 |
1088 |
for (int k = 0; k < D; k++) |
|
0 |
0 |
for (int k = 0; k < D; k++) |
|
0 |
0 |
for (int k = 0; k < D; k++) |
12991
|
204 |
68 |
for (int j = 0; j < 3; j++) |
|
0 |
0 |
for (int j = 0; j < 3; j++) |
|
0 |
0 |
for (int j = 0; j < 3; j++) |
12992
|
3264 |
204 |
for (int k = 0; k < D; k++) |
|
0 |
0 |
for (int k = 0; k < D; k++) |
|
0 |
0 |
for (int k = 0; k < D; k++) |
12998
|
34 |
1 |
for (auto&& outcome : outcomes) { |
|
0 |
0 |
for (auto&& outcome : outcomes) { |
|
0 |
0 |
for (auto&& outcome : outcomes) { |
13000
|
1 |
33 |
if (outcome.w[2] > outcome.w[outcome.outcome]) outcome.outcome = 2; |
|
0 |
0 |
if (outcome.w[2] > outcome.w[outcome.outcome]) outcome.outcome = 2; |
|
0 |
0 |
if (outcome.w[2] > outcome.w[outcome.outcome]) outcome.outcome = 2; |
13008
|
0 |
0 |
for (unsigned chars = data.next_4B(); chars; chars--) { |
|
0 |
0 |
for (unsigned chars = data.next_4B(); chars; chars--) { |
|
0 |
0 |
for (unsigned chars = data.next_4B(); chars; chars--) { |
|
0 |
0 |
for (unsigned chars = data.next_4B(); chars; chars--) { |
|
1 |
0 |
for (unsigned chars = data.next_4B(); chars; chars--) { |
|
20 |
1 |
for (unsigned chars = data.next_4B(); chars; chars--) { |
13009
|
0 |
0 |
auto& embedding = network->embeddings[data.next_4B()]; |
|
0 |
0 |
auto& embedding = network->embeddings[data.next_4B()]; |
|
20 |
0 |
auto& embedding = network->embeddings[data.next_4B()]; |
13010
|
0 |
0 |
copy_n(data.next(D), D, embedding.e.w[0]); |
|
0 |
0 |
copy_n(data.next(D), D, embedding.e.w[0]); |
|
20 |
0 |
copy_n(data.next(D), D, embedding.e.w[0]); |
13014
|
0 |
0 |
network->gru_fwd.load(data); |
|
0 |
0 |
network->gru_fwd.load(data); |
|
1 |
0 |
network->gru_fwd.load(data); |
13015
|
0 |
0 |
network->gru_bwd.load(data); |
|
0 |
0 |
network->gru_bwd.load(data); |
|
1 |
0 |
network->gru_bwd.load(data); |
13016
|
0 |
0 |
network->projection_fwd.load(data); |
|
0 |
0 |
network->projection_fwd.load(data); |
|
1 |
0 |
network->projection_fwd.load(data); |
13017
|
0 |
0 |
network->projection_bwd.load(data); |
|
0 |
0 |
network->projection_bwd.load(data); |
|
1 |
0 |
network->projection_bwd.load(data); |
13020
|
0 |
0 |
for (unsigned unknown_chars_len = data.next_1B(); unknown_chars_len; unknown_chars_len--) { |
|
0 |
0 |
for (unsigned unknown_chars_len = data.next_1B(); unknown_chars_len; unknown_chars_len--) { |
|
0 |
0 |
for (unsigned unknown_chars_len = data.next_1B(); unknown_chars_len; unknown_chars_len--) { |
|
0 |
0 |
for (unsigned unknown_chars_len = data.next_1B(); unknown_chars_len; unknown_chars_len--) { |
|
1 |
0 |
for (unsigned unknown_chars_len = data.next_1B(); unknown_chars_len; unknown_chars_len--) { |
|
4 |
1 |
for (unsigned unknown_chars_len = data.next_1B(); unknown_chars_len; unknown_chars_len--) { |
13021
|
0 |
0 |
unilib::unicode::category_t cat = data.next_4B(); |
|
0 |
0 |
unilib::unicode::category_t cat = data.next_4B(); |
|
4 |
0 |
unilib::unicode::category_t cat = data.next_4B(); |
13022
|
0 |
0 |
network->unknown_chars[cat] = data.next_4B(); |
|
0 |
0 |
network->unknown_chars[cat] = data.next_4B(); |
|
4 |
0 |
network->unknown_chars[cat] = data.next_4B(); |
13032
|
0 |
0 |
for (auto&& embedding : embeddings) { |
|
0 |
0 |
for (auto&& embedding : embeddings) { |
|
20 |
1 |
for (auto&& embedding : embeddings) { |
13036
|
0 |
0 |
for (int i = 0; i < 6; i++) fill_n(cache.w[i], D, 0.f); |
|
0 |
0 |
for (int i = 0; i < 6; i++) fill_n(cache.w[i], D, 0.f); |
|
20 |
120 |
for (int i = 0; i < 6; i++) fill_n(cache.w[i], D, 0.f); |
13037
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[0][i] += e.w[0][j] * gru_fwd.X.w[i][j]; |
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[0][i] += e.w[0][j] * gru_fwd.X.w[i][j]; |
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[0][i] += e.w[0][j] * gru_fwd.X.w[i][j]; |
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[0][i] += e.w[0][j] * gru_fwd.X.w[i][j]; |
|
320 |
20 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[0][i] += e.w[0][j] * gru_fwd.X.w[i][j]; |
|
5120 |
320 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[0][i] += e.w[0][j] * gru_fwd.X.w[i][j]; |
13038
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[1][i] += e.w[0][j] * gru_fwd.X_r.w[i][j]; |
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[1][i] += e.w[0][j] * gru_fwd.X_r.w[i][j]; |
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[1][i] += e.w[0][j] * gru_fwd.X_r.w[i][j]; |
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[1][i] += e.w[0][j] * gru_fwd.X_r.w[i][j]; |
|
320 |
20 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[1][i] += e.w[0][j] * gru_fwd.X_r.w[i][j]; |
|
5120 |
320 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[1][i] += e.w[0][j] * gru_fwd.X_r.w[i][j]; |
13039
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[2][i] += e.w[0][j] * gru_fwd.X_z.w[i][j]; |
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[2][i] += e.w[0][j] * gru_fwd.X_z.w[i][j]; |
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[2][i] += e.w[0][j] * gru_fwd.X_z.w[i][j]; |
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[2][i] += e.w[0][j] * gru_fwd.X_z.w[i][j]; |
|
320 |
20 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[2][i] += e.w[0][j] * gru_fwd.X_z.w[i][j]; |
|
5120 |
320 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[2][i] += e.w[0][j] * gru_fwd.X_z.w[i][j]; |
13040
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[3][i] += e.w[0][j] * gru_bwd.X.w[i][j]; |
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[3][i] += e.w[0][j] * gru_bwd.X.w[i][j]; |
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[3][i] += e.w[0][j] * gru_bwd.X.w[i][j]; |
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[3][i] += e.w[0][j] * gru_bwd.X.w[i][j]; |
|
320 |
20 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[3][i] += e.w[0][j] * gru_bwd.X.w[i][j]; |
|
5120 |
320 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[3][i] += e.w[0][j] * gru_bwd.X.w[i][j]; |
13041
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[4][i] += e.w[0][j] * gru_bwd.X_r.w[i][j]; |
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[4][i] += e.w[0][j] * gru_bwd.X_r.w[i][j]; |
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[4][i] += e.w[0][j] * gru_bwd.X_r.w[i][j]; |
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[4][i] += e.w[0][j] * gru_bwd.X_r.w[i][j]; |
|
320 |
20 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[4][i] += e.w[0][j] * gru_bwd.X_r.w[i][j]; |
|
5120 |
320 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[4][i] += e.w[0][j] * gru_bwd.X_r.w[i][j]; |
13042
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[5][i] += e.w[0][j] * gru_bwd.X_z.w[i][j]; |
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[5][i] += e.w[0][j] * gru_bwd.X_z.w[i][j]; |
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[5][i] += e.w[0][j] * gru_bwd.X_z.w[i][j]; |
|
0 |
0 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[5][i] += e.w[0][j] * gru_bwd.X_z.w[i][j]; |
|
320 |
20 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[5][i] += e.w[0][j] * gru_bwd.X_z.w[i][j]; |
|
5120 |
320 |
for (int i = 0; i < D; i++) for (int j = 0; j < D; j++) cache.w[5][i] += e.w[0][j] * gru_bwd.X_z.w[i][j]; |
13044
|
0 |
0 |
for (int i = 0; i < 6; i++) fill_n(empty_embedding.cache.w[i], D, 0.f); |
|
0 |
0 |
for (int i = 0; i < 6; i++) fill_n(empty_embedding.cache.w[i], D, 0.f); |
|
6 |
1 |
for (int i = 0; i < 6; i++) fill_n(empty_embedding.cache.w[i], D, 0.f); |
13067
|
0 |
0 |
: unicode_tokenizer(url_email_tokenizer), segment(segment), allow_spaces(allow_spaces), network_index(0), network_length(0), network(network) {} |
|
0 |
0 |
: unicode_tokenizer(url_email_tokenizer), segment(segment), allow_spaces(allow_spaces), network_index(0), network_length(0), network(network) {} |
|
0 |
0 |
: unicode_tokenizer(url_email_tokenizer), segment(segment), allow_spaces(allow_spaces), network_index(0), network_length(0), network(network) {} |
|
1 |
0 |
: unicode_tokenizer(url_email_tokenizer), segment(segment), allow_spaces(allow_spaces), network_index(0), network_length(0), network(network) {} |
13102
|
29 |
5 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
29 |
0 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
29 |
0 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
0 |
29 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
4 |
0 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
4 |
0 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
4 |
0 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
0 |
4 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
28 |
5 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
28 |
0 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
28 |
0 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
0 |
28 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
7 |
5 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
7 |
0 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
7 |
0 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
|
0 |
7 |
return (chars[index].cat & unilib::unicode::Zs) || chars[index].chr == '\r' || chars[index].chr == '\n' || chars[index].chr == '\t'; |
13109
|
1 |
1 |
if (current == 0) network_index = network_length = 0; |
13112
|
8 |
1 |
for (bool eos = false; !eos && !emergency_sentence_split(tokens); ) { |
|
0 |
8 |
for (bool eos = false; !eos && !emergency_sentence_split(tokens); ) { |
|
8 |
1 |
for (bool eos = false; !eos && !emergency_sentence_split(tokens); ) { |
13113
|
12 |
1 |
while (current < chars.size() - 1 && is_space(current)) |
|
7 |
5 |
while (current < chars.size() - 1 && is_space(current)) |
|
5 |
8 |
while (current < chars.size() - 1 && is_space(current)) |
13114
|
0 |
5 |
if (next_outcome() == gru_tokenizer_network::END_OF_SENTENCE && !tokens.empty()) |
|
0 |
0 |
if (next_outcome() == gru_tokenizer_network::END_OF_SENTENCE && !tokens.empty()) |
|
5 |
0 |
if (next_outcome() == gru_tokenizer_network::END_OF_SENTENCE && !tokens.empty()) |
13117
|
7 |
1 |
if (current >= chars.size() - 1) break; |
13120
|
0 |
7 |
if (tokenize_url_email(tokens)) { |
13121
|
0 |
0 |
while (network_index < network_length && network_offsets[network_index] < current) |
|
0 |
0 |
while (network_index < network_length && network_offsets[network_index] < current) |
|
0 |
0 |
while (network_index < network_length && network_offsets[network_index] < current) |
13122
|
0 |
0 |
if (network_outcomes[network_index++].outcome == gru_tokenizer_network::END_OF_SENTENCE && !tokens.empty()) |
|
0 |
0 |
if (network_outcomes[network_index++].outcome == gru_tokenizer_network::END_OF_SENTENCE && !tokens.empty()) |
|
0 |
0 |
if (network_outcomes[network_index++].outcome == gru_tokenizer_network::END_OF_SENTENCE && !tokens.empty()) |
13129
|
22 |
0 |
do { |
13132
|
22 |
7 |
if (outcome != gru_tokenizer_network::NO_SPLIT) break; |
13141
|
1 |
33 |
if (network_index >= network_length) { |
13150
|
34 |
1 |
for (size_t offset = current; |
13151
|
34 |
1 |
network_offsets.push_back(offset), offset < chars.size() - 1 && network_length < segment; |
|
0 |
34 |
network_offsets.push_back(offset), offset < chars.size() - 1 && network_length < segment; |
13153
|
5 |
29 |
if (is_space(offset)) { |
13155
|
4 |
1 |
while (offset + 1 < chars.size() - 1 && is_space(offset + 1)) offset++; |
|
4 |
0 |
while (offset + 1 < chars.size() - 1 && is_space(offset + 1)) offset++; |
|
0 |
5 |
while (offset + 1 < chars.size() - 1 && is_space(offset + 1)) offset++; |
13161
|
1 |
0 |
if (network_length < segment && network_chars.back().chr != ' ') |
|
1 |
0 |
if (network_length < segment && network_chars.back().chr != ' ') |
|
0 |
1 |
if (network_length < segment && network_chars.back().chr != ' ') |
13169
|
33 |
1 |
for (size_t i = 0; i < network_length - 1; i++) |
13170
|
28 |
5 |
if (is_space(network_offsets[i+1])) { |
13173
|
1 |
4 |
if (i + 2 == network_length) eos = true; |
13174
|
0 |
5 |
for (size_t j = network_offsets[i+1]; j + 1 < network_offsets[i+2] && !eos; j++) |
|
0 |
0 |
for (size_t j = network_offsets[i+1]; j + 1 < network_offsets[i+2] && !eos; j++) |
|
0 |
5 |
for (size_t j = network_offsets[i+1]; j + 1 < network_offsets[i+2] && !eos; j++) |
13175
|
0 |
0 |
eos = (chars[j].chr == '\n' && chars[j+1].chr == '\n') || |
|
0 |
0 |
eos = (chars[j].chr == '\n' && chars[j+1].chr == '\n') || |
|
0 |
0 |
eos = (chars[j].chr == '\n' && chars[j+1].chr == '\n') || |
13176
|
0 |
0 |
(j + 3 < network_offsets[i+2] && chars[j].chr == '\r' && chars[j+1].chr == '\n' && chars[j+2].chr == '\r' && chars[j+3].chr == '\n'); |
|
0 |
0 |
(j + 3 < network_offsets[i+2] && chars[j].chr == '\r' && chars[j+1].chr == '\n' && chars[j+2].chr == '\r' && chars[j+3].chr == '\n'); |
|
0 |
0 |
(j + 3 < network_offsets[i+2] && chars[j].chr == '\r' && chars[j+1].chr == '\n' && chars[j+2].chr == '\r' && chars[j+3].chr == '\n'); |
|
0 |
0 |
(j + 3 < network_offsets[i+2] && chars[j].chr == '\r' && chars[j+1].chr == '\n' && chars[j+2].chr == '\r' && chars[j+3].chr == '\n'); |
13177
|
1 |
4 |
if (eos) network_outcomes[i].outcome = gru_tokenizer_network::END_OF_SENTENCE; |
13179
|
1 |
4 |
if (network_outcomes[i].outcome == gru_tokenizer_network::NO_SPLIT) |
13181
|
0 |
4 |
if (!allow_spaces || network_outcomes[i+1].outcome == gru_tokenizer_network::END_OF_TOKEN) |
|
0 |
0 |
if (!allow_spaces || network_outcomes[i+1].outcome == gru_tokenizer_network::END_OF_TOKEN) |
|
0 |
4 |
if (!allow_spaces || network_outcomes[i+1].outcome == gru_tokenizer_network::END_OF_TOKEN) |
13186
|
0 |
1 |
if (network_length == segment && network_length >= 10) { |
|
0 |
0 |
if (network_length == segment && network_length >= 10) { |
13188
|
0 |
0 |
while (network_length > segment / 2) |
13189
|
0 |
0 |
if (network_outcomes[--network_length].outcome != gru_tokenizer_network::NO_SPLIT) |
13251
|
1 |
0 |
if (!is.get(version)) return false; |
13252
|
1 |
0 |
if (!(version >= 1 && version <= 2)) return false; |
13255
|
1 |
0 |
if (!compressor::load(is, data)) return false; |
|
1 |
0 |
if (!compressor::load(is, data)) return false; |
13258
|
1 |
0 |
url_email_tokenizer = data.next_1B(); |
13259
|
1 |
0 |
segment = data.next_2B(); |
13260
|
0 |
1 |
allow_spaces = version >= 2 ? data.next_1B() : false /*false was default for version 1*/; |
|
0 |
0 |
allow_spaces = version >= 2 ? data.next_1B() : false /*false was default for version 1*/; |
|
0 |
0 |
allow_spaces = version >= 2 ? data.next_1B() : false /*false was default for version 1*/; |
13262
|
1 |
0 |
network.reset(gru_tokenizer_network::load(data)); |
13263
|
1 |
0 |
if (!network) return false; |
|
0 |
0 |
if (!network) return false; |
13289
|
1 |
0 |
if (data.next_1B() != 1) return nullptr; |
13352
|
0 |
0 |
class gru_tokenizer_network_trainer : public gru_tokenizer_network_implementation { |
|
0 |
0 |
class gru_tokenizer_network_trainer : public gru_tokenizer_network_implementation { |
|
0 |
0 |
class gru_tokenizer_network_trainer : public gru_tokenizer_network_implementation { |
13370
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
|
0 |
0 |
matrix_trainer(matrix& original) : original(original), w_g(), b_g(), w_m(), b_m(), w_v(), b_v() {} |
13380
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
|
0 |
0 |
updates(segment), resets(segment), resetstates(segment), candidates(segment), dropouts(segment) {} |
13405
|
0 |
0 |
if (segment < 10) return error.assign("Segment size must be at least 10!"), false; |
|
0 |
0 |
if (segment < 10) return error.assign("Segment size must be at least 10!"), false; |
|
0 |
0 |
if (segment < 10) return error.assign("Segment size must be at least 10!"), false; |
13408
|
0 |
0 |
for (auto&& sentence : data) |
|
0 |
0 |
for (auto&& sentence : data) |
|
0 |
0 |
for (auto&& sentence : data) |
13410
|
0 |
0 |
if (characters < segment) return error.assign("Not enought training data for the gru_tokenizer!"), false; |
|
0 |
0 |
if (characters < segment) return error.assign("Not enought training data for the gru_tokenizer!"), false; |
|
0 |
0 |
if (characters < segment) return error.assign("Not enought training data for the gru_tokenizer!"), false; |
13418
|
0 |
0 |
for (auto&& sentence : data) |
|
0 |
0 |
for (auto&& sentence : data) |
|
0 |
0 |
for (auto&& sentence : data) |
13419
|
0 |
0 |
for (auto&& chr : sentence.sentence) |
|
0 |
0 |
for (auto&& chr : sentence.sentence) |
|
0 |
0 |
for (auto&& chr : sentence.sentence) |
13435
|
0 |
0 |
for (auto&& embedding : this->embeddings) |
|
0 |
0 |
for (auto&& embedding : this->embeddings) |
|
0 |
0 |
for (auto&& embedding : this->embeddings) |
13437
|
0 |
0 |
vector*> chosen_embeddings(segment); |
|
0 |
0 |
vector*> chosen_embeddings(segment); |
|
0 |
0 |
vector*> chosen_embeddings(segment); |
13438
|
0 |
0 |
vector> embedding_dropouts(segment); |
|
0 |
0 |
vector> embedding_dropouts(segment); |
|
0 |
0 |
vector> embedding_dropouts(segment); |
13439
|
0 |
0 |
gru_trainer gru_fwd(this->gru_fwd, segment), gru_bwd(this->gru_bwd, segment); |
|
0 |
0 |
gru_trainer gru_fwd(this->gru_fwd, segment), gru_bwd(this->gru_bwd, segment); |
|
0 |
0 |
gru_trainer gru_fwd(this->gru_fwd, segment), gru_bwd(this->gru_bwd, segment); |
|
0 |
0 |
gru_trainer gru_fwd(this->gru_fwd, segment), gru_bwd(this->gru_bwd, segment); |
|
0 |
0 |
gru_trainer gru_fwd(this->gru_fwd, segment), gru_bwd(this->gru_bwd, segment); |
|
0 |
0 |
gru_trainer gru_fwd(this->gru_fwd, segment), gru_bwd(this->gru_bwd, segment); |
13447
|
0 |
0 |
vector training_input, instance_input(segment); |
|
0 |
0 |
vector training_input, instance_input(segment); |
|
0 |
0 |
vector training_input, instance_input(segment); |
13448
|
0 |
0 |
vector training_output, instance_output(segment); |
|
0 |
0 |
vector training_output, instance_output(segment); |
|
0 |
0 |
vector training_output, instance_output(segment); |
13449
|
0 |
0 |
vector permutation; for (size_t i = 0; i < data.size(); i++) permutation.push_back(permutation.size()); |
|
0 |
0 |
vector permutation; for (size_t i = 0; i < data.size(); i++) permutation.push_back(permutation.size()); |
|
0 |
0 |
vector permutation; for (size_t i = 0; i < data.size(); i++) permutation.push_back(permutation.size()); |
13450
|
0 |
0 |
for (unsigned epoch = 0; epoch < epochs; epoch++) { |
|
0 |
0 |
for (unsigned epoch = 0; epoch < epochs; epoch++) { |
|
0 |
0 |
for (unsigned epoch = 0; epoch < epochs; epoch++) { |
13454
|
0 |
0 |
for (int instance = 0, instances = 10000; instance < instances; instance++) { |
|
0 |
0 |
for (int instance = 0, instances = 10000; instance < instances; instance++) { |
|
0 |
0 |
for (int instance = 0, instances = 10000; instance < instances; instance++) { |
13456
|
0 |
0 |
if (training_offset + segment >= training_input.size()) { |
|
0 |
0 |
if (training_offset + segment >= training_input.size()) { |
|
0 |
0 |
if (training_offset + segment >= training_input.size()) { |
13459
|
0 |
0 |
for (auto&& index : permutation) { |
|
0 |
0 |
for (auto&& index : permutation) { |
|
0 |
0 |
for (auto&& index : permutation) { |
13461
|
0 |
0 |
if (sentence.tokens.empty()) continue; |
|
0 |
0 |
if (sentence.tokens.empty()) continue; |
|
0 |
0 |
if (sentence.tokens.empty()) continue; |
13464
|
0 |
0 |
training_input.resize(training_offset + sentence.sentence.size()); |
|
0 |
0 |
training_input.resize(training_offset + sentence.sentence.size()); |
|
0 |
0 |
training_input.resize(training_offset + sentence.sentence.size()); |
13465
|
0 |
0 |
training_output.resize(training_offset + sentence.sentence.size()); |
|
0 |
0 |
training_output.resize(training_offset + sentence.sentence.size()); |
|
0 |
0 |
training_output.resize(training_offset + sentence.sentence.size()); |
13466
|
0 |
0 |
for (size_t i = 0; i < sentence.sentence.size(); i++) { |
|
0 |
0 |
for (size_t i = 0; i < sentence.sentence.size(); i++) { |
|
0 |
0 |
for (size_t i = 0; i < sentence.sentence.size(); i++) { |
13470
|
0 |
0 |
for (size_t i = 0; i < sentence.tokens.size(); i++) |
|
0 |
0 |
for (size_t i = 0; i < sentence.tokens.size(); i++) |
|
0 |
0 |
for (size_t i = 0; i < sentence.tokens.size(); i++) |
13471
|
0 |
0 |
training_output[training_offset + sentence.tokens[i].start + sentence.tokens[i].length - 1].outcome = |
|
0 |
0 |
training_output[training_offset + sentence.tokens[i].start + sentence.tokens[i].length - 1].outcome = |
|
0 |
0 |
training_output[training_offset + sentence.tokens[i].start + sentence.tokens[i].length - 1].outcome = |
13480
|
0 |
0 |
for (training_shift = segment - 5; training_shift > segment / 2; training_shift--) |
|
0 |
0 |
for (training_shift = segment - 5; training_shift > segment / 2; training_shift--) |
|
0 |
0 |
for (training_shift = segment - 5; training_shift > segment / 2; training_shift--) |
13481
|
0 |
0 |
if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ') |
|
0 |
0 |
if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ') |
|
0 |
0 |
if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ') |
|
0 |
0 |
if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ') |
|
0 |
0 |
if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ') |
|
0 |
0 |
if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ') |
|
0 |
0 |
if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ') |
|
0 |
0 |
if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ') |
|
0 |
0 |
if (instance_output[training_shift-1].outcome != gru_tokenizer_network::NO_SPLIT || instance_input[training_shift-1].chr == ' ') |
13486
|
0 |
0 |
for (unsigned i = 0; i < segment; i++) { |
|
0 |
0 |
for (unsigned i = 0; i < segment; i++) { |
|
0 |
0 |
for (unsigned i = 0; i < segment; i++) { |
13488
|
0 |
0 |
for (unsigned k = 0; k < D; k++) |
|
0 |
0 |
for (unsigned k = 0; k < D; k++) |
|
0 |
0 |
for (unsigned k = 0; k < D; k++) |
13489
|
0 |
0 |
embedding_dropouts[i].w[0][k] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier; |
|
0 |
0 |
embedding_dropouts[i].w[0][k] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier; |
|
0 |
0 |
embedding_dropouts[i].w[0][k] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier; |
|
0 |
0 |
embedding_dropouts[i].w[0][k] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier; |
|
0 |
0 |
embedding_dropouts[i].w[0][k] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier; |
|
0 |
0 |
embedding_dropouts[i].w[0][k] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier; |
13490
|
0 |
0 |
for (int j = 0; j < 3; j++) |
|
0 |
0 |
for (int j = 0; j < 3; j++) |
|
0 |
0 |
for (int j = 0; j < 3; j++) |
13494
|
0 |
0 |
for (int dir = 0; dir < 2; dir++) { |
|
0 |
0 |
for (int dir = 0; dir < 2; dir++) { |
|
0 |
0 |
for (int dir = 0; dir < 2; dir++) { |
13495
|
0 |
0 |
auto& gru = dir == 0 ? gru_fwd : gru_bwd; |
|
0 |
0 |
auto& gru = dir == 0 ? gru_fwd : gru_bwd; |
|
0 |
0 |
auto& gru = dir == 0 ? gru_fwd : gru_bwd; |
13496
|
0 |
0 |
auto& projection = dir == 0 ? projection_fwd : projection_bwd; |
|
0 |
0 |
auto& projection = dir == 0 ? projection_fwd : projection_bwd; |
|
0 |
0 |
auto& projection = dir == 0 ? projection_fwd : projection_bwd; |
13499
|
0 |
0 |
for (size_t i = 0; i < segment; i++) { |
|
0 |
0 |
for (size_t i = 0; i < segment; i++) { |
|
0 |
0 |
for (size_t i = 0; i < segment; i++) { |
13500
|
0 |
0 |
auto& embedding = chosen_embeddings[dir == 0 ? i : segment - 1 - i]; |
|
0 |
0 |
auto& embedding = chosen_embeddings[dir == 0 ? i : segment - 1 - i]; |
|
0 |
0 |
auto& embedding = chosen_embeddings[dir == 0 ? i : segment - 1 - i]; |
13501
|
0 |
0 |
auto& embedding_dropout = embedding_dropouts[dir == 0 ? i : segment - 1 - i]; |
|
0 |
0 |
auto& embedding_dropout = embedding_dropouts[dir == 0 ? i : segment - 1 - i]; |
|
0 |
0 |
auto& embedding_dropout = embedding_dropouts[dir == 0 ? i : segment - 1 - i]; |
13502
|
0 |
0 |
auto& output = instance_output[dir == 0 ? i : segment - 1 - i]; |
|
0 |
0 |
auto& output = instance_output[dir == 0 ? i : segment - 1 - i]; |
|
0 |
0 |
auto& output = instance_output[dir == 0 ? i : segment - 1 - i]; |
13504
|
0 |
0 |
for (int j = 0; j < D; j++) { |
|
0 |
0 |
for (int j = 0; j < D; j++) { |
|
0 |
0 |
for (int j = 0; j < D; j++) { |
13507
|
0 |
0 |
for (int k = 0; k < D; k++) { |
|
0 |
0 |
for (int k = 0; k < D; k++) { |
|
0 |
0 |
for (int k = 0; k < D; k++) { |
13515
|
0 |
0 |
for (int j = 0; j < D; j++) { |
|
0 |
0 |
for (int j = 0; j < D; j++) { |
|
0 |
0 |
for (int j = 0; j < D; j++) { |
13517
|
0 |
0 |
for (int k = 0; k < D; k++) |
|
0 |
0 |
for (int k = 0; k < D; k++) |
|
0 |
0 |
for (int k = 0; k < D; k++) |
13523
|
0 |
0 |
for (int j = 0; j < D; j++) |
|
0 |
0 |
for (int j = 0; j < D; j++) |
|
0 |
0 |
for (int j = 0; j < D; j++) |
13524
|
0 |
0 |
gru.dropouts[i].w[0][j] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier * gru.states[i+1].w[0][j]; |
|
0 |
0 |
gru.dropouts[i].w[0][j] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier * gru.states[i+1].w[0][j]; |
|
0 |
0 |
gru.dropouts[i].w[0][j] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier * gru.states[i+1].w[0][j]; |
|
0 |
0 |
gru.dropouts[i].w[0][j] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier * gru.states[i+1].w[0][j]; |
|
0 |
0 |
gru.dropouts[i].w[0][j] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier * gru.states[i+1].w[0][j]; |
|
0 |
0 |
gru.dropouts[i].w[0][j] = dropout && dropout_distribution(generator) ? 0.f : dropout_multiplier * gru.states[i+1].w[0][j]; |
13526
|
0 |
0 |
for (int j = 0; j < 3; j++) |
|
0 |
0 |
for (int j = 0; j < 3; j++) |
|
0 |
0 |
for (int j = 0; j < 3; j++) |
13527
|
0 |
0 |
for (int k = 0; k < D; k++) |
|
0 |
0 |
for (int k = 0; k < D; k++) |
|
0 |
0 |
for (int k = 0; k < D; k++) |
13532
|
0 |
0 |
for (auto&& output : instance_output) { |
|
0 |
0 |
for (auto&& output : instance_output) { |
|
0 |
0 |
for (auto&& output : instance_output) { |
13534
|
0 |
0 |
if (output.w[2] > output.w[best]) best = 2; |
|
0 |
0 |
if (output.w[2] > output.w[best]) best = 2; |
|
0 |
0 |
if (output.w[2] > output.w[best]) best = 2; |
13536
|
0 |
0 |
for (int j = 0; j < 3; j++) sum += (output.w[j] = exp(output.w[j] - maximum)); |
|
0 |
0 |
for (int j = 0; j < 3; j++) sum += (output.w[j] = exp(output.w[j] - maximum)); |
|
0 |
0 |
for (int j = 0; j < 3; j++) sum += (output.w[j] = exp(output.w[j] - maximum)); |
13538
|
0 |
0 |
for (int j = 0; j < 3; j++) output.w[j] *= sum; |
|
0 |
0 |
for (int j = 0; j < 3; j++) output.w[j] *= sum; |
|
0 |
0 |
for (int j = 0; j < 3; j++) output.w[j] *= sum; |
13546
|
0 |
0 |
for (auto&& output : instance_output) |
|
0 |
0 |
for (auto&& output : instance_output) |
|
0 |
0 |
for (auto&& output : instance_output) |
13547
|
0 |
0 |
for (int j = 0; j < 3; j++) |
|
0 |
0 |
for (int j = 0; j < 3; j++) |
|
0 |
0 |
for (int j = 0; j < 3; j++) |
13548
|
0 |
0 |
output.w[j] = (output.outcome == j) - output.w[j]; |
|
0 |
0 |
output.w[j] = (output.outcome == j) - output.w[j]; |
|
0 |
0 |
output.w[j] = (output.outcome == j) - output.w[j]; |
13550
|
0 |
0 |
for (int dir = 0; dir < 2; dir++) { |
|
0 |
0 |
for (int dir = 0; dir < 2; dir++) { |
|
0 |
0 |
for (int dir = 0; dir < 2; dir++) { |
13551
|
0 |
0 |
auto& gru = dir == 0 ? gru_fwd : gru_bwd; |
|
0 |
0 |
auto& gru = dir == 0 ? gru_fwd : gru_bwd; |
|
0 |
0 |
auto& gru = dir == 0 ? gru_fwd : gru_bwd; |
13552
|
0 |
0 |
auto& projection = dir == 0 ? projection_fwd : projection_bwd; |
|
0 |
0 |
auto& projection = dir == 0 ? projection_fwd : projection_bwd; |
|
0 |
0 |
auto& projection = dir == 0 ? projection_fwd : projection_bwd; |
13556
|
0 |
0 |
for (size_t i = segment; i--; ) { |
|
0 |
0 |
for (size_t i = segment; i--; ) { |
|
0 |
0 |
for (size_t i = segment; i--; ) { |
13557
|
0 |
0 |
auto& embedding = chosen_embeddings[dir == 0 ? i : segment - 1 - i]; |
|
0 |
0 |
auto& embedding = chosen_embeddings[dir == 0 ? i : segment - 1 - i]; |
|
0 |
0 |
auto& embedding = chosen_embeddings[dir == 0 ? i : segment - 1 - i]; |
13558
|
0 |
0 |
auto& embedding_dropout = embedding_dropouts[dir == 0 ? i : segment - 1 - i]; |
|
0 |
0 |
auto& embedding_dropout = embedding_dropouts[dir == 0 ? i : segment - 1 - i]; |
|
0 |
0 |
auto& embedding_dropout = embedding_dropouts[dir == 0 ? i : segment - 1 - i]; |
13559
|
0 |
0 |
auto& output = instance_output[dir == 0 ? i : segment - 1 - i]; |
|
0 |
0 |
auto& output = instance_output[dir == 0 ? i : segment - 1 - i]; |
|
0 |
0 |
auto& output = instance_output[dir == 0 ? i : segment - 1 - i]; |
13561
|
0 |
0 |
for (int j = 0; j < D; j++) // These for cycles are swapped because |
|
0 |
0 |
for (int j = 0; j < D; j++) // These for cycles are swapped because |
|
0 |
0 |
for (int j = 0; j < D; j++) // These for cycles are swapped because |
13562
|
0 |
0 |
for (int k = 0; k < 3; k++) // g++-4.8 generates wrong code otherwise. |
|
0 |
0 |
for (int k = 0; k < 3; k++) // g++-4.8 generates wrong code otherwise. |
|
0 |
0 |
for (int k = 0; k < 3; k++) // g++-4.8 generates wrong code otherwise. |
13565
|
0 |
0 |
for (int j = 0; j < D; j++) |
|
0 |
0 |
for (int j = 0; j < D; j++) |
|
0 |
0 |
for (int j = 0; j < D; j++) |
13566
|
0 |
0 |
if (gru.dropouts[i].w[0][j]) |
|
0 |
0 |
if (gru.dropouts[i].w[0][j]) |
|
0 |
0 |
if (gru.dropouts[i].w[0][j]) |
13567
|
0 |
0 |
for (int k = 0; k < 3; k++) |
|
0 |
0 |
for (int k = 0; k < 3; k++) |
|
0 |
0 |
for (int k = 0; k < 3; k++) |
13571
|
0 |
0 |
for (int j = 0; j < D; j++) { |
|
0 |
0 |
for (int j = 0; j < D; j++) { |
|
0 |
0 |
for (int j = 0; j < D; j++) { |
13578
|
0 |
0 |
for (int k = 0; k < D; k++) { |
|
0 |
0 |
for (int k = 0; k < D; k++) { |
|
0 |
0 |
for (int k = 0; k < D; k++) { |
13585
|
0 |
0 |
for (int j = 0; j < D; j++) { |
|
0 |
0 |
for (int j = 0; j < D; j++) { |
|
0 |
0 |
for (int j = 0; j < D; j++) { |
13594
|
0 |
0 |
for (int k = 0; k < D; k++) { |
|
0 |
0 |
for (int k = 0; k < D; k++) { |
|
0 |
0 |
for (int k = 0; k < D; k++) { |
13608
|
0 |
0 |
if (batch_size == 1 || |
|
0 |
0 |
if (batch_size == 1 || |
|
0 |
0 |
if (batch_size == 1 || |
|
0 |
0 |
if (batch_size == 1 || |
|
0 |
0 |
if (batch_size == 1 || |
|
0 |
0 |
if (batch_size == 1 || |
|
0 |
0 |
if (batch_size == 1 || |
|
0 |
0 |
if (batch_size == 1 || |
|
0 |
0 |
if (batch_size == 1 || |
13615
|
0 |
0 |
if (batch_size == 1) |
|
0 |
0 |
if (batch_size == 1) |
|
0 |
0 |
if (batch_size == 1) |
13616
|
0 |
0 |
for (auto&& chosen_embedding : chosen_embeddings) |
|
0 |
0 |
for (auto&& chosen_embedding : chosen_embeddings) |
|
0 |
0 |
for (auto&& chosen_embedding : chosen_embeddings) |
13619
|
0 |
0 |
for (auto&& embedding : embeddings) |
|
0 |
0 |
for (auto&& embedding : embeddings) |
|
0 |
0 |
for (auto&& embedding : embeddings) |
13627
|
0 |
0 |
if (learning_rate_final && learning_rate_final != learning_rate_initial) |
|
0 |
0 |
if (learning_rate_final && learning_rate_final != learning_rate_initial) |
|
0 |
0 |
if (learning_rate_final && learning_rate_final != learning_rate_initial) |
13631
|
0 |
0 |
cerr << "Epoch " << epoch+1 << ", logprob: " << scientific << setprecision(4) << logprob |
|
0 |
0 |
cerr << "Epoch " << epoch+1 << ", logprob: " << scientific << setprecision(4) << logprob |
|
0 |
0 |
cerr << "Epoch " << epoch+1 << ", logprob: " << scientific << setprecision(4) << logprob |
13633
|
0 |
0 |
if (!heldout.empty()) { |
|
0 |
0 |
if (!heldout.empty()) { |
|
0 |
0 |
if (!heldout.empty()) { |
13635
|
0 |
0 |
evaluate(url_email_tokenizer, segment, allow_spaces, heldout, tokens, sentences); |
|
0 |
0 |
evaluate(url_email_tokenizer, segment, allow_spaces, heldout, tokens, sentences); |
|
0 |
0 |
evaluate(url_email_tokenizer, segment, allow_spaces, heldout, tokens, sentences); |
13636
|
0 |
0 |
cerr << ", heldout tokens: " << 100. * tokens.precision << "%P/" << 100. * tokens.recall << "%R/" |
|
0 |
0 |
cerr << ", heldout tokens: " << 100. * tokens.precision << "%P/" << 100. * tokens.recall << "%R/" |
|
0 |
0 |
cerr << ", heldout tokens: " << 100. * tokens.precision << "%P/" << 100. * tokens.recall << "%R/" |
13640
|
0 |
0 |
if (early_stopping && sentences.f1 + tokens.f1 > best_combined_f1) { |
|
0 |
0 |
if (early_stopping && sentences.f1 + tokens.f1 > best_combined_f1) { |
|
0 |
0 |
if (early_stopping && sentences.f1 + tokens.f1 > best_combined_f1) { |
|
0 |
0 |
if (early_stopping && sentences.f1 + tokens.f1 > best_combined_f1) { |
|
0 |
0 |
if (early_stopping && sentences.f1 + tokens.f1 > best_combined_f1) { |
|
0 |
0 |
if (early_stopping && sentences.f1 + tokens.f1 > best_combined_f1) { |
13645
|
0 |
0 |
if (early_stopping && best_combined_f1 && epoch - best_combined_f1_epoch > 30) { |
|
0 |
0 |
if (early_stopping && best_combined_f1 && epoch - best_combined_f1_epoch > 30) { |
|
0 |
0 |
if (early_stopping && best_combined_f1 && epoch - best_combined_f1_epoch > 30) { |
|
0 |
0 |
if (early_stopping && best_combined_f1 && epoch - best_combined_f1_epoch > 30) { |
|
0 |
0 |
if (early_stopping && best_combined_f1 && epoch - best_combined_f1_epoch > 30) { |
|
0 |
0 |
if (early_stopping && best_combined_f1 && epoch - best_combined_f1_epoch > 30) { |
13654
|
0 |
0 |
if (early_stopping && best_combined_f1) { |
|
0 |
0 |
if (early_stopping && best_combined_f1) { |
|
0 |
0 |
if (early_stopping && best_combined_f1) { |
13664
|
0 |
0 |
enc.add_1B(1); |
|
0 |
0 |
enc.add_1B(1); |
|
0 |
0 |
enc.add_1B(1); |
13665
|
0 |
0 |
enc.add_1B(D); |
|
0 |
0 |
enc.add_1B(D); |
|
0 |
0 |
enc.add_1B(D); |
13668
|
0 |
0 |
for (auto&& embedding : this->embeddings) { |
|
0 |
0 |
for (auto&& embedding : this->embeddings) { |
|
0 |
0 |
for (auto&& embedding : this->embeddings) { |
13672
|
0 |
0 |
save_gru(this->gru_fwd, enc); |
|
0 |
0 |
save_gru(this->gru_fwd, enc); |
|
0 |
0 |
save_gru(this->gru_fwd, enc); |
13673
|
0 |
0 |
save_gru(this->gru_bwd, enc); |
|
0 |
0 |
save_gru(this->gru_bwd, enc); |
|
0 |
0 |
save_gru(this->gru_bwd, enc); |
13674
|
0 |
0 |
save_matrix(this->projection_fwd, enc); |
|
0 |
0 |
save_matrix(this->projection_fwd, enc); |
|
0 |
0 |
save_matrix(this->projection_fwd, enc); |
13675
|
0 |
0 |
save_matrix(this->projection_bwd, enc); |
|
0 |
0 |
save_matrix(this->projection_bwd, enc); |
|
0 |
0 |
save_matrix(this->projection_bwd, enc); |
13682
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
0 |
0 |
for (int i = 0; i < R; i++) { |
13683
|
0 |
0 |
for (int j = 0; j < C; j++) { |
|
0 |
0 |
for (int j = 0; j < C; j++) { |
|
0 |
0 |
for (int j = 0; j < C; j++) { |
|
0 |
0 |
for (int j = 0; j < C; j++) { |
|
0 |
0 |
for (int j = 0; j < C; j++) { |
|
0 |
0 |
for (int j = 0; j < C; j++) { |
|
0 |
0 |
for (int j = 0; j < C; j++) { |
|
0 |
0 |
for (int j = 0; j < C; j++) { |
|
0 |
0 |
for (int j = 0; j < C; j++) { |
13693
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
0 |
0 |
for (int i = 0; i < R; i++) { |
13694
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
0 |
0 |
for (int j = 0; j < C; j++) |
13716
|
0 |
0 |
for (auto&& sentence : heldout) { |
|
0 |
0 |
for (auto&& sentence : heldout) { |
|
0 |
0 |
for (auto&& sentence : heldout) { |
13717
|
0 |
0 |
if (sentence.tokens.empty()) continue; |
|
0 |
0 |
if (sentence.tokens.empty()) continue; |
|
0 |
0 |
if (sentence.tokens.empty()) continue; |
13719
|
0 |
0 |
gold_sentences.emplace_back(text.size() + sentence.tokens.front().start, sentence.tokens.back().start + sentence.tokens.back().length - sentence.tokens.front().start); |
|
0 |
0 |
gold_sentences.emplace_back(text.size() + sentence.tokens.front().start, sentence.tokens.back().start + sentence.tokens.back().length - sentence.tokens.front().start); |
|
0 |
0 |
gold_sentences.emplace_back(text.size() + sentence.tokens.front().start, sentence.tokens.back().start + sentence.tokens.back().length - sentence.tokens.front().start); |
13720
|
0 |
0 |
for (auto&& token : sentence.tokens) |
|
0 |
0 |
for (auto&& token : sentence.tokens) |
|
0 |
0 |
for (auto&& token : sentence.tokens) |
13721
|
0 |
0 |
gold_tokens.emplace_back(text.size() + token.start, token.length); |
|
0 |
0 |
gold_tokens.emplace_back(text.size() + token.start, token.length); |
|
0 |
0 |
gold_tokens.emplace_back(text.size() + token.start, token.length); |
13731
|
0 |
0 |
unilib::utf8::encode(text, text_utf8); |
|
0 |
0 |
unilib::utf8::encode(text, text_utf8); |
|
0 |
0 |
unilib::utf8::encode(text, text_utf8); |
13732
|
0 |
0 |
tokenizer.set_text(text_utf8); |
|
0 |
0 |
tokenizer.set_text(text_utf8); |
|
0 |
0 |
tokenizer.set_text(text_utf8); |
13734
|
0 |
0 |
while (tokenizer.next_sentence(tokens)) |
|
0 |
0 |
while (tokenizer.next_sentence(tokens)) |
|
0 |
0 |
while (tokenizer.next_sentence(tokens)) |
|
0 |
0 |
while (tokenizer.next_sentence(tokens)) |
|
0 |
0 |
while (tokenizer.next_sentence(tokens)) |
|
0 |
0 |
while (tokenizer.next_sentence(tokens)) |
13735
|
0 |
0 |
if (!tokens.empty()) { |
|
0 |
0 |
if (!tokens.empty()) { |
|
0 |
0 |
if (!tokens.empty()) { |
13736
|
0 |
0 |
system_sentences.emplace_back(tokens.front().start, tokens.back().start + tokens.back().length - tokens.front().start); |
|
0 |
0 |
system_sentences.emplace_back(tokens.front().start, tokens.back().start + tokens.back().length - tokens.front().start); |
|
0 |
0 |
system_sentences.emplace_back(tokens.front().start, tokens.back().start + tokens.back().length - tokens.front().start); |
13737
|
0 |
0 |
system_tokens.insert(system_tokens.end(), tokens.begin(), tokens.end()); |
|
0 |
0 |
system_tokens.insert(system_tokens.end(), tokens.begin(), tokens.end()); |
|
0 |
0 |
system_tokens.insert(system_tokens.end(), tokens.begin(), tokens.end()); |
13747
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
|
0 |
0 |
for (size_t si = 0, gi = 0; si < system.size() || gi < gold.size(); ) |
13748
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start)) |
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start)) |
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start)) |
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start)) |
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start)) |
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start)) |
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start)) |
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start)) |
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start)) |
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start)) |
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start)) |
|
0 |
0 |
if (si < system.size() && (gi == gold.size() || system[si].start < gold[gi].start)) |
13750
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start)) |
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start)) |
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start)) |
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start)) |
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start)) |
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start)) |
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start)) |
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start)) |
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start)) |
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start)) |
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start)) |
|
0 |
0 |
else if (gi < gold.size() && (si == system.size() || gold[gi].start < system[si].start)) |
13755
|
0 |
0 |
f1.precision = system.size() ? both / double(system.size()) : 0.; |
|
0 |
0 |
f1.precision = system.size() ? both / double(system.size()) : 0.; |
|
0 |
0 |
f1.precision = system.size() ? both / double(system.size()) : 0.; |
13756
|
0 |
0 |
f1.recall = gold.size() ? both / double(gold.size()) : 0.; |
|
0 |
0 |
f1.recall = gold.size() ? both / double(gold.size()) : 0.; |
|
0 |
0 |
f1.recall = gold.size() ? both / double(gold.size()) : 0.; |
13757
|
0 |
0 |
f1.f1 = system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0.; |
|
0 |
0 |
f1.f1 = system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0.; |
|
0 |
0 |
f1.f1 = system.size()+gold.size() ? 2 * both / double(system.size() + gold.size()) : 0.; |
13763
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
0 |
0 |
for (int i = 0; i < R; i++) { |
|
0 |
0 |
for (int i = 0; i < R; i++) { |
13765
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
0 |
0 |
for (int j = 0; j < C; j++) |
|
0 |
0 |
for (int j = 0; j < C; j++) |
13782
|
0 |
0 |
for (int i = 0; i < R; i++) |
|
0 |
0 |
for (int i = 0; i < R; i++) |
|
0 |
0 |
for (int i = 0; i < R; i++) |
|
0 |
0 |
for (int i = 0; i < R; i++) |
|
0 |
0 |
for (int i = 0; i < R; i++) |
|
0 |
0 |
for (int i = 0; i < R; i++) |
13826
|
0 |
0 |
enc.add_1B(url_email_tokenizer); |
13827
|
0 |
0 |
enc.add_2B(segment); |
13828
|
0 |
0 |
enc.add_1B(allow_spaces); |
13831
|
0 |
0 |
if (dimension == 16) { |
13833
|
0 |
0 |
if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final, |
|
0 |
0 |
if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final, |
13835
|
0 |
0 |
} else if (dimension == 24) { |
13837
|
0 |
0 |
if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final, |
|
0 |
0 |
if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final, |
13839
|
0 |
0 |
} else if (dimension == 64) { |
13841
|
0 |
0 |
if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final, |
|
0 |
0 |
if (!network.train(url_email_tokenizer, segment, allow_spaces, epochs, batch_size, learning_rate, learning_rate_final, |
13844
|
0 |
0 |
return error.assign("Gru tokenizer dimension '").append(to_string(dimension)).append("' is not supported!"), false; |
|
0 |
0 |
return error.assign("Gru tokenizer dimension '").append(to_string(dimension)).append("' is not supported!"), false; |
13849
|
0 |
0 |
for (auto&& sentence : data) |
13850
|
0 |
0 |
for (auto&& chr : sentence.sentence) |
13854
|
0 |
0 |
for (auto&& count : counts) { |
13857
|
0 |
0 |
for (auto&& chr : count.second) |
13858
|
0 |
0 |
if (chr.second > best) |
13860
|
0 |
0 |
if (best_chr) |
13863
|
0 |
0 |
enc.add_1B(unknown_chars.size()); |
13864
|
0 |
0 |
for (auto&& unknown_char : unknown_chars) { |
13869
|
0 |
0 |
if (!compressor::save(os, enc)) return error.assign("Cannot save gru_tokenizer_factory!"), false; |
|
0 |
0 |
if (!compressor::save(os, enc)) return error.assign("Cannot save gru_tokenizer_factory!"), false; |
|
0 |
0 |
if (!compressor::save(os, enc)) return error.assign("Cannot save gru_tokenizer_factory!"), false; |
14241
|
0 |
0 |
initialize_ragel_map(); |
14245
|
0 |
1 |
while (ragel_map_flag.test_and_set()) {} |
14246
|
1 |
0 |
if (ragel_map.empty()) { |
14247
|
128 |
1 |
for (uint8_t ascii = 0; ascii < 128; ascii++) |
14259
|
1 |
3 |
if (chr >= ragel_map.size()) |
14279
|
7 |
0 |
if ( ( current) == ( (chars.size() - 1)) ) |
14287
|
0 |
30 |
if ( _klen > 0 ) { |
14292
|
0 |
0 |
if ( _upper < _lower ) |
14296
|
0 |
0 |
if ( _widec < _mid[0] ) |
14298
|
0 |
0 |
else if ( _widec > _mid[1] ) |
14304
|
0 |
0 |
if ( |
14310
|
0 |
0 |
if ( |
14324
|
30 |
0 |
if ( _klen > 0 ) { |
14329
|
87 |
30 |
if ( _upper < _lower ) |
14333
|
13 |
74 |
if ( _widec < *_mid ) |
14335
|
74 |
0 |
else if ( _widec > *_mid ) |
14347
|
30 |
0 |
if ( _klen > 0 ) { |
14352
|
86 |
7 |
if ( _upper < _lower ) |
14356
|
9 |
77 |
if ( _widec < _mid[0] ) |
14358
|
54 |
23 |
else if ( _widec > _mid[1] ) |
14372
|
0 |
30 |
if ( _ragel_url_email_trans_actions[_trans] == 0 ) |
14393
|
23 |
7 |
if ( cs == 0 ) |
14395
|
23 |
0 |
if ( ++( current) != ( (chars.size() - 1)) ) |
14401
|
0 |
7 |
if (end > start) { |
14430
|
0 |
0 |
vertical_tokenizer() : unicode_tokenizer(0) {} |
14528
|
0 |
0 |
if (res->load(is)) return res.release(); |
14534
|
1 |
0 |
if (res->load(is)) return res.release(); |
|
1 |
0 |
if (res->load(is)) return res.release(); |
14540
|
0 |
0 |
if (res->load(is)) return res.release(); |
|
0 |
0 |
if (res->load(is)) return res.release(); |
14551
|
0 |
0 |
ifstream f(path_from_utf8(fname).c_str(), ifstream::binary); |
14552
|
0 |
0 |
if (!f) return nullptr; |
14554
|
0 |
0 |
return load(f); |
14575
|
1 |
0 |
ragel_tokenizer::initialize_ragel_map(); |
14577
|
1 |
0 |
set_text(string_piece(nullptr, 0)); |
14583
|
0 |
2 |
if (make_copy && text.str) { |
|
0 |
0 |
if (make_copy && text.str) { |
14590
|
34 |
2 |
for (const char* curr_str = text.str; text.len; curr_str = text.str) |
14596
|
2 |
0 |
vector& tokens = tokens_ptr ? *tokens_ptr : tokens_buffer; |
14598
|
2 |
0 |
if (forms) forms->clear(); |
14599
|
2 |
0 |
if (current >= chars.size() - 1) return false; |
14602
|
2 |
0 |
if (forms) |
14603
|
7 |
2 |
for (auto&& token : tokens) |
14610
|
7 |
0 |
if (current >= chars.size() - 1) return false; |
14612
|
7 |
0 |
return url_email_tokenizer ? ragel_tokenizer::ragel_url_email(url_email_tokenizer, chars, current, tokens) : false; |
14619
|
0 |
8 |
return tokens.size() >= 500 || |
14620
|
8 |
0 |
(tokens.size() >= 450 && chars[tokens.back().start].cat & unicode::P) || |
|
0 |
0 |
(tokens.size() >= 450 && chars[tokens.back().start].cat & unicode::P) || |
|
0 |
8 |
(tokens.size() >= 450 && chars[tokens.back().start].cat & unicode::P) || |
14621
|
0 |
0 |
(tokens.size() >= 400 && chars[tokens.back().start].cat & unicode::Po); |
14627
|
0 |
0 |
if (eos_chr == '.' && !tokens.empty()) { |
|
0 |
0 |
if (eos_chr == '.' && !tokens.empty()) { |
|
0 |
0 |
if (eos_chr == '.' && !tokens.empty()) { |
14629
|
0 |
0 |
if (tokens.back().length == 1 && chars[tokens.back().start].cat & unicode::Lut) |
|
0 |
0 |
if (tokens.back().length == 1 && chars[tokens.back().start].cat & unicode::Lut) |
|
0 |
0 |
if (tokens.back().length == 1 && chars[tokens.back().start].cat & unicode::Lut) |
14633
|
0 |
0 |
if (abbreviations) { |
14635
|
0 |
0 |
for (size_t i = 0; i < tokens.back().length; i++) |
14637
|
0 |
0 |
if (abbreviations->count(eos_buffer)) |
14662
|
0 |
0 |
if (current >= chars.size() - 1) return false; |
14666
|
0 |
0 |
while (current < chars.size() - 1 && chars[current].chr != '\r' && chars[current].chr != '\n') current++; |
|
0 |
0 |
while (current < chars.size() - 1 && chars[current].chr != '\r' && chars[current].chr != '\n') current++; |
|
0 |
0 |
while (current < chars.size() - 1 && chars[current].chr != '\r' && chars[current].chr != '\n') current++; |
|
0 |
0 |
while (current < chars.size() - 1 && chars[current].chr != '\r' && chars[current].chr != '\n') current++; |
14669
|
0 |
0 |
if (current < chars.size() - 1) { |
14671
|
0 |
0 |
if (current < chars.size() - 1 && |
|
0 |
0 |
if (current < chars.size() - 1 && |
|
0 |
0 |
if (current < chars.size() - 1 && |
14672
|
0 |
0 |
((chars[current-1].chr == '\r' && chars[current].chr == '\n') || |
|
0 |
0 |
((chars[current-1].chr == '\r' && chars[current].chr == '\n') || |
14673
|
0 |
0 |
(chars[current-1].chr == '\n' && chars[current].chr == '\r'))) |
14677
|
0 |
0 |
if (line_start < line_end) |
14765
|
0 |
0 |
return {1, 11, 1, "dev"}; |
|
0 |
0 |
return {1, 11, 1, "dev"}; |
14776
|
0 |
0 |
<< (morphodita.prerelease.empty() ? "" : "-") << morphodita.prerelease |
|
0 |
0 |
<< (morphodita.prerelease.empty() ? "" : "-") << morphodita.prerelease |
14778
|
0 |
0 |
<< (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n" |
|
0 |
0 |
<< (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n" |
14780
|
0 |
0 |
"Mathematics and Physics, Charles University in Prague, Czech Republic."; |
14803
|
0 |
1 |
assert(t); |
14809
|
1 |
0 |
if (!t->nodes.empty()) stack.push_back(0); |
14813
|
7 |
1 |
for (size_t i = t->nodes.size(); i > 1; i--) |
14818
|
0 |
0 |
return buffer.empty() && stack.size() <= 1; |
|
0 |
0 |
return buffer.empty() && stack.size() <= 1; |
|
0 |
0 |
return buffer.empty() && stack.size() <= 1; |
|
0 |
0 |
return buffer.empty() && stack.size() <= 1; |
|
0 |
0 |
return buffer.empty() && stack.size() <= 1; |
|
0 |
0 |
return buffer.empty() && stack.size() <= 1; |
|
0 |
0 |
return buffer.empty() && stack.size() <= 1; |
|
0 |
0 |
return buffer.empty() && stack.size() <= 1; |
|
16 |
51 |
return buffer.empty() && stack.size() <= 1; |
|
11 |
5 |
return buffer.empty() && stack.size() <= 1; |
|
0 |
0 |
return buffer.empty() && stack.size() <= 1; |
|
0 |
0 |
return buffer.empty() && stack.size() <= 1; |
|
0 |
0 |
return buffer.empty() && stack.size() <= 1; |
|
0 |
0 |
return buffer.empty() && stack.size() <= 1; |
14881
|
1116 |
62 |
for (auto&& selector : selectors) { |
14886
|
867 |
63 |
if (selector.start.second < int(conf.stack.size())) |
14890
|
98 |
88 |
if (selector.start.second < int(conf.buffer.size())) |
14896
|
965 |
151 |
if (current >= 0) |
14897
|
802 |
410 |
for (auto&& direction : selector.directions) { |
14901
|
0 |
0 |
current = node.head ? node.head : -1; |
14904
|
120 |
281 |
current = direction.second >= 0 && direction.second < int(node.children.size()) ? |
14906
|
127 |
274 |
direction.second < 0 && -direction.second <= int(node.children.size()) ? |
14908
|
401 |
401 |
-1; |
|
401 |
281 |
-1; |
14911
|
247 |
555 |
if (current <= 0) break; |
14924
|
1 |
0 |
split(description, '\n', lines); |
14925
|
19 |
1 |
for (auto&& line : lines) { |
14926
|
18 |
1 |
if (!line.len || line.str[0] == '#') continue; |
|
18 |
0 |
if (!line.len || line.str[0] == '#') continue; |
14929
|
18 |
0 |
split(line, ',', parts); |
14932
|
18 |
0 |
split(parts[0], ' ', words); |
14933
|
0 |
18 |
if (words.size() != 2) |
14934
|
0 |
0 |
return error.assign("The node selector '").append(parts[0].str, parts[0].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false; |
|
0 |
0 |
return error.assign("The node selector '").append(parts[0].str, parts[0].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false; |
|
0 |
0 |
return error.assign("The node selector '").append(parts[0].str, parts[0].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false; |
|
0 |
0 |
return error.assign("The node selector '").append(parts[0].str, parts[0].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false; |
|
0 |
0 |
return error.assign("The node selector '").append(parts[0].str, parts[0].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false; |
14937
|
15 |
3 |
if (words[0] == "stack") |
14939
|
3 |
0 |
else if (words[0] == "buffer") |
14942
|
0 |
0 |
return error.assign("Cannot parse starting location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false; |
|
0 |
0 |
return error.assign("Cannot parse starting location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false; |
|
0 |
0 |
return error.assign("Cannot parse starting location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false; |
|
0 |
0 |
return error.assign("Cannot parse starting location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false; |
|
0 |
0 |
return error.assign("Cannot parse starting location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false; |
14945
|
18 |
0 |
if (!parse_int(words[1], "starting index", start_index, error)) return false; |
|
18 |
0 |
if (!parse_int(words[1], "starting index", start_index, error)) return false; |
14947
|
18 |
0 |
selectors.emplace_back(start, start_index); |
14950
|
16 |
18 |
for (size_t i = 1; i < parts.size(); i++) { |
14951
|
16 |
0 |
split(parts[i], ' ', words); |
14952
|
0 |
16 |
if (words.empty()) |
14953
|
0 |
0 |
return error.assign("Empty node selector on line '").append(line.str, line.len).append(".!"), false; |
|
0 |
0 |
return error.assign("Empty node selector on line '").append(line.str, line.len).append(".!"), false; |
|
0 |
0 |
return error.assign("Empty node selector on line '").append(line.str, line.len).append(".!"), false; |
14955
|
0 |
16 |
if (words[0] == "parent") { |
14956
|
0 |
0 |
if (words.size() != 1) |
14957
|
0 |
0 |
return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain one space separated value!"), false; |
|
0 |
0 |
return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain one space separated value!"), false; |
|
0 |
0 |
return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain one space separated value!"), false; |
|
0 |
0 |
return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain one space separated value!"), false; |
|
0 |
0 |
return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain one space separated value!"), false; |
14958
|
0 |
0 |
selectors.back().directions.emplace_back(PARENT, 0); |
14959
|
16 |
0 |
} else if (words[0] == "child") { |
14960
|
0 |
16 |
if (words.size() != 2) |
14961
|
0 |
0 |
return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false; |
|
0 |
0 |
return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false; |
|
0 |
0 |
return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false; |
|
0 |
0 |
return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false; |
|
0 |
0 |
return error.assign("The node selector '").append(parts[i].str, parts[i].len).append("' on line '").append(line.str, line.len).append("' does not contain two space separated values!"), false; |
14963
|
16 |
0 |
if (!parse_int(words[1], "child index", child_index, error)) return false; |
|
16 |
0 |
if (!parse_int(words[1], "child index", child_index, error)) return false; |
14964
|
16 |
0 |
selectors.back().directions.emplace_back(CHILD, child_index); |
14966
|
0 |
0 |
return error.assign("Cannot parse direction location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false; |
|
0 |
0 |
return error.assign("Cannot parse direction location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false; |
|
0 |
0 |
return error.assign("Cannot parse direction location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false; |
|
0 |
0 |
return error.assign("Cannot parse direction location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false; |
|
0 |
0 |
return error.assign("Cannot parse direction location '").append(words[0].str, words[0].len).append("' on line '").append(line.str, line.len).append(".!"), false; |
15029
|
0 |
0 |
if (!n.misc.empty()) { |
15032
|
0 |
0 |
if (lid != string::npos) { |
15037
|
0 |
0 |
if (lid_end == string::npos) lid_end = n.misc.size(); |
15067
|
1 |
3 |
if (description == "form") |
15069
|
0 |
3 |
else if (description == "lemma") |
15071
|
0 |
3 |
else if (description == "lemma_id") |
15073
|
0 |
3 |
else if (description == "tag") |
15075
|
1 |
2 |
else if (description == "universal_tag") |
15077
|
1 |
1 |
else if (description == "feats") |
15079
|
0 |
1 |
else if (description == "universal_tag_feats") |
15081
|
1 |
0 |
else if (description == "deprel") |
15150
|
92 |
36 |
if (it != dictionary.end()) return it->second; |
15157
|
18 |
36 |
for (auto&& chr : utf8::decoder(word)) { |
15158
|
3 |
15 |
(first ? first_category : other_categories) |= unicode::category(chr); |
15162
|
0 |
36 |
if ((first_category & unicode::Lut) && (other_categories & unicode::Lut)) { |
|
0 |
0 |
if ((first_category & unicode::Lut) && (other_categories & unicode::Lut)) { |
15166
|
0 |
0 |
for (auto&& chr : utf8::decoder(word)) { |
15167
|
0 |
0 |
utf8::append(buffer, first ? chr : unicode::lowercase(chr)); |
15172
|
0 |
0 |
if (it != dictionary.end()) return it->second; |
15175
|
36 |
0 |
if ((first_category & unicode::Lut) || (other_categories & unicode::Lut)) { |
|
0 |
36 |
if ((first_category & unicode::Lut) || (other_categories & unicode::Lut)) { |
15179
|
0 |
0 |
if (it != dictionary.end()) return it->second; |
15184
|
0 |
36 |
if ((first_category & unicode::N) && !(other_categories & unicode::L)) { |
|
0 |
0 |
if ((first_category & unicode::N) && !(other_categories & unicode::L)) { |
15189
|
0 |
0 |
if (it != dictionary.end()) return it->second; |
15200
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
15205
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
31 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
4 |
27 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
27 |
4 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
27 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
27 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
27 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
|
0 |
0 |
if (id < 0 || id * dimension >= weights.size()) return nullptr; |
15218
|
4 |
0 |
for (unsigned size = data.next_4B(); size; size--) { |
|
23 |
4 |
for (unsigned size = data.next_4B(); size; size--) { |
15219
|
23 |
0 |
data.next_str(word); |
15223
|
4 |
0 |
unknown_index = data.next_1B() ? dictionary.size() : -1; |
|
4 |
0 |
unknown_index = data.next_1B() ? dictionary.size() : -1; |
15226
|
4 |
0 |
weights.resize(dimension * (dictionary.size() + (unknown_index >= 0))); |
15227
|
4 |
0 |
memcpy(weights.data(), data.next(weights.size()), sizeof(float) * weights.size()); |
15253
|
0 |
0 |
for (auto&& entry : dictionary) { |
15254
|
0 |
0 |
assert(entry.second >= 0 && entry.second < int(dictionary.size())); |
|
0 |
0 |
assert(entry.second >= 0 && entry.second < int(dictionary.size())); |
15258
|
0 |
0 |
for (auto&& word : words) |
15259
|
0 |
0 |
enc.add_str(word); |
15261
|
0 |
0 |
enc.add_1B(unknown_index >= 0); |
15277
|
0 |
0 |
for (auto&& word : words) { |
15278
|
0 |
0 |
assert(word.second.size() == dimension); |
15283
|
0 |
0 |
if (unknown_weights.empty()) { |
15295
|
0 |
0 |
if (dictionary.empty()) return; |
15297
|
0 |
0 |
assert(unknown_index < 0 || unknown_index == int(dictionary.size())); |
|
0 |
0 |
assert(unknown_index < 0 || unknown_index == int(dictionary.size())); |
15300
|
0 |
0 |
for (auto&& entry : dictionary) { |
15304
|
0 |
0 |
if (unknown_index >= 0) |
15353
|
0 |
0 |
class neural_network { |
|
0 |
0 |
class neural_network { |
|
2 |
1 |
class neural_network { |
|
1 |
0 |
class neural_network { |
|
2 |
1 |
class neural_network { |
15397
|
367 |
2 |
for (auto&& row : m) { |
15411
|
0 |
62 |
assert(!weights[0].empty()); |
15412
|
0 |
62 |
assert(!weights[1].empty()); |
15413
|
1116 |
62 |
for (auto&& embedding_ids : embedding_ids_sequences) if (embedding_ids) assert(embeddings.size() == embedding_ids->size()); |
|
410 |
706 |
for (auto&& embedding_ids : embedding_ids_sequences) if (embedding_ids) assert(embeddings.size() == embedding_ids->size()); |
|
0 |
410 |
for (auto&& embedding_ids : embedding_ids_sequences) if (embedding_ids) assert(embeddings.size() == embedding_ids->size()); |
15424
|
1116 |
62 |
for (unsigned sequence = 0; sequence < embedding_ids_sequences.size(); sequence++) |
15425
|
4464 |
1116 |
for (unsigned i = 0; i < embeddings.size(); index += embeddings[i].dimension, i++) |
15426
|
1640 |
2824 |
if (embedding_ids_sequences[sequence] && embedding_ids_sequences[sequence]->at(i) >= 0) { |
|
0 |
1640 |
if (embedding_ids_sequences[sequence] && embedding_ids_sequences[sequence]->at(i) >= 0) { |
|
1640 |
2824 |
if (embedding_ids_sequences[sequence] && embedding_ids_sequences[sequence]->at(i) >= 0) { |
15428
|
1640 |
0 |
if (cache && i < cache->size() && word < cache->at(i).size()) { |
|
1640 |
0 |
if (cache && i < cache->size() && word < cache->at(i).size()) { |
|
0 |
1640 |
if (cache && i < cache->size() && word < cache->at(i).size()) { |
|
1640 |
0 |
if (cache && i < cache->size() && word < cache->at(i).size()) { |
15431
|
8200 |
1640 |
for (unsigned j = 0; j < hidden_layer_size; j++) |
15436
|
0 |
0 |
for (unsigned j = 0; j < embeddings[i].dimension; j++) |
15437
|
0 |
0 |
for (unsigned k = 0; k < hidden_layer_size; k++) |
15441
|
310 |
62 |
for (unsigned i = 0; i < hidden_layer_size; i++) // Bias |
15447
|
62 |
0 |
if (!tanh_cache.empty()) |
15448
|
310 |
62 |
for (auto&& weight : hidden_layer) |
15449
|
310 |
0 |
weight = weight <= -10 ? -1 : weight >= 10 ? 1 : tanh_cache[int(weight * 32768 + 10 * 32768)]; |
|
310 |
0 |
weight = weight <= -10 ? -1 : weight >= 10 ? 1 : tanh_cache[int(weight * 32768 + 10 * 32768)]; |
15451
|
0 |
0 |
for (auto&& weight : hidden_layer) |
15455
|
0 |
0 |
for (auto&& weight : hidden_layer) |
15459
|
0 |
0 |
for (auto&& weight : hidden_layer) |
15460
|
0 |
0 |
if (weight < 0) weight = 0; |
15464
|
310 |
62 |
for (unsigned i = 0; i < hidden_layer_size; i++) |
15465
|
4030 |
310 |
for (unsigned j = 0; j < outcomes_size; j++) |
15467
|
806 |
62 |
for (unsigned i = 0; i < outcomes_size; i++) // Bias |
15471
|
62 |
0 |
if (softmax) { |
15473
|
62 |
744 |
for (unsigned i = 1; i < outcomes_size; i++) if (outcomes[i] > max) max = outcomes[i]; |
|
68 |
676 |
for (unsigned i = 1; i < outcomes_size; i++) if (outcomes[i] > max) max = outcomes[i]; |
15476
|
806 |
62 |
for (unsigned i = 0; i < outcomes_size; i++) sum += (outcomes[i] = exp(outcomes[i] - max)); |
15479
|
806 |
62 |
for (unsigned i = 0; i < outcomes_size; i++) outcomes[i] *= sum; |
15485
|
655360 |
1 |
for (unsigned i = 0; i < tanh_cache.size(); i++) |
15491
|
4 |
1 |
for (auto&& embedding : embeddings) embeddings_dim += embedding.dimension; |
15494
|
0 |
1 |
assert(sequences * embeddings_dim + 1 == weights[0].size()); |
15499
|
4 |
1 |
for (unsigned i = 0, weight_index = 0; i < embeddings.size(); weight_index += embeddings[i].dimension, i++) { |
15501
|
31 |
0 |
while (words < max_words && embeddings[i].weight(words)) words++; |
|
4 |
27 |
while (words < max_words && embeddings[i].weight(words)) words++; |
|
27 |
4 |
while (words < max_words && embeddings[i].weight(words)) words++; |
15504
|
27 |
4 |
for (unsigned word = 0; word < words; word++) { |
15508
|
486 |
27 |
for (unsigned sequence = 0, index = weight_index; sequence < sequences; index += embeddings_dim, sequence++) |
15509
|
2430 |
486 |
for (unsigned j = 0; j < embeddings[i].dimension; j++) |
15510
|
12150 |
2430 |
for (unsigned k = 0; k < hidden_layer_size; k++) |
15587
|
0 |
0 |
struct workspace { |
|
0 |
0 |
struct workspace { |
|
0 |
0 |
struct workspace { |
|
0 |
0 |
struct workspace { |
|
0 |
0 |
struct workspace { |
|
0 |
0 |
struct workspace { |
15677
|
0 |
0 |
if (parameters.hidden_layer) { |
15679
|
0 |
0 |
-parameters.initialization_range * sqrt(6.0 / float(input_size + parameters.hidden_layer)); |
15683
|
0 |
0 |
for (auto&& row : network.weights[0]) { |
15685
|
0 |
0 |
for (auto&& weight : row) |
15690
|
0 |
0 |
-parameters.initialization_range * sqrt(6.0 / float(output_size + parameters.hidden_layer)); |
15694
|
0 |
0 |
for (auto&& row : network.weights[1]) { |
15696
|
0 |
0 |
for (auto&& weight : row) |
15713
|
0 |
0 |
if (maxnorm_regularization) maxnorm_regularize(); |
15717
|
0 |
0 |
if (iteration++ >= iterations) return false; |
15719
|
0 |
0 |
if (trainer.algorithm != network_trainer::ADADELTA) |
15720
|
0 |
0 |
if (trainer.learning_rate != trainer.learning_rate_final && iteration > 1) |
|
0 |
0 |
if (trainer.learning_rate != trainer.learning_rate_final && iteration > 1) |
15729
|
0 |
0 |
if (dropout_input) { |
15732
|
0 |
0 |
for (auto&& flag : w.input_dropout) |
15736
|
0 |
0 |
if (dropout_hidden) { |
15739
|
0 |
0 |
for (auto&& flag : w.hidden_dropout) |
15743
|
0 |
0 |
for (unsigned i = 0; i < network.weights[0].front().size(); i++) |
15744
|
0 |
0 |
if (w.hidden_dropout.empty() || !w.hidden_dropout[i]) |
|
0 |
0 |
if (w.hidden_dropout.empty() || !w.hidden_dropout[i]) |
|
0 |
0 |
if (w.hidden_dropout.empty() || !w.hidden_dropout[i]) |
15757
|
0 |
0 |
for (auto&& embedding_ids : embedding_ids_sequences) |
15760
|
0 |
0 |
for (unsigned i = 0; i < embeddings.size(); i++) { |
15761
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
15763
|
0 |
0 |
for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, embedding++, index++) |
15764
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) |
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) |
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) |
15765
|
0 |
0 |
for (auto&& j : w.hidden_kept) |
15771
|
0 |
0 |
if (dropout_input) { // Dropout normalization |
15773
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
15776
|
0 |
0 |
for (auto&& i : w.hidden_kept) // Bias |
15782
|
0 |
0 |
for (auto&& weight : w.hidden_layer) |
15786
|
0 |
0 |
for (auto&& weight : w.hidden_layer) |
15790
|
0 |
0 |
for (auto&& weight : w.hidden_layer) |
15791
|
0 |
0 |
if (weight < 0) weight = 0; |
15794
|
0 |
0 |
if (dropout_hidden) { // Dropout normalization |
15796
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
15800
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
15801
|
0 |
0 |
for (unsigned j = 0; j < outcomes_size; j++) |
15803
|
0 |
0 |
for (unsigned i = 0; i < outcomes_size; i++) // Bias |
15808
|
0 |
0 |
for (unsigned i = 1; i < outcomes_size; i++) if (w.outcomes[i] > max) max = w.outcomes[i]; |
|
0 |
0 |
for (unsigned i = 1; i < outcomes_size; i++) if (w.outcomes[i] > max) max = w.outcomes[i]; |
15811
|
0 |
0 |
for (unsigned i = 0; i < outcomes_size; i++) sum += (w.outcomes[i] = exp(w.outcomes[i] - max)); |
15814
|
0 |
0 |
for (unsigned i = 0; i < outcomes_size; i++) w.outcomes[i] *= sum; |
15861
|
0 |
0 |
if (network.weights[0].size() > w.weights_batch[0].size()) w.weights_batch[0].resize(network.weights[0].size()); |
|
0 |
0 |
if (network.weights[0].size() > w.weights_batch[0].size()) w.weights_batch[0].resize(network.weights[0].size()); |
|
0 |
0 |
if (network.weights[0].size() > w.weights_batch[0].size()) w.weights_batch[0].resize(network.weights[0].size()); |
|
0 |
0 |
if (network.weights[0].size() > w.weights_batch[0].size()) w.weights_batch[0].resize(network.weights[0].size()); |
|
0 |
0 |
if (network.weights[0].size() > w.weights_batch[0].size()) w.weights_batch[0].resize(network.weights[0].size()); |
15862
|
0 |
0 |
if (network.weights[1].size() > w.weights_batch[1].size()) w.weights_batch[1].resize(network.weights[1].size()); |
|
0 |
0 |
if (network.weights[1].size() > w.weights_batch[1].size()) w.weights_batch[1].resize(network.weights[1].size()); |
|
0 |
0 |
if (network.weights[1].size() > w.weights_batch[1].size()) w.weights_batch[1].resize(network.weights[1].size()); |
|
0 |
0 |
if (network.weights[1].size() > w.weights_batch[1].size()) w.weights_batch[1].resize(network.weights[1].size()); |
|
0 |
0 |
if (network.weights[1].size() > w.weights_batch[1].size()) w.weights_batch[1].resize(network.weights[1].size()); |
15863
|
0 |
0 |
if (embeddings.size() > w.error_embedding.size()) w.error_embedding.resize(embeddings.size()); |
|
0 |
0 |
if (embeddings.size() > w.error_embedding.size()) w.error_embedding.resize(embeddings.size()); |
|
0 |
0 |
if (embeddings.size() > w.error_embedding.size()) w.error_embedding.resize(embeddings.size()); |
|
0 |
0 |
if (embeddings.size() > w.error_embedding.size()) w.error_embedding.resize(embeddings.size()); |
|
0 |
0 |
if (embeddings.size() > w.error_embedding.size()) w.error_embedding.resize(embeddings.size()); |
15864
|
0 |
0 |
if (embeddings.size() > w.error_embedding_nonempty.size()) w.error_embedding_nonempty.resize(embeddings.size()); |
|
0 |
0 |
if (embeddings.size() > w.error_embedding_nonempty.size()) w.error_embedding_nonempty.resize(embeddings.size()); |
|
0 |
0 |
if (embeddings.size() > w.error_embedding_nonempty.size()) w.error_embedding_nonempty.resize(embeddings.size()); |
|
0 |
0 |
if (embeddings.size() > w.error_embedding_nonempty.size()) w.error_embedding_nonempty.resize(embeddings.size()); |
|
0 |
0 |
if (embeddings.size() > w.error_embedding_nonempty.size()) w.error_embedding_nonempty.resize(embeddings.size()); |
15868
|
0 |
0 |
if (TRAINER::need_trainer_data) { |
|
0 |
0 |
if (TRAINER::need_trainer_data) { |
|
0 |
0 |
if (TRAINER::need_trainer_data) { |
|
0 |
0 |
if (TRAINER::need_trainer_data) { |
|
0 |
0 |
if (TRAINER::need_trainer_data) { |
15869
|
0 |
0 |
while (network.weights[0].size() > w.weights_trainer[0].size()) w.weights_trainer[0].emplace_back(network.weights[0].front().size()); |
|
0 |
0 |
while (network.weights[0].size() > w.weights_trainer[0].size()) w.weights_trainer[0].emplace_back(network.weights[0].front().size()); |
|
0 |
0 |
while (network.weights[0].size() > w.weights_trainer[0].size()) w.weights_trainer[0].emplace_back(network.weights[0].front().size()); |
|
0 |
0 |
while (network.weights[0].size() > w.weights_trainer[0].size()) w.weights_trainer[0].emplace_back(network.weights[0].front().size()); |
|
0 |
0 |
while (network.weights[0].size() > w.weights_trainer[0].size()) w.weights_trainer[0].emplace_back(network.weights[0].front().size()); |
15870
|
0 |
0 |
while (network.weights[1].size() > w.weights_trainer[1].size()) w.weights_trainer[1].emplace_back(outcomes_size); |
|
0 |
0 |
while (network.weights[1].size() > w.weights_trainer[1].size()) w.weights_trainer[1].emplace_back(outcomes_size); |
|
0 |
0 |
while (network.weights[1].size() > w.weights_trainer[1].size()) w.weights_trainer[1].emplace_back(outcomes_size); |
|
0 |
0 |
while (network.weights[1].size() > w.weights_trainer[1].size()) w.weights_trainer[1].emplace_back(outcomes_size); |
|
0 |
0 |
while (network.weights[1].size() > w.weights_trainer[1].size()) w.weights_trainer[1].emplace_back(outcomes_size); |
15875
|
0 |
0 |
for (unsigned i = 0; i < outcomes_size; i++) |
|
0 |
0 |
for (unsigned i = 0; i < outcomes_size; i++) |
|
0 |
0 |
for (unsigned i = 0; i < outcomes_size; i++) |
|
0 |
0 |
for (unsigned i = 0; i < outcomes_size; i++) |
|
0 |
0 |
for (unsigned i = 0; i < outcomes_size; i++) |
15876
|
0 |
0 |
w.error_outcomes[i] = (i == required_outcome) - w.outcomes[i]; |
|
0 |
0 |
w.error_outcomes[i] = (i == required_outcome) - w.outcomes[i]; |
|
0 |
0 |
w.error_outcomes[i] = (i == required_outcome) - w.outcomes[i]; |
|
0 |
0 |
w.error_outcomes[i] = (i == required_outcome) - w.outcomes[i]; |
|
0 |
0 |
w.error_outcomes[i] = (i == required_outcome) - w.outcomes[i]; |
15880
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
15881
|
0 |
0 |
for (unsigned j = 0; j < outcomes_size; j++) |
|
0 |
0 |
for (unsigned j = 0; j < outcomes_size; j++) |
|
0 |
0 |
for (unsigned j = 0; j < outcomes_size; j++) |
|
0 |
0 |
for (unsigned j = 0; j < outcomes_size; j++) |
|
0 |
0 |
for (unsigned j = 0; j < outcomes_size; j++) |
15884
|
0 |
0 |
if (dropout_hidden) { |
|
0 |
0 |
if (dropout_hidden) { |
|
0 |
0 |
if (dropout_hidden) { |
|
0 |
0 |
if (dropout_hidden) { |
|
0 |
0 |
if (dropout_hidden) { |
15886
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
15893
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
15897
|
0 |
0 |
for (auto&& i : w.hidden_kept) { |
|
0 |
0 |
for (auto&& i : w.hidden_kept) { |
|
0 |
0 |
for (auto&& i : w.hidden_kept) { |
|
0 |
0 |
for (auto&& i : w.hidden_kept) { |
|
0 |
0 |
for (auto&& i : w.hidden_kept) { |
15903
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
15904
|
0 |
0 |
if (w.hidden_layer[i] <= 0) |
|
0 |
0 |
if (w.hidden_layer[i] <= 0) |
|
0 |
0 |
if (w.hidden_layer[i] <= 0) |
|
0 |
0 |
if (w.hidden_layer[i] <= 0) |
|
0 |
0 |
if (w.hidden_layer[i] <= 0) |
15910
|
0 |
0 |
for (auto&& i : w.hidden_kept) { |
|
0 |
0 |
for (auto&& i : w.hidden_kept) { |
|
0 |
0 |
for (auto&& i : w.hidden_kept) { |
|
0 |
0 |
for (auto&& i : w.hidden_kept) { |
|
0 |
0 |
for (auto&& i : w.hidden_kept) { |
15911
|
0 |
0 |
if (w.weights_batch[1][i].empty()) w.weights_batch[1][i].resize(outcomes_size); |
|
0 |
0 |
if (w.weights_batch[1][i].empty()) w.weights_batch[1][i].resize(outcomes_size); |
|
0 |
0 |
if (w.weights_batch[1][i].empty()) w.weights_batch[1][i].resize(outcomes_size); |
|
0 |
0 |
if (w.weights_batch[1][i].empty()) w.weights_batch[1][i].resize(outcomes_size); |
|
0 |
0 |
if (w.weights_batch[1][i].empty()) w.weights_batch[1][i].resize(outcomes_size); |
15912
|
0 |
0 |
for (unsigned j = 0; j < outcomes_size; j++) |
|
0 |
0 |
for (unsigned j = 0; j < outcomes_size; j++) |
|
0 |
0 |
for (unsigned j = 0; j < outcomes_size; j++) |
|
0 |
0 |
for (unsigned j = 0; j < outcomes_size; j++) |
|
0 |
0 |
for (unsigned j = 0; j < outcomes_size; j++) |
15916
|
0 |
0 |
if (w.weights_batch[1][hidden_layer_size].empty()) w.weights_batch[1][hidden_layer_size].resize(outcomes_size); |
|
0 |
0 |
if (w.weights_batch[1][hidden_layer_size].empty()) w.weights_batch[1][hidden_layer_size].resize(outcomes_size); |
|
0 |
0 |
if (w.weights_batch[1][hidden_layer_size].empty()) w.weights_batch[1][hidden_layer_size].resize(outcomes_size); |
|
0 |
0 |
if (w.weights_batch[1][hidden_layer_size].empty()) w.weights_batch[1][hidden_layer_size].resize(outcomes_size); |
|
0 |
0 |
if (w.weights_batch[1][hidden_layer_size].empty()) w.weights_batch[1][hidden_layer_size].resize(outcomes_size); |
15917
|
0 |
0 |
for (unsigned i = 0; i < outcomes_size; i++) |
|
0 |
0 |
for (unsigned i = 0; i < outcomes_size; i++) |
|
0 |
0 |
for (unsigned i = 0; i < outcomes_size; i++) |
|
0 |
0 |
for (unsigned i = 0; i < outcomes_size; i++) |
|
0 |
0 |
for (unsigned i = 0; i < outcomes_size; i++) |
15921
|
0 |
0 |
if (dropout_input) { |
|
0 |
0 |
if (dropout_input) { |
|
0 |
0 |
if (dropout_input) { |
|
0 |
0 |
if (dropout_input) { |
|
0 |
0 |
if (dropout_input) { |
15923
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
15928
|
0 |
0 |
for (auto&& embedding_ids : embedding_ids_sequences) |
|
0 |
0 |
for (auto&& embedding_ids : embedding_ids_sequences) |
|
0 |
0 |
for (auto&& embedding_ids : embedding_ids_sequences) |
|
0 |
0 |
for (auto&& embedding_ids : embedding_ids_sequences) |
|
0 |
0 |
for (auto&& embedding_ids : embedding_ids_sequences) |
15931
|
0 |
0 |
for (unsigned i = 0; i < embeddings.size(); i++) { |
|
0 |
0 |
for (unsigned i = 0; i < embeddings.size(); i++) { |
|
0 |
0 |
for (unsigned i = 0; i < embeddings.size(); i++) { |
|
0 |
0 |
for (unsigned i = 0; i < embeddings.size(); i++) { |
|
0 |
0 |
for (unsigned i = 0; i < embeddings.size(); i++) { |
15932
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
|
0 |
0 |
if (embedding_ids && (*embedding_ids)[i] >= 0) { |
15936
|
0 |
0 |
if (embeddings[i].can_update_weights(embedding_id)) { |
|
0 |
0 |
if (embeddings[i].can_update_weights(embedding_id)) { |
|
0 |
0 |
if (embeddings[i].can_update_weights(embedding_id)) { |
|
0 |
0 |
if (embeddings[i].can_update_weights(embedding_id)) { |
|
0 |
0 |
if (embeddings[i].can_update_weights(embedding_id)) { |
15937
|
0 |
0 |
if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1); |
|
0 |
0 |
if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1); |
|
0 |
0 |
if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1); |
|
0 |
0 |
if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1); |
|
0 |
0 |
if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1); |
|
0 |
0 |
if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1); |
|
0 |
0 |
if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1); |
|
0 |
0 |
if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1); |
|
0 |
0 |
if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1); |
|
0 |
0 |
if (w.error_embedding[i].size() <= unsigned(embedding_id)) w.error_embedding[i].resize(embedding_id + 1); |
15938
|
0 |
0 |
if (w.error_embedding[i][embedding_id].empty()) { |
|
0 |
0 |
if (w.error_embedding[i][embedding_id].empty()) { |
|
0 |
0 |
if (w.error_embedding[i][embedding_id].empty()) { |
|
0 |
0 |
if (w.error_embedding[i][embedding_id].empty()) { |
|
0 |
0 |
if (w.error_embedding[i][embedding_id].empty()) { |
15940
|
0 |
0 |
w.error_embedding_nonempty[i].emplace_back(embedding_id); |
|
0 |
0 |
w.error_embedding_nonempty[i].emplace_back(embedding_id); |
|
0 |
0 |
w.error_embedding_nonempty[i].emplace_back(embedding_id); |
|
0 |
0 |
w.error_embedding_nonempty[i].emplace_back(embedding_id); |
|
0 |
0 |
w.error_embedding_nonempty[i].emplace_back(embedding_id); |
15946
|
0 |
0 |
for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding) |
|
0 |
0 |
for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding) |
|
0 |
0 |
for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding) |
|
0 |
0 |
for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding) |
|
0 |
0 |
for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding) |
|
0 |
0 |
for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding) |
|
0 |
0 |
for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding) |
|
0 |
0 |
for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding) |
|
0 |
0 |
for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding) |
|
0 |
0 |
for (unsigned dimension = embeddings[i].dimension; dimension; dimension--, index++, embedding++, error_embedding += !!error_embedding) |
15947
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
|
0 |
0 |
if (w.input_dropout.empty() || !w.input_dropout[index]) { |
15948
|
0 |
0 |
if (error_embedding) |
|
0 |
0 |
if (error_embedding) |
|
0 |
0 |
if (error_embedding) |
|
0 |
0 |
if (error_embedding) |
|
0 |
0 |
if (error_embedding) |
15949
|
0 |
0 |
for (auto&& j : w.hidden_kept) |
|
0 |
0 |
for (auto&& j : w.hidden_kept) |
|
0 |
0 |
for (auto&& j : w.hidden_kept) |
|
0 |
0 |
for (auto&& j : w.hidden_kept) |
|
0 |
0 |
for (auto&& j : w.hidden_kept) |
15951
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
15952
|
0 |
0 |
for (auto&& j : w.hidden_kept) |
|
0 |
0 |
for (auto&& j : w.hidden_kept) |
|
0 |
0 |
for (auto&& j : w.hidden_kept) |
|
0 |
0 |
for (auto&& j : w.hidden_kept) |
|
0 |
0 |
for (auto&& j : w.hidden_kept) |
15962
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
|
0 |
0 |
if (w.weights_batch[0][index].empty()) w.weights_batch[0][index].resize(hidden_layer_size); |
15963
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
|
0 |
0 |
for (auto&& i : w.hidden_kept) |
15968
|
0 |
0 |
if (++w.batch < batch_size) return; |
|
0 |
0 |
if (++w.batch < batch_size) return; |
|
0 |
0 |
if (++w.batch < batch_size) return; |
|
0 |
0 |
if (++w.batch < batch_size) return; |
|
0 |
0 |
if (++w.batch < batch_size) return; |
15972
|
0 |
0 |
if (!network.weights[0].empty()) |
|
0 |
0 |
if (!network.weights[0].empty()) |
|
0 |
0 |
if (!network.weights[0].empty()) |
|
0 |
0 |
if (!network.weights[0].empty()) |
|
0 |
0 |
if (!network.weights[0].empty()) |
15973
|
0 |
0 |
for (int i = 0; i < 2; i++) { |
|
0 |
0 |
for (int i = 0; i < 2; i++) { |
|
0 |
0 |
for (int i = 0; i < 2; i++) { |
|
0 |
0 |
for (int i = 0; i < 2; i++) { |
|
0 |
0 |
for (int i = 0; i < 2; i++) { |
15974
|
0 |
0 |
for (unsigned j = 0; j < w.weights_batch[i].size(); j++) |
|
0 |
0 |
for (unsigned j = 0; j < w.weights_batch[i].size(); j++) |
|
0 |
0 |
for (unsigned j = 0; j < w.weights_batch[i].size(); j++) |
|
0 |
0 |
for (unsigned j = 0; j < w.weights_batch[i].size(); j++) |
|
0 |
0 |
for (unsigned j = 0; j < w.weights_batch[i].size(); j++) |
15975
|
0 |
0 |
if (!w.weights_batch[i][j].empty()) { |
|
0 |
0 |
if (!w.weights_batch[i][j].empty()) { |
|
0 |
0 |
if (!w.weights_batch[i][j].empty()) { |
|
0 |
0 |
if (!w.weights_batch[i][j].empty()) { |
|
0 |
0 |
if (!w.weights_batch[i][j].empty()) { |
15976
|
0 |
0 |
for (unsigned k = 0; k < w.weights_batch[i][j].size(); k++) |
|
0 |
0 |
for (unsigned k = 0; k < w.weights_batch[i][j].size(); k++) |
|
0 |
0 |
for (unsigned k = 0; k < w.weights_batch[i][j].size(); k++) |
|
0 |
0 |
for (unsigned k = 0; k < w.weights_batch[i][j].size(); k++) |
|
0 |
0 |
for (unsigned k = 0; k < w.weights_batch[i][j].size(); k++) |
15977
|
0 |
0 |
network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k]; |
|
0 |
0 |
network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k]; |
|
0 |
0 |
network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k]; |
|
0 |
0 |
network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k]; |
|
0 |
0 |
network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k]; |
|
0 |
0 |
network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k]; |
|
0 |
0 |
network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k]; |
|
0 |
0 |
network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k]; |
|
0 |
0 |
network.weights[i][j][k] += TRAINER::delta(w.weights_batch[i][j][k], trainer, TRAINER::need_trainer_data ? w.weights_trainer[i][j][k] : none_trainer_data) - (j+1 == w.weights_batch[i].size() ? /*bias*/ 0. : l2_regularization) * network.weights[i][j][k]; |
15983
|
0 |
0 |
for (unsigned i = 0; i < embeddings.size(); i++) { |
|
0 |
0 |
for (unsigned i = 0; i < embeddings.size(); i++) { |
|
0 |
0 |
for (unsigned i = 0; i < embeddings.size(); i++) { |
|
0 |
0 |
for (unsigned i = 0; i < embeddings.size(); i++) { |
|
0 |
0 |
for (unsigned i = 0; i < embeddings.size(); i++) { |
15984
|
0 |
0 |
for (auto&& id : w.error_embedding_nonempty[i]) { |
|
0 |
0 |
for (auto&& id : w.error_embedding_nonempty[i]) { |
|
0 |
0 |
for (auto&& id : w.error_embedding_nonempty[i]) { |
|
0 |
0 |
for (auto&& id : w.error_embedding_nonempty[i]) { |
|
0 |
0 |
for (auto&& id : w.error_embedding_nonempty[i]) { |
15985
|
0 |
0 |
if (TRAINER::need_trainer_data) { |
|
0 |
0 |
if (TRAINER::need_trainer_data) { |
|
0 |
0 |
if (TRAINER::need_trainer_data) { |
|
0 |
0 |
if (TRAINER::need_trainer_data) { |
|
0 |
0 |
if (TRAINER::need_trainer_data) { |
15986
|
0 |
0 |
if (w.embedding_trainer.size() <= i) w.embedding_trainer.resize(i + 1); |
|
0 |
0 |
if (w.embedding_trainer.size() <= i) w.embedding_trainer.resize(i + 1); |
|
0 |
0 |
if (w.embedding_trainer.size() <= i) w.embedding_trainer.resize(i + 1); |
|
0 |
0 |
if (w.embedding_trainer.size() <= i) w.embedding_trainer.resize(i + 1); |
|
0 |
0 |
if (w.embedding_trainer.size() <= i) w.embedding_trainer.resize(i + 1); |
15987
|
0 |
0 |
if (w.embedding_trainer[i].size() <= id) w.embedding_trainer[i].resize(id + 1); |
|
0 |
0 |
if (w.embedding_trainer[i].size() <= id) w.embedding_trainer[i].resize(id + 1); |
|
0 |
0 |
if (w.embedding_trainer[i].size() <= id) w.embedding_trainer[i].resize(id + 1); |
|
0 |
0 |
if (w.embedding_trainer[i].size() <= id) w.embedding_trainer[i].resize(id + 1); |
|
0 |
0 |
if (w.embedding_trainer[i].size() <= id) w.embedding_trainer[i].resize(id + 1); |
15988
|
0 |
0 |
if (w.embedding_trainer[i][id].size() < embeddings[i].dimension) w.embedding_trainer[i][id].resize(embeddings[i].dimension); |
|
0 |
0 |
if (w.embedding_trainer[i][id].size() < embeddings[i].dimension) w.embedding_trainer[i][id].resize(embeddings[i].dimension); |
|
0 |
0 |
if (w.embedding_trainer[i][id].size() < embeddings[i].dimension) w.embedding_trainer[i][id].resize(embeddings[i].dimension); |
|
0 |
0 |
if (w.embedding_trainer[i][id].size() < embeddings[i].dimension) w.embedding_trainer[i][id].resize(embeddings[i].dimension); |
|
0 |
0 |
if (w.embedding_trainer[i][id].size() < embeddings[i].dimension) w.embedding_trainer[i][id].resize(embeddings[i].dimension); |
15991
|
0 |
0 |
for (unsigned j = 0; j < embeddings[i].dimension; j++) |
|
0 |
0 |
for (unsigned j = 0; j < embeddings[i].dimension; j++) |
|
0 |
0 |
for (unsigned j = 0; j < embeddings[i].dimension; j++) |
|
0 |
0 |
for (unsigned j = 0; j < embeddings[i].dimension; j++) |
|
0 |
0 |
for (unsigned j = 0; j < embeddings[i].dimension; j++) |
15992
|
0 |
0 |
embedding[j] += TRAINER::delta(w.error_embedding[i][id][j], trainer, TRAINER::need_trainer_data ? w.embedding_trainer[i][id][j] : none_trainer_data) - l2_regularization * embedding[j]; |
|
0 |
0 |
embedding[j] += TRAINER::delta(w.error_embedding[i][id][j], trainer, TRAINER::need_trainer_data ? w.embedding_trainer[i][id][j] : none_trainer_data) - l2_regularization * embedding[j]; |
|
0 |
0 |
embedding[j] += TRAINER::delta(w.error_embedding[i][id][j], trainer, TRAINER::need_trainer_data ? w.embedding_trainer[i][id][j] : none_trainer_data) - l2_regularization * embedding[j]; |
|
0 |
0 |
embedding[j] += TRAINER::delta(w.error_embedding[i][id][j], trainer, TRAINER::need_trainer_data ? w.embedding_trainer[i][id][j] : none_trainer_data) - l2_regularization * embedding[j]; |
15999
|
0 |
0 |
if (maxnorm_regularization) maxnorm_regularize(); |
|
0 |
0 |
if (maxnorm_regularization) maxnorm_regularize(); |
|
0 |
0 |
if (maxnorm_regularization) maxnorm_regularize(); |
|
0 |
0 |
if (maxnorm_regularization) maxnorm_regularize(); |
|
0 |
0 |
if (maxnorm_regularization) maxnorm_regularize(); |
16026
|
0 |
0 |
training_failure("Internal error, unsupported trainer!"); |
|
0 |
0 |
training_failure("Internal error, unsupported trainer!"); |
16030
|
0 |
0 |
if (!l1_regularization) return; |
16032
|
0 |
0 |
for (auto&& weights : network.weights) |
16033
|
0 |
0 |
for (unsigned i = 0; i + 1 /*ignore biases*/ < weights.size(); i++) { |
16035
|
0 |
0 |
for (auto&& weight : row) |
16036
|
0 |
0 |
if (weight < l1_regularization) weight += l1_regularization; |
16037
|
0 |
0 |
else if (weight > l1_regularization) weight -= l1_regularization; |
16043
|
0 |
0 |
if (!maxnorm_regularization) return; |
16045
|
0 |
0 |
for (unsigned i = 0; i < 2; i++) |
16046
|
0 |
0 |
for (unsigned j = 0; j < network.weights[i].front().size(); j++) { |
16048
|
0 |
0 |
for (auto&& row : network.weights[i]) |
16051
|
0 |
0 |
if (length > 0 && length > maxnorm_regularization * maxnorm_regularization) { |
|
0 |
0 |
if (length > 0 && length > maxnorm_regularization * maxnorm_regularization) { |
16053
|
0 |
0 |
for (auto&& row : network.weights[i]) |
16060
|
0 |
0 |
if (l1_regularization) l1_regularize(); |
|
0 |
0 |
if (l1_regularization) l1_regularize(); |
|
0 |
0 |
if (l1_regularization) l1_regularize(); |
16065
|
0 |
0 |
enc.add_4B(m.empty() ? 0 : m.front().size()); |
16067
|
0 |
0 |
for (auto&& row : m) { |
16068
|
0 |
0 |
assert(row.size() == m.front().size()); |
16283
|
1 |
0 |
struct workspace { |
|
2 |
1 |
struct workspace { |
16284
|
2 |
1 |
workspace(bool single_root) : conf(single_root) {} |
16342
|
0 |
0 |
ifstream in(path_from_utf8(file).c_str(), ifstream::in | ifstream::binary); |
16343
|
0 |
0 |
if (!in.is_open()) return nullptr; |
16344
|
0 |
0 |
return load(in, cache); |
16351
|
1 |
0 |
if (!compressor::load(in, data)) return nullptr; |
|
1 |
0 |
if (!compressor::load(in, data)) return nullptr; |
16355
|
1 |
0 |
data.next_str(name); |
16357
|
1 |
0 |
result.reset(create(name)); |
16358
|
0 |
1 |
if (!result) return nullptr; |
16360
|
1 |
0 |
result->load(data, cache); |
|
0 |
0 |
result->load(data, cache); |
16365
|
1 |
0 |
return result && data.is_end() ? result.release() : nullptr; |
|
1 |
0 |
return result && data.is_end() ? result.release() : nullptr; |
16369
|
1 |
0 |
if (name == "nn") return new parser_nn(false); |
16370
|
0 |
0 |
if (name == "nn_versioned") return new parser_nn(true); |
16398
|
1 |
0 |
if (beam_size > 1) |
16401
|
0 |
0 |
parse_greedy(t, cost); |
16405
|
0 |
0 |
assert(system); |
16406
|
0 |
0 |
if (cost) *cost = 0.; |
16410
|
0 |
0 |
if (!w) w = new workspace(single_root); |
16416
|
0 |
0 |
if (w->embeddings.size() < t.nodes.size()) w->embeddings.resize(t.nodes.size()); |
16417
|
0 |
0 |
for (size_t i = 0; i < t.nodes.size(); i++) { |
16418
|
0 |
0 |
if (w->embeddings[i].size() < embeddings.size()) w->embeddings[i].resize(embeddings.size()); |
16419
|
0 |
0 |
for (size_t j = 0; j < embeddings.size(); j++) { |
16427
|
0 |
0 |
for (; !w->conf.final(); transitions++) { |
16431
|
0 |
0 |
for (size_t i = 0; i < w->extracted_nodes.size(); i++) |
16432
|
0 |
0 |
w->extracted_embeddings[i] = w->extracted_nodes[i] >= 0 ? &w->embeddings[w->extracted_nodes[i]] : nullptr; |
16439
|
0 |
0 |
for (unsigned i = 0; i < w->outcomes.size(); i++) |
16440
|
0 |
0 |
if (system->applicable(w->conf, i) && (best < 0 || w->outcomes[i] > w->outcomes[best])) |
|
0 |
0 |
if (system->applicable(w->conf, i) && (best < 0 || w->outcomes[i] > w->outcomes[best])) |
|
0 |
0 |
if (system->applicable(w->conf, i) && (best < 0 || w->outcomes[i] > w->outcomes[best])) |
|
0 |
0 |
if (system->applicable(w->conf, i) && (best < 0 || w->outcomes[i] > w->outcomes[best])) |
16445
|
0 |
0 |
if (cost) *cost += log(w->outcomes[best]); |
16448
|
0 |
0 |
if (child >= 0) |
16449
|
0 |
0 |
for (size_t i = 0; i < embeddings.size(); i++) { |
16455
|
0 |
0 |
if (cost && transitions) |
16463
|
0 |
1 |
assert(system); |
16467
|
1 |
0 |
if (!w) w = new workspace(single_root); |
16470
|
2 |
1 |
for (int i = 0; i < 2; i++) { |
16471
|
2 |
10 |
while (w->bs_confs[i].size() < beam_size) w->bs_confs[i].emplace_back(single_root); |
16472
|
0 |
2 |
while (w->bs_confs[i].size() > beam_size) w->bs_confs[i].pop_back(); |
16481
|
1 |
0 |
if (w->embeddings.size() < t.nodes.size()) w->embeddings.resize(t.nodes.size()); |
16482
|
1 |
0 |
if (w->embeddings_values.size() < t.nodes.size()) w->embeddings_values.resize(t.nodes.size()); |
16483
|
8 |
1 |
for (size_t i = 0; i < t.nodes.size(); i++) { |
16484
|
8 |
0 |
if (w->embeddings[i].size() < embeddings.size()) w->embeddings[i].resize(embeddings.size()); |
16485
|
8 |
0 |
if (w->embeddings_values[i].size() < embeddings.size()) w->embeddings_values[i].resize(embeddings.size()); |
16486
|
32 |
8 |
for (size_t j = 0; j < embeddings.size(); j++) { |
16494
|
1 |
15 |
for (bool all_final = false; !all_final; iteration++) { |
16498
|
67 |
15 |
for (size_t c = 0; c < w->bs_confs_size[iteration & 1]; c++) { |
16501
|
5 |
62 |
if (bs_conf.conf.final()) { |
16502
|
0 |
5 |
if (w->bs_alternatives.size() == beam_size) { |
16503
|
0 |
0 |
if (bs_conf.cost <= w->bs_alternatives[0].cost) continue; |
16515
|
496 |
62 |
for (size_t i = 0; i < t.nodes.size(); i++) |
16516
|
1984 |
496 |
for (size_t j = 0; j < embeddings.size(); j++) { |
16518
|
96 |
1888 |
if (w->word != w->embeddings_values[i][j]) { |
16527
|
1116 |
62 |
for (size_t i = 0; i < w->extracted_nodes.size(); i++) |
16528
|
410 |
706 |
w->extracted_embeddings[i] = w->extracted_nodes[i] >= 0 ? &w->embeddings[w->extracted_nodes[i]] : nullptr; |
16534
|
806 |
62 |
for (unsigned i = 0; i < w->outcomes.size(); i++) |
16535
|
633 |
173 |
if (system->applicable(bs_conf.conf, i)) { |
16537
|
567 |
66 |
if (w->bs_alternatives.size() == beam_size) { |
16538
|
170 |
397 |
if (cost <= w->bs_alternatives[0].cost) continue; |
16548
|
15 |
71 |
for (auto&& alternative : w->bs_alternatives) { |
16552
|
66 |
5 |
if (alternative.transition >= 0) { |
16562
|
4 |
1 |
for (size_t i = 1; i < w->bs_confs_size[iteration & 1]; i++) |
16563
|
2 |
2 |
if (w->bs_confs[iteration & 1][i].cost > w->bs_confs[iteration & 1][best].cost) |
16567
|
0 |
1 |
if (cost) *cost = w->bs_confs[iteration & 1][best].cost * (t.nodes.size() - 1); |
16574
|
129 |
1032 |
for (auto&& node : conf.t->nodes) node.children.clear(); |
16575
|
1032 |
129 |
for (size_t i = 0; i < conf.t->nodes.size(); i++) { |
16578
|
302 |
730 |
if (heads[i] >= 0) conf.t->nodes[heads[i]].children.push_back(i); |
16583
|
1 |
66 |
if (conf.t->nodes.size() > heads.size()) heads.resize(conf.t->nodes.size()); |
16584
|
1 |
66 |
if (conf.t->nodes.size() > deprels.size()) deprels.resize(conf.t->nodes.size()); |
16585
|
536 |
67 |
for (size_t i = 0; i < conf.t->nodes.size(); i++) { |
16594
|
0 |
1 |
version = versioned ? data.next_1B() : 1; |
|
0 |
0 |
version = versioned ? data.next_1B() : 1; |
16595
|
0 |
1 |
if (!(version >= 1 && version <= VERSION_LATEST)) |
16598
|
0 |
1 |
single_root = version >= 2 ? data.next_1B() : false; |
|
0 |
0 |
single_root = version >= 2 ? data.next_1B() : false; |
|
0 |
0 |
single_root = version >= 2 ? data.next_1B() : false; |
16601
|
1 |
0 |
labels.resize(data.next_2B()); |
|
1 |
0 |
labels.resize(data.next_2B()); |
16602
|
6 |
1 |
for (auto&& label : labels) |
16603
|
6 |
0 |
data.next_str(label); |
16607
|
1 |
0 |
data.next_str(system_name); |
16608
|
1 |
0 |
system.reset(transition_system::create(system_name, labels)); |
16609
|
0 |
1 |
if (!system) throw binary_decoder_error("Cannot load transition system"); |
16612
|
1 |
0 |
data.next_str(description); |
16613
|
1 |
0 |
if (!nodes.create(description, error)) |
|
0 |
1 |
if (!nodes.create(description, error)) |
16617
|
1 |
0 |
values.resize(data.next_2B()); |
|
1 |
0 |
values.resize(data.next_2B()); |
16618
|
4 |
1 |
for (auto&& value : values) { |
16619
|
4 |
0 |
data.next_str(description); |
16620
|
4 |
0 |
if (!value.create(description, error)) |
|
0 |
4 |
if (!value.create(description, error)) |
16624
|
1 |
0 |
embeddings.resize(values.size()); |
16625
|
4 |
1 |
for (auto&& embedding : embeddings) |
16626
|
4 |
0 |
embedding.load(data); |
16629
|
1 |
0 |
network.load(data); |
16630
|
1 |
0 |
network.generate_tanh_cache(); |
16631
|
1 |
0 |
network.generate_embeddings_cache(embeddings, embeddings_cache, cache); |
16678
|
0 |
0 |
if (train.empty()) training_failure("No training data was given!"); |
|
0 |
0 |
if (train.empty()) training_failure("No training data was given!"); |
|
0 |
0 |
if (train.empty()) training_failure("No training data was given!"); |
16684
|
0 |
0 |
for (auto&& tree : train) |
16685
|
0 |
0 |
for (auto&& node : tree.nodes) |
16686
|
0 |
0 |
if (node.id) { |
16687
|
0 |
0 |
if (node.head < 0) training_failure("The node '" << node.form << "' with id " << node.id << " has no head set!"); |
|
0 |
0 |
if (node.head < 0) training_failure("The node '" << node.form << "' with id " << node.id << " has no head set!"); |
|
0 |
0 |
if (node.head < 0) training_failure("The node '" << node.form << "' with id " << node.id << " has no head set!"); |
|
0 |
0 |
if (node.head < 0) training_failure("The node '" << node.form << "' with id " << node.id << " has no head set!"); |
16688
|
0 |
0 |
if (node.deprel.empty()) training_failure("The node '" << node.form << "' with id " << node.id << " has no deprel set!"); |
|
0 |
0 |
if (node.deprel.empty()) training_failure("The node '" << node.form << "' with id " << node.id << " has no deprel set!"); |
|
0 |
0 |
if (node.deprel.empty()) training_failure("The node '" << node.form << "' with id " << node.id << " has no deprel set!"); |
|
0 |
0 |
if (node.deprel.empty()) training_failure("The node '" << node.form << "' with id " << node.id << " has no deprel set!"); |
16696
|
0 |
0 |
for (auto&& tree : train) |
16697
|
0 |
0 |
for (auto&& node : tree.nodes) |
16698
|
0 |
0 |
if (node.id && !labels_set.count(node.deprel)) { |
16700
|
0 |
0 |
parser.labels.push_back(node.deprel); |
16704
|
0 |
0 |
if (single_root) { |
16705
|
0 |
0 |
for (auto&& tree : train) { |
16707
|
0 |
0 |
for (auto&& node : tree.nodes) |
16708
|
0 |
0 |
if (node.id) { |
16709
|
0 |
0 |
if (node.head == 0 && node.deprel != "root") |
|
0 |
0 |
if (node.head == 0 && node.deprel != "root") |
|
0 |
0 |
if (node.head == 0 && node.deprel != "root") |
16710
|
0 |
0 |
training_failure("When single root is required, every root node must have 'root' deprel!"); |
|
0 |
0 |
training_failure("When single root is required, every root node must have 'root' deprel!"); |
16711
|
0 |
0 |
if (node.head != 0 && node.deprel == "root") |
|
0 |
0 |
if (node.head != 0 && node.deprel == "root") |
|
0 |
0 |
if (node.head != 0 && node.deprel == "root") |
16712
|
0 |
0 |
training_failure("When single root is required, any non-root cannot have 'root' deprel!"); |
|
0 |
0 |
training_failure("When single root is required, any non-root cannot have 'root' deprel!"); |
16715
|
0 |
0 |
if (roots != 1) |
16716
|
0 |
0 |
training_failure("When single root is required, every training tree must have single root!"); |
|
0 |
0 |
training_failure("When single root is required, every training tree must have single root!"); |
16720
|
0 |
0 |
if (!labels_set.count("root")) |
|
0 |
0 |
if (!labels_set.count("root")) |
16721
|
0 |
0 |
training_failure("When single root is required, the deprel 'root' must be present!"); |
|
0 |
0 |
training_failure("When single root is required, the deprel 'root' must be present!"); |
16722
|
0 |
0 |
if (labels_set.size() <= 1) |
16723
|
0 |
0 |
training_failure("When single root is required, deprel different from 'root' must exist!"); |
|
0 |
0 |
training_failure("When single root is required, deprel different from 'root' must exist!"); |
16727
|
0 |
0 |
parser.system.reset(transition_system::create(transition_system_name, parser.labels)); |
16728
|
0 |
0 |
if (!parser.system) training_failure("Cannot create transition system '" << transition_system_name << "'!"); |
|
0 |
0 |
if (!parser.system) training_failure("Cannot create transition system '" << transition_system_name << "'!"); |
|
0 |
0 |
if (!parser.system) training_failure("Cannot create transition system '" << transition_system_name << "'!"); |
16730
|
0 |
0 |
unique_ptr oracle(parser.system->oracle(transition_oracle_name)); |
16731
|
0 |
0 |
if (!oracle) training_failure("Cannot create transition oracle '" << transition_oracle_name << "' for transition system '" << transition_system_name << "'!"); |
|
0 |
0 |
if (!oracle) training_failure("Cannot create transition oracle '" << transition_oracle_name << "' for transition system '" << transition_system_name << "'!"); |
|
0 |
0 |
if (!oracle) training_failure("Cannot create transition oracle '" << transition_oracle_name << "' for transition system '" << transition_system_name << "'!"); |
16735
|
0 |
0 |
if (!parser.nodes.create(nodes_description, error)) training_failure(error); |
|
0 |
0 |
if (!parser.nodes.create(nodes_description, error)) training_failure(error); |
|
0 |
0 |
if (!parser.nodes.create(nodes_description, error)) training_failure(error); |
16740
|
0 |
0 |
split(embeddings_description, '\n', lines); |
16741
|
0 |
0 |
for (auto&& line : lines) { |
16743
|
0 |
0 |
if (!line.len || line.str[0] == '#') continue; |
|
0 |
0 |
if (!line.len || line.str[0] == '#') continue; |
16745
|
0 |
0 |
split(line, ' ', tokens); |
16746
|
0 |
0 |
if (!(tokens.size() >= 3 && tokens.size() <= 6)) |
|
0 |
0 |
if (!(tokens.size() >= 3 && tokens.size() <= 6)) |
|
0 |
0 |
if (!(tokens.size() >= 3 && tokens.size() <= 6)) |
16747
|
0 |
0 |
training_failure("Expected 3 to 6 columns on embedding description line '" << line << "'!"); |
|
0 |
0 |
training_failure("Expected 3 to 6 columns on embedding description line '" << line << "'!"); |
16749
|
0 |
0 |
value_names.emplace_back(string(tokens[0].str, tokens[0].len)); |
16750
|
0 |
0 |
parser.values.emplace_back(); |
16751
|
0 |
0 |
if (!parser.values.back().create(tokens[0], error)) training_failure(error); |
|
0 |
0 |
if (!parser.values.back().create(tokens[0], error)) training_failure(error); |
|
0 |
0 |
if (!parser.values.back().create(tokens[0], error)) training_failure(error); |
16753
|
0 |
0 |
int dimension = parse_int(tokens[1], "embedding dimension"); |
16754
|
0 |
0 |
int min_count = parse_int(tokens[2], "minimum frequency count"); |
16764
|
0 |
0 |
for (auto&& tree : train) |
16765
|
0 |
0 |
for (auto&& node : tree.nodes) |
16766
|
0 |
0 |
if (node.id) { |
16767
|
0 |
0 |
parser.values.back().extract(node, word); |
16772
|
0 |
0 |
if (tokens.size() >= 4) { |
16773
|
0 |
0 |
int update_weights = tokens.size() >= 5 ? parse_int(tokens[4], "update weights") : 1; |
|
0 |
0 |
int update_weights = tokens.size() >= 5 ? parse_int(tokens[4], "update weights") : 1; |
16774
|
0 |
0 |
int max_embeddings = tokens.size() >= 6 ? parse_int(tokens[5], "maximum embeddings count") : numeric_limits::max(); |
|
0 |
0 |
int max_embeddings = tokens.size() >= 6 ? parse_int(tokens[5], "maximum embeddings count") : numeric_limits::max(); |
16775
|
0 |
0 |
ifstream in(path_from_utf8(string(tokens[3].str, tokens[3].len)).c_str()); |
16776
|
0 |
0 |
if (!in.is_open()) training_failure("Cannot load '" << tokens[0] << "' embedding from file '" << tokens[3] << "'!"); |
|
0 |
0 |
if (!in.is_open()) training_failure("Cannot load '" << tokens[0] << "' embedding from file '" << tokens[3] << "'!"); |
|
0 |
0 |
if (!in.is_open()) training_failure("Cannot load '" << tokens[0] << "' embedding from file '" << tokens[3] << "'!"); |
16781
|
0 |
0 |
if (!getline(in, line)) training_failure("Cannot read first line from embedding file '" << tokens[3] << "'!"); |
|
0 |
0 |
if (!getline(in, line)) training_failure("Cannot read first line from embedding file '" << tokens[3] << "'!"); |
|
0 |
0 |
if (!getline(in, line)) training_failure("Cannot read first line from embedding file '" << tokens[3] << "'!"); |
|
0 |
0 |
if (!getline(in, line)) training_failure("Cannot read first line from embedding file '" << tokens[3] << "'!"); |
16782
|
0 |
0 |
split(line, ' ', parts); |
16783
|
0 |
0 |
if (parts.size() != 2) training_failure("Expected two numbers on the first line of embedding file '" << tokens[3] << "'!"); |
|
0 |
0 |
if (parts.size() != 2) training_failure("Expected two numbers on the first line of embedding file '" << tokens[3] << "'!"); |
|
0 |
0 |
if (parts.size() != 2) training_failure("Expected two numbers on the first line of embedding file '" << tokens[3] << "'!"); |
16784
|
0 |
0 |
int file_dimension = parse_int(parts[1], "embedding file dimension"); |
16786
|
0 |
0 |
if (file_dimension < dimension) training_failure("The embedding file '" << tokens[3] << "' has lower dimension than required!"); |
|
0 |
0 |
if (file_dimension < dimension) training_failure("The embedding file '" << tokens[3] << "' has lower dimension than required!"); |
|
0 |
0 |
if (file_dimension < dimension) training_failure("The embedding file '" << tokens[3] << "' has lower dimension than required!"); |
16790
|
0 |
0 |
if (file_dimension > dimension) { |
16791
|
0 |
0 |
embeddings_from_file_comment = "[dim" + to_string(file_dimension) + "->" + to_string(dimension) + "]"; |
|
0 |
0 |
embeddings_from_file_comment = "[dim" + to_string(file_dimension) + "->" + to_string(dimension) + "]"; |
|
0 |
0 |
embeddings_from_file_comment = "[dim" + to_string(file_dimension) + "->" + to_string(dimension) + "]"; |
|
0 |
0 |
embeddings_from_file_comment = "[dim" + to_string(file_dimension) + "->" + to_string(dimension) + "]"; |
16794
|
0 |
0 |
projection.resize(dimension); |
16795
|
0 |
0 |
for (auto&& row : projection) { |
16796
|
0 |
0 |
row.resize(file_dimension); |
16797
|
0 |
0 |
for (auto&& weight : row) weight = uniform(generator); |
16800
|
0 |
0 |
for (auto&& weight : row) sum += weight; |
16801
|
0 |
0 |
for (auto&& weight : row) weight /= sum; |
16806
|
0 |
0 |
vector input_weights(file_dimension); |
16807
|
0 |
0 |
vector projected_weights(dimension); |
16808
|
0 |
0 |
while (getline(in, line) && int(weights.size()) < max_embeddings) { |
|
0 |
0 |
while (getline(in, line) && int(weights.size()) < max_embeddings) { |
|
0 |
0 |
while (getline(in, line) && int(weights.size()) < max_embeddings) { |
|
0 |
0 |
while (getline(in, line) && int(weights.size()) < max_embeddings) { |
16809
|
0 |
0 |
split(line, ' ', parts); |
16810
|
0 |
0 |
if (!parts.empty() && !parts.back().len) parts.pop_back(); // Ignore space at the end of line |
|
0 |
0 |
if (!parts.empty() && !parts.back().len) parts.pop_back(); // Ignore space at the end of line |
|
0 |
0 |
if (!parts.empty() && !parts.back().len) parts.pop_back(); // Ignore space at the end of line |
16811
|
0 |
0 |
if (int(parts.size()) != file_dimension + 1) training_failure("Wrong number of values on line '" << line << "' of embedding file '" << tokens[3]); |
|
0 |
0 |
if (int(parts.size()) != file_dimension + 1) training_failure("Wrong number of values on line '" << line << "' of embedding file '" << tokens[3]); |
16812
|
0 |
0 |
for (int i = 0; i < file_dimension; i++) |
16813
|
0 |
0 |
input_weights[i] = parse_double(parts[1 + i], "embedding weight"); |
16818
|
0 |
0 |
if (update_weights == 2 && !word_counts.count(word)) |
16821
|
0 |
0 |
for (int i = 0; i < dimension; i++) |
16822
|
0 |
0 |
if (file_dimension == dimension) { |
16826
|
0 |
0 |
for (int j = 0; j < file_dimension; j++) |
16830
|
0 |
0 |
if (!weights_set.count(word)) { |
16831
|
0 |
0 |
weights.emplace_back(word, projected_weights); |
16836
|
0 |
0 |
updatable_index = update_weights ? 0 : embeddings_from_file; |
16842
|
0 |
0 |
for (auto&& word_count : word_counts) |
16843
|
0 |
0 |
if (word_count.second >= min_count && !weights_set.count(word_count.first)) |
16844
|
0 |
0 |
count_words.emplace_back(word_count.second, word_count.first); |
16848
|
0 |
0 |
vector word_weights(dimension); |
16850
|
0 |
0 |
for (auto&& count_word : count_words) { |
16851
|
0 |
0 |
for (auto&& word_weight : word_weights) |
16854
|
0 |
0 |
weights.emplace_back(count_word.second, word_weights); |
16859
|
0 |
0 |
vector unknown_weights(dimension); |
16860
|
0 |
0 |
if (min_count > 1) { |
16863
|
0 |
0 |
for (auto&& weight : unknown_weights) |
16868
|
0 |
0 |
parser.embeddings.emplace_back(); |
16869
|
0 |
0 |
parser.embeddings.back().create(dimension, updatable_index, weights, unknown_weights); |
16874
|
0 |
0 |
for (auto&& tree : train) |
16875
|
0 |
0 |
for (auto&& node : tree.nodes) |
16876
|
0 |
0 |
if (node.id) { |
16877
|
0 |
0 |
parser.values.back().extract(node, word); |
16879
|
0 |
0 |
int word_id = parser.embeddings.back().lookup_word(word, buffer); |
16881
|
0 |
0 |
words_covered_from_file += word_id != parser.embeddings.back().unknown_word() && unsigned(word_id) < embeddings_from_file; |
|
0 |
0 |
words_covered_from_file += word_id != parser.embeddings.back().unknown_word() && unsigned(word_id) < embeddings_from_file; |
16891
|
0 |
0 |
for (auto&& embedding : parser.embeddings) total_dimension += embedding.dimension; |
16892
|
0 |
0 |
for (auto&& tree : train) total_nodes += tree.nodes.size() - 1; |
16896
|
0 |
0 |
neural_network_trainer network_trainer(parser.network, total_dimension * parser.nodes.node_count(), parser.system->transition_count(), scaled_parameters, generator); |
|
0 |
0 |
neural_network_trainer network_trainer(parser.network, total_dimension * parser.nodes.node_count(), parser.system->transition_count(), scaled_parameters, generator); |
16902
|
0 |
0 |
for (size_t i = 0; i < train.size(); i++) |
16905
|
0 |
0 |
for (int iteration = 1; network_trainer.next_iteration(); iteration++) { |
16922
|
0 |
0 |
tree t_eval; |
16930
|
0 |
0 |
for (unsigned current_index; (current_index = atomic_index++) < permutation.size();) { |
16934
|
0 |
0 |
conf.init(&t); |
16937
|
0 |
0 |
if (t.nodes.size() > nodes_embeddings.size()) nodes_embeddings.resize(t.nodes.size()); |
|
0 |
0 |
if (t.nodes.size() > nodes_embeddings.size()) nodes_embeddings.resize(t.nodes.size()); |
16938
|
0 |
0 |
for (size_t i = 0; i < t.nodes.size(); i++) { |
16939
|
0 |
0 |
nodes_embeddings[i].resize(parser.embeddings.size()); |
16940
|
0 |
0 |
for (size_t j = 0; j < parser.embeddings.size(); j++) { |
16941
|
0 |
0 |
parser.values[j].extract(t.nodes[i], word); |
16942
|
0 |
0 |
nodes_embeddings[i][j] = parser.embeddings[j].lookup_word(word, word_buffer); |
16947
|
0 |
0 |
auto tree_oracle = oracle->create_tree_oracle(gold); |
16950
|
0 |
0 |
while (!conf.final()) { |
16952
|
0 |
0 |
parser.nodes.extract(conf, extracted_nodes); |
16953
|
0 |
0 |
extracted_embeddings.resize(extracted_nodes.size()); |
16954
|
0 |
0 |
for (size_t i = 0; i < extracted_nodes.size(); i++) |
16955
|
0 |
0 |
extracted_embeddings[i] = extracted_nodes[i] >= 0 ? &nodes_embeddings[extracted_nodes[i]] : nullptr; |
16958
|
0 |
0 |
network_trainer.propagate(parser.embeddings, extracted_embeddings, workspace); |
16962
|
0 |
0 |
for (unsigned i = 0; i < workspace.outcomes.size(); i++) |
16963
|
0 |
0 |
if (parser.system->applicable(conf, i) && (network_best < 0 || workspace.outcomes[i] > workspace.outcomes[network_best])) |
|
0 |
0 |
if (parser.system->applicable(conf, i) && (network_best < 0 || workspace.outcomes[i] > workspace.outcomes[network_best])) |
|
0 |
0 |
if (parser.system->applicable(conf, i) && (network_best < 0 || workspace.outcomes[i] > workspace.outcomes[network_best])) |
|
0 |
0 |
if (parser.system->applicable(conf, i) && (network_best < 0 || workspace.outcomes[i] > workspace.outcomes[network_best])) |
|
0 |
0 |
if (parser.system->applicable(conf, i) && (network_best < 0 || workspace.outcomes[i] > workspace.outcomes[network_best])) |
16967
|
0 |
0 |
auto prediction = tree_oracle->predict(conf, network_best, iteration); |
16970
|
0 |
0 |
if (parser.system->applicable(conf, prediction.best)) { |
|
0 |
0 |
if (parser.system->applicable(conf, prediction.best)) { |
16972
|
0 |
0 |
if (workspace.outcomes[prediction.best]) |
16976
|
0 |
0 |
network_trainer.backpropagate(parser.embeddings, extracted_embeddings, prediction.best, workspace); |
16980
|
0 |
0 |
if (!parser.system->applicable(conf, prediction.to_follow)) |
|
0 |
0 |
if (!parser.system->applicable(conf, prediction.to_follow)) |
16984
|
0 |
0 |
int child = parser.system->perform(conf, prediction.to_follow); |
16987
|
0 |
0 |
if (child >= 0) |
16988
|
0 |
0 |
for (size_t i = 0; i < parser.embeddings.size(); i++) { |
16989
|
0 |
0 |
parser.values[i].extract(t.nodes[child], word); |
16990
|
0 |
0 |
nodes_embeddings[child][i] = parser.embeddings[i].lookup_word(word, word_buffer); |
16996
|
0 |
0 |
if (parameters.structured_interval && (current_index % parameters.structured_interval) == 0) { |
|
0 |
0 |
if (parameters.structured_interval && (current_index % parameters.structured_interval) == 0) { |
17001
|
0 |
0 |
conf.init(&t); |
17004
|
0 |
0 |
if (t.nodes.size() > nodes_embeddings.size()) nodes_embeddings.resize(t.nodes.size()); |
|
0 |
0 |
if (t.nodes.size() > nodes_embeddings.size()) nodes_embeddings.resize(t.nodes.size()); |
17005
|
0 |
0 |
for (size_t i = 0; i < t.nodes.size(); i++) { |
17006
|
0 |
0 |
nodes_embeddings[i].resize(parser.embeddings.size()); |
17007
|
0 |
0 |
for (size_t j = 0; j < parser.embeddings.size(); j++) { |
17008
|
0 |
0 |
parser.values[j].extract(t.nodes[i], word); |
17009
|
0 |
0 |
nodes_embeddings[i][j] = parser.embeddings[j].lookup_word(word, word_buffer); |
17014
|
0 |
0 |
auto tree_oracle = oracle->create_tree_oracle(gold); |
17017
|
0 |
0 |
while (!conf.final()) { |
17019
|
0 |
0 |
parser.nodes.extract(conf, extracted_nodes); |
17020
|
0 |
0 |
extracted_embeddings.resize(extracted_nodes.size()); |
17021
|
0 |
0 |
for (size_t i = 0; i < extracted_nodes.size(); i++) |
17022
|
0 |
0 |
extracted_embeddings[i] = extracted_nodes[i] >= 0 ? &nodes_embeddings[extracted_nodes[i]] : nullptr; |
17027
|
0 |
0 |
tree_oracle->interesting_transitions(conf, transitions_eval); |
17028
|
0 |
0 |
for (auto&& transition : transitions_eval) { |
17030
|
0 |
0 |
conf_eval = conf; |
17032
|
0 |
0 |
nodes_embeddings_eval = nodes_embeddings; |
17035
|
0 |
0 |
int child = parser.system->perform(conf_eval, transition); |
17036
|
0 |
0 |
if (child >= 0) |
17037
|
0 |
0 |
for (size_t i = 0; i < parser.embeddings.size(); i++) { |
17038
|
0 |
0 |
parser.values[i].extract(t_eval.nodes[child], word); |
17039
|
0 |
0 |
nodes_embeddings_eval[child][i] = parser.embeddings[i].lookup_word(word, word_buffer); |
17043
|
0 |
0 |
while (!conf_eval.final()) { |
17045
|
0 |
0 |
parser.nodes.extract(conf_eval, extracted_nodes_eval); |
17046
|
0 |
0 |
extracted_embeddings_eval.resize(extracted_nodes_eval.size()); |
17047
|
0 |
0 |
for (size_t i = 0; i < extracted_nodes_eval.size(); i++) |
17048
|
0 |
0 |
extracted_embeddings_eval[i] = extracted_nodes_eval[i] >= 0 ? &nodes_embeddings_eval[extracted_nodes_eval[i]] : nullptr; |
17051
|
0 |
0 |
parser.network.propagate(parser.embeddings, extracted_embeddings_eval, hidden_layer_eval, outcomes_eval, nullptr, false); |
17055
|
0 |
0 |
for (unsigned i = 0; i < outcomes_eval.size(); i++) |
17056
|
0 |
0 |
if (parser.system->applicable(conf_eval, i) && (network_best < 0 || outcomes_eval[i] > outcomes_eval[network_best])) |
|
0 |
0 |
if (parser.system->applicable(conf_eval, i) && (network_best < 0 || outcomes_eval[i] > outcomes_eval[network_best])) |
|
0 |
0 |
if (parser.system->applicable(conf_eval, i) && (network_best < 0 || outcomes_eval[i] > outcomes_eval[network_best])) |
|
0 |
0 |
if (parser.system->applicable(conf_eval, i) && (network_best < 0 || outcomes_eval[i] > outcomes_eval[network_best])) |
|
0 |
0 |
if (parser.system->applicable(conf_eval, i) && (network_best < 0 || outcomes_eval[i] > outcomes_eval[network_best])) |
17060
|
0 |
0 |
int child = parser.system->perform(conf_eval, network_best); |
17063
|
0 |
0 |
if (child >= 0) |
17064
|
0 |
0 |
for (size_t i = 0; i < parser.embeddings.size(); i++) { |
17065
|
0 |
0 |
parser.values[i].extract(t_eval.nodes[child], word); |
17066
|
0 |
0 |
nodes_embeddings_eval[child][i] = parser.embeddings[i].lookup_word(word, word_buffer); |
17071
|
0 |
0 |
for (unsigned i = 1; i < gold.nodes.size(); i++) |
17074
|
0 |
0 |
if (uas > best_uas) best = transition, best_uas = uas; |
17078
|
0 |
0 |
network_trainer.propagate(parser.embeddings, extracted_embeddings, workspace); |
17081
|
0 |
0 |
if (workspace.outcomes[best]) |
17083
|
0 |
0 |
network_trainer.backpropagate(parser.embeddings, extracted_embeddings, best, workspace); |
17092
|
0 |
0 |
int child = parser.system->perform(conf, /*network_*/best); |
17095
|
0 |
0 |
if (child >= 0) |
17096
|
0 |
0 |
for (size_t i = 0; i < parser.embeddings.size(); i++) { |
17097
|
0 |
0 |
parser.values[i].extract(t.nodes[child], word); |
17098
|
0 |
0 |
nodes_embeddings[child][i] = parser.embeddings[i].lookup_word(word, word_buffer); |
17104
|
0 |
0 |
for (double old_atomic_logprob = atomic_logprob; atomic_logprob.compare_exchange_weak(old_atomic_logprob, old_atomic_logprob + logprob); ) {} |
17107
|
0 |
0 |
cerr << "Iteration " << iteration << ": "; |
|
0 |
0 |
cerr << "Iteration " << iteration << ": "; |
17108
|
0 |
0 |
training(); |
17112
|
0 |
0 |
if (!heldout.empty()) { |
17113
|
0 |
0 |
tree t; |
17115
|
0 |
0 |
for (auto&& gold : heldout) { |
17119
|
0 |
0 |
for (size_t i = 1; i < t.nodes.size(); i++) { |
17122
|
0 |
0 |
correct_labelled += t.nodes[i].head == gold.nodes[i].head && t.nodes[i].deprel == gold.nodes[i].deprel; |
|
0 |
0 |
correct_labelled += t.nodes[i].head == gold.nodes[i].head && t.nodes[i].deprel == gold.nodes[i].deprel; |
17126
|
0 |
0 |
cerr << ", heldout UAS " << fixed << setprecision(2) << (100. * correct_unlabelled / total) << "%, LAS " << (100. * correct_labelled / total) << "%"; |
17128
|
0 |
0 |
if (parameters.early_stopping && correct_labelled > heldout_best_correct_labelled) { |
|
0 |
0 |
if (parameters.early_stopping && correct_labelled > heldout_best_correct_labelled) { |
17129
|
0 |
0 |
heldout_best_network = parser.network; |
17138
|
0 |
0 |
if (parameters.early_stopping && heldout_best_iteration > 0) { |
|
0 |
0 |
if (parameters.early_stopping && heldout_best_iteration > 0) { |
17140
|
0 |
0 |
parser.network = heldout_best_network; |
17144
|
0 |
0 |
enc.add_1B(parser.version); |
17147
|
0 |
0 |
enc.add_1B(single_root); |
17150
|
0 |
0 |
enc.add_2B(parser.labels.size()); |
17151
|
0 |
0 |
for (auto&& label : parser.labels) |
17152
|
0 |
0 |
enc.add_str(label); |
17153
|
0 |
0 |
enc.add_str(transition_system_name); |
17156
|
0 |
0 |
enc.add_str(nodes_description); |
17159
|
0 |
0 |
enc.add_2B(value_names.size()); |
17160
|
0 |
0 |
for (auto&& value_name : value_names) |
17161
|
0 |
0 |
enc.add_str(value_name); |
17162
|
0 |
0 |
for (auto&& embedding : parser.embeddings) |
17163
|
0 |
0 |
embedding.save(enc); |
17166
|
0 |
0 |
network_trainer.save_network(enc); |
17188
|
0 |
387 |
if (conf.single_root && label_is_root) |
|
0 |
0 |
if (conf.single_root && label_is_root) |
17191
|
351 |
36 |
return conf.stack.size() >= 2 && conf.stack[conf.stack.size() - 2]; |
|
90 |
261 |
return conf.stack.size() >= 2 && conf.stack[conf.stack.size() - 2]; |
17195
|
0 |
15 |
assert(applicable(conf)); |
17206
|
0 |
395 |
if (conf.single_root && label_is_root) |
|
0 |
0 |
if (conf.single_root && label_is_root) |
17207
|
0 |
0 |
return conf.stack.size() == 2 && conf.buffer.empty(); |
|
0 |
0 |
return conf.stack.size() == 2 && conf.buffer.empty(); |
17208
|
0 |
395 |
else if (conf.single_root) // && !label_is_root |
17215
|
0 |
23 |
assert(applicable(conf)); |
17229
|
0 |
28 |
assert(applicable(conf)); |
17238
|
0 |
0 |
return conf.stack.size() >= 2 && conf.stack[conf.stack.size() - 2] && conf.stack[conf.stack.size() - 2] < conf.stack[conf.stack.size() - 1]; |
|
0 |
0 |
return conf.stack.size() >= 2 && conf.stack[conf.stack.size() - 2] && conf.stack[conf.stack.size() - 2] < conf.stack[conf.stack.size() - 1]; |
|
0 |
0 |
return conf.stack.size() >= 2 && conf.stack[conf.stack.size() - 2] && conf.stack[conf.stack.size() - 2] < conf.stack[conf.stack.size() - 1]; |
17242
|
0 |
0 |
assert(applicable(conf)); |
17253
|
0 |
0 |
if (conf.single_root && label_is_root) |
|
0 |
0 |
if (conf.single_root && label_is_root) |
17256
|
0 |
0 |
return conf.stack.size() >= 3 && conf.stack[conf.stack.size() - 3]; |
|
0 |
0 |
return conf.stack.size() >= 3 && conf.stack[conf.stack.size() - 3]; |
17260
|
0 |
0 |
assert(applicable(conf)); |
17273
|
0 |
0 |
if (conf.single_root && label_is_root) |
|
0 |
0 |
if (conf.single_root && label_is_root) |
17275
|
0 |
0 |
else if (conf.single_root) // && !label_is_root |
17282
|
0 |
0 |
assert(applicable(conf)); |
17386
|
0 |
806 |
assert(transition < transitions.size()); |
17392
|
0 |
66 |
assert(transition < transitions.size()); |
17398
|
1 |
0 |
if (name == "projective") return new transition_system_projective(labels); |
|
1 |
0 |
if (name == "projective") return new transition_system_projective(labels); |
17399
|
0 |
0 |
if (name == "swap") return new transition_system_swap(labels); |
|
0 |
0 |
if (name == "swap") return new transition_system_swap(labels); |
17400
|
0 |
0 |
if (name == "link2") return new transition_system_link2(labels); |
|
0 |
0 |
if (name == "link2") return new transition_system_link2(labels); |
17422
|
0 |
0 |
transitions.emplace_back(new transition_shift()); |
|
0 |
0 |
transitions.emplace_back(new transition_shift()); |
17423
|
0 |
0 |
for (auto&& label : labels) { |
17424
|
0 |
0 |
transitions.emplace_back(new transition_left_arc(label)); |
|
0 |
0 |
transitions.emplace_back(new transition_left_arc(label)); |
|
0 |
0 |
transitions.emplace_back(new transition_left_arc(label)); |
17425
|
0 |
0 |
transitions.emplace_back(new transition_right_arc(label)); |
|
0 |
0 |
transitions.emplace_back(new transition_right_arc(label)); |
|
0 |
0 |
transitions.emplace_back(new transition_right_arc(label)); |
17426
|
0 |
0 |
transitions.emplace_back(new transition_left_arc_2(label)); |
|
0 |
0 |
transitions.emplace_back(new transition_left_arc_2(label)); |
|
0 |
0 |
transitions.emplace_back(new transition_left_arc_2(label)); |
17427
|
0 |
0 |
transitions.emplace_back(new transition_right_arc_2(label)); |
|
0 |
0 |
transitions.emplace_back(new transition_right_arc_2(label)); |
|
0 |
0 |
transitions.emplace_back(new transition_right_arc_2(label)); |
17435
|
0 |
0 |
for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break; |
|
0 |
0 |
for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break; |
17463
|
0 |
0 |
if (!conf.buffer.empty()) transitions.push_back(0); |
17468
|
0 |
0 |
for (int direction = 0; direction < 4; direction++) |
17469
|
0 |
0 |
if (conf.stack.size() >= parents[direction] && conf.stack.size() >= children[direction]) { |
|
0 |
0 |
if (conf.stack.size() >= parents[direction] && conf.stack.size() >= children[direction]) { |
|
0 |
0 |
if (conf.stack.size() >= parents[direction] && conf.stack.size() >= children[direction]) { |
17474
|
0 |
0 |
if (direction >= 2 && gold.nodes[child].head != parent) continue; |
|
0 |
0 |
if (direction >= 2 && gold.nodes[child].head != parent) continue; |
|
0 |
0 |
if (direction >= 2 && gold.nodes[child].head != parent) continue; |
17476
|
0 |
0 |
for (size_t i = 0; i < labels.size(); i++) |
17477
|
0 |
0 |
if (gold.nodes[child].deprel == labels[i]) |
17478
|
0 |
0 |
if (!conf.single_root || |
|
0 |
0 |
if (!conf.single_root || |
17479
|
0 |
0 |
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
0 |
0 |
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
0 |
0 |
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
0 |
0 |
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
17480
|
0 |
0 |
(i != root_label && conf.stack.size() > 2 && direction < 2) || |
|
0 |
0 |
(i != root_label && conf.stack.size() > 2 && direction < 2) || |
|
0 |
0 |
(i != root_label && conf.stack.size() > 2 && direction < 2) || |
|
0 |
0 |
(i != root_label && conf.stack.size() > 2 && direction < 2) || |
17481
|
0 |
0 |
(i != root_label && conf.stack.size() > 3 && direction >= 2)) |
|
0 |
0 |
(i != root_label && conf.stack.size() > 3 && direction >= 2)) |
17490
|
0 |
0 |
for (int direction = 0; direction < 4; direction++) |
17491
|
0 |
0 |
if (conf.stack.size() >= parents[direction] && conf.stack.size() >= children[direction]) { |
|
0 |
0 |
if (conf.stack.size() >= parents[direction] && conf.stack.size() >= children[direction]) { |
|
0 |
0 |
if (conf.stack.size() >= parents[direction] && conf.stack.size() >= children[direction]) { |
17495
|
0 |
0 |
if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) { |
|
0 |
0 |
if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) { |
|
0 |
0 |
if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) { |
17496
|
0 |
0 |
for (size_t i = 0; i < labels.size(); i++) |
17497
|
0 |
0 |
if (gold.nodes[child].deprel == labels[i]) |
17510
|
0 |
0 |
if (name == "static") return new transition_system_link2_oracle_static(labels); |
|
0 |
0 |
if (name == "static") return new transition_system_link2_oracle_static(labels); |
17532
|
1 |
0 |
transitions.emplace_back(new transition_shift()); |
|
1 |
0 |
transitions.emplace_back(new transition_shift()); |
17533
|
6 |
1 |
for (auto&& label : labels) { |
17534
|
6 |
0 |
transitions.emplace_back(new transition_left_arc(label)); |
|
6 |
0 |
transitions.emplace_back(new transition_left_arc(label)); |
|
6 |
0 |
transitions.emplace_back(new transition_left_arc(label)); |
17535
|
6 |
0 |
transitions.emplace_back(new transition_right_arc(label)); |
|
6 |
0 |
transitions.emplace_back(new transition_right_arc(label)); |
|
6 |
0 |
transitions.emplace_back(new transition_right_arc(label)); |
17543
|
0 |
0 |
for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break; |
|
0 |
0 |
for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break; |
17569
|
0 |
0 |
if (!conf.buffer.empty()) transitions.push_back(0); |
17570
|
0 |
0 |
if (conf.stack.size() >= 2) |
17571
|
0 |
0 |
for (int direction = 0; direction < 2; direction++) { |
17573
|
0 |
0 |
for (size_t i = 0; i < labels.size(); i++) |
17574
|
0 |
0 |
if (gold.nodes[child].deprel == labels[i]) |
17575
|
0 |
0 |
if (!conf.single_root || |
|
0 |
0 |
if (!conf.single_root || |
17576
|
0 |
0 |
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
0 |
0 |
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
0 |
0 |
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
0 |
0 |
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
0 |
0 |
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
17577
|
0 |
0 |
(i != root_label && conf.stack.size() > 2)) |
17584
|
0 |
0 |
if (conf.stack.size() >= 2) { |
17587
|
0 |
0 |
if (gold.nodes[child].head == parent) { |
17588
|
0 |
0 |
for (size_t i = 0; i < labels.size(); i++) |
17589
|
0 |
0 |
if (gold.nodes[child].deprel == labels[i]) |
17597
|
0 |
0 |
if (conf.stack.size() >= 2) { |
17600
|
0 |
0 |
if (gold.nodes[child].head == parent && |
|
0 |
0 |
if (gold.nodes[child].head == parent && |
|
0 |
0 |
if (gold.nodes[child].head == parent && |
17601
|
0 |
0 |
(conf.buffer.empty() || gold.nodes[child].children.empty() || gold.nodes[child].children.back() < conf.buffer.back())) { |
|
0 |
0 |
(conf.buffer.empty() || gold.nodes[child].children.empty() || gold.nodes[child].children.back() < conf.buffer.back())) { |
17602
|
0 |
0 |
for (size_t i = 0; i < labels.size(); i++) |
17603
|
0 |
0 |
if (gold.nodes[child].deprel == labels[i]) |
17618
|
0 |
0 |
for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break; |
|
0 |
0 |
for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break; |
17648
|
0 |
0 |
if (iteration <= 1) |
17657
|
0 |
0 |
if (!conf.buffer.empty()) { |
17659
|
0 |
0 |
for (size_t i = conf.buffer.size(); i--; ) { |
17662
|
0 |
0 |
for (auto&& child : gold.nodes[node].children) |
17664
|
0 |
0 |
if (to_right_stack) { |
17665
|
0 |
0 |
right_stack.push_back(node); |
17672
|
0 |
0 |
class t_representation { |
|
0 |
0 |
class t_representation { |
|
0 |
0 |
class t_representation { |
|
0 |
0 |
class t_representation { |
17675
|
0 |
0 |
: stack(stack), right_stack(right_stack), gold(gold), labels(labels) { |
|
0 |
0 |
: stack(stack), right_stack(right_stack), gold(gold), labels(labels) { |
17676
|
0 |
0 |
for (int i = 0; i < 2; i++) { |
17677
|
0 |
0 |
costs[i].reserve((stack.size() + right_stack.size()) * (stack.size() + right_stack.size())); |
17678
|
0 |
0 |
transitions[i].reserve((stack.size() + right_stack.size()) * (stack.size() + right_stack.size())); |
17680
|
0 |
0 |
} |
|
0 |
0 |
} |
17690
|
0 |
0 |
int node(unsigned i, unsigned /*j*/, unsigned h) const { return h <= i ? stack[stack.size() - 1 - i + h] : right_stack[h - i - 1]; } |
|
0 |
0 |
int node(unsigned i, unsigned /*j*/, unsigned h) const { return h <= i ? stack[stack.size() - 1 - i + h] : right_stack[h - i - 1]; } |
|
0 |
0 |
int node(unsigned i, unsigned /*j*/, unsigned h) const { return h <= i ? stack[stack.size() - 1 - i + h] : right_stack[h - i - 1]; } |
17693
|
0 |
0 |
for (size_t i = 0; i < labels.size(); i++) |
17694
|
0 |
0 |
if (gold.nodes[child].deprel == labels[i]) |
17706
|
0 |
0 |
} t(conf.stack, right_stack, gold, labels); |
17708
|
0 |
0 |
t.prepare(0); |
17710
|
0 |
0 |
for (unsigned diagonal = 0; diagonal < conf.stack.size() + right_stack.size(); diagonal++) { |
17711
|
0 |
0 |
t.prepare(diagonal + 1); |
17712
|
0 |
0 |
for (unsigned i = diagonal > right_stack.size() ? diagonal - right_stack.size() : 0; i <= diagonal && i < conf.stack.size(); i++) { |
|
0 |
0 |
for (unsigned i = diagonal > right_stack.size() ? diagonal - right_stack.size() : 0; i <= diagonal && i < conf.stack.size(); i++) { |
|
0 |
0 |
for (unsigned i = diagonal > right_stack.size() ? diagonal - right_stack.size() : 0; i <= diagonal && i < conf.stack.size(); i++) { |
|
0 |
0 |
for (unsigned i = diagonal > right_stack.size() ? diagonal - right_stack.size() : 0; i <= diagonal && i < conf.stack.size(); i++) { |
17716
|
0 |
0 |
if (i+1 < conf.stack.size()) |
17717
|
0 |
0 |
for (unsigned h = 0; h <= diagonal; h++) { |
17719
|
0 |
0 |
if (new_node && t.cost(i, j, h) + t.edge_cost(h_node, new_node) < t.cost(i+1, j, h+1) + (t.transition(i, j, h) != 0)) { |
|
0 |
0 |
if (new_node && t.cost(i, j, h) + t.edge_cost(h_node, new_node) < t.cost(i+1, j, h+1) + (t.transition(i, j, h) != 0)) { |
|
0 |
0 |
if (new_node && t.cost(i, j, h) + t.edge_cost(h_node, new_node) < t.cost(i+1, j, h+1) + (t.transition(i, j, h) != 0)) { |
17721
|
0 |
0 |
t.transition(i+1, j, h+1) = t.transition(i, j, h) >= 0 ? t.transition(i, j, h) : t.which_arc_transition(h_node, new_node); |
17723
|
0 |
0 |
if (t.cost(i, j, h) + t.edge_cost(new_node, h_node) < t.cost(i+1, j, 0) + (t.transition(i, j, h) != 0)) { |
17725
|
0 |
0 |
t.transition(i+1, j, 0) = t.transition(i, j, h) >= 0 ? t.transition(i, j, h) : t.which_arc_transition(new_node, h_node); |
17730
|
0 |
0 |
if (j+1 < right_stack.size() + 1) |
17731
|
0 |
0 |
for (unsigned h = 0; h <= diagonal; h++) { |
17733
|
0 |
0 |
if (t.cost(i, j, h) + t.edge_cost(h_node, new_node) < t.cost(i, j+1, h) + (t.transition(i, j, h) > 0)) { |
17735
|
0 |
0 |
t.transition(i, j+1, h) = t.transition(i, j, h) >= 0 ? t.transition(i, j, h) : 0; |
17737
|
0 |
0 |
if (h_node && t.cost(i, j, h) + t.edge_cost(new_node, h_node) < t.cost(i, j+1, diagonal+1) + (t.transition(i, j, h) > 0)) { |
|
0 |
0 |
if (h_node && t.cost(i, j, h) + t.edge_cost(new_node, h_node) < t.cost(i, j+1, diagonal+1) + (t.transition(i, j, h) > 0)) { |
|
0 |
0 |
if (h_node && t.cost(i, j, h) + t.edge_cost(new_node, h_node) < t.cost(i, j+1, diagonal+1) + (t.transition(i, j, h) > 0)) { |
17739
|
0 |
0 |
t.transition(i, j+1, diagonal+1) = t.transition(i, j, h) >= 0 ? t.transition(i, j, h) : 0; |
17750
|
0 |
0 |
if (name == "static") return new transition_system_projective_oracle_static(labels); |
|
0 |
0 |
if (name == "static") return new transition_system_projective_oracle_static(labels); |
17751
|
0 |
0 |
if (name == "dynamic") return new transition_system_projective_oracle_dynamic(labels); |
|
0 |
0 |
if (name == "dynamic") return new transition_system_projective_oracle_dynamic(labels); |
17773
|
0 |
0 |
transitions.emplace_back(new transition_shift()); |
|
0 |
0 |
transitions.emplace_back(new transition_shift()); |
17774
|
0 |
0 |
transitions.emplace_back(new transition_swap()); |
|
0 |
0 |
transitions.emplace_back(new transition_swap()); |
17775
|
0 |
0 |
for (auto&& label : labels) { |
17776
|
0 |
0 |
transitions.emplace_back(new transition_left_arc(label)); |
|
0 |
0 |
transitions.emplace_back(new transition_left_arc(label)); |
|
0 |
0 |
transitions.emplace_back(new transition_left_arc(label)); |
17777
|
0 |
0 |
transitions.emplace_back(new transition_right_arc(label)); |
|
0 |
0 |
transitions.emplace_back(new transition_right_arc(label)); |
|
0 |
0 |
transitions.emplace_back(new transition_right_arc(label)); |
17785
|
0 |
0 |
for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break; |
|
0 |
0 |
for (root_label = 0; root_label < labels.size(); root_label++) if (labels[root_label] == "root") break; |
17791
|
0 |
0 |
: labels(labels), root_label(root_label), gold(gold), projective_order(projective_order), projective_components(projective_components) {} |
|
0 |
0 |
: labels(labels), root_label(root_label), gold(gold), projective_order(projective_order), projective_components(projective_components) {} |
17818
|
0 |
0 |
if (lazy) { |
17819
|
0 |
0 |
tree_oracle_static projective_oracle(labels, root_label, gold, vector(), vector()); |
17822
|
0 |
0 |
transition_system_swap system(labels); |
17824
|
0 |
0 |
conf.init(&t); |
17825
|
0 |
0 |
while (!conf.final()) { |
17827
|
0 |
0 |
if (!system.applicable(conf, prediction.to_follow)) break; |
|
0 |
0 |
if (!system.applicable(conf, prediction.to_follow)) break; |
17828
|
0 |
0 |
system.perform(conf, prediction.to_follow); |
17832
|
0 |
0 |
for (auto&& node : conf.stack) |
17833
|
0 |
0 |
if (node) |
17837
|
0 |
0 |
return unique_ptr(new tree_oracle_static(labels, root_label, gold, move(projective_order), move(projective_components))); |
|
0 |
0 |
return unique_ptr(new tree_oracle_static(labels, root_label, gold, move(projective_order), move(projective_components))); |
17842
|
0 |
0 |
while (child_index < gold.nodes[node].children.size() && gold.nodes[node].children[child_index] < node) |
|
0 |
0 |
while (child_index < gold.nodes[node].children.size() && gold.nodes[node].children[child_index] < node) |
|
0 |
0 |
while (child_index < gold.nodes[node].children.size() && gold.nodes[node].children[child_index] < node) |
17845
|
0 |
0 |
while (child_index < gold.nodes[node].children.size()) |
17851
|
0 |
0 |
for (auto&& child : gold.nodes[node].children) |
17857
|
0 |
0 |
if (!conf.buffer.empty()) transitions.push_back(0); |
17858
|
0 |
0 |
if (conf.stack.size() >= 2) { |
17860
|
0 |
0 |
if (!projective_order.empty()) { |
17863
|
0 |
0 |
if (projective_order[last] < projective_order[prev] && |
|
0 |
0 |
if (projective_order[last] < projective_order[prev] && |
|
0 |
0 |
if (projective_order[last] < projective_order[prev] && |
17864
|
0 |
0 |
(projective_components.empty() || |
17865
|
0 |
0 |
(conf.buffer.empty() || projective_components[last] != projective_components[conf.buffer.back()]))) |
17870
|
0 |
0 |
for (int direction = 0; direction < 2; direction++) { |
17872
|
0 |
0 |
for (size_t i = 0; i < labels.size(); i++) |
17873
|
0 |
0 |
if (gold.nodes[child].deprel == labels[i]) |
17874
|
0 |
0 |
if (!conf.single_root || |
|
0 |
0 |
if (!conf.single_root || |
17875
|
0 |
0 |
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
0 |
0 |
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
0 |
0 |
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
0 |
0 |
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
|
0 |
0 |
(i == root_label && conf.stack.size() == 2 && conf.buffer.empty() && direction == 1) || |
17876
|
0 |
0 |
(i != root_label && conf.stack.size() > 2)) |
17884
|
0 |
0 |
if (conf.stack.size() >= 2) { |
17887
|
0 |
0 |
if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) { |
|
0 |
0 |
if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) { |
|
0 |
0 |
if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) { |
17888
|
0 |
0 |
for (size_t i = 0; i < labels.size(); i++) |
17889
|
0 |
0 |
if (gold.nodes[child].deprel == labels[i]) |
17897
|
0 |
0 |
if (conf.stack.size() >= 2) { |
17900
|
0 |
0 |
if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) { |
|
0 |
0 |
if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) { |
|
0 |
0 |
if (gold.nodes[child].head == parent && gold.nodes[child].children.size() == conf.t->nodes[child].children.size()) { |
17901
|
0 |
0 |
for (size_t i = 0; i < labels.size(); i++) |
17902
|
0 |
0 |
if (gold.nodes[child].deprel == labels[i]) |
17910
|
0 |
0 |
if (conf.stack.size() >= 2 && !projective_order.empty()) { |
|
0 |
0 |
if (conf.stack.size() >= 2 && !projective_order.empty()) { |
|
0 |
0 |
if (conf.stack.size() >= 2 && !projective_order.empty()) { |
17913
|
0 |
0 |
if (projective_order[last] < projective_order[prev] && |
|
0 |
0 |
if (projective_order[last] < projective_order[prev] && |
|
0 |
0 |
if (projective_order[last] < projective_order[prev] && |
17914
|
0 |
0 |
(projective_components.empty() || |
17915
|
0 |
0 |
(conf.buffer.empty() || projective_components[last] != projective_components[conf.buffer.back()]))) |
17925
|
0 |
0 |
if (name == "static_eager") return new transition_system_swap_oracle_static(labels, false); |
|
0 |
0 |
if (name == "static_eager") return new transition_system_swap_oracle_static(labels, false); |
17926
|
0 |
0 |
if (name == "static_lazy") return new transition_system_swap_oracle_static(labels, true); |
|
0 |
0 |
if (name == "static_lazy") return new transition_system_swap_oracle_static(labels, true); |
17950
|
1 |
0 |
clear(); |
17964
|
0 |
0 |
nodes.emplace_back((int)nodes.size(), form); |
|
0 |
0 |
nodes.emplace_back((int)nodes.size(), form); |
|
0 |
0 |
nodes.emplace_back((int)nodes.size(), form); |
|
7 |
0 |
nodes.emplace_back((int)nodes.size(), form); |
17969
|
38 |
0 |
assert(id >= 0 && id < int(nodes.size())); |
|
0 |
38 |
assert(id >= 0 && id < int(nodes.size())); |
17970
|
0 |
38 |
assert(head < int(nodes.size())); |
17973
|
0 |
38 |
if (nodes[id].head >= 0) { |
17975
|
0 |
0 |
for (size_t i = children.size(); i && children[i-1] >= id; i--) |
|
0 |
0 |
for (size_t i = children.size(); i && children[i-1] >= id; i--) |
|
0 |
0 |
for (size_t i = children.size(); i && children[i-1] >= id; i--) |
17976
|
0 |
0 |
if (children[i-1] == id) { |
17985
|
38 |
0 |
if (head >= 0) { |
17988
|
27 |
29 |
while (i && children[i-1] > id) i--; |
|
9 |
18 |
while (i && children[i-1] > id) i--; |
|
18 |
38 |
while (i && children[i-1] > id) i--; |
17989
|
9 |
29 |
if (!i || children[i-1] < id) children.insert(children.begin() + i, id); |
|
9 |
0 |
if (!i || children[i-1] < id) children.insert(children.begin() + i, id); |
|
38 |
0 |
if (!i || children[i-1] < id) children.insert(children.begin() + i, id); |
17994
|
0 |
0 |
for (auto&& node : nodes) { |
|
0 |
0 |
for (auto&& node : nodes) { |
|
0 |
0 |
for (auto&& node : nodes) { |
|
0 |
0 |
for (auto&& node : nodes) { |
|
8 |
1 |
for (auto&& node : nodes) { |
18088
|
0 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
0 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
0 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
0 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
0 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
0 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
0 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
18118
|
0 |
0 |
if (name == "conllu") return new_conllu_input_format(); |
18128
|
0 |
0 |
if (name == "conllu") return new_conllu_output_format(); |
18156
|
0 |
0 |
if (make_copy) { |
18171
|
0 |
0 |
while (text.len) { |
18174
|
0 |
0 |
while (line.len < text.len && line.str[line.len] != '\n') line.len++; |
|
0 |
0 |
while (line.len < text.len && line.str[line.len] != '\n') line.len++; |
18179
|
0 |
0 |
if (!line.len) { |
18180
|
0 |
0 |
if (t.empty()) continue; |
18184
|
0 |
0 |
if (*line.str == '#') { |
18186
|
0 |
0 |
if (t.empty()) comments.push_back(line); |
|
0 |
0 |
if (t.empty()) comments.push_back(line); |
18191
|
0 |
0 |
split(line, '\t', tokens); |
18192
|
0 |
0 |
if (tokens.size() != 10) |
18193
|
0 |
0 |
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' does not contain 10 columns!") , false; |
|
0 |
0 |
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' does not contain 10 columns!") , false; |
|
0 |
0 |
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' does not contain 10 columns!") , false; |
18196
|
0 |
0 |
if (memchr(tokens[0].str, '-', tokens[0].len)) { |
18197
|
0 |
0 |
split(tokens[0], '-', parts); |
18198
|
0 |
0 |
if (parts.size() != 2) |
18199
|
0 |
0 |
return error.assign("Cannot parse ID of multiword token '").append(line.str, line.len).append("'!") , false; |
|
0 |
0 |
return error.assign("Cannot parse ID of multiword token '").append(line.str, line.len).append("'!") , false; |
|
0 |
0 |
return error.assign("Cannot parse ID of multiword token '").append(line.str, line.len).append("'!") , false; |
18201
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error)) |
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error)) |
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error)) |
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error)) |
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error)) |
18203
|
0 |
0 |
if (from != int(t.nodes.size())) |
18204
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
18205
|
0 |
0 |
if (to < from) |
18206
|
0 |
0 |
return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
18207
|
0 |
0 |
if (from <= last_multiword_token) |
18208
|
0 |
0 |
return error.assign("Multiword token '").append(line.str, line.len).append("' overlaps with the previous one!"), false; |
|
0 |
0 |
return error.assign("Multiword token '").append(line.str, line.len).append("' overlaps with the previous one!"), false; |
|
0 |
0 |
return error.assign("Multiword token '").append(line.str, line.len).append("' overlaps with the previous one!"), false; |
18210
|
0 |
0 |
multiword_tokens.emplace_back(from, line); |
18216
|
0 |
0 |
if (!parse_int(tokens[0], "CoNLL-U id", id, error)) |
|
0 |
0 |
if (!parse_int(tokens[0], "CoNLL-U id", id, error)) |
18218
|
0 |
0 |
if (id != int(t.nodes.size())) |
18219
|
0 |
0 |
return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false; |
18222
|
0 |
0 |
if (tokens[6].len == 1 && tokens[6].str[0] == '_') { |
|
0 |
0 |
if (tokens[6].len == 1 && tokens[6].str[0] == '_') { |
|
0 |
0 |
if (tokens[6].len == 1 && tokens[6].str[0] == '_') { |
18225
|
0 |
0 |
if (!parse_int(tokens[6], "CoNLL-U head", head, error)) |
|
0 |
0 |
if (!parse_int(tokens[6], "CoNLL-U head", head, error)) |
18227
|
0 |
0 |
if (head < 0) |
18228
|
0 |
0 |
return error.assign("Numeric head value '").append(tokens[0].str, tokens[0].len).append("' cannot be negative!"), false; |
|
0 |
0 |
return error.assign("Numeric head value '").append(tokens[0].str, tokens[0].len).append("' cannot be negative!"), false; |
|
0 |
0 |
return error.assign("Numeric head value '").append(tokens[0].str, tokens[0].len).append("' cannot be negative!"), false; |
18233
|
0 |
0 |
if (!(tokens[2].len == 1 && tokens[2].str[0] == '_')) node.lemma.assign(tokens[2].str, tokens[2].len); |
|
0 |
0 |
if (!(tokens[2].len == 1 && tokens[2].str[0] == '_')) node.lemma.assign(tokens[2].str, tokens[2].len); |
|
0 |
0 |
if (!(tokens[2].len == 1 && tokens[2].str[0] == '_')) node.lemma.assign(tokens[2].str, tokens[2].len); |
18234
|
0 |
0 |
if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) node.upostag.assign(tokens[3].str, tokens[3].len); |
|
0 |
0 |
if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) node.upostag.assign(tokens[3].str, tokens[3].len); |
|
0 |
0 |
if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) node.upostag.assign(tokens[3].str, tokens[3].len); |
18235
|
0 |
0 |
if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) node.xpostag.assign(tokens[4].str, tokens[4].len); |
|
0 |
0 |
if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) node.xpostag.assign(tokens[4].str, tokens[4].len); |
|
0 |
0 |
if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) node.xpostag.assign(tokens[4].str, tokens[4].len); |
18236
|
0 |
0 |
if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) node.feats.assign(tokens[5].str, tokens[5].len); |
|
0 |
0 |
if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) node.feats.assign(tokens[5].str, tokens[5].len); |
|
0 |
0 |
if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) node.feats.assign(tokens[5].str, tokens[5].len); |
18238
|
0 |
0 |
if (!(tokens[7].len == 1 && tokens[7].str[0] == '_')) node.deprel.assign(tokens[7].str, tokens[7].len); |
|
0 |
0 |
if (!(tokens[7].len == 1 && tokens[7].str[0] == '_')) node.deprel.assign(tokens[7].str, tokens[7].len); |
|
0 |
0 |
if (!(tokens[7].len == 1 && tokens[7].str[0] == '_')) node.deprel.assign(tokens[7].str, tokens[7].len); |
18239
|
0 |
0 |
if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) node.deps.assign(tokens[8].str, tokens[8].len); |
|
0 |
0 |
if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) node.deps.assign(tokens[8].str, tokens[8].len); |
|
0 |
0 |
if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) node.deps.assign(tokens[8].str, tokens[8].len); |
18240
|
0 |
0 |
if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) node.misc.assign(tokens[9].str, tokens[9].len); |
|
0 |
0 |
if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) node.misc.assign(tokens[9].str, tokens[9].len); |
|
0 |
0 |
if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) node.misc.assign(tokens[9].str, tokens[9].len); |
18244
|
0 |
0 |
if (last_multiword_token >= int(t.nodes.size())) |
18245
|
0 |
0 |
return error.assign("There are words missing for multiword token '").append(multiword_tokens.back().second.str, multiword_tokens.back().second.len).append("'!"), false; |
|
0 |
0 |
return error.assign("There are words missing for multiword token '").append(multiword_tokens.back().second.str, multiword_tokens.back().second.len).append("'!"), false; |
|
0 |
0 |
return error.assign("There are words missing for multiword token '").append(multiword_tokens.back().second.str, multiword_tokens.back().second.len).append("'!"), false; |
18248
|
0 |
0 |
for (auto&& node : t.nodes) |
18249
|
0 |
0 |
if (node.id && node.head >= 0) { |
|
0 |
0 |
if (node.id && node.head >= 0) { |
18250
|
0 |
0 |
if (node.head >= int(t.nodes.size())) |
18251
|
0 |
0 |
return error.assign("Node ID '").append(to_string(node.id)).append("' form '").append(node.form).append("' has too large head: '").append(to_string(node.head)).append("'!"), false; |
|
0 |
0 |
return error.assign("Node ID '").append(to_string(node.id)).append("' form '").append(node.form).append("' has too large head: '").append(to_string(node.head)).append("'!"), false; |
|
0 |
0 |
return error.assign("Node ID '").append(to_string(node.id)).append("' form '").append(node.form).append("' has too large head: '").append(to_string(node.head)).append("'!"), false; |
|
0 |
0 |
return error.assign("Node ID '").append(to_string(node.id)).append("' form '").append(node.form).append("' has too large head: '").append(to_string(node.head)).append("'!"), false; |
18252
|
0 |
0 |
t.set_head(node.id, node.head, node.deprel); |
18266
|
0 |
0 |
auto input_conllu = dynamic_cast(additional_info); |
18270
|
0 |
0 |
if (input_conllu) |
18271
|
0 |
0 |
for (auto&& comment : input_conllu->comments) |
18275
|
0 |
0 |
for (int i = 1 /*skip the root node*/; i < int(t.nodes.size()); i++) { |
18277
|
0 |
0 |
if (input_conllu && input_conllu_multiword_tokens < input_conllu->multiword_tokens.size() && |
|
0 |
0 |
if (input_conllu && input_conllu_multiword_tokens < input_conllu->multiword_tokens.size() && |
|
0 |
0 |
if (input_conllu && input_conllu_multiword_tokens < input_conllu->multiword_tokens.size() && |
|
0 |
0 |
if (input_conllu && input_conllu_multiword_tokens < input_conllu->multiword_tokens.size() && |
18285
|
0 |
0 |
output.append(to_string(i)).push_back('\t'); |
18291
|
0 |
0 |
output.append(t.nodes[i].head < 0 ? "_" : to_string(t.nodes[i].head)).push_back('\t'); |
|
0 |
0 |
output.append(t.nodes[i].head < 0 ? "_" : to_string(t.nodes[i].head)).push_back('\t'); |
18348
|
0 |
0 |
return {1, 1, 1, "devel"}; |
|
0 |
0 |
return {1, 1, 1, "devel"}; |
18359
|
0 |
0 |
<< (parsito.prerelease.empty() ? "" : "-") << parsito.prerelease |
|
0 |
0 |
<< (parsito.prerelease.empty() ? "" : "-") << parsito.prerelease |
18361
|
0 |
0 |
<< (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n" |
|
0 |
0 |
<< (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n" |
18363
|
0 |
0 |
"Mathematics and Physics, Charles University in Prague, Czech Republic."; |
18407
|
20 |
2 |
const string input_format_conllu::columns[10] = {"ID", "FORM", "LEMMA", |
18408
|
2 |
0 |
"UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"}; |
|
2 |
0 |
"UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"}; |
|
2 |
0 |
"UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"}; |
|
2 |
0 |
"UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"}; |
|
2 |
0 |
"UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"}; |
|
2 |
0 |
"UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"}; |
|
2 |
0 |
"UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"}; |
|
2 |
0 |
"UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"}; |
|
2 |
0 |
"UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"}; |
|
2 |
0 |
"UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"}; |
|
0 |
0 |
"UPOS", "XPOS", "FEATS", "HEAD", "DEPREL", "DEPS", "MISC"}; |
18419
|
0 |
0 |
if (make_copy) { |
18432
|
0 |
0 |
while (text.len) { |
18435
|
0 |
0 |
while (line.len < text.len && (line.str[line.len] != '\r' && line.str[line.len] != '\n')) line.len++; |
|
0 |
0 |
while (line.len < text.len && (line.str[line.len] != '\r' && line.str[line.len] != '\n')) line.len++; |
18438
|
0 |
0 |
if (text.len >= 2 && text.str[0] == '\r' && text.str[1] == '\n') |
|
0 |
0 |
if (text.len >= 2 && text.str[0] == '\r' && text.str[1] == '\n') |
|
0 |
0 |
if (text.len >= 2 && text.str[0] == '\r' && text.str[1] == '\n') |
18440
|
0 |
0 |
else if (text.len && *text.str == '\n') |
|
0 |
0 |
else if (text.len && *text.str == '\n') |
18444
|
0 |
0 |
if (!line.len) { |
18445
|
0 |
0 |
if (s.empty()) continue; |
18449
|
0 |
0 |
if (*line.str == '#') { |
18451
|
0 |
0 |
if (s.empty()) s.comments.emplace_back(line.str, line.len); |
|
0 |
0 |
if (s.empty()) s.comments.emplace_back(line.str, line.len); |
18456
|
0 |
0 |
split(line, '\t', tokens); |
18457
|
0 |
0 |
if (tokens.size() != 10) |
18458
|
0 |
0 |
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' does not contain 10 columns!") , false; |
|
0 |
0 |
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' does not contain 10 columns!") , false; |
|
0 |
0 |
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' does not contain 10 columns!") , false; |
18461
|
0 |
0 |
for (int i = 0; i < 10; i++) { |
18462
|
0 |
0 |
if (!tokens[i].len) |
18463
|
0 |
0 |
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains empty column ").append(columns[i]).append("!"), false; |
|
0 |
0 |
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains empty column ").append(columns[i]).append("!"), false; |
|
0 |
0 |
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains empty column ").append(columns[i]).append("!"), false; |
|
0 |
0 |
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains empty column ").append(columns[i]).append("!"), false; |
18464
|
0 |
0 |
if ((version < 2 || (i != 1 && i != 2 && i != 9)) && memchr(tokens[i].str, ' ', tokens[i].len) != NULL) |
|
0 |
0 |
if ((version < 2 || (i != 1 && i != 2 && i != 9)) && memchr(tokens[i].str, ' ', tokens[i].len) != NULL) |
|
0 |
0 |
if ((version < 2 || (i != 1 && i != 2 && i != 9)) && memchr(tokens[i].str, ' ', tokens[i].len) != NULL) |
|
0 |
0 |
if ((version < 2 || (i != 1 && i != 2 && i != 9)) && memchr(tokens[i].str, ' ', tokens[i].len) != NULL) |
|
0 |
0 |
if ((version < 2 || (i != 1 && i != 2 && i != 9)) && memchr(tokens[i].str, ' ', tokens[i].len) != NULL) |
18465
|
0 |
0 |
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains spaces in column ").append(columns[i]).append("!"), false; |
|
0 |
0 |
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains spaces in column ").append(columns[i]).append("!"), false; |
|
0 |
0 |
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains spaces in column ").append(columns[i]).append("!"), false; |
|
0 |
0 |
return error.assign("The CoNLL-U line '").append(line.str, line.len).append("' contains spaces in column ").append(columns[i]).append("!"), false; |
18469
|
0 |
0 |
if (memchr(tokens[0].str, '-', tokens[0].len)) { |
18470
|
0 |
0 |
split(tokens[0], '-', parts); |
18471
|
0 |
0 |
if (parts.size() != 2) |
18472
|
0 |
0 |
return error.assign("Cannot parse ID of multiword token '").append(line.str, line.len).append("'!") , false; |
|
0 |
0 |
return error.assign("Cannot parse ID of multiword token '").append(line.str, line.len).append("'!") , false; |
|
0 |
0 |
return error.assign("Cannot parse ID of multiword token '").append(line.str, line.len).append("'!") , false; |
18474
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error)) |
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error)) |
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error)) |
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error)) |
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U id", from, error) || !parse_int(parts[1], "CoNLL-U id", to, error)) |
18476
|
0 |
0 |
if (from != int(s.words.size())) |
18477
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
18478
|
0 |
0 |
if (to < from) |
18479
|
0 |
0 |
return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect range '").append(tokens[0].str, tokens[0].len).append("' of multiword token '").append(line.str, line.len).append("'!"), false; |
18480
|
0 |
0 |
if (from <= last_multiword_token) |
18481
|
0 |
0 |
return error.assign("Multiword token '").append(line.str, line.len).append("' overlaps with the previous one!"), false; |
|
0 |
0 |
return error.assign("Multiword token '").append(line.str, line.len).append("' overlaps with the previous one!"), false; |
|
0 |
0 |
return error.assign("Multiword token '").append(line.str, line.len).append("' overlaps with the previous one!"), false; |
18483
|
0 |
0 |
for (int i = 2; i < 9; i++) |
18484
|
0 |
0 |
if (tokens[i].len != 1 || tokens[i].str[0] != '_') |
|
0 |
0 |
if (tokens[i].len != 1 || tokens[i].str[0] != '_') |
|
0 |
0 |
if (tokens[i].len != 1 || tokens[i].str[0] != '_') |
18485
|
0 |
0 |
return error.assign("Column ").append(columns[i]).append(" of an multi-word token '").append(line.str, line.len).append("' is not an empty!"), false; |
|
0 |
0 |
return error.assign("Column ").append(columns[i]).append(" of an multi-word token '").append(line.str, line.len).append("' is not an empty!"), false; |
|
0 |
0 |
return error.assign("Column ").append(columns[i]).append(" of an multi-word token '").append(line.str, line.len).append("' is not an empty!"), false; |
|
0 |
0 |
return error.assign("Column ").append(columns[i]).append(" of an multi-word token '").append(line.str, line.len).append("' is not an empty!"), false; |
18486
|
0 |
0 |
s.multiword_tokens.emplace_back(from, to, tokens[1], tokens[9].len == 1 && tokens[9].str[0] == '_' ? string_piece() : tokens[9]); |
|
0 |
0 |
s.multiword_tokens.emplace_back(from, to, tokens[1], tokens[9].len == 1 && tokens[9].str[0] == '_' ? string_piece() : tokens[9]); |
|
0 |
0 |
s.multiword_tokens.emplace_back(from, to, tokens[1], tokens[9].len == 1 && tokens[9].str[0] == '_' ? string_piece() : tokens[9]); |
18491
|
0 |
0 |
if (version >= 2) |
18492
|
0 |
0 |
if (memchr(tokens[0].str, '.', tokens[0].len)) { |
18493
|
0 |
0 |
split(tokens[0], '.', parts); |
18494
|
0 |
0 |
if (parts.size() != 2) |
18495
|
0 |
0 |
return error.assign("Cannot parse ID of empty node '").append(line.str, line.len).append("'!") , false; |
|
0 |
0 |
return error.assign("Cannot parse ID of empty node '").append(line.str, line.len).append("'!") , false; |
|
0 |
0 |
return error.assign("Cannot parse ID of empty node '").append(line.str, line.len).append("'!") , false; |
18497
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U empty node id", id, error) || !parse_int(parts[1], "CoNLL-U empty node index", index, error)) |
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U empty node id", id, error) || !parse_int(parts[1], "CoNLL-U empty node index", index, error)) |
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U empty node id", id, error) || !parse_int(parts[1], "CoNLL-U empty node index", index, error)) |
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U empty node id", id, error) || !parse_int(parts[1], "CoNLL-U empty node index", index, error)) |
|
0 |
0 |
if (!parse_int(parts[0], "CoNLL-U empty node id", id, error) || !parse_int(parts[1], "CoNLL-U empty node index", index, error)) |
18499
|
0 |
0 |
if (id != int(s.words.size()) - 1) |
18500
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect ID '").append(parts[0].str, parts[0].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false; |
18501
|
0 |
0 |
if (!((s.empty_nodes.empty() && index == 1) || (!s.empty_nodes.empty() && s.empty_nodes.back().id < id && index == 1) || |
|
0 |
0 |
if (!((s.empty_nodes.empty() && index == 1) || (!s.empty_nodes.empty() && s.empty_nodes.back().id < id && index == 1) || |
|
0 |
0 |
if (!((s.empty_nodes.empty() && index == 1) || (!s.empty_nodes.empty() && s.empty_nodes.back().id < id && index == 1) || |
|
0 |
0 |
if (!((s.empty_nodes.empty() && index == 1) || (!s.empty_nodes.empty() && s.empty_nodes.back().id < id && index == 1) || |
|
0 |
0 |
if (!((s.empty_nodes.empty() && index == 1) || (!s.empty_nodes.empty() && s.empty_nodes.back().id < id && index == 1) || |
18502
|
0 |
0 |
(!s.empty_nodes.empty() && s.empty_nodes.back().id == id && index == s.empty_nodes.back().index + 1))) |
|
0 |
0 |
(!s.empty_nodes.empty() && s.empty_nodes.back().id == id && index == s.empty_nodes.back().index + 1))) |
|
0 |
0 |
(!s.empty_nodes.empty() && s.empty_nodes.back().id == id && index == s.empty_nodes.back().index + 1))) |
|
0 |
0 |
(!s.empty_nodes.empty() && s.empty_nodes.back().id == id && index == s.empty_nodes.back().index + 1))) |
18503
|
0 |
0 |
return error.assign("Incorrect ID index '").append(parts[1].str, parts[1].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect ID index '").append(parts[1].str, parts[1].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect ID index '").append(parts[1].str, parts[1].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect ID index '").append(parts[1].str, parts[1].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect ID index '").append(parts[1].str, parts[1].len).append("' of empty node token '").append(line.str, line.len).append("'!"), false; |
18504
|
0 |
0 |
for (int i = 6; i < 8; i++) |
18505
|
0 |
0 |
if (tokens[i].len != 1 || tokens[i].str[0] != '_') |
|
0 |
0 |
if (tokens[i].len != 1 || tokens[i].str[0] != '_') |
|
0 |
0 |
if (tokens[i].len != 1 || tokens[i].str[0] != '_') |
18506
|
0 |
0 |
return error.assign("Column ").append(columns[i]).append(" of an empty node token '").append(line.str, line.len).append("' is not an empty!"), false; |
|
0 |
0 |
return error.assign("Column ").append(columns[i]).append(" of an empty node token '").append(line.str, line.len).append("' is not an empty!"), false; |
|
0 |
0 |
return error.assign("Column ").append(columns[i]).append(" of an empty node token '").append(line.str, line.len).append("' is not an empty!"), false; |
|
0 |
0 |
return error.assign("Column ").append(columns[i]).append(" of an empty node token '").append(line.str, line.len).append("' is not an empty!"), false; |
18508
|
0 |
0 |
s.empty_nodes.emplace_back(id, index); |
18511
|
0 |
0 |
if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) s.empty_nodes.back().upostag.assign(tokens[3].str, tokens[3].len); |
|
0 |
0 |
if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) s.empty_nodes.back().upostag.assign(tokens[3].str, tokens[3].len); |
|
0 |
0 |
if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) s.empty_nodes.back().upostag.assign(tokens[3].str, tokens[3].len); |
18512
|
0 |
0 |
if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) s.empty_nodes.back().xpostag.assign(tokens[4].str, tokens[4].len); |
|
0 |
0 |
if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) s.empty_nodes.back().xpostag.assign(tokens[4].str, tokens[4].len); |
|
0 |
0 |
if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) s.empty_nodes.back().xpostag.assign(tokens[4].str, tokens[4].len); |
18513
|
0 |
0 |
if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) s.empty_nodes.back().feats.assign(tokens[5].str, tokens[5].len); |
|
0 |
0 |
if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) s.empty_nodes.back().feats.assign(tokens[5].str, tokens[5].len); |
|
0 |
0 |
if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) s.empty_nodes.back().feats.assign(tokens[5].str, tokens[5].len); |
18514
|
0 |
0 |
if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) s.empty_nodes.back().deps.assign(tokens[8].str, tokens[8].len); |
|
0 |
0 |
if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) s.empty_nodes.back().deps.assign(tokens[8].str, tokens[8].len); |
|
0 |
0 |
if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) s.empty_nodes.back().deps.assign(tokens[8].str, tokens[8].len); |
18515
|
0 |
0 |
if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) s.empty_nodes.back().misc.assign(tokens[9].str, tokens[9].len); |
|
0 |
0 |
if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) s.empty_nodes.back().misc.assign(tokens[9].str, tokens[9].len); |
|
0 |
0 |
if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) s.empty_nodes.back().misc.assign(tokens[9].str, tokens[9].len); |
18521
|
0 |
0 |
if (!parse_int(tokens[0], "CoNLL-U id", id, error)) |
|
0 |
0 |
if (!parse_int(tokens[0], "CoNLL-U id", id, error)) |
18523
|
0 |
0 |
if (id != int(s.words.size())) |
18524
|
0 |
0 |
return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false; |
|
0 |
0 |
return error.assign("Incorrect ID '").append(tokens[0].str, tokens[0].len).append("' of CoNLL-U line '").append(line.str, line.len).append("'!"), false; |
18527
|
0 |
0 |
if (tokens[6].len == 1 && tokens[6].str[0] == '_') { |
|
0 |
0 |
if (tokens[6].len == 1 && tokens[6].str[0] == '_') { |
|
0 |
0 |
if (tokens[6].len == 1 && tokens[6].str[0] == '_') { |
18530
|
0 |
0 |
if (!parse_int(tokens[6], "CoNLL-U head", head, error)) |
|
0 |
0 |
if (!parse_int(tokens[6], "CoNLL-U head", head, error)) |
18532
|
0 |
0 |
if (head < 0) |
18533
|
0 |
0 |
return error.assign("Numeric head value '").append(tokens[0].str, tokens[0].len).append("' cannot be negative!"), false; |
|
0 |
0 |
return error.assign("Numeric head value '").append(tokens[0].str, tokens[0].len).append("' cannot be negative!"), false; |
|
0 |
0 |
return error.assign("Numeric head value '").append(tokens[0].str, tokens[0].len).append("' cannot be negative!"), false; |
18539
|
0 |
0 |
if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) word.upostag.assign(tokens[3].str, tokens[3].len); |
|
0 |
0 |
if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) word.upostag.assign(tokens[3].str, tokens[3].len); |
|
0 |
0 |
if (!(tokens[3].len == 1 && tokens[3].str[0] == '_')) word.upostag.assign(tokens[3].str, tokens[3].len); |
18540
|
0 |
0 |
if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) word.xpostag.assign(tokens[4].str, tokens[4].len); |
|
0 |
0 |
if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) word.xpostag.assign(tokens[4].str, tokens[4].len); |
|
0 |
0 |
if (!(tokens[4].len == 1 && tokens[4].str[0] == '_')) word.xpostag.assign(tokens[4].str, tokens[4].len); |
18541
|
0 |
0 |
if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) word.feats.assign(tokens[5].str, tokens[5].len); |
|
0 |
0 |
if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) word.feats.assign(tokens[5].str, tokens[5].len); |
|
0 |
0 |
if (!(tokens[5].len == 1 && tokens[5].str[0] == '_')) word.feats.assign(tokens[5].str, tokens[5].len); |
18543
|
0 |
0 |
if (!(tokens[7].len == 1 && tokens[7].str[0] == '_')) word.deprel.assign(tokens[7].str, tokens[7].len); |
|
0 |
0 |
if (!(tokens[7].len == 1 && tokens[7].str[0] == '_')) word.deprel.assign(tokens[7].str, tokens[7].len); |
|
0 |
0 |
if (!(tokens[7].len == 1 && tokens[7].str[0] == '_')) word.deprel.assign(tokens[7].str, tokens[7].len); |
18544
|
0 |
0 |
if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) word.deps.assign(tokens[8].str, tokens[8].len); |
|
0 |
0 |
if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) word.deps.assign(tokens[8].str, tokens[8].len); |
|
0 |
0 |
if (!(tokens[8].len == 1 && tokens[8].str[0] == '_')) word.deps.assign(tokens[8].str, tokens[8].len); |
18545
|
0 |
0 |
if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) word.misc.assign(tokens[9].str, tokens[9].len); |
|
0 |
0 |
if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) word.misc.assign(tokens[9].str, tokens[9].len); |
|
0 |
0 |
if (!(tokens[9].len == 1 && tokens[9].str[0] == '_')) word.misc.assign(tokens[9].str, tokens[9].len); |
18549
|
0 |
0 |
if (last_multiword_token >= int(s.words.size())) |
18550
|
0 |
0 |
return error.assign("There are words missing for multiword token '").append(s.multiword_tokens.back().form).append("'!"), false; |
|
0 |
0 |
return error.assign("There are words missing for multiword token '").append(s.multiword_tokens.back().form).append("'!"), false; |
18553
|
0 |
0 |
for (auto&& word : s.words) |
18554
|
0 |
0 |
if (word.id && word.head >= 0) { |
|
0 |
0 |
if (word.id && word.head >= 0) { |
18555
|
0 |
0 |
if (word.head >= int(s.words.size())) |
18556
|
0 |
0 |
return error.assign("Node ID '").append(to_string(word.id)).append("' form '").append(word.form).append("' has too large head: '").append(to_string(word.head)).append("'!"), false; |
|
0 |
0 |
return error.assign("Node ID '").append(to_string(word.id)).append("' form '").append(word.form).append("' has too large head: '").append(to_string(word.head)).append("'!"), false; |
|
0 |
0 |
return error.assign("Node ID '").append(to_string(word.id)).append("' form '").append(word.form).append("' has too large head: '").append(to_string(word.head)).append("'!"), false; |
|
0 |
0 |
return error.assign("Node ID '").append(to_string(word.id)).append("' form '").append(word.form).append("' has too large head: '").append(to_string(word.head)).append("'!"), false; |
18557
|
0 |
0 |
s.set_head(word.id, word.head, word.deprel); |
18581
|
0 |
0 |
if (getline(is, block)) |
18595
|
0 |
0 |
if (make_copy) { |
18607
|
0 |
0 |
while (text.len && (*text.str == ' ' || *text.str == '\t' || *text.str == '\r' || *text.str == '\n')) { |
|
0 |
0 |
while (text.len && (*text.str == ' ' || *text.str == '\t' || *text.str == '\r' || *text.str == '\n')) { |
|
0 |
0 |
while (text.len && (*text.str == ' ' || *text.str == '\t' || *text.str == '\r' || *text.str == '\n')) { |
|
0 |
0 |
while (text.len && (*text.str == ' ' || *text.str == '\t' || *text.str == '\r' || *text.str == '\n')) { |
18613
|
0 |
0 |
while (text.len && *text.str != '\r' && *text.str != '\n') { |
|
0 |
0 |
while (text.len && *text.str != '\r' && *text.str != '\n') { |
|
0 |
0 |
while (text.len && *text.str != '\r' && *text.str != '\n') { |
18617
|
0 |
0 |
while (text.len && *text.str != ' ' && *text.str != '\t' && *text.str != '\r' && *text.str != '\n') |
|
0 |
0 |
while (text.len && *text.str != ' ' && *text.str != '\t' && *text.str != '\r' && *text.str != '\n') |
|
0 |
0 |
while (text.len && *text.str != ' ' && *text.str != '\t' && *text.str != '\r' && *text.str != '\n') |
|
0 |
0 |
while (text.len && *text.str != ' ' && *text.str != '\t' && *text.str != '\r' && *text.str != '\n') |
|
0 |
0 |
while (text.len && *text.str != ' ' && *text.str != '\t' && *text.str != '\r' && *text.str != '\n') |
18623
|
0 |
0 |
if (s.words.back().form.find("\302\240") != string::npos) { |
18626
|
0 |
0 |
for (size_t i = 0; i < form.size(); i++) { |
18627
|
0 |
0 |
if (form_len && form[form_len-1] == '\302' && form[i] == '\240') |
|
0 |
0 |
if (form_len && form[form_len-1] == '\302' && form[i] == '\240') |
|
0 |
0 |
if (form_len && form[form_len-1] == '\302' && form[i] == '\240') |
|
0 |
0 |
if (form_len && form[form_len-1] == '\302' && form[i] == '\240') |
18636
|
0 |
0 |
while (text.len && (*text.str == ' ' || *text.str == '\t')) |
|
0 |
0 |
while (text.len && (*text.str == ' ' || *text.str == '\t')) |
18640
|
0 |
0 |
if (!s.empty()) { |
18642
|
0 |
0 |
if (new_document) |
18647
|
0 |
0 |
if (preceeding_newlines >= 2) |
18652
|
0 |
0 |
s.set_sent_id(to_string(sentence_id++)); |
18688
|
0 |
0 |
if (make_copy) { |
18700
|
0 |
0 |
while (text.len && (*text.str == '\t' || *text.str == '\r' || *text.str == '\n')) { |
|
0 |
0 |
while (text.len && (*text.str == '\t' || *text.str == '\r' || *text.str == '\n')) { |
|
0 |
0 |
while (text.len && (*text.str == '\t' || *text.str == '\r' || *text.str == '\n')) { |
18706
|
0 |
0 |
while (text.len && *text.str != '\r' && *text.str != '\n') { |
|
0 |
0 |
while (text.len && *text.str != '\r' && *text.str != '\n') { |
|
0 |
0 |
while (text.len && *text.str != '\r' && *text.str != '\n') { |
18710
|
0 |
0 |
while (text.len && *text.str != '\t' && *text.str != '\r' && *text.str != '\n') |
|
0 |
0 |
while (text.len && *text.str != '\t' && *text.str != '\r' && *text.str != '\n') |
|
0 |
0 |
while (text.len && *text.str != '\t' && *text.str != '\r' && *text.str != '\n') |
|
0 |
0 |
while (text.len && *text.str != '\t' && *text.str != '\r' && *text.str != '\n') |
18716
|
0 |
0 |
while (text.len && *text.str != '\r' && *text.str != '\n') |
|
0 |
0 |
while (text.len && *text.str != '\r' && *text.str != '\n') |
|
0 |
0 |
while (text.len && *text.str != '\r' && *text.str != '\n') |
18720
|
0 |
0 |
if (text.len >= 2 && text.str[0] == '\r' && text.str[1] == '\n') |
|
0 |
0 |
if (text.len >= 2 && text.str[0] == '\r' && text.str[1] == '\n') |
|
0 |
0 |
if (text.len >= 2 && text.str[0] == '\r' && text.str[1] == '\n') |
18722
|
0 |
0 |
else if (text.len && *text.str == '\n') |
|
0 |
0 |
else if (text.len && *text.str == '\n') |
18726
|
0 |
0 |
while (text.len && *text.str == '\t') |
|
0 |
0 |
while (text.len && *text.str == '\t') |
18730
|
0 |
0 |
if (!s.empty()) { |
18732
|
0 |
0 |
if (new_document) |
18737
|
0 |
0 |
if (preceeding_newlines >= 2) |
18742
|
0 |
0 |
s.set_sent_id(to_string(sentence_id++)); |
18769
|
0 |
0 |
if (getline(is, block)) |
18784
|
0 |
0 |
if (make_copy) { |
18797
|
0 |
0 |
while (text.len && s.empty()) { |
|
0 |
0 |
while (text.len && s.empty()) { |
|
0 |
0 |
while (text.len && s.empty()) { |
18800
|
0 |
0 |
while (line.len < text.len && (line.str[line.len] == '\n' || line.str[line.len] == '\r')) { |
|
0 |
0 |
while (line.len < text.len && (line.str[line.len] == '\n' || line.str[line.len] == '\r')) { |
18804
|
0 |
0 |
while (line.len < text.len && (line.str[line.len] != '\n' && line.str[line.len] != '\r')) |
|
0 |
0 |
while (line.len < text.len && (line.str[line.len] != '\n' && line.str[line.len] != '\r')) |
18806
|
0 |
0 |
while (line.len < text.len && (line.str[line.len] == '\n' || line.str[line.len] == '\r')) { |
|
0 |
0 |
while (line.len < text.len && (line.str[line.len] == '\n' || line.str[line.len] == '\r')) { |
18813
|
0 |
0 |
tokenizer->set_text(line, false); |
18814
|
0 |
0 |
while (tokenizer->next_sentence(partial, error)) { |
|
0 |
0 |
while (tokenizer->next_sentence(partial, error)) { |
18817
|
0 |
0 |
for (size_t i = 1; i < partial.words.size(); i++) { |
18820
|
0 |
0 |
if (s.words.back().head > 0) s.words.back().head += words; |
18824
|
0 |
0 |
for (auto&& multiword_token : partial.multiword_tokens) { |
18831
|
0 |
0 |
for (auto&& empty_node : partial.empty_nodes) { |
18836
|
0 |
0 |
if (!error.empty()) return false; |
18838
|
0 |
0 |
if (s.empty()) { |
18844
|
0 |
0 |
if (!s.empty()) { |
18846
|
0 |
0 |
if (new_document) |
18847
|
0 |
0 |
s.set_new_doc(true, document_id); |
18851
|
0 |
0 |
if (preceeding_newlines >= 2) |
18852
|
0 |
0 |
s.set_new_par(true); |
18856
|
0 |
0 |
s.set_sent_id(to_string(sentence_id++)); |
18859
|
0 |
0 |
s.comments.emplace_back("# text = "); |
18860
|
0 |
0 |
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
18861
|
0 |
0 |
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j].form : (const token&)s.words[i].form; |
|
0 |
0 |
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j].form : (const token&)s.words[i].form; |
18862
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
18866
|
0 |
0 |
if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' '); |
|
0 |
0 |
if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' '); |
|
0 |
0 |
if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' '); |
|
0 |
0 |
if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' '); |
|
0 |
0 |
if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' '); |
18877
|
0 |
0 |
if (!named_values::parse(options, parsed_options, parse_error)) |
|
0 |
0 |
if (!named_values::parse(options, parsed_options, parse_error)) |
18881
|
0 |
0 |
if (parsed_options.count(CONLLU_V1)) |
18883
|
0 |
0 |
if (parsed_options.count(CONLLU_V2)) |
18886
|
0 |
0 |
return new input_format_conllu(version); |
18892
|
0 |
0 |
if (!named_values::parse(options, parsed_options, parse_error)) |
|
0 |
0 |
if (!named_values::parse(options, parsed_options, parse_error)) |
18898
|
0 |
0 |
input_format* result = new morphodita_tokenizer_wrapper(morphodita::tokenizer::new_generic_tokenizer(), nullptr, normalized_spaces, token_ranges); |
|
0 |
0 |
input_format* result = new morphodita_tokenizer_wrapper(morphodita::tokenizer::new_generic_tokenizer(), nullptr, normalized_spaces, token_ranges); |
|
0 |
0 |
input_format* result = new morphodita_tokenizer_wrapper(morphodita::tokenizer::new_generic_tokenizer(), nullptr, normalized_spaces, token_ranges); |
18899
|
0 |
0 |
return (parsed_options.count(GENERIC_TOKENIZER_PRESEGMENTED) && result) ? input_format::new_presegmented_tokenizer(result) : result; |
|
0 |
0 |
return (parsed_options.count(GENERIC_TOKENIZER_PRESEGMENTED) && result) ? input_format::new_presegmented_tokenizer(result) : result; |
|
0 |
0 |
return (parsed_options.count(GENERIC_TOKENIZER_PRESEGMENTED) && result) ? input_format::new_presegmented_tokenizer(result) : result; |
18912
|
0 |
0 |
size_t name_len = equal != string::npos ? equal : name.size(); |
18913
|
0 |
0 |
size_t option_offset = equal != string::npos ? equal + 1 : name.size(); |
18915
|
0 |
0 |
if (name.compare(0, name_len, "conllu") == 0) return new_conllu_input_format(name.substr(option_offset)); |
|
0 |
0 |
if (name.compare(0, name_len, "conllu") == 0) return new_conllu_input_format(name.substr(option_offset)); |
18916
|
0 |
0 |
if (name.compare(0, name_len, "generic_tokenizer") == 0) return new_generic_tokenizer_input_format(name.substr(option_offset)); |
|
0 |
0 |
if (name.compare(0, name_len, "generic_tokenizer") == 0) return new_generic_tokenizer_input_format(name.substr(option_offset)); |
18917
|
0 |
0 |
if (name.compare(0, name_len, "horizontal") == 0) return new_horizontal_input_format(name.substr(option_offset)); |
|
0 |
0 |
if (name.compare(0, name_len, "horizontal") == 0) return new_horizontal_input_format(name.substr(option_offset)); |
18918
|
0 |
0 |
if (name.compare(0, name_len, "vertical") == 0) return new_vertical_input_format(name.substr(option_offset)); |
|
0 |
0 |
if (name.compare(0, name_len, "vertical") == 0) return new_vertical_input_format(name.substr(option_offset)); |
18966
|
0 |
0 |
while (str.len) { |
18967
|
0 |
0 |
while (str.len && *str.str != '<' && *str.str != '>' && *str.str != '&' && (!data.encode_quot || *str.str != '"')) |
|
0 |
0 |
while (str.len && *str.str != '<' && *str.str != '>' && *str.str != '&' && (!data.encode_quot || *str.str != '"')) |
|
0 |
0 |
while (str.len && *str.str != '<' && *str.str != '>' && *str.str != '&' && (!data.encode_quot || *str.str != '"')) |
|
0 |
0 |
while (str.len && *str.str != '<' && *str.str != '>' && *str.str != '&' && (!data.encode_quot || *str.str != '"')) |
|
0 |
0 |
while (str.len && *str.str != '<' && *str.str != '>' && *str.str != '&' && (!data.encode_quot || *str.str != '"')) |
|
0 |
0 |
while (str.len && *str.str != '<' && *str.str != '>' && *str.str != '&' && (!data.encode_quot || *str.str != '"')) |
18970
|
0 |
0 |
if (str.len) { |
18971
|
0 |
0 |
if (to_print < str.str) os.write(to_print, str.str - to_print); |
18972
|
0 |
0 |
os << (*str.str == '<' ? "<" : *str.str == '>' ? ">" : *str.str == '&' ? "&" : """); |
|
0 |
0 |
os << (*str.str == '<' ? "<" : *str.str == '>' ? ">" : *str.str == '&' ? "&" : """); |
|
0 |
0 |
os << (*str.str == '<' ? "<" : *str.str == '>' ? ">" : *str.str == '&' ? "&" : """); |
18978
|
0 |
0 |
if (to_print < str.str) os.write(to_print, str.str - to_print); |
19014
|
0 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
4 |
3 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
7 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
7 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
7 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
2 |
5 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
0 |
7 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
7 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
0 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
0 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
0 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
0 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
0 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
|
0 |
0 |
const string& underscore_on_empty(const string& str) const { return str.empty() ? underscore : str; } |
19022
|
1 |
4 |
for (auto&& comment : s.comments) |
19027
|
8 |
1 |
for (int i = 0; i < int(s.words.size()); i++) { |
19029
|
7 |
1 |
if (i > 0) { |
19031
|
0 |
7 |
if (multiword_token < s.multiword_tokens.size() && |
|
0 |
0 |
if (multiword_token < s.multiword_tokens.size() && |
|
0 |
7 |
if (multiword_token < s.multiword_tokens.size() && |
19047
|
0 |
7 |
if (s.words[i].head < 0) os << '_'; else os << s.words[i].head; os << '\t' |
19054
|
8 |
0 |
if (version >= 2) |
19055
|
0 |
8 |
for (; empty_node < s.empty_nodes.size() && i == s.empty_nodes[empty_node].id; empty_node++) { |
|
0 |
0 |
for (; empty_node < s.empty_nodes.size() && i == s.empty_nodes[empty_node].id; empty_node++) { |
|
0 |
8 |
for (; empty_node < s.empty_nodes.size() && i == s.empty_nodes[empty_node].id; empty_node++) { |
19072
|
0 |
14 |
if (version >= 2 || str.find(' ') == string::npos) |
|
0 |
0 |
if (version >= 2 || str.find(' ') == string::npos) |
|
14 |
0 |
if (version >= 2 || str.find(' ') == string::npos) |
19075
|
0 |
0 |
for (auto&& chr : str) |
19076
|
0 |
0 |
os << (chr == ' ' ? '_' : chr); |
19092
|
0 |
0 |
json_builder& close() { if (!stack.empty()) { json.push_back(stack.back()); stack.pop_back(); } comma_needed = true; return *this; } |
|
0 |
0 |
json_builder& close() { if (!stack.empty()) { json.push_back(stack.back()); stack.pop_back(); } comma_needed = true; return *this; } |
19094
|
0 |
0 |
json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; } |
|
0 |
0 |
json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; } |
|
0 |
0 |
json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; } |
|
0 |
0 |
json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; } |
|
0 |
0 |
json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; } |
|
0 |
0 |
json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; } |
|
0 |
0 |
json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; } |
|
0 |
0 |
json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; } |
|
0 |
0 |
json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; } |
|
0 |
0 |
json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; } |
|
0 |
0 |
json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; } |
|
0 |
0 |
json_builder& value(string_piece value) { comma(); string(value); comma_needed=true; return *this; } |
19095
|
0 |
0 |
json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; } |
|
0 |
0 |
json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; } |
|
0 |
0 |
json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; } |
|
0 |
0 |
json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; } |
|
0 |
0 |
json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; } |
|
0 |
0 |
json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; } |
|
0 |
0 |
json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; } |
|
0 |
0 |
json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; } |
|
0 |
0 |
json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; } |
|
0 |
0 |
json_builder& value(size_t value) { comma(); number(value); comma_needed=true; return *this; } |
19103
|
0 |
0 |
if (comma_needed) { |
19111
|
0 |
0 |
for (; str.len; str.str++, str.len--) |
19121
|
0 |
0 |
if (((unsigned char)*str.str) < 32) { |
19131
|
0 |
0 |
for (; value || start_size == json.size(); value /= 10) |
|
0 |
0 |
for (; value || start_size == json.size(); value /= 10) |
|
0 |
0 |
for (; value || start_size == json.size(); value /= 10) |
19146
|
0 |
0 |
json.object().key("id").value(++sentences).key("nodes").array(); |
|
0 |
0 |
json.object().key("id").value(++sentences).key("nodes").array(); |
|
0 |
0 |
json.object().key("id").value(++sentences).key("nodes").array(); |
|
0 |
0 |
json.object().key("id").value(++sentences).key("nodes").array(); |
19148
|
0 |
0 |
for (size_t i = 1; i < s.words.size(); i++) { |
19149
|
0 |
0 |
json.object().key("id").value(i).key("form").value(s.words[i].form); |
|
0 |
0 |
json.object().key("id").value(i).key("form").value(s.words[i].form); |
|
0 |
0 |
json.object().key("id").value(i).key("form").value(s.words[i].form); |
19152
|
0 |
0 |
if (s.words[i].get_token_range(start, end)) |
19153
|
0 |
0 |
json.key("start").value(start).key("end").value(end); |
|
0 |
0 |
json.key("start").value(start).key("end").value(end); |
19154
|
0 |
0 |
if (s.words[i].head == 0) |
19157
|
0 |
0 |
json.key("properties").object() |
|
0 |
0 |
json.key("properties").object() |
19158
|
0 |
0 |
.key("lemma").value(s.words[i].lemma) |
19159
|
0 |
0 |
.key("upos").value(s.words[i].upostag) |
19160
|
0 |
0 |
.key("xpos").value(s.words[i].xpostag); |
19162
|
0 |
0 |
for (auto&& feat : feats) { |
19164
|
0 |
0 |
while (key.len < feat.len && key.str[key.len] != '=') |
|
0 |
0 |
while (key.len < feat.len && key.str[key.len] != '=') |
19166
|
0 |
0 |
if (key.len + 1 < feat.len) |
19167
|
0 |
0 |
json.key(key).value(string_piece(key.str + key.len + 1, feat.len - key.len - 1)); |
19171
|
0 |
0 |
if (!s.words[i].children.empty()) { |
19173
|
0 |
0 |
for (auto&& child : s.words[i].children) |
19174
|
0 |
0 |
json.object().key("label").value(s.words[child].deprel).key("target").value(child).close(); |
|
0 |
0 |
json.object().key("label").value(s.words[child].deprel).key("target").value(child).close(); |
|
0 |
0 |
json.object().key("label").value(s.words[child].deprel).key("target").value(child).close(); |
|
0 |
0 |
json.object().key("label").value(s.words[child].deprel).key("target").value(child).close(); |
19204
|
0 |
0 |
if (!sentences) { |
19210
|
0 |
0 |
for (auto&& node : s.words[0].children) |
19211
|
0 |
0 |
write_node(s, node, pad, os); |
19226
|
0 |
0 |
os << pad << "
|
|
0 |
0 |
os << pad << "
|
19227
|
0 |
0 |
<< "\" form=\"" << xml_encoded(s.words[node].form, true) |
19228
|
0 |
0 |
<< "\" lem=\"" << xml_encoded(s.words[node].lemma, true) |
19229
|
0 |
0 |
<< "\" mi=\"" << xml_encoded(s.words[node].feats, true) |
19230
|
0 |
0 |
<< "\" si=\"" << xml_encoded(s.words[node].deprel, true) << '"'; |
19232
|
0 |
0 |
if (s.words[node].children.empty()) { |
19236
|
0 |
0 |
for (auto&& child : s.words[node].children) |
19258
|
0 |
0 |
if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par())) |
|
0 |
0 |
if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par())) |
|
0 |
0 |
if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par())) |
|
0 |
0 |
if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par())) |
|
0 |
0 |
if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par())) |
19263
|
0 |
0 |
for (size_t i = 1; i < s.words.size(); i++) { |
19265
|
0 |
0 |
for (auto&& chr : s.words[i].form) |
19266
|
0 |
0 |
if (chr == ' ') |
19267
|
0 |
0 |
line.append("\302\240"); |
19269
|
0 |
0 |
line.push_back(chr); |
19271
|
0 |
0 |
if (i+1 < s.words.size()) |
19272
|
0 |
0 |
line.push_back(' '); |
19290
|
0 |
0 |
if (normalized) { |
19291
|
0 |
0 |
if (!empty && (s.get_new_doc() || s.get_new_par())) |
|
0 |
0 |
if (!empty && (s.get_new_doc() || s.get_new_par())) |
|
0 |
0 |
if (!empty && (s.get_new_doc() || s.get_new_par())) |
|
0 |
0 |
if (!empty && (s.get_new_doc() || s.get_new_par())) |
19293
|
0 |
0 |
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
19294
|
0 |
0 |
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j] : (const token&)s.words[i]; |
|
0 |
0 |
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j] : (const token&)s.words[i]; |
19296
|
0 |
0 |
if (i+1 < s.words.size() && tok.get_space_after()) |
|
0 |
0 |
if (i+1 < s.words.size() && tok.get_space_after()) |
|
0 |
0 |
if (i+1 < s.words.size() && tok.get_space_after()) |
19298
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
19304
|
0 |
0 |
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
19305
|
0 |
0 |
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j] : (const token&)s.words[i]; |
|
0 |
0 |
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j] : (const token&)s.words[i]; |
19306
|
0 |
0 |
tok.get_spaces_before(spaces); os << spaces; |
19307
|
0 |
0 |
tok.get_spaces_in_token(spaces); os << (!spaces.empty() ? spaces : tok.form); |
|
0 |
0 |
tok.get_spaces_in_token(spaces); os << (!spaces.empty() ? spaces : tok.form); |
19308
|
0 |
0 |
tok.get_spaces_after(spaces); os << spaces; |
19309
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
19331
|
0 |
0 |
if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par())) |
|
0 |
0 |
if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par())) |
|
0 |
0 |
if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par())) |
|
0 |
0 |
if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par())) |
|
0 |
0 |
if (paragraphs && !empty && (s.get_new_doc() || s.get_new_par())) |
19335
|
0 |
0 |
for (size_t i = 1; i < s.words.size(); i++) |
19344
|
1 |
0 |
if (!named_values::parse(options, parsed_options, parse_error)) |
|
1 |
0 |
if (!named_values::parse(options, parsed_options, parse_error)) |
19348
|
0 |
1 |
if (parsed_options.count(CONLLU_V1)) |
19350
|
0 |
1 |
if (parsed_options.count(CONLLU_V2)) |
19353
|
1 |
0 |
return new output_format_conllu(version); |
19361
|
0 |
0 |
return new output_format_matxin(); |
19367
|
0 |
0 |
if (!named_values::parse(options, parsed_options, parse_error)) |
|
0 |
0 |
if (!named_values::parse(options, parsed_options, parse_error)) |
19370
|
0 |
0 |
return new output_format_horizontal(parsed_options.count(HORIZONTAL_PARAGRAPHS)); |
19376
|
0 |
0 |
if (!named_values::parse(options, parsed_options, parse_error)) |
|
0 |
0 |
if (!named_values::parse(options, parsed_options, parse_error)) |
19379
|
0 |
0 |
return new output_format_plaintext(parsed_options.count(PLAINTEXT_NORMALIZED_SPACES)); |
19385
|
0 |
0 |
if (!named_values::parse(options, parsed_options, parse_error)) |
|
0 |
0 |
if (!named_values::parse(options, parsed_options, parse_error)) |
19388
|
0 |
0 |
return new output_format_vertical(parsed_options.count(VERTICAL_PARAGRAPHS)); |
19393
|
1 |
0 |
size_t name_len = equal != string::npos ? equal : name.size(); |
19394
|
0 |
1 |
size_t option_offset = equal != string::npos ? equal + 1 : name.size(); |
19396
|
1 |
0 |
if (name.compare(0, name_len, "conllu") == 0) return new_conllu_output_format(name.substr(option_offset)); |
|
1 |
0 |
if (name.compare(0, name_len, "conllu") == 0) return new_conllu_output_format(name.substr(option_offset)); |
19397
|
0 |
0 |
if (name.compare(0, name_len, "epe") == 0) return new_epe_output_format(name.substr(option_offset)); |
|
0 |
0 |
if (name.compare(0, name_len, "epe") == 0) return new_epe_output_format(name.substr(option_offset)); |
19398
|
0 |
0 |
if (name.compare(0, name_len, "matxin") == 0) return new_matxin_output_format(name.substr(option_offset)); |
19399
|
0 |
0 |
if (name.compare(0, name_len, "horizontal") == 0) return new_horizontal_output_format(name.substr(option_offset)); |
|
0 |
0 |
if (name.compare(0, name_len, "horizontal") == 0) return new_horizontal_output_format(name.substr(option_offset)); |
19400
|
0 |
0 |
if (name.compare(0, name_len, "plaintext") == 0) return new_plaintext_output_format(name.substr(option_offset)); |
|
0 |
0 |
if (name.compare(0, name_len, "plaintext") == 0) return new_plaintext_output_format(name.substr(option_offset)); |
19401
|
0 |
0 |
if (name.compare(0, name_len, "vertical") == 0) return new_vertical_output_format(name.substr(option_offset)); |
|
0 |
0 |
if (name.compare(0, name_len, "vertical") == 0) return new_vertical_output_format(name.substr(option_offset)); |
19421
|
1 |
0 |
clear(); |
19439
|
0 |
0 |
words.emplace_back((int)words.size(), form); |
|
0 |
0 |
words.emplace_back((int)words.size(), form); |
|
0 |
0 |
words.emplace_back((int)words.size(), form); |
19444
|
7 |
0 |
assert(id >= 0 && id < int(words.size())); |
|
0 |
7 |
assert(id >= 0 && id < int(words.size())); |
19445
|
0 |
7 |
assert(head < int(words.size())); |
19448
|
0 |
7 |
if (words[id].head >= 0) { |
19450
|
0 |
0 |
for (size_t i = children.size(); i && children[i-1] >= id; i--) |
|
0 |
0 |
for (size_t i = children.size(); i && children[i-1] >= id; i--) |
|
0 |
0 |
for (size_t i = children.size(); i && children[i-1] >= id; i--) |
19451
|
0 |
0 |
if (children[i-1] == id) { |
19460
|
7 |
0 |
if (head >= 0) { |
19463
|
4 |
3 |
while (i && children[i-1] > id) i--; |
|
4 |
0 |
while (i && children[i-1] > id) i--; |
|
0 |
7 |
while (i && children[i-1] > id) i--; |
19464
|
4 |
3 |
if (!i || children[i-1] < id) children.insert(children.begin() + i, id); |
|
4 |
0 |
if (!i || children[i-1] < id) children.insert(children.begin() + i, id); |
|
7 |
0 |
if (!i || children[i-1] < id) children.insert(children.begin() + i, id); |
19469
|
0 |
0 |
for (auto&& word : words) { |
19477
|
0 |
0 |
if (get_comment("newdoc id", id)) |
19486
|
1 |
0 |
if (new_doc && id.len) |
|
0 |
1 |
if (new_doc && id.len) |
19488
|
1 |
0 |
else if (new_doc) |
19493
|
0 |
0 |
if (get_comment("newpar id", id)) |
19502
|
1 |
0 |
if (new_par && id.len) |
|
0 |
1 |
if (new_par && id.len) |
19504
|
1 |
0 |
else if (new_par) |
19517
|
1 |
0 |
if (id.len) |
19530
|
0 |
0 |
if (text.len) |
19535
|
0 |
0 |
for (auto&& comment : comments) |
19536
|
0 |
0 |
if (comment[0] == '#') { |
19539
|
0 |
0 |
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
|
0 |
0 |
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
|
0 |
0 |
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
|
0 |
0 |
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
19542
|
0 |
0 |
if (j + name.len <= comment.size() && comment.compare(j, name.len, name.str, name.len) == 0) { |
|
0 |
0 |
if (j + name.len <= comment.size() && comment.compare(j, name.len, name.str, name.len) == 0) { |
|
0 |
0 |
if (j + name.len <= comment.size() && comment.compare(j, name.len, name.str, name.len) == 0) { |
19544
|
0 |
0 |
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
|
0 |
0 |
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
|
0 |
0 |
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
|
0 |
0 |
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
19545
|
0 |
0 |
if (j < comment.size() && comment[j] == '=') { |
|
0 |
0 |
if (j < comment.size() && comment[j] == '=') { |
|
0 |
0 |
if (j < comment.size() && comment[j] == '=') { |
19548
|
0 |
0 |
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
|
0 |
0 |
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
|
0 |
0 |
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
|
0 |
0 |
while (j < comment.size() && (comment[j] == ' ' || comment[j] == '\t')) j++; |
19549
|
0 |
0 |
if (value) value->assign(comment, j, comment.size() - j); |
19552
|
0 |
0 |
if (value) value->clear(); |
19563
|
7 |
8 |
for (unsigned i = comments.size(); i--; ) |
19564
|
0 |
7 |
if (comments[i][0] == '#') { |
19567
|
14 |
0 |
while (j < comments[i].size() && (comments[i][j] == ' ' || comments[i][j] == '\t')) j++; |
|
7 |
7 |
while (j < comments[i].size() && (comments[i][j] == ' ' || comments[i][j] == '\t')) j++; |
|
0 |
7 |
while (j < comments[i].size() && (comments[i][j] == ' ' || comments[i][j] == '\t')) j++; |
|
7 |
7 |
while (j < comments[i].size() && (comments[i][j] == ' ' || comments[i][j] == '\t')) j++; |
19570
|
2 |
5 |
if (j + name.len <= comments[i].size() && comments[i].compare(j, name.len, name.str, name.len) == 0) |
|
2 |
0 |
if (j + name.len <= comments[i].size() && comments[i].compare(j, name.len, name.str, name.len) == 0) |
|
7 |
0 |
if (j + name.len <= comments[i].size() && comments[i].compare(j, name.len, name.str, name.len) == 0) |
19579
|
3 |
0 |
comment.append("# ").append(name.str, name.len); |
|
3 |
0 |
comment.append("# ").append(name.str, name.len); |
19580
|
1 |
2 |
if (value.len) { |
19581
|
1 |
0 |
comment.append(" = "); |
19582
|
1 |
1 |
for (size_t i = 0; i < value.len; i++) |
19583
|
1 |
0 |
comment.push_back(value.str[i] == '\r' || value.str[i] == '\n' ? ' ' : value.str[i]); |
|
1 |
0 |
comment.push_back(value.str[i] == '\r' || value.str[i] == '\n' ? ' ' : value.str[i]); |
19602
|
3 |
8 |
if (form.len) this->form.assign(form.str, form.len); |
19603
|
0 |
11 |
if (misc.len) this->misc.assign(misc.str, misc.len); |
19610
|
2 |
4 |
return !(get_misc_field("SpaceAfter", value) && value.len == 2 && memcmp(value.str, "No", 2) == 0); |
|
2 |
0 |
return !(get_misc_field("SpaceAfter", value) && value.len == 2 && memcmp(value.str, "No", 2) == 0); |
|
2 |
0 |
return !(get_misc_field("SpaceAfter", value) && value.len == 2 && memcmp(value.str, "No", 2) == 0); |
19614
|
5 |
2 |
if (space_after) |
19624
|
0 |
0 |
if (get_misc_field("SpacesBefore", value)) |
19631
|
7 |
0 |
if (spaces_before.len == 0) |
19640
|
0 |
0 |
if (get_misc_field("SpacesAfter", value)) |
19643
|
0 |
0 |
spaces_after.assign(get_space_after() ? " " : ""); |
19647
|
2 |
5 |
if (spaces_after.len == 0) { |
19650
|
5 |
0 |
} else if (spaces_after.len == 1 && spaces_after.str[0] == ' ') { |
|
5 |
0 |
} else if (spaces_after.len == 1 && spaces_after.str[0] == ' ') { |
19662
|
0 |
0 |
if (get_misc_field("SpacesInToken", value)) |
19669
|
7 |
0 |
if (spaces_in_token.len == 0) |
19679
|
0 |
0 |
if (!get_misc_field("TokenRange", value)) return false; |
19682
|
0 |
0 |
while (value.len && value.str[0] >= '0' && value.str[0] <= '9') { |
|
0 |
0 |
while (value.len && value.str[0] >= '0' && value.str[0] <= '9') { |
|
0 |
0 |
while (value.len && value.str[0] >= '0' && value.str[0] <= '9') { |
19683
|
0 |
0 |
if (start > (numeric_limits::max() - (value.str[0] - '0')) / 10) |
19689
|
0 |
0 |
if (value.len == 0 || value.str[0] != ':') return false; |
|
0 |
0 |
if (value.len == 0 || value.str[0] != ':') return false; |
19693
|
0 |
0 |
while (value.len && value.str[0] >= '0' && value.str[0] <= '9') { |
|
0 |
0 |
while (value.len && value.str[0] >= '0' && value.str[0] <= '9') { |
|
0 |
0 |
while (value.len && value.str[0] >= '0' && value.str[0] <= '9') { |
19694
|
0 |
0 |
if (end > (numeric_limits::max() - (value.str[0] - '0')) / 10) |
19704
|
0 |
0 |
if (start == size_t(string::npos)) |
19707
|
0 |
0 |
start_misc_field("TokenRange").append(to_string(start)).append(1, ':').append(to_string(end)); |
19712
|
2 |
4 |
for (size_t index = 0; index < misc.size(); ) { |
19713
|
2 |
0 |
if (misc.compare(index, name.len, name.str, name.len) == 0 && misc[index + name.len] == '=') { |
|
0 |
2 |
if (misc.compare(index, name.len, name.str, name.len) == 0 && misc[index + name.len] == '=') { |
|
2 |
0 |
if (misc.compare(index, name.len, name.str, name.len) == 0 && misc[index + name.len] == '=') { |
19717
|
2 |
0 |
value.len = (value.len == size_t(string::npos) ? misc.size() : value.len) - index; |
19721
|
0 |
0 |
if (index != size_t(string::npos)) index++; |
19727
|
8 |
28 |
for (size_t index = 0; index < misc.size(); ) |
19728
|
2 |
6 |
if (misc.compare(index, name.len, name.str, name.len) == 0 && misc[index + name.len] == '=') { |
|
0 |
2 |
if (misc.compare(index, name.len, name.str, name.len) == 0 && misc[index + name.len] == '=') { |
|
2 |
6 |
if (misc.compare(index, name.len, name.str, name.len) == 0 && misc[index + name.len] == '=') { |
19730
|
2 |
0 |
if (end_index == size_t(string::npos)) end_index = misc.size(); |
19733
|
0 |
2 |
if (index) |
19736
|
2 |
0 |
misc.erase(index, end_index + (end_index < misc.size() ? 1 : 0) - index); |
19739
|
6 |
0 |
if (index != size_t(string::npos)) index++; |
19745
|
0 |
2 |
if (!misc.empty()) misc.push_back('|'); |
19751
|
0 |
0 |
for (unsigned i = 0; i < spaces.len; i++) |
19773
|
0 |
0 |
for (unsigned i = 0; i < escaped_spaces.len; i++) |
19774
|
0 |
0 |
if (escaped_spaces.str[i] != '\\' || i+1 >= escaped_spaces.len) |
|
0 |
0 |
if (escaped_spaces.str[i] != '\\' || i+1 >= escaped_spaces.len) |
19876
|
0 |
0 |
sa_lowercased(data_lowercased), sa_categorized(data_categorized) {} |
|
0 |
0 |
sa_lowercased(data_lowercased), sa_categorized(data_categorized) {} |
|
0 |
0 |
sa_lowercased(data_lowercased), sa_categorized(data_categorized) {} |
19880
|
0 |
0 |
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
19881
|
0 |
0 |
token* tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (token*)&s.multiword_tokens[j] : (token*)&s.words[i]; |
|
0 |
0 |
token* tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (token*)&s.multiword_tokens[j] : (token*)&s.words[i]; |
19883
|
0 |
0 |
if (previous_tok) { |
19886
|
0 |
0 |
if (!score) score = has_letters(previous_tok->form) && has_letters(tok->form) ? -1 : 0; |
|
0 |
0 |
if (!score) score = has_letters(previous_tok->form) && has_letters(tok->form) ? -1 : 0; |
|
0 |
0 |
if (!score) score = has_letters(previous_tok->form) && has_letters(tok->form) ? -1 : 0; |
19887
|
0 |
0 |
if (!score) score = only_digits(previous_tok->form) && only_digits(tok->form) ? -1 : 0; |
|
0 |
0 |
if (!score) score = only_digits(previous_tok->form) && only_digits(tok->form) ? -1 : 0; |
|
0 |
0 |
if (!score) score = only_digits(previous_tok->form) && only_digits(tok->form) ? -1 : 0; |
19888
|
0 |
0 |
if (!score) score = difference(previous_tok->form, tok->form, false, LOWERCASE); |
19889
|
0 |
0 |
if (!score) score = difference(previous_tok->form, tok->form, false, CATEGORIZE); |
19890
|
0 |
0 |
if (!score) score = difference(previous_tok->form, tok->form, true, CATEGORIZE); |
19892
|
0 |
0 |
if (score > 0) |
19900
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
19906
|
0 |
0 |
auto& func = mode == LOWERCASE ? perform_lowercase : perform_categorize; |
19907
|
0 |
0 |
auto& sa = mode == LOWERCASE ? sa_lowercased : sa_categorized; |
19910
|
0 |
0 |
string right_mapped = func(right); |
19913
|
0 |
0 |
pattern.assign(separate?" ":"").append(left_mapped).append(right_mapped).append(separate?" ":""); |
|
0 |
0 |
pattern.assign(separate?" ":"").append(left_mapped).append(right_mapped).append(separate?" ":""); |
|
0 |
0 |
pattern.assign(separate?" ":"").append(left_mapped).append(right_mapped).append(separate?" ":""); |
|
0 |
0 |
pattern.assign(separate?" ":"").append(left_mapped).append(right_mapped).append(separate?" ":""); |
19914
|
0 |
0 |
int together = sa.count(pattern); |
19916
|
0 |
0 |
pattern.assign(separate?" ":"").append(left_mapped).append(" ").append(right_mapped).append(separate?" ":""); |
|
0 |
0 |
pattern.assign(separate?" ":"").append(left_mapped).append(" ").append(right_mapped).append(separate?" ":""); |
|
0 |
0 |
pattern.assign(separate?" ":"").append(left_mapped).append(" ").append(right_mapped).append(separate?" ":""); |
|
0 |
0 |
pattern.assign(separate?" ":"").append(left_mapped).append(" ").append(right_mapped).append(separate?" ":""); |
|
0 |
0 |
pattern.assign(separate?" ":"").append(left_mapped).append(" ").append(right_mapped).append(separate?" ":""); |
19917
|
0 |
0 |
int apart = sa.count(pattern); |
19926
|
0 |
0 |
for (auto&& chr : utf8::decoder(input)) |
19927
|
0 |
0 |
utf8::append(output, unicode::lowercase(chr)); |
19935
|
0 |
0 |
for (auto&& chr : utf8::decoder(input)) { |
19937
|
0 |
0 |
if (category & unicode::C) output.push_back('C'); |
|
0 |
0 |
if (category & unicode::C) output.push_back('C'); |
19938
|
0 |
0 |
if (category & unicode::L) output.push_back('L'); |
|
0 |
0 |
if (category & unicode::L) output.push_back('L'); |
19939
|
0 |
0 |
if (category & unicode::M) output.push_back('M'); |
|
0 |
0 |
if (category & unicode::M) output.push_back('M'); |
19940
|
0 |
0 |
if (category & unicode::N) output.push_back('N'); |
|
0 |
0 |
if (category & unicode::N) output.push_back('N'); |
19941
|
0 |
0 |
if (category & unicode::Pc) output.push_back('c'); |
|
0 |
0 |
if (category & unicode::Pc) output.push_back('c'); |
19942
|
0 |
0 |
if (category & unicode::Pd) output.push_back('d'); |
|
0 |
0 |
if (category & unicode::Pd) output.push_back('d'); |
19943
|
0 |
0 |
if (category & unicode::Pe) output.push_back('e'); |
|
0 |
0 |
if (category & unicode::Pe) output.push_back('e'); |
19944
|
0 |
0 |
if (category & unicode::Pf) output.push_back('f'); |
|
0 |
0 |
if (category & unicode::Pf) output.push_back('f'); |
19945
|
0 |
0 |
if (category & unicode::Pi) output.push_back('i'); |
|
0 |
0 |
if (category & unicode::Pi) output.push_back('i'); |
19946
|
0 |
0 |
if (category & unicode::Po) output.push_back('o'); |
|
0 |
0 |
if (category & unicode::Po) output.push_back('o'); |
19947
|
0 |
0 |
if (category & unicode::Ps) output.push_back('s'); |
|
0 |
0 |
if (category & unicode::Ps) output.push_back('s'); |
19948
|
0 |
0 |
if (category & unicode::S) output.push_back('S'); |
|
0 |
0 |
if (category & unicode::S) output.push_back('S'); |
19949
|
0 |
0 |
if (category & unicode::Zl) output.push_back('Z'); |
|
0 |
0 |
if (category & unicode::Zl) output.push_back('Z'); |
19950
|
0 |
0 |
if (category & unicode::Zp) output.push_back('z'); |
|
0 |
0 |
if (category & unicode::Zp) output.push_back('z'); |
19951
|
0 |
0 |
if (category & unicode::Zs) output.push_back(' '); |
|
0 |
0 |
if (category & unicode::Zs) output.push_back(' '); |
19959
|
0 |
0 |
for (auto&& chr : utf8::decoder(word)) |
19960
|
0 |
0 |
if (unicode::category(chr) & unicode::L) |
19968
|
0 |
0 |
for (auto&& chr : utf8::decoder(word)) |
19969
|
0 |
0 |
if (unicode::category(chr) & ~unicode::N) |
19975
|
0 |
0 |
sa.reserve(str.size()); |
19976
|
0 |
0 |
for (unsigned i = 0; i < str.size(); i++) |
19977
|
0 |
0 |
sa.push_back(i); |
20003
|
1 |
0 |
: tokenizer(tokenizer), splitter(splitter), normalized_spaces(normalized_spaces), token_ranges(token_ranges) {} |
20023
|
0 |
1 |
for (char32_t chr; |
20024
|
1 |
0 |
text.len && (following = text, chr = unilib::utf8::decode(following.str, following.len), |
|
1 |
0 |
text.len && (following = text, chr = unilib::utf8::decode(following.str, following.len), |
20025
|
1 |
0 |
(unilib::unicode::category(chr) & unilib::unicode::Zs) || chr == '\r' || chr == '\n' || chr == '\t'); |
|
1 |
0 |
(unilib::unicode::category(chr) & unilib::unicode::Zs) || chr == '\r' || chr == '\n' || chr == '\t'); |
20032
|
34 |
1 |
for (following = text; following.len; unilib::utf8::decode(following.str, following.len)) |
20036
|
1 |
0 |
if (make_copy) { |
20053
|
0 |
2 |
if (tokenizer->next_sentence(&forms, token_ranges ? &tokens : nullptr)) { |
|
1 |
1 |
if (tokenizer->next_sentence(&forms, token_ranges ? &tokens : nullptr)) { |
20056
|
7 |
1 |
for (size_t i = 0; i < forms.size(); i++) { |
20057
|
7 |
0 |
while (forms[i].len && (forms[i].str[0] == '\r' || forms[i].str[0] == '\n' || |
|
7 |
0 |
while (forms[i].len && (forms[i].str[0] == '\r' || forms[i].str[0] == '\n' || |
|
7 |
0 |
while (forms[i].len && (forms[i].str[0] == '\r' || forms[i].str[0] == '\n' || |
|
7 |
0 |
while (forms[i].len && (forms[i].str[0] == '\r' || forms[i].str[0] == '\n' || |
|
7 |
0 |
while (forms[i].len && (forms[i].str[0] == '\r' || forms[i].str[0] == '\n' || |
20058
|
0 |
7 |
forms[i].str[0] == '\t' || forms[i].str[0] == ' ')) |
20060
|
7 |
0 |
while (forms[i].len && (forms[i].str[forms[i].len-1] == '\r' || forms[i].str[forms[i].len-1] == '\n' || |
|
7 |
0 |
while (forms[i].len && (forms[i].str[forms[i].len-1] == '\r' || forms[i].str[forms[i].len-1] == '\n' || |
|
7 |
0 |
while (forms[i].len && (forms[i].str[forms[i].len-1] == '\r' || forms[i].str[forms[i].len-1] == '\n' || |
|
7 |
0 |
while (forms[i].len && (forms[i].str[forms[i].len-1] == '\r' || forms[i].str[forms[i].len-1] == '\n' || |
|
0 |
7 |
while (forms[i].len && (forms[i].str[forms[i].len-1] == '\r' || forms[i].str[forms[i].len-1] == '\n' || |
20061
|
0 |
7 |
forms[i].str[forms[i].len-1] == '\t' || forms[i].str[forms[i].len-1] == ' ')) |
20063
|
0 |
7 |
if (!forms[i].len) |
20066
|
1 |
0 |
if (!forms.size()) return next_sentence(s, error); |
20068
|
7 |
1 |
for (size_t i = 0; i < forms.size(); i++) { |
20072
|
34 |
7 |
for (size_t j = 0; j < forms[i].len; j++) { |
20074
|
34 |
0 |
if (chr == '\r' || chr == '\n' || chr == '\t') chr = ' '; |
|
0 |
34 |
if (chr == '\r' || chr == '\n' || chr == '\t') chr = ' '; |
20075
|
0 |
34 |
if (chr != ' ' || tok.form.empty() || tok.form.back() != ' ') |
|
0 |
0 |
if (chr != ' ' || tok.form.empty() || tok.form.back() != ' ') |
|
0 |
0 |
if (chr != ' ' || tok.form.empty() || tok.form.back() != ' ') |
|
34 |
0 |
if (chr != ' ' || tok.form.empty() || tok.form.back() != ' ') |
20080
|
1 |
6 |
if (i == 0) { |
20081
|
0 |
1 |
if (forms[0].str > text.str) |
20085
|
7 |
0 |
if (!normalized_spaces) { |
20086
|
1 |
6 |
tok.set_spaces_before(i == 0 ? saved_spaces : ""); |
|
7 |
0 |
tok.set_spaces_before(i == 0 ? saved_spaces : ""); |
20091
|
1 |
6 |
if (i+1 == forms.size()) { |
20096
|
1 |
1 |
for (char32_t chr; text.len && (following = text, chr = unilib::utf8::decode(following.str, following.len), |
|
0 |
1 |
for (char32_t chr; text.len && (following = text, chr = unilib::utf8::decode(following.str, following.len), |
|
1 |
1 |
for (char32_t chr; text.len && (following = text, chr = unilib::utf8::decode(following.str, following.len), |
20097
|
0 |
0 |
(unilib::unicode::category(chr) & unilib::unicode::Zs) || chr == '\r' || chr == '\n' || chr == '\t'); text = following) |
|
0 |
0 |
(unilib::unicode::category(chr) & unilib::unicode::Zs) || chr == '\r' || chr == '\n' || chr == '\t'); text = following) |
20102
|
0 |
7 |
if (normalized_spaces) { |
20103
|
0 |
0 |
tok.set_space_after(i+1 == forms.size() ? !saved_spaces.empty() : forms[i+1].str > forms[i].str + forms[i].len); |
20105
|
0 |
7 |
tok.set_spaces_in_token(tok.form.size() != forms[i].len ? forms[i] : ""); |
20106
|
1 |
6 |
tok.set_spaces_after(i+1 == forms.size() ? saved_spaces : string_piece(forms[i].str + forms[i].len, forms[i+1].str - forms[i].str - forms[i].len)); |
20111
|
0 |
7 |
if (token_ranges) |
20114
|
7 |
0 |
if (splitter) |
20121
|
1 |
0 |
if (new_document) { |
20127
|
1 |
0 |
if (preceeding_newlines >= 2) |
20131
|
1 |
0 |
s.set_sent_id(to_string(sentence_id++)); |
20135
|
7 |
1 |
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
20136
|
0 |
7 |
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j].form : (const token&)s.words[i].form; |
|
0 |
0 |
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? (const token&)s.multiword_tokens[j].form : (const token&)s.words[i].form; |
20137
|
0 |
7 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
0 |
7 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
20141
|
6 |
1 |
if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' '); |
|
2 |
4 |
if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' '); |
|
3 |
4 |
if (i+1 < s.words.size() && tok.get_space_after()) s.comments.back().push_back(' '); |
20148
|
0 |
1 |
if (text.len) { |
20185
|
7 |
0 |
if (it == full_rules.end()) { |
20186
|
0 |
7 |
if (version >= 2) { |
20189
|
0 |
0 |
while (suffix.size() + 1 < buffer.size()) { |
20193
|
0 |
0 |
if (suffix_it == suffix_rules.end()) |
20196
|
0 |
0 |
if (!suffix_it->second.words.empty()) { |
20204
|
7 |
0 |
if (!prefix_len) { |
20207
|
2 |
5 |
if (misc.len) s.words.back().misc.assign(misc.str, misc.len); |
20215
|
0 |
0 |
if (unicode::category(utf8::first(token.str, token.len)) & unicode::Lut) { |
20217
|
0 |
0 |
for (auto&& chr : utf8::decoder(token.str, token.len)) |
20218
|
0 |
0 |
if (unicode::category(chr) & (unicode::L & ~unicode::Lut)) { casing = UC_FIRST; break; } |
20225
|
0 |
0 |
if (prefix_len) { |
20228
|
0 |
0 |
while (s.words.back().form.size() < prefix_len && suffix.len) |
|
0 |
0 |
while (s.words.back().form.size() < prefix_len && suffix.len) |
|
0 |
0 |
while (s.words.back().form.size() < prefix_len && suffix.len) |
20232
|
0 |
0 |
for (auto&& chr : utf8::decoder(it->second.words[0])) |
20233
|
0 |
0 |
utf8::append(s.words.back().form, casing == UC_ALL || (casing == UC_FIRST && s.words.back().form.empty()) ? unicode::uppercase(chr) : chr); |
|
0 |
0 |
utf8::append(s.words.back().form, casing == UC_ALL || (casing == UC_FIRST && s.words.back().form.empty()) ? unicode::uppercase(chr) : chr); |
|
0 |
0 |
utf8::append(s.words.back().form, casing == UC_ALL || (casing == UC_FIRST && s.words.back().form.empty()) ? unicode::uppercase(chr) : chr); |
20235
|
0 |
0 |
for (size_t i = 1; i < it->second.words.size(); i++) |
20236
|
0 |
0 |
if (casing != UC_ALL) { |
20246
|
1 |
0 |
if (!is.get(version)) return nullptr; |
20247
|
1 |
0 |
if (!(version >= 1 && version <= VERSION_LATEST)) return nullptr; |
20250
|
1 |
0 |
if (!compressor::load(is, data)) return nullptr; |
|
1 |
0 |
if (!compressor::load(is, data)) return nullptr; |
20252
|
1 |
0 |
unique_ptr splitter(new multiword_splitter(version)); |
20254
|
1 |
0 |
for (unsigned full_rules = data.next_4B(); full_rules; full_rules--) { |
|
0 |
1 |
for (unsigned full_rules = data.next_4B(); full_rules; full_rules--) { |
20256
|
0 |
0 |
data.next_str(full_rule); |
20261
|
0 |
0 |
for (unsigned words = data.next_1B(); words; words--) { |
|
0 |
0 |
for (unsigned words = data.next_1B(); words; words--) { |
20262
|
0 |
0 |
info.words.emplace_back(); |
20263
|
0 |
0 |
data.next_str(info.words.back()); |
20265
|
0 |
0 |
if (info.words.empty()) return nullptr; |
20268
|
0 |
1 |
if (version >= 2) |
20269
|
0 |
0 |
for (unsigned suffix_rules = data.next_4B(); suffix_rules; suffix_rules--) { |
|
0 |
0 |
for (unsigned suffix_rules = data.next_4B(); suffix_rules; suffix_rules--) { |
20271
|
0 |
0 |
data.next_str(suffix_rule); |
20276
|
0 |
0 |
for (unsigned words = data.next_1B(); words; words--) { |
|
0 |
0 |
for (unsigned words = data.next_1B(); words; words--) { |
20277
|
0 |
0 |
info.words.emplace_back(); |
20278
|
0 |
0 |
data.next_str(info.words.back()); |
20280
|
0 |
0 |
if (info.words.empty()) return nullptr; |
20283
|
0 |
0 |
if (!suffix_rule.empty()) |
20284
|
0 |
0 |
for (suffix_rule.pop_back(); !suffix_rule.empty(); suffix_rule.pop_back()) |
20286
|
0 |
0 |
} |
20291
|
1 |
0 |
return data.is_end() ? splitter.release() : nullptr; |
20339
|
0 |
0 |
for (auto&& sentence : data) |
20340
|
0 |
0 |
for (auto&& multiword : sentence.multiword_tokens) { |
20343
|
0 |
0 |
for (int i = multiword.id_first; i <= multiword.id_last; i++) |
20344
|
0 |
0 |
utf8::map(unicode::lowercase, sentence.words[i].form, (lc_words.emplace_back(), lc_words.back())); |
20346
|
0 |
0 |
auto& info = full_rules[lc_form]; |
20347
|
0 |
0 |
if (info.words.empty()) |
20350
|
0 |
0 |
if (!info.count) full_rules.erase(lc_form); |
20354
|
0 |
0 |
for (auto&& sentence : data) |
20355
|
0 |
0 |
for (size_t i = 1, j = 0; i < sentence.words.size(); i++) { |
20356
|
0 |
0 |
if (j < sentence.multiword_tokens.size() && sentence.multiword_tokens[j].id_first == int(i)) { |
|
0 |
0 |
if (j < sentence.multiword_tokens.size() && sentence.multiword_tokens[j].id_first == int(i)) { |
|
0 |
0 |
if (j < sentence.multiword_tokens.size() && sentence.multiword_tokens[j].id_first == int(i)) { |
20363
|
0 |
0 |
if (it != full_rules.end()) |
20364
|
0 |
0 |
if (!--it->second.count) |
20369
|
0 |
0 |
for (auto&& full_rule : full_rules) { |
20371
|
0 |
0 |
while (prefix_match < full_rule.first.size() && prefix_match < full_rule.second.words[0].size()) prefix_match++; |
|
0 |
0 |
while (prefix_match < full_rule.first.size() && prefix_match < full_rule.second.words[0].size()) prefix_match++; |
|
0 |
0 |
while (prefix_match < full_rule.first.size() && prefix_match < full_rule.second.words[0].size()) prefix_match++; |
20372
|
0 |
0 |
for (; prefix_match; prefix_match--) |
20373
|
0 |
0 |
if (((unsigned char)full_rule.first[prefix_match]) < 0x80 || ((unsigned char)full_rule.first[prefix_match]) >= 0xC0) { |
|
0 |
0 |
if (((unsigned char)full_rule.first[prefix_match]) < 0x80 || ((unsigned char)full_rule.first[prefix_match]) >= 0xC0) { |
|
0 |
0 |
if (((unsigned char)full_rule.first[prefix_match]) < 0x80 || ((unsigned char)full_rule.first[prefix_match]) >= 0xC0) { |
20374
|
0 |
0 |
lc_form.assign(full_rule.first, prefix_match, string::npos); |
20376
|
0 |
0 |
lc_words[0].erase(0, prefix_match); |
20378
|
0 |
0 |
auto& info = suffix_rules[lc_form]; |
20379
|
0 |
0 |
if (info.words.empty()) |
20382
|
0 |
0 |
if (!info.count) suffix_rules.erase(lc_form); |
20387
|
0 |
0 |
for (auto&& sentence : data) |
20388
|
0 |
0 |
for (size_t i = 1, j = 0; i < sentence.words.size(); i++) { |
20389
|
0 |
0 |
if (j < sentence.multiword_tokens.size() && sentence.multiword_tokens[j].id_first == int(i)) { |
|
0 |
0 |
if (j < sentence.multiword_tokens.size() && sentence.multiword_tokens[j].id_first == int(i)) { |
|
0 |
0 |
if (j < sentence.multiword_tokens.size() && sentence.multiword_tokens[j].id_first == int(i)) { |
20395
|
0 |
0 |
while (lc_form.size() > 1) { |
20396
|
0 |
0 |
lc_form.erase(0, 1); |
20398
|
0 |
0 |
if (it != suffix_rules.end()) { |
20399
|
0 |
0 |
if (it->second.count <= 10) |
20408
|
0 |
0 |
binary_encoder enc; |
20410
|
0 |
0 |
for (auto&& full_rule : full_rules) { |
20411
|
0 |
0 |
enc.add_str(full_rule.first); |
20412
|
0 |
0 |
enc.add_1B(full_rule.second.words.size()); |
20413
|
0 |
0 |
for (auto& word : full_rule.second.words) |
20414
|
0 |
0 |
enc.add_str(word); |
20417
|
0 |
0 |
for (auto&& suffix_rule : suffix_rules) { |
20418
|
0 |
0 |
enc.add_str(suffix_rule.first); |
20419
|
0 |
0 |
enc.add_1B(suffix_rule.second.words.size()); |
20420
|
0 |
0 |
for (auto& word : suffix_rule.second.words) |
20421
|
0 |
0 |
enc.add_str(word); |
20425
|
0 |
0 |
os.put(multiword_splitter::VERSION_LATEST); |
20426
|
0 |
0 |
if (!compressor::save(os, enc)) return error.assign("Cannot encode multiword_splitter!"), false; |
|
0 |
0 |
if (!compressor::save(os, enc)) return error.assign("Cannot encode multiword_splitter!"), false; |
|
0 |
0 |
if (!compressor::save(os, enc)) return error.assign("Cannot encode multiword_splitter!"), false; |
20536
|
0 |
0 |
stringstream os_buffer; |
20537
|
0 |
0 |
os_buffer.put(method.size()); |
20538
|
0 |
0 |
os_buffer.write(method.c_str(), method.size()); |
20541
|
0 |
0 |
if (method == "morphodita_parsito") { |
20542
|
0 |
0 |
if (!trainer_morphodita_parsito::train(training, heldout, tokenizer, tagger, parser, os_buffer, error)) |
|
0 |
0 |
if (!trainer_morphodita_parsito::train(training, heldout, tokenizer, tagger, parser, os_buffer, error)) |
20545
|
0 |
0 |
error.assign("Unknown UDPipe method '").append(method).append("'!"); |
|
0 |
0 |
error.assign("Unknown UDPipe method '").append(method).append("'!"); |
20547
|
0 |
0 |
} |
|
0 |
0 |
} |
20553
|
0 |
0 |
os << os_buffer.rdbuf(); |
20571
|
0 |
0 |
for (unsigned i = 0; i < 10; i++) |
|
0 |
0 |
for (unsigned i = 0; i < 10; i++) |
|
0 |
0 |
for (unsigned i = 0; i < 10; i++) |
|
0 |
0 |
for (unsigned i = 0; i < 10; i++) |
|
0 |
0 |
for (unsigned i = 0; i < 10; i++) |
|
0 |
0 |
for (unsigned i = 0; i < 10; i++) |
|
0 |
0 |
for (unsigned i = 0; i < 10; i++) |
|
0 |
0 |
for (unsigned i = 0; i < 10; i++) |
|
0 |
0 |
for (unsigned i = 0; i < 10; i++) |
|
0 |
0 |
for (unsigned i = 0; i < 10; i++) |
20595
|
0 |
0 |
enc.add_1B(maps.size()); |
20596
|
0 |
0 |
for (auto&& map : maps) |
20597
|
0 |
0 |
map.save(enc); |
20599
|
0 |
0 |
return compressor::save(os, enc); |
20622
|
0 |
0 |
for (auto&& description : ElementaryFeatures::descriptions) |
20623
|
0 |
0 |
if (!elementary_map.emplace(description.name, description).second) |
20624
|
0 |
0 |
training_failure("Repeated elementary feature with name " << description.name << '!'); |
20628
|
0 |
0 |
while (getline(is, line)) { |
|
0 |
0 |
while (getline(is, line)) { |
20629
|
0 |
0 |
split(line, ',', tokens); |
20630
|
0 |
0 |
if (tokens.empty()) training_failure("Feature sequence cannot be empty!"); |
|
0 |
0 |
if (tokens.empty()) training_failure("Feature sequence cannot be empty!"); |
|
0 |
0 |
if (tokens.empty()) training_failure("Feature sequence cannot be empty!"); |
20633
|
0 |
0 |
sequences.emplace_back(); |
20634
|
0 |
0 |
for (auto&& token : tokens) { |
20636
|
0 |
0 |
split(token, ' ', parts); |
20637
|
0 |
0 |
if (parts.size() != 2) training_failure("Cannot parse feature sequence element '" << token << "'!"); |
|
0 |
0 |
if (parts.size() != 2) training_failure("Cannot parse feature sequence element '" << token << "'!"); |
|
0 |
0 |
if (parts.size() != 2) training_failure("Cannot parse feature sequence element '" << token << "'!"); |
20639
|
0 |
0 |
if (it == elementary_map.end()) training_failure("Unknown elementary feature '" << parts[0] << "' used in feature sequence '" << token << "'!"); |
|
0 |
0 |
if (it == elementary_map.end()) training_failure("Unknown elementary feature '" << parts[0] << "' used in feature sequence '" << token << "'!"); |
|
0 |
0 |
if (it == elementary_map.end()) training_failure("Unknown elementary feature '" << parts[0] << "' used in feature sequence '" << token << "'!"); |
20642
|
0 |
0 |
int sequence_index = parse_int(parts[1].c_str(), "sequence_index"); |
20643
|
0 |
0 |
if (desc.type == DYNAMIC && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of dynamic elementary feature '" << desc.name << "'!"); |
|
0 |
0 |
if (desc.type == DYNAMIC && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of dynamic elementary feature '" << desc.name << "'!"); |
|
0 |
0 |
if (desc.type == DYNAMIC && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of dynamic elementary feature '" << desc.name << "'!"); |
|
0 |
0 |
if (desc.type == DYNAMIC && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of dynamic elementary feature '" << desc.name << "'!"); |
|
0 |
0 |
if (desc.type == DYNAMIC && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of dynamic elementary feature '" << desc.name << "'!"); |
20644
|
0 |
0 |
if (desc.type == PER_TAG && (sequence_index > 0 || sequence_index <= -window_size)) training_failure("Wrong sequence index " << sequence_index << " of per-tag elementary feature '" << desc.name << "'!"); |
|
0 |
0 |
if (desc.type == PER_TAG && (sequence_index > 0 || sequence_index <= -window_size)) training_failure("Wrong sequence index " << sequence_index << " of per-tag elementary feature '" << desc.name << "'!"); |
|
0 |
0 |
if (desc.type == PER_TAG && (sequence_index > 0 || sequence_index <= -window_size)) training_failure("Wrong sequence index " << sequence_index << " of per-tag elementary feature '" << desc.name << "'!"); |
|
0 |
0 |
if (desc.type == PER_TAG && (sequence_index > 0 || sequence_index <= -window_size)) training_failure("Wrong sequence index " << sequence_index << " of per-tag elementary feature '" << desc.name << "'!"); |
|
0 |
0 |
if (desc.type == PER_TAG && (sequence_index > 0 || sequence_index <= -window_size)) training_failure("Wrong sequence index " << sequence_index << " of per-tag elementary feature '" << desc.name << "'!"); |
|
0 |
0 |
if (desc.type == PER_TAG && (sequence_index > 0 || sequence_index <= -window_size)) training_failure("Wrong sequence index " << sequence_index << " of per-tag elementary feature '" << desc.name << "'!"); |
20645
|
0 |
0 |
if (desc.range == ONLY_CURRENT && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of elementary feature '" << desc.name << "' requiring zero offset!"); |
|
0 |
0 |
if (desc.range == ONLY_CURRENT && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of elementary feature '" << desc.name << "' requiring zero offset!"); |
|
0 |
0 |
if (desc.range == ONLY_CURRENT && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of elementary feature '" << desc.name << "' requiring zero offset!"); |
|
0 |
0 |
if (desc.range == ONLY_CURRENT && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of elementary feature '" << desc.name << "' requiring zero offset!"); |
|
0 |
0 |
if (desc.range == ONLY_CURRENT && sequence_index != 0) training_failure("Nonzero sequence index " << sequence_index << " of elementary feature '" << desc.name << "' requiring zero offset!"); |
20647
|
0 |
0 |
sequences.back().elements.emplace_back(it->second.type, it->second.index, sequence_index); |
20648
|
0 |
0 |
if (desc.type == DYNAMIC) sequences.back().dependant_range = max(sequences.back().dependant_range, window_size + 1); |
20649
|
0 |
0 |
if (desc.type == PER_TAG) sequences.back().dependant_range = max(sequences.back().dependant_range, 1 - sequence_index); |
20652
|
0 |
0 |
if (contains_only_current && sequences.back().dependant_range > 1) training_failure("Feature sequence '" << line << "' contains both a non-local elementary feature and exclusively-local elementary feature!"); |
|
0 |
0 |
if (contains_only_current && sequences.back().dependant_range > 1) training_failure("Feature sequence '" << line << "' contains both a non-local elementary feature and exclusively-local elementary feature!"); |
|
0 |
0 |
if (contains_only_current && sequences.back().dependant_range > 1) training_failure("Feature sequence '" << line << "' contains both a non-local elementary feature and exclusively-local elementary feature!"); |
|
0 |
0 |
if (contains_only_current && sequences.back().dependant_range > 1) training_failure("Feature sequence '" << line << "' contains both a non-local elementary feature and exclusively-local elementary feature!"); |
|
0 |
0 |
if (contains_only_current && sequences.back().dependant_range > 1) training_failure("Feature sequence '" << line << "' contains both a non-local elementary feature and exclusively-local elementary feature!"); |
20656
|
0 |
0 |
scores.resize(sequences.size()); |
20661
|
0 |
0 |
if (!elementary.save(os)) return false; |
20664
|
0 |
0 |
enc.add_1B(sequences.size()); |
20665
|
0 |
0 |
for (auto&& sequence : sequences) { |
20667
|
0 |
0 |
enc.add_1B(sequence.elements.size()); |
20668
|
0 |
0 |
for (auto&& element : sequence.elements) { |
20675
|
0 |
0 |
enc.add_1B(scores.size()); |
20676
|
0 |
0 |
for (auto&& score : scores) |
20677
|
0 |
0 |
score.save(enc); |
20679
|
0 |
0 |
return compressor::save(os, enc); |
20700
|
0 |
0 |
class training_elementary_feature_map { |
|
0 |
0 |
class training_elementary_feature_map { |
20734
|
0 |
0 |
return it != map.end() ? it->second.alpha : 0; |
20777
|
0 |
0 |
for (unsigned i = 0; i < map_indices.size(); i++) { |
20778
|
0 |
0 |
for (auto&& element : features.sequences[i].elements) |
20779
|
0 |
0 |
for (auto&& description : decltype(features.elementary)::descriptions) |
20780
|
0 |
0 |
if (element.type == description.type && element.elementary_index == description.index) |
|
0 |
0 |
if (element.type == description.type && element.elementary_index == description.index) |
20781
|
0 |
0 |
map_indices[i].emplace_back(description.map_index); |
20783
|
0 |
0 |
assert(map_indices[i].size() == features.sequences[i].elements.size()); |
20787
|
0 |
0 |
vector> counts(elementary.maps.size()); |
20789
|
0 |
0 |
for (unsigned i = 0; i < features.sequences.size(); i++) |
20790
|
0 |
0 |
for (auto&& element : features.scores[i].map) |
20791
|
0 |
0 |
if (element.second.gamma) { |
20793
|
0 |
0 |
for (const char* key = element.first.c_str(); key != element.first.c_str() + element.first.size(); assert(key <= element.first.c_str() + element.first.size())) |
|
0 |
0 |
for (const char* key = element.first.c_str(); key != element.first.c_str() + element.first.size(); assert(key <= element.first.c_str() + element.first.size())) |
20794
|
0 |
0 |
elementary_ids.emplace_back(vli::decode(key)); |
20796
|
0 |
0 |
assert(elementary_ids.size() == features.sequences[i].elements.size()); |
20797
|
0 |
0 |
for (unsigned j = 0; j < elementary_ids.size(); j++) { |
20798
|
0 |
0 |
if (map_indices[i][j] < 0) continue; |
20799
|
0 |
0 |
if (elementary_ids[j] >= counts[map_indices[i][j]].size()) counts[map_indices[i][j]].resize(elementary_ids[j] + 1); |
|
0 |
0 |
if (elementary_ids[j] >= counts[map_indices[i][j]].size()) counts[map_indices[i][j]].resize(elementary_ids[j] + 1); |
20805
|
0 |
0 |
for (auto&& count : counts) { |
20806
|
0 |
0 |
if (elementary_feature_empty >= count.size()) count.resize(elementary_feature_empty + 1); |
|
0 |
0 |
if (elementary_feature_empty >= count.size()) count.resize(elementary_feature_empty + 1); |
20809
|
0 |
0 |
for (elementary_feature_value i = 0; i < count.size(); i++) count[i].ori = i; |
20814
|
0 |
0 |
vector> elementary_ids_map(counts.size()); |
20815
|
0 |
0 |
for (unsigned i = 0; i < counts.size(); i++) { |
20816
|
0 |
0 |
elementary_ids_map[i].resize(counts[i].size()); |
20817
|
0 |
0 |
for (elementary_feature_value j = 0; j < counts[i].size(); j++) |
20818
|
0 |
0 |
elementary_ids_map[i][counts[i][j].ori] = counts[i][j].count ? j : elementary_feature_unknown; |
20823
|
0 |
0 |
for (unsigned i = 0; i < elementary.maps.size(); i++) { |
20825
|
0 |
0 |
for (auto&& element : elementary.maps[i].map) |
20826
|
0 |
0 |
if (element.second < elementary_ids_map[i].size() && elementary_ids_map[i][element.second] != elementary_feature_unknown) |
|
0 |
0 |
if (element.second < elementary_ids_map[i].size() && elementary_ids_map[i][element.second] != elementary_feature_unknown) |
|
0 |
0 |
if (element.second < elementary_ids_map[i].size() && elementary_ids_map[i][element.second] != elementary_feature_unknown) |
20829
|
0 |
0 |
optimized_elementary.maps.emplace_back(persistent_unordered_map(mapped_ids, 1, [](binary_encoder& enc, int id) { |
|
0 |
0 |
optimized_elementary.maps.emplace_back(persistent_unordered_map(mapped_ids, 1, [](binary_encoder& enc, int id) { |
20835
|
0 |
0 |
optimized_features.sequences = features.sequences; |
20838
|
0 |
0 |
for (unsigned i = 0; i < features.sequences.size(); i++) { |
20840
|
0 |
0 |
for (auto&& element : features.scores[i].map) |
20841
|
0 |
0 |
if (element.second.gamma) { |
20843
|
0 |
0 |
for (const char* key = element.first.c_str(); key < element.first.c_str() + element.first.size(); ) |
20844
|
0 |
0 |
elementary_ids.emplace_back(vli::decode(key)); |
20846
|
0 |
0 |
assert(elementary_ids.size() == features.sequences[i].elements.size()); |
20847
|
0 |
0 |
for (unsigned j = 0; j < elementary_ids.size(); j++) { |
20848
|
0 |
0 |
if (map_indices[i][j] < 0) continue; |
20849
|
0 |
0 |
assert(elementary_ids[j] < elementary_ids_map[map_indices[i][j]].size() && elementary_ids_map[map_indices[i][j]][elementary_ids[j]] != elementary_feature_unknown); |
|
0 |
0 |
assert(elementary_ids[j] < elementary_ids_map[map_indices[i][j]].size() && elementary_ids_map[map_indices[i][j]][elementary_ids[j]] != elementary_feature_unknown); |
20853
|
0 |
0 |
key_buffer.resize(elementary_ids.size() * vli::max_length()); |
20855
|
0 |
0 |
for (unsigned j = 0; j < elementary_ids.size(); j++) |
20861
|
0 |
0 |
optimized_features.scores.emplace_back(persistent_unordered_map(updated_map, 1, [](binary_encoder& enc, const training_feature_sequence_map::info& info) { |
|
0 |
0 |
optimized_features.scores.emplace_back(persistent_unordered_map(updated_map, 1, [](binary_encoder& enc, const training_feature_sequence_map::info& info) { |
20862
|
0 |
0 |
assert(feature_sequence_score(info.gamma) == info.gamma); |
20928
|
0 |
0 |
if (!d) training_failure("Cannot load dictionary!"); |
|
0 |
0 |
if (!d) training_failure("Cannot load dictionary!"); |
|
0 |
0 |
if (!d) training_failure("Cannot load dictionary!"); |
20930
|
0 |
0 |
if (!in_morpho_dict.seekg(0, istream::beg)) training_failure("Cannot seek in dictionary file to the beginning!"); |
|
0 |
0 |
if (!in_morpho_dict.seekg(0, istream::beg)) training_failure("Cannot seek in dictionary file to the beginning!"); |
|
0 |
0 |
if (!in_morpho_dict.seekg(0, istream::beg)) training_failure("Cannot seek in dictionary file to the beginning!"); |
|
0 |
0 |
if (!in_morpho_dict.seekg(0, istream::beg)) training_failure("Cannot seek in dictionary file to the beginning!"); |
20935
|
0 |
0 |
load_data(in_train, *d, use_guesser, train_data, true); |
20938
|
0 |
0 |
if (in_heldout) { |
20941
|
0 |
0 |
load_data(in_heldout, *d, use_guesser, heldout_data, false); |
20946
|
0 |
0 |
out_tagger << in_morpho_dict.rdbuf(); |
20947
|
0 |
0 |
out_tagger.put(use_guesser); |
20950
|
0 |
0 |
TaggerTrainer::train(decoding_order, window_size, iterations, train_data, heldout_data, early_stopping, prune_features, in_feature_templates, out_tagger); |
20961
|
0 |
0 |
sentences.emplace_back(); |
20962
|
0 |
0 |
while (getline(is, line)) { |
|
0 |
0 |
while (getline(is, line)) { |
20963
|
0 |
0 |
if (line.empty()) { |
20964
|
0 |
0 |
if (!sentences.back().words.empty()) |
20965
|
0 |
0 |
sentences.emplace_back(); |
20969
|
0 |
0 |
split(line, '\t', tokens); |
20970
|
0 |
0 |
if (tokens.size() != 3) training_failure("The tagger data line '" << line << "' does not contain three columns!"); |
|
0 |
0 |
if (tokens.size() != 3) training_failure("The tagger data line '" << line << "' does not contain three columns!"); |
|
0 |
0 |
if (tokens.size() != 3) training_failure("The tagger data line '" << line << "' does not contain three columns!"); |
20975
|
0 |
0 |
s.words.emplace_back(tokens[0]); |
20976
|
0 |
0 |
s.gold.emplace_back(tokens[1], tokens[2]); |
20977
|
0 |
0 |
s.gold_index.emplace_back(-1); |
20980
|
0 |
0 |
s.analyses.emplace_back(); |
20981
|
0 |
0 |
d.analyze(tokens[0], use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, s.analyses.back()); |
|
0 |
0 |
d.analyze(tokens[0], use_guesser ? morpho::GUESSER : morpho::NO_GUESSER, s.analyses.back()); |
20984
|
0 |
0 |
for (size_t i = 0; i < s.analyses.back().size(); i++) |
20985
|
0 |
0 |
if (s.analyses.back()[i].lemma == s.gold.back().lemma && s.analyses.back()[i].tag == s.gold.back().tag) { |
|
0 |
0 |
if (s.analyses.back()[i].lemma == s.gold.back().lemma && s.analyses.back()[i].tag == s.gold.back().tag) { |
|
0 |
0 |
if (s.analyses.back()[i].lemma == s.gold.back().lemma && s.analyses.back()[i].tag == s.gold.back().tag) { |
20990
|
0 |
0 |
if (s.gold_index.back() == -1 && add_gold) { |
|
0 |
0 |
if (s.gold_index.back() == -1 && add_gold) { |
|
0 |
0 |
if (s.gold_index.back() == -1 && add_gold) { |
20992
|
0 |
0 |
s.analyses.back().emplace_back(tokens[1], tokens[2]); |
20995
|
0 |
0 |
if (!sentences.empty() && sentences.back().words.empty()) sentences.pop_back(); |
|
0 |
0 |
if (!sentences.empty() && sentences.back().words.empty()) sentences.pop_back(); |
|
0 |
0 |
if (!sentences.empty() && sentences.back().words.empty()) sentences.pop_back(); |
20998
|
0 |
0 |
for (auto&& sentence : sentences) |
20999
|
0 |
0 |
for (auto&& word : sentence.words) |
21000
|
0 |
0 |
sentence.forms.emplace_back(string_piece(word.c_str(), d.raw_form_len(word))); |
|
0 |
0 |
sentence.forms.emplace_back(string_piece(word.c_str(), d.raw_form_len(word))); |
21040
|
0 |
0 |
features.parse(window_size, in_feature_templates); |
21043
|
0 |
0 |
train_viterbi(decoding_order, window_size, iterations, train, heldout, early_stopping, prune_features, features); |
21048
|
0 |
0 |
optimizer::optimize(features, optimized_features); |
21049
|
0 |
0 |
if (!optimized_features.save(out_tagger)) training_failure("Cannot save feature sequences!"); |
|
0 |
0 |
if (!optimized_features.save(out_tagger)) training_failure("Cannot save feature sequences!"); |
|
0 |
0 |
if (!optimized_features.save(out_tagger)) training_failure("Cannot save feature sequences!"); |
|
0 |
0 |
if (!optimized_features.save(out_tagger)) training_failure("Cannot save feature sequences!"); |
21058
|
0 |
0 |
typename decltype(decoder)::cache decoder_cache(decoder); |
21060
|
0 |
0 |
typename FeatureSequences::cache feature_sequences_cache(features); |
21064
|
0 |
0 |
vector window(window_size); |
21067
|
0 |
0 |
if (prune_features) |
21068
|
0 |
0 |
for (unsigned s = 0; s < train.size(); s++) { |
21070
|
0 |
0 |
features.initialize_sentence(sentence.forms, sentence.analyses, feature_sequences_cache); |
21071
|
0 |
0 |
for (int i = 0; i < int(sentence.forms.size()); i++) { |
21073
|
0 |
0 |
for (int j = 0; j < window_size && i - j >= 0; j++) window[j] = sentence.gold_index[i - j]; |
|
0 |
0 |
for (int j = 0; j < window_size && i - j >= 0; j++) window[j] = sentence.gold_index[i - j]; |
21076
|
0 |
0 |
features.feature_keys(i, window.data(), 0, gold_dynamic_features, gold_feature_sequences_keys, feature_sequences_cache); |
21078
|
0 |
0 |
for (unsigned f = 0; f < features.scores.size(); f++) |
21079
|
0 |
0 |
if (!gold_feature_sequences_keys[f].empty()) |
21085
|
0 |
0 |
for (int i = 0; i < iterations; i++) { |
21088
|
0 |
0 |
cerr << "Iteration " << i + 1 << ": "; |
|
0 |
0 |
cerr << "Iteration " << i + 1 << ": "; |
21091
|
0 |
0 |
for (unsigned s = 0; s < train.size(); s++) { |
21095
|
0 |
0 |
if (tags.size() < sentence.forms.size()) tags.resize(2 * sentence.forms.size()); |
|
0 |
0 |
if (tags.size() < sentence.forms.size()) tags.resize(2 * sentence.forms.size()); |
21096
|
0 |
0 |
decoder.tag(sentence.forms, sentence.analyses, decoder_cache, tags); |
21099
|
0 |
0 |
features.initialize_sentence(sentence.forms, sentence.analyses, feature_sequences_cache); |
21100
|
0 |
0 |
for (int i = 0; i < int(sentence.forms.size()); i++) { |
21105
|
0 |
0 |
for (int j = 0; j < window_size && i - j >= 0; j++) window[j] = tags[i - j]; |
|
0 |
0 |
for (int j = 0; j < window_size && i - j >= 0; j++) window[j] = tags[i - j]; |
21107
|
0 |
0 |
features.feature_keys(i, window.data(), 0, decoded_dynamic_features, decoded_feature_sequences_keys, feature_sequences_cache); |
21109
|
0 |
0 |
for (int j = 0; j < window_size && i - j >= 0; j++) window[j] = sentence.gold_index[i - j]; |
|
0 |
0 |
for (int j = 0; j < window_size && i - j >= 0; j++) window[j] = sentence.gold_index[i - j]; |
21111
|
0 |
0 |
features.feature_keys(i, window.data(), 0, gold_dynamic_features, gold_feature_sequences_keys, feature_sequences_cache); |
21113
|
0 |
0 |
for (unsigned f = 0; f < features.scores.size(); f++) { |
21114
|
0 |
0 |
if (decoded_feature_sequences_keys[f] != gold_feature_sequences_keys[f]) { |
21115
|
0 |
0 |
if (!decoded_feature_sequences_keys[f].empty()) { |
21117
|
0 |
0 |
if (it == features.scores[f].map.end() && !prune_features) it = features.scores[f].map.emplace(decoded_feature_sequences_keys[f], typename decltype(features.scores[f].map)::mapped_type()).first; |
|
0 |
0 |
if (it == features.scores[f].map.end() && !prune_features) it = features.scores[f].map.emplace(decoded_feature_sequences_keys[f], typename decltype(features.scores[f].map)::mapped_type()).first; |
|
0 |
0 |
if (it == features.scores[f].map.end() && !prune_features) it = features.scores[f].map.emplace(decoded_feature_sequences_keys[f], typename decltype(features.scores[f].map)::mapped_type()).first; |
21118
|
0 |
0 |
if (it != features.scores[f].map.end()) { |
21126
|
0 |
0 |
if (!gold_feature_sequences_keys[f].empty()) { |
21128
|
0 |
0 |
if (it == features.scores[f].map.end() && !prune_features) it = features.scores[f].map.emplace(gold_feature_sequences_keys[f], typename decltype(features.scores[f].map)::mapped_type()).first; |
|
0 |
0 |
if (it == features.scores[f].map.end() && !prune_features) it = features.scores[f].map.emplace(gold_feature_sequences_keys[f], typename decltype(features.scores[f].map)::mapped_type()).first; |
|
0 |
0 |
if (it == features.scores[f].map.end() && !prune_features) it = features.scores[f].map.emplace(gold_feature_sequences_keys[f], typename decltype(features.scores[f].map)::mapped_type()).first; |
21129
|
0 |
0 |
if (it != features.scores[f].map.end()) { |
21142
|
0 |
0 |
for (auto&& score : features.scores) |
21143
|
0 |
0 |
for (auto&& element : score.map) { |
21150
|
0 |
0 |
if (!heldout.empty()) { |
21156
|
0 |
0 |
optimizer::optimize(features, frozen_features); |
21158
|
0 |
0 |
typename decltype(frozen_decoder)::cache frozen_decoder_cache(frozen_decoder); |
21160
|
0 |
0 |
for (auto&& sentence : heldout) { |
21161
|
0 |
0 |
if (tags.size() < sentence.forms.size()) tags.resize(sentence.forms.size() * 2); |
|
0 |
0 |
if (tags.size() < sentence.forms.size()) tags.resize(sentence.forms.size() * 2); |
21162
|
0 |
0 |
frozen_decoder.tag(sentence.forms, sentence.analyses, frozen_decoder_cache, tags); |
21164
|
0 |
0 |
for (unsigned i = 0; i < sentence.forms.size(); i++) { |
21167
|
0 |
0 |
heldout_correct[BOTH] += sentence.gold[i].tag == sentence.analyses[i][tags[i]].tag && sentence.gold[i].lemma == sentence.analyses[i][tags[i]].lemma; |
|
0 |
0 |
heldout_correct[BOTH] += sentence.gold[i].tag == sentence.analyses[i][tags[i]].tag && sentence.gold[i].lemma == sentence.analyses[i][tags[i]].lemma; |
21172
|
0 |
0 |
if (early_stopping && heldout_correct[BOTH] > best_correct) { |
|
0 |
0 |
if (early_stopping && heldout_correct[BOTH] > best_correct) { |
21175
|
0 |
0 |
best_features = features; |
21178
|
0 |
0 |
cerr << ", heldout accuracy " << fixed << setprecision(2) |
21186
|
0 |
0 |
if (early_stopping && best_iteration >= 0) { |
21187
|
0 |
0 |
cerr << "Chosen tagger model from iteration " << best_iteration + 1 << endl; |
21188
|
0 |
0 |
features = best_features; |
21286
|
0 |
0 |
for (auto&& sentence : training) |
21287
|
0 |
0 |
for (size_t i = 1; i < sentence.words.size(); i++) |
21288
|
0 |
0 |
if (!can_combine_tag(sentence.words[i], error)) |
21290
|
0 |
0 |
for (auto&& sentence : heldout) |
21291
|
0 |
0 |
for (size_t i = 1; i < sentence.words.size(); i++) |
21292
|
0 |
0 |
if (!can_combine_tag(sentence.words[i], error)) |
21295
|
0 |
0 |
if (!train_tokenizer(training, heldout, tokenizer, os, error)) return false; |
21298
|
0 |
0 |
ostringstream os_tagger; |
21299
|
0 |
0 |
if (!train_tagger(training, heldout, tagger, os_tagger, error)) return false; |
|
0 |
0 |
if (!train_tagger(training, heldout, tagger, os_tagger, error)) return false; |
21301
|
0 |
0 |
os.write(tagger_model.data(), tagger_model.size()); |
21303
|
0 |
0 |
if (!train_parser(training, heldout, parser, tagger_model, os, error)) return false; |
|
0 |
0 |
if (!train_parser(training, heldout, parser, tagger_model, os, error)) return false; |
21310
|
0 |
0 |
if (options == NONE) { |
21315
|
0 |
0 |
if (!named_values::parse(options, tokenizer, error)) return false; |
|
0 |
0 |
if (!named_values::parse(options, tokenizer, error)) return false; |
21316
|
0 |
0 |
int run = 0; if (!option_int(tokenizer, "run", run, error)) return false; |
|
0 |
0 |
int run = 0; if (!option_int(tokenizer, "run", run, error)) return false; |
|
0 |
0 |
int run = 0; if (!option_int(tokenizer, "run", run, error)) return false; |
21318
|
0 |
0 |
if (tokenizer.count("from_model")) { |
|
0 |
0 |
if (tokenizer.count("from_model")) { |
21321
|
0 |
0 |
if (!load_model(tokenizer["from_model"], TOKENIZER_MODEL, tokenizer_data)) |
|
0 |
0 |
if (!load_model(tokenizer["from_model"], TOKENIZER_MODEL, tokenizer_data)) |
|
0 |
0 |
if (!load_model(tokenizer["from_model"], TOKENIZER_MODEL, tokenizer_data)) |
21322
|
0 |
0 |
return error.assign("Cannot load model from which the tokenizer should be used!"), false; |
21325
|
0 |
0 |
os.write(tokenizer_data.str, tokenizer_data.len); |
21327
|
0 |
0 |
os.put(1); |
21328
|
0 |
0 |
const string& model = option_str(tokenizer, "model"); |
|
0 |
0 |
const string& model = option_str(tokenizer, "model"); |
21331
|
0 |
0 |
if (model == "generic") { |
21332
|
0 |
0 |
os.put(morphodita::tokenizer_id::GENERIC); |
21334
|
0 |
0 |
} else if (model.empty() || model == "gru") { |
|
0 |
0 |
} else if (model.empty() || model == "gru") { |
|
0 |
0 |
} else if (model.empty() || model == "gru") { |
21337
|
0 |
0 |
if (tokenizer.count("detokenize")) { |
|
0 |
0 |
if (tokenizer.count("detokenize")) { |
21338
|
0 |
0 |
detokenizer.reset(new udpipe::detokenizer(tokenizer["detokenize"])); |
|
0 |
0 |
detokenizer.reset(new udpipe::detokenizer(tokenizer["detokenize"])); |
|
0 |
0 |
detokenizer.reset(new udpipe::detokenizer(tokenizer["detokenize"])); |
21339
|
0 |
0 |
if (!detokenizer) return error.assign("Cannot create detokenizer!"), false; |
|
0 |
0 |
if (!detokenizer) return error.assign("Cannot create detokenizer!"), false; |
21345
|
0 |
0 |
for (size_t training_sentence = 0; training_sentence < training.size(); training_sentence++) { |
21346
|
0 |
0 |
sentence s = training[training_sentence]; |
21347
|
0 |
0 |
if (detokenizer) detokenizer->detokenize(s); |
|
0 |
0 |
if (detokenizer) detokenizer->detokenize(s); |
21349
|
0 |
0 |
auto& sentence = (sentences.emplace_back(), sentences.back()); |
21351
|
0 |
0 |
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
21352
|
0 |
0 |
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? |
21353
|
0 |
0 |
(const token&)s.multiword_tokens[j] : (const token&)s.words[i]; |
21355
|
0 |
0 |
sentence.tokens.emplace_back(sentence.sentence.size(), 0); |
21356
|
0 |
0 |
for (auto&& chr : unilib::utf8::decoder(tok.form)) { |
21357
|
0 |
0 |
sentence.sentence.push_back(chr); |
21358
|
0 |
0 |
if (unilib::unicode::category(chr) & unilib::unicode::Zs) spaces_in_training = true; |
21362
|
0 |
0 |
if (tok.get_space_after()) sentence.sentence.push_back(' '); |
|
0 |
0 |
if (tok.get_space_after()) sentence.sentence.push_back(' '); |
|
0 |
0 |
if (tok.get_space_after()) sentence.sentence.push_back(' '); |
21364
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
21367
|
0 |
0 |
if (training_sentence + 1 < training.size() && (training[training_sentence + 1].get_new_doc() || training[training_sentence + 1].get_new_par())) |
|
0 |
0 |
if (training_sentence + 1 < training.size() && (training[training_sentence + 1].get_new_doc() || training[training_sentence + 1].get_new_par())) |
|
0 |
0 |
if (training_sentence + 1 < training.size() && (training[training_sentence + 1].get_new_doc() || training[training_sentence + 1].get_new_par())) |
|
0 |
0 |
if (training_sentence + 1 < training.size() && (training[training_sentence + 1].get_new_doc() || training[training_sentence + 1].get_new_par())) |
|
0 |
0 |
if (training_sentence + 1 < training.size() && (training[training_sentence + 1].get_new_doc() || training[training_sentence + 1].get_new_par())) |
|
0 |
0 |
if (training_sentence + 1 < training.size() && (training[training_sentence + 1].get_new_doc() || training[training_sentence + 1].get_new_par())) |
21374
|
0 |
0 |
bool detokenize_handout = true; if (!option_bool(tokenizer, "detokenize_handout", detokenize_handout, error)) return false; |
|
0 |
0 |
bool detokenize_handout = true; if (!option_bool(tokenizer, "detokenize_handout", detokenize_handout, error)) return false; |
|
0 |
0 |
bool detokenize_handout = true; if (!option_bool(tokenizer, "detokenize_handout", detokenize_handout, error)) return false; |
21375
|
0 |
0 |
for (size_t heldout_sentence = 0; heldout_sentence < heldout.size(); heldout_sentence++) { |
21376
|
0 |
0 |
sentence s = heldout[heldout_sentence]; |
21377
|
0 |
0 |
if (detokenizer && detokenize_handout) detokenizer->detokenize(s); |
|
0 |
0 |
if (detokenizer && detokenize_handout) detokenizer->detokenize(s); |
|
0 |
0 |
if (detokenizer && detokenize_handout) detokenizer->detokenize(s); |
|
0 |
0 |
if (detokenizer && detokenize_handout) detokenizer->detokenize(s); |
21379
|
0 |
0 |
auto& sentence = (heldout_sentences.emplace_back(), heldout_sentences.back()); |
21381
|
0 |
0 |
for (size_t i = 1, j = 0; i < s.words.size(); i++) { |
21382
|
0 |
0 |
const token& tok = j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i) ? |
21383
|
0 |
0 |
(const token&)s.multiword_tokens[j] : (const token&)s.words[i]; |
21385
|
0 |
0 |
sentence.tokens.emplace_back(sentence.sentence.size(), 0); |
21386
|
0 |
0 |
for (auto&& chr : unilib::utf8::decoder(tok.form)) |
21387
|
0 |
0 |
sentence.sentence.push_back(chr); |
21390
|
0 |
0 |
if (tok.get_space_after()) sentence.sentence.push_back(' '); |
|
0 |
0 |
if (tok.get_space_after()) sentence.sentence.push_back(' '); |
|
0 |
0 |
if (tok.get_space_after()) sentence.sentence.push_back(' '); |
21392
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
|
0 |
0 |
if (j < s.multiword_tokens.size() && s.multiword_tokens[j].id_first == int(i)) |
21395
|
0 |
0 |
if (heldout_sentence + 1 < heldout.size() && (heldout[heldout_sentence + 1].get_new_doc() || heldout[heldout_sentence + 1].get_new_par())) |
|
0 |
0 |
if (heldout_sentence + 1 < heldout.size() && (heldout[heldout_sentence + 1].get_new_doc() || heldout[heldout_sentence + 1].get_new_par())) |
|
0 |
0 |
if (heldout_sentence + 1 < heldout.size() && (heldout[heldout_sentence + 1].get_new_doc() || heldout[heldout_sentence + 1].get_new_par())) |
|
0 |
0 |
if (heldout_sentence + 1 < heldout.size() && (heldout[heldout_sentence + 1].get_new_doc() || heldout[heldout_sentence + 1].get_new_par())) |
|
0 |
0 |
if (heldout_sentence + 1 < heldout.size() && (heldout[heldout_sentence + 1].get_new_doc() || heldout[heldout_sentence + 1].get_new_par())) |
|
0 |
0 |
if (heldout_sentence + 1 < heldout.size() && (heldout[heldout_sentence + 1].get_new_doc() || heldout[heldout_sentence + 1].get_new_par())) |
21400
|
0 |
0 |
bool tokenize_url = true; if (!option_bool(tokenizer, "tokenize_url", tokenize_url, error)) return false; |
|
0 |
0 |
bool tokenize_url = true; if (!option_bool(tokenizer, "tokenize_url", tokenize_url, error)) return false; |
|
0 |
0 |
bool tokenize_url = true; if (!option_bool(tokenizer, "tokenize_url", tokenize_url, error)) return false; |
21401
|
0 |
0 |
int segment_size = 50; if (!option_int(tokenizer, "segment_size", segment_size, error)) return false; |
|
0 |
0 |
int segment_size = 50; if (!option_int(tokenizer, "segment_size", segment_size, error)) return false; |
|
0 |
0 |
int segment_size = 50; if (!option_int(tokenizer, "segment_size", segment_size, error)) return false; |
21402
|
0 |
0 |
bool allow_spaces = spaces_in_training; if (!option_bool(tokenizer, "allow_spaces", allow_spaces, error)) return false; |
|
0 |
0 |
bool allow_spaces = spaces_in_training; if (!option_bool(tokenizer, "allow_spaces", allow_spaces, error)) return false; |
|
0 |
0 |
bool allow_spaces = spaces_in_training; if (!option_bool(tokenizer, "allow_spaces", allow_spaces, error)) return false; |
21403
|
0 |
0 |
int dimension = 24; if (!option_int(tokenizer, "dimension", dimension, error)) return false; |
|
0 |
0 |
int dimension = 24; if (!option_int(tokenizer, "dimension", dimension, error)) return false; |
|
0 |
0 |
int dimension = 24; if (!option_int(tokenizer, "dimension", dimension, error)) return false; |
21404
|
0 |
0 |
int epochs = 100; if (!option_int(tokenizer, "epochs", epochs, error)) return false; |
|
0 |
0 |
int epochs = 100; if (!option_int(tokenizer, "epochs", epochs, error)) return false; |
|
0 |
0 |
int epochs = 100; if (!option_int(tokenizer, "epochs", epochs, error)) return false; |
21405
|
0 |
0 |
int batch_size = run <= 1 ? 50 : 50 + 50 * hyperparameter_integer(run, 1, 0, 1); |
21406
|
0 |
0 |
if (!option_int(tokenizer, "batch_size", batch_size, error)) return false; |
|
0 |
0 |
if (!option_int(tokenizer, "batch_size", batch_size, error)) return false; |
|
0 |
0 |
if (!option_int(tokenizer, "batch_size", batch_size, error)) return false; |
21407
|
0 |
0 |
double learning_rate = run <= 1 ? 0.005 : hyperparameter_logarithmic(run, 2, 0.0005, 0.01); |
21408
|
0 |
0 |
if (!option_double(tokenizer, "learning_rate", learning_rate, error)) return false; |
|
0 |
0 |
if (!option_double(tokenizer, "learning_rate", learning_rate, error)) return false; |
|
0 |
0 |
if (!option_double(tokenizer, "learning_rate", learning_rate, error)) return false; |
21409
|
0 |
0 |
double learning_rate_final = 0.0; if (!option_double(tokenizer, "learning_rate_final", learning_rate_final, error)) return false; |
|
0 |
0 |
double learning_rate_final = 0.0; if (!option_double(tokenizer, "learning_rate_final", learning_rate_final, error)) return false; |
|
0 |
0 |
double learning_rate_final = 0.0; if (!option_double(tokenizer, "learning_rate_final", learning_rate_final, error)) return false; |
21410
|
0 |
0 |
double dropout = 0.1; if (!option_double(tokenizer, "dropout", dropout, error)) return false; |
|
0 |
0 |
double dropout = 0.1; if (!option_double(tokenizer, "dropout", dropout, error)) return false; |
|
0 |
0 |
double dropout = 0.1; if (!option_double(tokenizer, "dropout", dropout, error)) return false; |
21411
|
0 |
0 |
double initialization_range = 0.5; if (!option_double(tokenizer, "initialization_range", initialization_range, error)) return false; |
|
0 |
0 |
double initialization_range = 0.5; if (!option_double(tokenizer, "initialization_range", initialization_range, error)) return false; |
|
0 |
0 |
double initialization_range = 0.5; if (!option_double(tokenizer, "initialization_range", initialization_range, error)) return false; |
21412
|
0 |
0 |
bool early_stopping = !heldout_sentences.empty(); if (!option_bool(tokenizer, "early_stopping", early_stopping, error)) return false; |
|
0 |
0 |
bool early_stopping = !heldout_sentences.empty(); if (!option_bool(tokenizer, "early_stopping", early_stopping, error)) return false; |
|
0 |
0 |
bool early_stopping = !heldout_sentences.empty(); if (!option_bool(tokenizer, "early_stopping", early_stopping, error)) return false; |
21414
|
0 |
0 |
if (run >= 1) cerr << "Random search run " << run << ", batch_size=" << batch_size |
|
0 |
0 |
if (run >= 1) cerr << "Random search run " << run << ", batch_size=" << batch_size |
|
0 |
0 |
if (run >= 1) cerr << "Random search run " << run << ", batch_size=" << batch_size |
21417
|
0 |
0 |
cerr << "Training tokenizer with the following options: " << "tokenize_url=" << (tokenize_url ? 1 : 0) |
|
0 |
0 |
cerr << "Training tokenizer with the following options: " << "tokenize_url=" << (tokenize_url ? 1 : 0) |
21418
|
0 |
0 |
<< ", allow_spaces=" << (allow_spaces ? 1 : 0) << ", dimension=" << dimension << endl |
|
0 |
0 |
<< ", allow_spaces=" << (allow_spaces ? 1 : 0) << ", dimension=" << dimension << endl |
|
0 |
0 |
<< ", allow_spaces=" << (allow_spaces ? 1 : 0) << ", dimension=" << dimension << endl |
21419
|
0 |
0 |
<< " epochs=" << epochs << ", batch_size=" << batch_size << ", segment_size=" << segment_size |
|
0 |
0 |
<< " epochs=" << epochs << ", batch_size=" << batch_size << ", segment_size=" << segment_size |
|
0 |
0 |
<< " epochs=" << epochs << ", batch_size=" << batch_size << ", segment_size=" << segment_size |
21421
|
0 |
0 |
<< " dropout=" << dropout << ", early_stopping=" << (early_stopping ? 1 : 0) << endl; |
|
0 |
0 |
<< " dropout=" << dropout << ", early_stopping=" << (early_stopping ? 1 : 0) << endl; |
21424
|
0 |
0 |
os.put(morphodita::tokenizer_ids::GRU); |
21425
|
0 |
0 |
if (!morphodita::gru_tokenizer_trainer::train(tokenize_url ? morphodita::gru_tokenizer_trainer::URL_EMAIL_LATEST : 0, |
|
0 |
0 |
if (!morphodita::gru_tokenizer_trainer::train(tokenize_url ? morphodita::gru_tokenizer_trainer::URL_EMAIL_LATEST : 0, |
|
0 |
0 |
if (!morphodita::gru_tokenizer_trainer::train(tokenize_url ? morphodita::gru_tokenizer_trainer::URL_EMAIL_LATEST : 0, |
21431
|
0 |
0 |
return error.assign("Unknown tokenizer model '").append(model).append("'!"), false; |
|
0 |
0 |
return error.assign("Unknown tokenizer model '").append(model).append("'!"), false; |
21435
|
0 |
0 |
if (!multiword_splitter_trainer::train(training, os, error)) return false; |
|
0 |
0 |
if (!multiword_splitter_trainer::train(training, os, error)) return false; |
21444
|
0 |
0 |
if (options == NONE) { |
21449
|
0 |
0 |
if (!named_values::parse(options, tagger, error)) return false; |
|
0 |
0 |
if (!named_values::parse(options, tagger, error)) return false; |
21451
|
0 |
0 |
if (tagger.count("from_model")) { |
|
0 |
0 |
if (tagger.count("from_model")) { |
21454
|
0 |
0 |
string model_name = "from_model"; |
21456
|
0 |
0 |
do { |
21457
|
0 |
0 |
taggers_data.emplace_back(); |
21458
|
0 |
0 |
if (!load_model(tagger[model_name], TAGGER_MODEL, taggers_data.back())) |
|
0 |
0 |
if (!load_model(tagger[model_name], TAGGER_MODEL, taggers_data.back())) |
21459
|
0 |
0 |
return error.assign("Cannot load model from which the tagger should be used!"), false; |
21460
|
0 |
0 |
if (taggers_data.back().str[0]) { |
21463
|
0 |
0 |
vector overrides = {"lemma", "xpostag", "feats"}; |
|
0 |
0 |
vector overrides = {"lemma", "xpostag", "feats"}; |
|
0 |
0 |
vector overrides = {"lemma", "xpostag", "feats"}; |
|
0 |
0 |
vector overrides = {"lemma", "xpostag", "feats"}; |
|
0 |
0 |
vector overrides = {"lemma", "xpostag", "feats"}; |
|
0 |
0 |
vector overrides = {"lemma", "xpostag", "feats"}; |
21464
|
0 |
0 |
for (size_t i = 0; i < overrides.size(); i++) { |
21465
|
0 |
0 |
string override_name = "from_model_" + overrides[i]; |
21467
|
0 |
0 |
if (!option_int(tagger, override_name, override_value, error, model_index)) return false; |
|
0 |
0 |
if (!option_int(tagger, override_name, override_value, error, model_index)) return false; |
21468
|
0 |
0 |
if (override_value >= 0) |
21474
|
0 |
0 |
model_name = "from_model_" + to_string(1 + ++model_index); |
21476
|
0 |
0 |
if (taggers_total < 0 || taggers_total > 4) return error.assign("Cannot create more than four tagger models!"), false; |
|
0 |
0 |
if (taggers_total < 0 || taggers_total > 4) return error.assign("Cannot create more than four tagger models!"), false; |
21479
|
0 |
0 |
os.put(taggers_total); |
21480
|
0 |
0 |
for (auto&& tagger_data : taggers_data) |
21481
|
0 |
0 |
os.write(tagger_data.str + 1, tagger_data.len - 1); |
21484
|
0 |
0 |
int models = 1; if (!option_int(tagger, "models", models, error)) return false; |
|
0 |
0 |
int models = 1; if (!option_int(tagger, "models", models, error)) return false; |
|
0 |
0 |
int models = 1; if (!option_int(tagger, "models", models, error)) return false; |
21485
|
0 |
0 |
if (models <= 0) return error.assign("Number of tagger models cannot be negative or zero!"), false; |
|
0 |
0 |
if (models <= 0) return error.assign("Number of tagger models cannot be negative or zero!"), false; |
21486
|
0 |
0 |
if (models > 4) return error.assign("Cannot create more than four tagger models!"), false; |
|
0 |
0 |
if (models > 4) return error.assign("Cannot create more than four tagger models!"), false; |
21488
|
0 |
0 |
os.put(models); |
21489
|
0 |
0 |
for (int model = 0; model < models; model++) |
21490
|
0 |
0 |
if (!train_tagger_model(training, heldout, model, models, tagger, os, error)) |
|
0 |
0 |
if (!train_tagger_model(training, heldout, model, models, tagger, os, error)) |
21500
|
0 |
0 |
if (options == NONE) { |
21505
|
0 |
0 |
if (!named_values::parse(options, parser, error)) return false; |
|
0 |
0 |
if (!named_values::parse(options, parser, error)) return false; |
21506
|
0 |
0 |
int run = 0; if (!option_int(parser, "run", run, error)) return false; |
|
0 |
0 |
int run = 0; if (!option_int(parser, "run", run, error)) return false; |
|
0 |
0 |
int run = 0; if (!option_int(parser, "run", run, error)) return false; |
21508
|
0 |
0 |
if (parser.count("from_model")) { |
|
0 |
0 |
if (parser.count("from_model")) { |
21511
|
0 |
0 |
if (!load_model(parser["from_model"], PARSER_MODEL, parser_data)) |
|
0 |
0 |
if (!load_model(parser["from_model"], PARSER_MODEL, parser_data)) |
|
0 |
0 |
if (!load_model(parser["from_model"], PARSER_MODEL, parser_data)) |
21512
|
0 |
0 |
return error.assign("Cannot load model from which the parser should be used!"), false; |
21515
|
0 |
0 |
os.write(parser_data.str, parser_data.len); |
21517
|
0 |
0 |
os.put(1); |
21520
|
0 |
0 |
string transition_system = parser.count("transition_system") ? parser["transition_system"] : "projective"; |
|
0 |
0 |
string transition_system = parser.count("transition_system") ? parser["transition_system"] : "projective"; |
|
0 |
0 |
string transition_system = parser.count("transition_system") ? parser["transition_system"] : "projective"; |
|
0 |
0 |
string transition_system = parser.count("transition_system") ? parser["transition_system"] : "projective"; |
21521
|
0 |
0 |
string transition_oracle = parser.count("transition_oracle") ? parser["transition_oracle"] : |
|
0 |
0 |
string transition_oracle = parser.count("transition_oracle") ? parser["transition_oracle"] : |
|
0 |
0 |
string transition_oracle = parser.count("transition_oracle") ? parser["transition_oracle"] : |
|
0 |
0 |
string transition_oracle = parser.count("transition_oracle") ? parser["transition_oracle"] : |
21524
|
0 |
0 |
"static"; |
|
0 |
0 |
"static"; |
|
0 |
0 |
"static"; |
|
0 |
0 |
"static"; |
21526
|
0 |
0 |
int embedding_upostag = 20; if (!option_int(parser, "embedding_upostag", embedding_upostag, error)) return false; |
|
0 |
0 |
int embedding_upostag = 20; if (!option_int(parser, "embedding_upostag", embedding_upostag, error)) return false; |
|
0 |
0 |
int embedding_upostag = 20; if (!option_int(parser, "embedding_upostag", embedding_upostag, error)) return false; |
21527
|
0 |
0 |
int embedding_feats = 20; if (!option_int(parser, "embedding_feats", embedding_feats, error)) return false; |
|
0 |
0 |
int embedding_feats = 20; if (!option_int(parser, "embedding_feats", embedding_feats, error)) return false; |
|
0 |
0 |
int embedding_feats = 20; if (!option_int(parser, "embedding_feats", embedding_feats, error)) return false; |
21528
|
0 |
0 |
int embedding_xpostag = 0; if (!option_int(parser, "embedding_xpostag", embedding_xpostag, error)) return false; |
|
0 |
0 |
int embedding_xpostag = 0; if (!option_int(parser, "embedding_xpostag", embedding_xpostag, error)) return false; |
|
0 |
0 |
int embedding_xpostag = 0; if (!option_int(parser, "embedding_xpostag", embedding_xpostag, error)) return false; |
21529
|
0 |
0 |
int embedding_form = 50; if (!option_int(parser, "embedding_form", embedding_form, error)) return false; |
|
0 |
0 |
int embedding_form = 50; if (!option_int(parser, "embedding_form", embedding_form, error)) return false; |
|
0 |
0 |
int embedding_form = 50; if (!option_int(parser, "embedding_form", embedding_form, error)) return false; |
21530
|
0 |
0 |
int embedding_form_mincount = 2; if (!option_int(parser, "embedding_form_mincount", embedding_form_mincount, error)) return false; |
|
0 |
0 |
int embedding_form_mincount = 2; if (!option_int(parser, "embedding_form_mincount", embedding_form_mincount, error)) return false; |
|
0 |
0 |
int embedding_form_mincount = 2; if (!option_int(parser, "embedding_form_mincount", embedding_form_mincount, error)) return false; |
21531
|
0 |
0 |
int embedding_lemma = 0; if (!option_int(parser, "embedding_lemma", embedding_lemma, error)) return false; |
|
0 |
0 |
int embedding_lemma = 0; if (!option_int(parser, "embedding_lemma", embedding_lemma, error)) return false; |
|
0 |
0 |
int embedding_lemma = 0; if (!option_int(parser, "embedding_lemma", embedding_lemma, error)) return false; |
21532
|
0 |
0 |
int embedding_lemma_mincount = 2; if (!option_int(parser, "embedding_lemma_mincount", embedding_lemma_mincount, error)) return false; |
|
0 |
0 |
int embedding_lemma_mincount = 2; if (!option_int(parser, "embedding_lemma_mincount", embedding_lemma_mincount, error)) return false; |
|
0 |
0 |
int embedding_lemma_mincount = 2; if (!option_int(parser, "embedding_lemma_mincount", embedding_lemma_mincount, error)) return false; |
21533
|
0 |
0 |
int embedding_deprel = 20; if (!option_int(parser, "embedding_deprel", embedding_deprel, error)) return false; |
|
0 |
0 |
int embedding_deprel = 20; if (!option_int(parser, "embedding_deprel", embedding_deprel, error)) return false; |
|
0 |
0 |
int embedding_deprel = 20; if (!option_int(parser, "embedding_deprel", embedding_deprel, error)) return false; |
21535
|
0 |
0 |
if (embedding_upostag) embeddings.append("universal_tag ").append(to_string(embedding_upostag)).append(" 1\n"); |
|
0 |
0 |
if (embedding_upostag) embeddings.append("universal_tag ").append(to_string(embedding_upostag)).append(" 1\n"); |
|
0 |
0 |
if (embedding_upostag) embeddings.append("universal_tag ").append(to_string(embedding_upostag)).append(" 1\n"); |
21536
|
0 |
0 |
if (embedding_feats) embeddings.append("feats ").append(to_string(embedding_feats)).append(" 1\n"); |
|
0 |
0 |
if (embedding_feats) embeddings.append("feats ").append(to_string(embedding_feats)).append(" 1\n"); |
|
0 |
0 |
if (embedding_feats) embeddings.append("feats ").append(to_string(embedding_feats)).append(" 1\n"); |
21537
|
0 |
0 |
if (embedding_xpostag) embeddings.append("tag ").append(to_string(embedding_xpostag)).append(" 1\n"); |
|
0 |
0 |
if (embedding_xpostag) embeddings.append("tag ").append(to_string(embedding_xpostag)).append(" 1\n"); |
|
0 |
0 |
if (embedding_xpostag) embeddings.append("tag ").append(to_string(embedding_xpostag)).append(" 1\n"); |
21538
|
0 |
0 |
if (embedding_form) { |
21539
|
0 |
0 |
embeddings.append("form ").append(to_string(embedding_form)).append(" ").append(to_string(embedding_form_mincount)); |
|
0 |
0 |
embeddings.append("form ").append(to_string(embedding_form)).append(" ").append(to_string(embedding_form_mincount)); |
21540
|
0 |
0 |
if (!option_str(parser, "embedding_form_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_form_file")); |
|
0 |
0 |
if (!option_str(parser, "embedding_form_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_form_file")); |
|
0 |
0 |
if (!option_str(parser, "embedding_form_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_form_file")); |
|
0 |
0 |
if (!option_str(parser, "embedding_form_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_form_file")); |
|
0 |
0 |
if (!option_str(parser, "embedding_form_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_form_file")); |
|
0 |
0 |
if (!option_str(parser, "embedding_form_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_form_file")); |
21541
|
0 |
0 |
embeddings.push_back('\n'); |
21543
|
0 |
0 |
if (embedding_lemma) { |
21544
|
0 |
0 |
embeddings.append("lemma ").append(to_string(embedding_lemma)).append(" ").append(to_string(embedding_lemma_mincount)); |
|
0 |
0 |
embeddings.append("lemma ").append(to_string(embedding_lemma)).append(" ").append(to_string(embedding_lemma_mincount)); |
21545
|
0 |
0 |
if (!option_str(parser, "embedding_lemma_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_lemma_file")); |
|
0 |
0 |
if (!option_str(parser, "embedding_lemma_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_lemma_file")); |
|
0 |
0 |
if (!option_str(parser, "embedding_lemma_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_lemma_file")); |
|
0 |
0 |
if (!option_str(parser, "embedding_lemma_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_lemma_file")); |
|
0 |
0 |
if (!option_str(parser, "embedding_lemma_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_lemma_file")); |
|
0 |
0 |
if (!option_str(parser, "embedding_lemma_file").empty()) embeddings.append(" ").append(option_str(parser, "embedding_lemma_file")); |
21546
|
0 |
0 |
embeddings.push_back('\n'); |
21548
|
0 |
0 |
if (embedding_deprel) embeddings.append("deprel ").append(to_string(embedding_deprel)).append(" 1\n"); |
|
0 |
0 |
if (embedding_deprel) embeddings.append("deprel ").append(to_string(embedding_deprel)).append(" 1\n"); |
|
0 |
0 |
if (embedding_deprel) embeddings.append("deprel ").append(to_string(embedding_deprel)).append(" 1\n"); |
21550
|
0 |
0 |
bool single_root = true; if (!option_bool(parser, "single_root", single_root, error)) return false; |
|
0 |
0 |
bool single_root = true; if (!option_bool(parser, "single_root", single_root, error)) return false; |
|
0 |
0 |
bool single_root = true; if (!option_bool(parser, "single_root", single_root, error)) return false; |
21551
|
0 |
0 |
int iterations = 10; if (!option_int(parser, "iterations", iterations, error)) return false; |
|
0 |
0 |
int iterations = 10; if (!option_int(parser, "iterations", iterations, error)) return false; |
|
0 |
0 |
int iterations = 10; if (!option_int(parser, "iterations", iterations, error)) return false; |
21552
|
0 |
0 |
int hidden_layer = 200; if (!option_int(parser, "hidden_layer", hidden_layer, error)) return false; |
|
0 |
0 |
int hidden_layer = 200; if (!option_int(parser, "hidden_layer", hidden_layer, error)) return false; |
|
0 |
0 |
int hidden_layer = 200; if (!option_int(parser, "hidden_layer", hidden_layer, error)) return false; |
21553
|
0 |
0 |
int batch_size = 10; if (!option_int(parser, "batch_size", batch_size, error)) return false; |
|
0 |
0 |
int batch_size = 10; if (!option_int(parser, "batch_size", batch_size, error)) return false; |
|
0 |
0 |
int batch_size = 10; if (!option_int(parser, "batch_size", batch_size, error)) return false; |
21554
|
0 |
0 |
int structured_interval = run <= 1 ? 8 : hyperparameter_integer(run,1,0,2) == 2 ? 0 : 8 + 2*hyperparameter_integer(run,1,0,2); |
|
0 |
0 |
int structured_interval = run <= 1 ? 8 : hyperparameter_integer(run,1,0,2) == 2 ? 0 : 8 + 2*hyperparameter_integer(run,1,0,2); |
21555
|
0 |
0 |
if (!option_int(parser, "structured_interval", structured_interval, error)) return false; |
|
0 |
0 |
if (!option_int(parser, "structured_interval", structured_interval, error)) return false; |
|
0 |
0 |
if (!option_int(parser, "structured_interval", structured_interval, error)) return false; |
21556
|
0 |
0 |
double learning_rate = run <= 1 ? 0.02 : hyperparameter_logarithmic(run, 2, 0.005, 0.04); |
21557
|
0 |
0 |
if (!option_double(parser, "learning_rate", learning_rate, error)) return false; |
|
0 |
0 |
if (!option_double(parser, "learning_rate", learning_rate, error)) return false; |
|
0 |
0 |
if (!option_double(parser, "learning_rate", learning_rate, error)) return false; |
21558
|
0 |
0 |
double learning_rate_final = 0.001; if (!option_double(parser, "learning_rate_final", learning_rate_final, error)) return false; |
|
0 |
0 |
double learning_rate_final = 0.001; if (!option_double(parser, "learning_rate_final", learning_rate_final, error)) return false; |
|
0 |
0 |
double learning_rate_final = 0.001; if (!option_double(parser, "learning_rate_final", learning_rate_final, error)) return false; |
21559
|
0 |
0 |
double l2 = run <= 1 ? 0.5 : hyperparameter_uniform(run, 3, 0.2, 0.6); |
21560
|
0 |
0 |
if (!option_double(parser, "l2", l2, error)) return false; |
|
0 |
0 |
if (!option_double(parser, "l2", l2, error)) return false; |
|
0 |
0 |
if (!option_double(parser, "l2", l2, error)) return false; |
21561
|
0 |
0 |
bool early_stopping = !heldout.empty(); if (!option_bool(parser, "early_stopping", early_stopping, error)) return false; |
|
0 |
0 |
bool early_stopping = !heldout.empty(); if (!option_bool(parser, "early_stopping", early_stopping, error)) return false; |
|
0 |
0 |
bool early_stopping = !heldout.empty(); if (!option_bool(parser, "early_stopping", early_stopping, error)) return false; |
21563
|
0 |
0 |
if (run >= 1) cerr << "Random search run " << run << ", structured_interval=" << structured_interval |
|
0 |
0 |
if (run >= 1) cerr << "Random search run " << run << ", structured_interval=" << structured_interval |
|
0 |
0 |
if (run >= 1) cerr << "Random search run " << run << ", structured_interval=" << structured_interval |
21589
|
0 |
0 |
bool use_gold_tags = false; if (!option_bool(parser, "use_gold_tags", use_gold_tags, error)) return false; |
|
0 |
0 |
bool use_gold_tags = false; if (!option_bool(parser, "use_gold_tags", use_gold_tags, error)) return false; |
|
0 |
0 |
bool use_gold_tags = false; if (!option_bool(parser, "use_gold_tags", use_gold_tags, error)) return false; |
21590
|
0 |
0 |
if (!use_gold_tags && !tagger_model.empty() && tagger_model[0]) { |
|
0 |
0 |
if (!use_gold_tags && !tagger_model.empty() && tagger_model[0]) { |
|
0 |
0 |
if (!use_gold_tags && !tagger_model.empty() && tagger_model[0]) { |
|
0 |
0 |
if (!use_gold_tags && !tagger_model.empty() && tagger_model[0]) { |
21591
|
0 |
0 |
stringstream tagger_description; |
21592
|
0 |
0 |
tagger_description.put(model_morphodita_parsito::VERSION_LATEST).put(0x7F).put(0x7F).put(0).write(tagger_model.data(), tagger_model.size()).put(0); |
|
0 |
0 |
tagger_description.put(model_morphodita_parsito::VERSION_LATEST).put(0x7F).put(0x7F).put(0).write(tagger_model.data(), tagger_model.size()).put(0); |
|
0 |
0 |
tagger_description.put(model_morphodita_parsito::VERSION_LATEST).put(0x7F).put(0x7F).put(0).write(tagger_model.data(), tagger_model.size()).put(0); |
|
0 |
0 |
tagger_description.put(model_morphodita_parsito::VERSION_LATEST).put(0x7F).put(0x7F).put(0).write(tagger_model.data(), tagger_model.size()).put(0); |
|
0 |
0 |
tagger_description.put(model_morphodita_parsito::VERSION_LATEST).put(0x7F).put(0x7F).put(0).write(tagger_model.data(), tagger_model.size()).put(0); |
|
0 |
0 |
tagger_description.put(model_morphodita_parsito::VERSION_LATEST).put(0x7F).put(0x7F).put(0).write(tagger_model.data(), tagger_model.size()).put(0); |
21593
|
0 |
0 |
tagger.reset(model_morphodita_parsito::load(tagger_description)); |
21594
|
0 |
0 |
if (!tagger) return error.assign("Cannot load the tagger model for parser training data generation!"), false; |
|
0 |
0 |
if (!tagger) return error.assign("Cannot load the tagger model for parser training data generation!"), false; |
21598
|
0 |
0 |
sentence tagged; |
21600
|
0 |
0 |
for (auto&& sentence : training) { |
21601
|
0 |
0 |
tagged = sentence; |
21602
|
0 |
0 |
if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false; |
|
0 |
0 |
if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false; |
|
0 |
0 |
if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false; |
|
0 |
0 |
if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false; |
21604
|
0 |
0 |
train_trees.emplace_back(); |
21605
|
0 |
0 |
for (size_t i = 1; i < tagged.words.size(); i++) { |
21607
|
0 |
0 |
model_normalize_form(tagged.words[i].form, train_trees.back().nodes.back().form); |
21613
|
0 |
0 |
for (size_t i = 1; i < tagged.words.size(); i++) |
21614
|
0 |
0 |
train_trees.back().set_head(tagged.words[i].id, tagged.words[i].head, tagged.words[i].deprel); |
21619
|
0 |
0 |
for (auto&& sentence : heldout) { |
21620
|
0 |
0 |
tagged = sentence; |
21621
|
0 |
0 |
if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false; |
|
0 |
0 |
if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false; |
|
0 |
0 |
if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false; |
|
0 |
0 |
if (tagger && !tagger->tag(tagged, DEFAULT, error)) return false; |
21623
|
0 |
0 |
heldout_trees.emplace_back(); |
21624
|
0 |
0 |
for (size_t i = 1; i < tagged.words.size(); i++) { |
21626
|
0 |
0 |
model_normalize_form(tagged.words[i].form, heldout_trees.back().nodes.back().form); |
21632
|
0 |
0 |
for (size_t i = 1; i < tagged.words.size(); i++) |
21633
|
0 |
0 |
heldout_trees.back().set_head(tagged.words[i].id, tagged.words[i].head, tagged.words[i].deprel); |
21637
|
0 |
0 |
<< ", structured_interval=" << structured_interval << ", single_root=" << (single_root ? 1 : 0) << endl |
|
0 |
0 |
<< ", structured_interval=" << structured_interval << ", single_root=" << (single_root ? 1 : 0) << endl |
|
0 |
0 |
<< ", structured_interval=" << structured_interval << ", single_root=" << (single_root ? 1 : 0) << endl |
21638
|
0 |
0 |
<< "Parser uses lemmas/upos/xpos/feats: " << (tagger ? "automatically generated by tagger" : "from gold data") << endl |
|
0 |
0 |
<< "Parser uses lemmas/upos/xpos/feats: " << (tagger ? "automatically generated by tagger" : "from gold data") << endl |
21639
|
0 |
0 |
<< "Parser embeddings options: upostag=" << embedding_upostag << ", feats=" << embedding_feats << ", xpostag=" << embedding_xpostag |
|
0 |
0 |
<< "Parser embeddings options: upostag=" << embedding_upostag << ", feats=" << embedding_feats << ", xpostag=" << embedding_xpostag |
|
0 |
0 |
<< "Parser embeddings options: upostag=" << embedding_upostag << ", feats=" << embedding_feats << ", xpostag=" << embedding_xpostag |
21640
|
0 |
0 |
<< ", form=" << embedding_form << ", lemma=" << embedding_lemma << ", deprel=" << embedding_deprel << endl |
|
0 |
0 |
<< ", form=" << embedding_form << ", lemma=" << embedding_lemma << ", deprel=" << embedding_deprel << endl |
|
0 |
0 |
<< ", form=" << embedding_form << ", lemma=" << embedding_lemma << ", deprel=" << embedding_deprel << endl |
21641
|
0 |
0 |
<< " form mincount=" << embedding_form_mincount << ", precomputed form embeddings=" << (parser["embedding_form_file"].empty() ? "none" : parser["embedding_form_file"]) << endl |
|
0 |
0 |
<< " form mincount=" << embedding_form_mincount << ", precomputed form embeddings=" << (parser["embedding_form_file"].empty() ? "none" : parser["embedding_form_file"]) << endl |
|
0 |
0 |
<< " form mincount=" << embedding_form_mincount << ", precomputed form embeddings=" << (parser["embedding_form_file"].empty() ? "none" : parser["embedding_form_file"]) << endl |
|
0 |
0 |
<< " form mincount=" << embedding_form_mincount << ", precomputed form embeddings=" << (parser["embedding_form_file"].empty() ? "none" : parser["embedding_form_file"]) << endl |
|
0 |
0 |
<< " form mincount=" << embedding_form_mincount << ", precomputed form embeddings=" << (parser["embedding_form_file"].empty() ? "none" : parser["embedding_form_file"]) << endl |
|
0 |
0 |
<< " form mincount=" << embedding_form_mincount << ", precomputed form embeddings=" << (parser["embedding_form_file"].empty() ? "none" : parser["embedding_form_file"]) << endl |
|
0 |
0 |
<< " form mincount=" << embedding_form_mincount << ", precomputed form embeddings=" << (parser["embedding_form_file"].empty() ? "none" : parser["embedding_form_file"]) << endl |
21642
|
0 |
0 |
<< " lemma mincount=" << embedding_lemma_mincount << ", precomputed lemma embeddings=" << (parser["embedding_lemma_file"].empty() ? "none" : parser["embedding_lemma_file"]) << endl |
|
0 |
0 |
<< " lemma mincount=" << embedding_lemma_mincount << ", precomputed lemma embeddings=" << (parser["embedding_lemma_file"].empty() ? "none" : parser["embedding_lemma_file"]) << endl |
|
0 |
0 |
<< " lemma mincount=" << embedding_lemma_mincount << ", precomputed lemma embeddings=" << (parser["embedding_lemma_file"].empty() ? "none" : parser["embedding_lemma_file"]) << endl |
|
0 |
0 |
<< " lemma mincount=" << embedding_lemma_mincount << ", precomputed lemma embeddings=" << (parser["embedding_lemma_file"].empty() ? "none" : parser["embedding_lemma_file"]) << endl |
|
0 |
0 |
<< " lemma mincount=" << embedding_lemma_mincount << ", precomputed lemma embeddings=" << (parser["embedding_lemma_file"].empty() ? "none" : parser["embedding_lemma_file"]) << endl |
|
0 |
0 |
<< " lemma mincount=" << embedding_lemma_mincount << ", precomputed lemma embeddings=" << (parser["embedding_lemma_file"].empty() ? "none" : parser["embedding_lemma_file"]) << endl |
|
0 |
0 |
<< " lemma mincount=" << embedding_lemma_mincount << ", precomputed lemma embeddings=" << (parser["embedding_lemma_file"].empty() ? "none" : parser["embedding_lemma_file"]) << endl |
21643
|
0 |
0 |
<< "Parser network options: iterations=" << iterations << ", hidden_layer=" << hidden_layer << ", batch_size=" << batch_size << "," << endl |
|
0 |
0 |
<< "Parser network options: iterations=" << iterations << ", hidden_layer=" << hidden_layer << ", batch_size=" << batch_size << "," << endl |
|
0 |
0 |
<< "Parser network options: iterations=" << iterations << ", hidden_layer=" << hidden_layer << ", batch_size=" << batch_size << "," << endl |
21645
|
0 |
0 |
<< ", l2=" << l2 << ", early_stopping=" << (early_stopping ? 1 : 0) << endl; |
|
0 |
0 |
<< ", l2=" << l2 << ", early_stopping=" << (early_stopping ? 1 : 0) << endl; |
21648
|
0 |
0 |
binary_encoder enc; |
21649
|
0 |
0 |
enc.add_str("nn_versioned"); |
21651
|
0 |
0 |
parameters, 1, train_trees, heldout_trees, enc); |
21652
|
0 |
0 |
compressor::save(os, enc); |
21664
|
0 |
0 |
if (!is.get(len)) return false; |
|
0 |
0 |
if (!is.get(len)) return false; |
21666
|
0 |
0 |
if (!is.read(&name[0], len)) return false; |
|
0 |
0 |
if (!is.read(&name[0], len)) return false; |
21667
|
0 |
0 |
if (name != "morphodita_parsito") return false; |
21670
|
0 |
0 |
if (!is.get(version)) return false; |
|
0 |
0 |
if (!is.get(version)) return false; |
21671
|
0 |
0 |
if (!(version >= 1 && version <= model_morphodita_parsito::VERSION_LATEST)) return false; |
21676
|
0 |
0 |
if (version >= 2) { |
21678
|
0 |
0 |
if (!is.get(sentinel) || sentinel != 0x7F) return false; |
|
0 |
0 |
if (!is.get(sentinel) || sentinel != 0x7F) return false; |
|
0 |
0 |
if (!is.get(sentinel) || sentinel != 0x7F) return false; |
|
0 |
0 |
if (!is.get(sentinel) || sentinel != 0x7F) return false; |
21679
|
0 |
0 |
if (!is.get(sentinel) || sentinel != 0x7F) return false; |
|
0 |
0 |
if (!is.get(sentinel) || sentinel != 0x7F) return false; |
|
0 |
0 |
if (!is.get(sentinel) || sentinel != 0x7F) return false; |
|
0 |
0 |
if (!is.get(sentinel) || sentinel != 0x7F) return false; |
21684
|
0 |
0 |
if (model == TOKENIZER_MODEL) range.str = data.data() + is.tellg(); |
|
0 |
0 |
if (model == TOKENIZER_MODEL) range.str = data.data() + is.tellg(); |
21685
|
0 |
0 |
char tokenizer; if (!is.get(tokenizer)) return false; |
|
0 |
0 |
char tokenizer; if (!is.get(tokenizer)) return false; |
21686
|
0 |
0 |
unique_ptr tokenizer_factory(tokenizer ? morphodita::tokenizer_factory::load(is) : nullptr); |
|
0 |
0 |
unique_ptr tokenizer_factory(tokenizer ? morphodita::tokenizer_factory::load(is) : nullptr); |
21687
|
0 |
0 |
if (tokenizer && !tokenizer_factory) return false; |
|
0 |
0 |
if (tokenizer && !tokenizer_factory) return false; |
|
0 |
0 |
if (tokenizer && !tokenizer_factory) return false; |
21688
|
0 |
0 |
unique_ptr splitter(tokenizer ? multiword_splitter::load(is) : nullptr); |
|
0 |
0 |
unique_ptr splitter(tokenizer ? multiword_splitter::load(is) : nullptr); |
21689
|
0 |
0 |
if (model == TOKENIZER_MODEL) return range.len = is.tellg() - streampos(range.str - data.data()), true; |
|
0 |
0 |
if (model == TOKENIZER_MODEL) return range.len = is.tellg() - streampos(range.str - data.data()), true; |
21694
|
0 |
0 |
if (model == TAGGER_MODEL) range.str = data.data() + is.tellg(); |
|
0 |
0 |
if (model == TAGGER_MODEL) range.str = data.data() + is.tellg(); |
21695
|
0 |
0 |
char taggers; if (!is.get(taggers)) return false; |
|
0 |
0 |
char taggers; if (!is.get(taggers)) return false; |
21696
|
0 |
0 |
for (char i = 0; i < taggers; i++) { |
21697
|
0 |
0 |
char lemma; if (!is.get(lemma)) return false; |
|
0 |
0 |
char lemma; if (!is.get(lemma)) return false; |
21698
|
0 |
0 |
char xpostag; if (!is.get(xpostag)) return false; |
|
0 |
0 |
char xpostag; if (!is.get(xpostag)) return false; |
21699
|
0 |
0 |
char feats; if (!is.get(feats)) return false; |
|
0 |
0 |
char feats; if (!is.get(feats)) return false; |
21700
|
0 |
0 |
unique_ptr tagger(morphodita::tagger::load(is)); |
21701
|
0 |
0 |
if (!tagger) return false; |
21703
|
0 |
0 |
if (model == TAGGER_MODEL) return range.len = is.tellg() - streampos(range.str - data.data()), true; |
|
0 |
0 |
if (model == TAGGER_MODEL) return range.len = is.tellg() - streampos(range.str - data.data()), true; |
21708
|
0 |
0 |
if (model == PARSER_MODEL) range.str = data.data() + is.tellg(); |
|
0 |
0 |
if (model == PARSER_MODEL) range.str = data.data() + is.tellg(); |
21710
|
0 |
0 |
if (!is.get(parser)) return false; |
|
0 |
0 |
if (!is.get(parser)) return false; |
21711
|
0 |
0 |
unique_ptr parser_model(parser ? parsito::parser::load(is) : nullptr); |
|
0 |
0 |
unique_ptr parser_model(parser ? parsito::parser::load(is) : nullptr); |
21712
|
0 |
0 |
if (parser && !parser_model) return false; |
|
0 |
0 |
if (parser && !parser_model) return false; |
|
0 |
0 |
if (parser && !parser_model) return false; |
21713
|
0 |
0 |
if (model == PARSER_MODEL) return range.len = is.tellg() - streampos(range.str - data.data()), true; |
|
0 |
0 |
if (model == PARSER_MODEL) return range.len = is.tellg() - streampos(range.str - data.data()), true; |
21720
|
0 |
0 |
return model_morphodita_parsito(model_morphodita_parsito::VERSION_LATEST).normalize_form(form, output); |
21724
|
0 |
0 |
return model_morphodita_parsito(model_morphodita_parsito::VERSION_LATEST).normalize_lemma(lemma, output); |
21728
|
0 |
0 |
model_morphodita_parsito(model_morphodita_parsito::VERSION_LATEST).fill_word_analysis(analysis, false, upostag, lemma, xpostag, feats, word); |
21736
|
0 |
0 |
unique_ptr conllu_input_format(input_format::new_conllu_input_format()); |
21738
|
0 |
0 |
int run = 0; if (!option_int(tagger, "run", run, error, model)) return false; |
|
0 |
0 |
int run = 0; if (!option_int(tagger, "run", run, error, model)) return false; |
|
0 |
0 |
int run = 0; if (!option_int(tagger, "run", run, error, model)) return false; |
21741
|
0 |
0 |
for (auto&& sentence : training) |
21742
|
0 |
0 |
for (size_t i = 1; !have_lemma && i < sentence.words.size(); i++) |
|
0 |
0 |
for (size_t i = 1; !have_lemma && i < sentence.words.size(); i++) |
|
0 |
0 |
for (size_t i = 1; !have_lemma && i < sentence.words.size(); i++) |
21743
|
0 |
0 |
if (!sentence.words[i].lemma.empty() && sentence.words[i].lemma != "_") |
|
0 |
0 |
if (!sentence.words[i].lemma.empty() && sentence.words[i].lemma != "_") |
|
0 |
0 |
if (!sentence.words[i].lemma.empty() && sentence.words[i].lemma != "_") |
21745
|
0 |
0 |
bool use_lemma_flag = model == 1 || models == 1; if (!option_bool(tagger, "use_lemma", use_lemma_flag, error, model)) return false; |
|
0 |
0 |
bool use_lemma_flag = model == 1 || models == 1; if (!option_bool(tagger, "use_lemma", use_lemma_flag, error, model)) return false; |
|
0 |
0 |
bool use_lemma_flag = model == 1 || models == 1; if (!option_bool(tagger, "use_lemma", use_lemma_flag, error, model)) return false; |
21746
|
0 |
0 |
int lemma_encoding = 2; if (!option_int(tagger, "dictionary_lemma_encoding", lemma_encoding, error, model)) return false; |
|
0 |
0 |
int lemma_encoding = 2; if (!option_int(tagger, "dictionary_lemma_encoding", lemma_encoding, error, model)) return false; |
|
0 |
0 |
int lemma_encoding = 2; if (!option_int(tagger, "dictionary_lemma_encoding", lemma_encoding, error, model)) return false; |
21747
|
0 |
0 |
int use_lemma = have_lemma && use_lemma_flag ? lemma_encoding : 0; |
|
0 |
0 |
int use_lemma = have_lemma && use_lemma_flag ? lemma_encoding : 0; |
21748
|
0 |
0 |
bool use_xpostag = model == 0; if (!option_bool(tagger, "use_xpostag", use_xpostag, error, model)) return false; |
|
0 |
0 |
bool use_xpostag = model == 0; if (!option_bool(tagger, "use_xpostag", use_xpostag, error, model)) return false; |
|
0 |
0 |
bool use_xpostag = model == 0; if (!option_bool(tagger, "use_xpostag", use_xpostag, error, model)) return false; |
21749
|
0 |
0 |
bool use_feats = model == 0; if (!option_bool(tagger, "use_feats", use_feats, error, model)) return false; |
|
0 |
0 |
bool use_feats = model == 0; if (!option_bool(tagger, "use_feats", use_feats, error, model)) return false; |
|
0 |
0 |
bool use_feats = model == 0; if (!option_bool(tagger, "use_feats", use_feats, error, model)) return false; |
21751
|
0 |
0 |
bool provide_lemma = model == 1 || models == 1; if (!option_bool(tagger, "provide_lemma", provide_lemma, error, model)) return false; |
|
0 |
0 |
bool provide_lemma = model == 1 || models == 1; if (!option_bool(tagger, "provide_lemma", provide_lemma, error, model)) return false; |
|
0 |
0 |
bool provide_lemma = model == 1 || models == 1; if (!option_bool(tagger, "provide_lemma", provide_lemma, error, model)) return false; |
21752
|
0 |
0 |
bool provide_xpostag = model == 0; if (!option_bool(tagger, "provide_xpostag", provide_xpostag, error, model)) return false; |
|
0 |
0 |
bool provide_xpostag = model == 0; if (!option_bool(tagger, "provide_xpostag", provide_xpostag, error, model)) return false; |
|
0 |
0 |
bool provide_xpostag = model == 0; if (!option_bool(tagger, "provide_xpostag", provide_xpostag, error, model)) return false; |
21753
|
0 |
0 |
bool provide_feats = model == 0; if (!option_bool(tagger, "provide_feats", provide_feats, error, model)) return false; |
|
0 |
0 |
bool provide_feats = model == 0; if (!option_bool(tagger, "provide_feats", provide_feats, error, model)) return false; |
|
0 |
0 |
bool provide_feats = model == 0; if (!option_bool(tagger, "provide_feats", provide_feats, error, model)) return false; |
21754
|
0 |
0 |
os.put(char(provide_lemma ? use_lemma : 0)); |
|
0 |
0 |
os.put(char(provide_lemma ? use_lemma : 0)); |
21755
|
0 |
0 |
os.put(char(provide_xpostag && use_xpostag)); |
|
0 |
0 |
os.put(char(provide_xpostag && use_xpostag)); |
|
0 |
0 |
os.put(char(provide_xpostag && use_xpostag)); |
21756
|
0 |
0 |
os.put(char(provide_feats && use_feats)); |
|
0 |
0 |
os.put(char(provide_feats && use_feats)); |
|
0 |
0 |
os.put(char(provide_feats && use_feats)); |
21758
|
0 |
0 |
cerr << "Tagger model " << model+1 << " columns: " << "lemma use=" << (use_lemma ? 1 : 0) << "/provide=" << (provide_lemma ? 1 : 0) |
|
0 |
0 |
cerr << "Tagger model " << model+1 << " columns: " << "lemma use=" << (use_lemma ? 1 : 0) << "/provide=" << (provide_lemma ? 1 : 0) |
|
0 |
0 |
cerr << "Tagger model " << model+1 << " columns: " << "lemma use=" << (use_lemma ? 1 : 0) << "/provide=" << (provide_lemma ? 1 : 0) |
|
0 |
0 |
cerr << "Tagger model " << model+1 << " columns: " << "lemma use=" << (use_lemma ? 1 : 0) << "/provide=" << (provide_lemma ? 1 : 0) |
21759
|
0 |
0 |
<< ", xpostag use=" << (use_xpostag ? 1 : 0) << "/provide=" << (provide_xpostag ? 1 : 0) |
|
0 |
0 |
<< ", xpostag use=" << (use_xpostag ? 1 : 0) << "/provide=" << (provide_xpostag ? 1 : 0) |
|
0 |
0 |
<< ", xpostag use=" << (use_xpostag ? 1 : 0) << "/provide=" << (provide_xpostag ? 1 : 0) |
|
0 |
0 |
<< ", xpostag use=" << (use_xpostag ? 1 : 0) << "/provide=" << (provide_xpostag ? 1 : 0) |
21760
|
0 |
0 |
<< ", feats use=" << (use_feats ? 1 : 0) << "/provide=" << (provide_feats ? 1 : 0) << endl; |
|
0 |
0 |
<< ", feats use=" << (use_feats ? 1 : 0) << "/provide=" << (provide_feats ? 1 : 0) << endl; |
|
0 |
0 |
<< ", feats use=" << (use_feats ? 1 : 0) << "/provide=" << (provide_feats ? 1 : 0) << endl; |
|
0 |
0 |
<< ", feats use=" << (use_feats ? 1 : 0) << "/provide=" << (provide_feats ? 1 : 0) << endl; |
21763
|
0 |
0 |
stringstream morpho_description; |
21767
|
0 |
0 |
const string& dictionary_model = option_str(tagger, "dictionary_model", model); |
|
0 |
0 |
const string& dictionary_model = option_str(tagger, "dictionary_model", model); |
21768
|
0 |
0 |
if (!dictionary_model.empty()) { |
21777
|
0 |
0 |
int dictionary_suffix_len = 8; if (!option_int(tagger, "dictionary_suffix_len", dictionary_suffix_len, error, model)) return false; |
|
0 |
0 |
int dictionary_suffix_len = 8; if (!option_int(tagger, "dictionary_suffix_len", dictionary_suffix_len, error, model)) return false; |
|
0 |
0 |
int dictionary_suffix_len = 8; if (!option_int(tagger, "dictionary_suffix_len", dictionary_suffix_len, error, model)) return false; |
21779
|
0 |
0 |
if (!option_str(tagger, "dictionary_flat_lemmas", model).empty()) { |
|
0 |
0 |
if (!option_str(tagger, "dictionary_flat_lemmas", model).empty()) { |
|
0 |
0 |
if (!option_str(tagger, "dictionary_flat_lemmas", model).empty()) { |
21781
|
0 |
0 |
split(option_str(tagger, "dictionary_flat_lemmas", model), ',', lemmas); |
|
0 |
0 |
split(option_str(tagger, "dictionary_flat_lemmas", model), ',', lemmas); |
|
0 |
0 |
split(option_str(tagger, "dictionary_flat_lemmas", model), ',', lemmas); |
21782
|
0 |
0 |
for (auto&& lemma : lemmas) { |
21783
|
0 |
0 |
if (lemma.find('~') != string::npos) |
21784
|
0 |
0 |
return error.assign("Dictionary_flat_lemmas cannot contain '~' character!"), false; |
21788
|
0 |
0 |
flat_lemmas.insert("greek.expression"); |
21791
|
0 |
0 |
if (!option_str(tagger, "dictionary", model).empty()) |
|
0 |
0 |
if (!option_str(tagger, "dictionary", model).empty()) |
|
0 |
0 |
if (!option_str(tagger, "dictionary", model).empty()) |
21792
|
0 |
0 |
return error.assign("The tagger 'dictionary' option is no longer supported, use 'dictionary_file' instead!"), false; |
21793
|
0 |
0 |
const string& dictionary_file = option_str(tagger, "dictionary_file", model); |
|
0 |
0 |
const string& dictionary_file = option_str(tagger, "dictionary_file", model); |
21794
|
0 |
0 |
int max_form_analyses = 0; if (!option_int(tagger, "dictionary_max_form_analyses", max_form_analyses, error, model)) return false; |
|
0 |
0 |
int max_form_analyses = 0; if (!option_int(tagger, "dictionary_max_form_analyses", max_form_analyses, error, model)) return false; |
|
0 |
0 |
int max_form_analyses = 0; if (!option_int(tagger, "dictionary_max_form_analyses", max_form_analyses, error, model)) return false; |
21796
|
0 |
0 |
cerr << "Tagger model " << model+1 << " dictionary options: " << "max_form_analyses=" << max_form_analyses |
21797
|
0 |
0 |
<< ", custom dictionary_file=" << (dictionary_file.empty() ? "none" : dictionary_file) << endl; |
|
0 |
0 |
<< ", custom dictionary_file=" << (dictionary_file.empty() ? "none" : dictionary_file) << endl; |
21800
|
0 |
0 |
int guesser_suffix_len = 4; if (!option_int(tagger, "guesser_suffix_len", guesser_suffix_len, error, model)) return false; |
|
0 |
0 |
int guesser_suffix_len = 4; if (!option_int(tagger, "guesser_suffix_len", guesser_suffix_len, error, model)) return false; |
|
0 |
0 |
int guesser_suffix_len = 4; if (!option_int(tagger, "guesser_suffix_len", guesser_suffix_len, error, model)) return false; |
21801
|
0 |
0 |
int guesser_suffix_rules = run <= 1 ? 8 : 5 + hyperparameter_integer(run, 1, 0, 7); |
21802
|
0 |
0 |
if (!option_int(tagger, "guesser_suffix_rules", guesser_suffix_rules, error, model)) return false; |
|
0 |
0 |
if (!option_int(tagger, "guesser_suffix_rules", guesser_suffix_rules, error, model)) return false; |
|
0 |
0 |
if (!option_int(tagger, "guesser_suffix_rules", guesser_suffix_rules, error, model)) return false; |
21803
|
0 |
0 |
int guesser_prefixes_max = provide_lemma ? 4 : 0; if (!option_int(tagger, "guesser_prefixes_max", guesser_prefixes_max, error, model)) return false; |
|
0 |
0 |
int guesser_prefixes_max = provide_lemma ? 4 : 0; if (!option_int(tagger, "guesser_prefixes_max", guesser_prefixes_max, error, model)) return false; |
|
0 |
0 |
int guesser_prefixes_max = provide_lemma ? 4 : 0; if (!option_int(tagger, "guesser_prefixes_max", guesser_prefixes_max, error, model)) return false; |
|
0 |
0 |
int guesser_prefixes_max = provide_lemma ? 4 : 0; if (!option_int(tagger, "guesser_prefixes_max", guesser_prefixes_max, error, model)) return false; |
21804
|
0 |
0 |
int guesser_prefix_min_count = 10; if (!option_int(tagger, "guesser_prefix_min_count", guesser_prefix_min_count, error, model)) return false; |
|
0 |
0 |
int guesser_prefix_min_count = 10; if (!option_int(tagger, "guesser_prefix_min_count", guesser_prefix_min_count, error, model)) return false; |
|
0 |
0 |
int guesser_prefix_min_count = 10; if (!option_int(tagger, "guesser_prefix_min_count", guesser_prefix_min_count, error, model)) return false; |
21805
|
0 |
0 |
int guesser_enrich_dictionary = run <= 1 ? 6 : 3 + hyperparameter_integer(run, 2, 0, 7); |
21806
|
0 |
0 |
if (!dictionary_file.empty()) guesser_enrich_dictionary = 0; |
21807
|
0 |
0 |
if (!option_int(tagger, "guesser_enrich_dictionary", guesser_enrich_dictionary, error, model)) return false; |
|
0 |
0 |
if (!option_int(tagger, "guesser_enrich_dictionary", guesser_enrich_dictionary, error, model)) return false; |
|
0 |
0 |
if (!option_int(tagger, "guesser_enrich_dictionary", guesser_enrich_dictionary, error, model)) return false; |
21809
|
0 |
0 |
if (run >= 1) cerr << "Random search run " << run << ", guesser_suffix_rules=" << guesser_suffix_rules |
|
0 |
0 |
if (run >= 1) cerr << "Random search run " << run << ", guesser_suffix_rules=" << guesser_suffix_rules |
|
0 |
0 |
if (run >= 1) cerr << "Random search run " << run << ", guesser_suffix_rules=" << guesser_suffix_rules |
21810
|
0 |
0 |
<< ", guesser_enrich_dictionary=" << guesser_enrich_dictionary << endl; |
21812
|
0 |
0 |
cerr << "Tagger model " << model+1 << " guesser options: " << "suffix_rules=" << guesser_suffix_rules |
21813
|
0 |
0 |
<< ", prefixes_max=" << guesser_prefixes_max << ", prefix_min_count=" << guesser_prefix_min_count |
|
0 |
0 |
<< ", prefixes_max=" << guesser_prefixes_max << ", prefix_min_count=" << guesser_prefix_min_count |
21814
|
0 |
0 |
<< ", enrich_dictionary=" << guesser_enrich_dictionary << endl; |
21817
|
0 |
0 |
stringstream guesser_description; |
21819
|
0 |
0 |
stringstream guesser_input; |
21820
|
0 |
0 |
for (auto&& sentence : training) { |
21821
|
0 |
0 |
for (size_t i = 1; i < sentence.words.size(); i++) |
21822
|
0 |
0 |
guesser_input << model_normalize_form(sentence.words[i].form, normalized_form) << '\t' |
21823
|
0 |
0 |
<< combine_lemma(sentence.words[i], use_lemma, combined_lemma, flat_lemmas) << '\t' |
21824
|
0 |
0 |
<< combine_tag(sentence.words[i], use_xpostag, use_feats, combined_tag) << '\n'; |
21827
|
0 |
0 |
morphodita::morpho_statistical_guesser_trainer::train(guesser_input, guesser_suffix_len, guesser_suffix_rules, guesser_prefixes_max, guesser_prefix_min_count, guesser_description); |
21835
|
0 |
0 |
for (auto&& sentence : training) |
21836
|
0 |
0 |
for (size_t i = 1; i < sentence.words.size(); i++) { |
21837
|
0 |
0 |
model_normalize_form(sentence.words[i].form, normalized_form); |
21838
|
0 |
0 |
entry.assign(combine_lemma(sentence.words[i], use_lemma, combined_lemma, flat_lemmas)) |
21839
|
0 |
0 |
.append("\t").append(combine_tag(sentence.words[i], use_xpostag, use_feats, combined_tag)) |
|
0 |
0 |
.append("\t").append(combine_tag(sentence.words[i], use_xpostag, use_feats, combined_tag)) |
21840
|
0 |
0 |
.append("\t").append(normalized_form); |
21845
|
0 |
0 |
for (auto&& form_analyses : entries) { |
21847
|
0 |
0 |
for (auto&& analysis : form_analyses.second) |
21848
|
0 |
0 |
analyses.emplace_back(analysis.second, analysis.first); |
21849
|
0 |
0 |
if (max_form_analyses && int(analyses.size()) > max_form_analyses) { |
|
0 |
0 |
if (max_form_analyses && int(analyses.size()) > max_form_analyses) { |
|
0 |
0 |
if (max_form_analyses && int(analyses.size()) > max_form_analyses) { |
21851
|
0 |
0 |
analyses.resize(max_form_analyses); |
21853
|
0 |
0 |
for (auto&& analysis : analyses) |
21859
|
0 |
0 |
dictionary_special_tags.number_tag = most_frequent_tag(training, "NUM", use_xpostag, use_feats, combined_tag); |
|
0 |
0 |
dictionary_special_tags.number_tag = most_frequent_tag(training, "NUM", use_xpostag, use_feats, combined_tag); |
21860
|
0 |
0 |
dictionary_special_tags.punctuation_tag = most_frequent_tag(training, "PUNCT", use_xpostag, use_feats, combined_tag); |
|
0 |
0 |
dictionary_special_tags.punctuation_tag = most_frequent_tag(training, "PUNCT", use_xpostag, use_feats, combined_tag); |
21861
|
0 |
0 |
dictionary_special_tags.symbol_tag = most_frequent_tag(training, "SYM", use_xpostag, use_feats, combined_tag); |
|
0 |
0 |
dictionary_special_tags.symbol_tag = most_frequent_tag(training, "SYM", use_xpostag, use_feats, combined_tag); |
21864
|
0 |
0 |
if (!dictionary_file.empty()) { |
21865
|
0 |
0 |
ifstream is(path_from_utf8(dictionary_file).c_str()); |
21866
|
0 |
0 |
if (!is.is_open()) return error.assign("Cannot open dictionary_file '").append(dictionary_file).append("'!"), false; |
|
0 |
0 |
if (!is.is_open()) return error.assign("Cannot open dictionary_file '").append(dictionary_file).append("'!"), false; |
|
0 |
0 |
if (!is.is_open()) return error.assign("Cannot open dictionary_file '").append(dictionary_file).append("'!"), false; |
21869
|
0 |
0 |
word entry; |
21871
|
0 |
0 |
while (getline(is, line)) { |
|
0 |
0 |
while (getline(is, line)) { |
21873
|
0 |
0 |
if (line.empty()) continue; |
21875
|
0 |
0 |
split(line, '\t', dictionary_parts); |
21877
|
0 |
0 |
if (dictionary_parts.size() != 5) |
21878
|
0 |
0 |
return error.assign("Dictionary line '").append(line).append("' does not contain 5 tab-separated columns!"), false; |
|
0 |
0 |
return error.assign("Dictionary line '").append(line).append("' does not contain 5 tab-separated columns!"), false; |
21880
|
0 |
0 |
model_normalize_form(dictionary_parts[0], entry.form); |
21881
|
0 |
0 |
entry.lemma.assign(dictionary_parts[1].str, dictionary_parts[1].len == 1 && dictionary_parts[1].str[0] == '_' ? 0 : dictionary_parts[1].len); |
|
0 |
0 |
entry.lemma.assign(dictionary_parts[1].str, dictionary_parts[1].len == 1 && dictionary_parts[1].str[0] == '_' ? 0 : dictionary_parts[1].len); |
21882
|
0 |
0 |
entry.upostag.assign(dictionary_parts[2].str, dictionary_parts[2].len == 1 && dictionary_parts[2].str[0] == '_' ? 0 : dictionary_parts[2].len); |
|
0 |
0 |
entry.upostag.assign(dictionary_parts[2].str, dictionary_parts[2].len == 1 && dictionary_parts[2].str[0] == '_' ? 0 : dictionary_parts[2].len); |
21883
|
0 |
0 |
entry.xpostag.assign(dictionary_parts[3].str, dictionary_parts[3].len == 1 && dictionary_parts[3].str[0] == '_' ? 0 : dictionary_parts[3].len); |
|
0 |
0 |
entry.xpostag.assign(dictionary_parts[3].str, dictionary_parts[3].len == 1 && dictionary_parts[3].str[0] == '_' ? 0 : dictionary_parts[3].len); |
21884
|
0 |
0 |
entry.feats.assign(dictionary_parts[4].str, dictionary_parts[4].len == 1 && dictionary_parts[4].str[0] == '_' ? 0 : dictionary_parts[4].len); |
|
0 |
0 |
entry.feats.assign(dictionary_parts[4].str, dictionary_parts[4].len == 1 && dictionary_parts[4].str[0] == '_' ? 0 : dictionary_parts[4].len); |
21886
|
0 |
0 |
entry_encoded.assign(combine_lemma(entry, use_lemma, combined_lemma, flat_lemmas)) |
21887
|
0 |
0 |
.append("\t").append(combine_tag(entry, use_xpostag, use_feats, combined_tag)) |
|
0 |
0 |
.append("\t").append(combine_tag(entry, use_xpostag, use_feats, combined_tag)) |
21888
|
0 |
0 |
.append("\t").append(entry.form); |
21894
|
0 |
0 |
if (guesser_enrich_dictionary) { |
21896
|
0 |
0 |
stringstream empty_data, guesser_description_copy(guesser_description.str()), guesser_only_morphology; |
|
0 |
0 |
stringstream empty_data, guesser_description_copy(guesser_description.str()), guesser_only_morphology; |
|
0 |
0 |
stringstream empty_data, guesser_description_copy(guesser_description.str()), guesser_only_morphology; |
21897
|
0 |
0 |
guesser_only_morphology.put(morphodita::morpho_ids::GENERIC); |
21898
|
0 |
0 |
morphodita::generic_morpho_encoder::encode(empty_data, dictionary_suffix_len, dictionary_special_tags, guesser_description_copy, guesser_only_morphology); |
21900
|
0 |
0 |
unique_ptr guesser_only_morpho(morphodita::morpho::load(guesser_only_morphology)); |
21901
|
0 |
0 |
if (!guesser_only_morpho) return error.assign("Cannot create temporary guesser-only morphology!"), false; |
|
0 |
0 |
if (!guesser_only_morpho) return error.assign("Cannot create temporary guesser-only morphology!"), false; |
21906
|
0 |
0 |
for (auto&& sentence : training) |
21907
|
0 |
0 |
for (size_t i = 1; i < sentence.words.size(); i++) { |
21908
|
0 |
0 |
const auto& form = model_normalize_form(sentence.words[i].form, normalized_form); |
21909
|
0 |
0 |
if (!analyzed_forms.count(form)) { |
21910
|
0 |
0 |
guesser_only_morpho->analyze(form, morphodita::morpho::GUESSER, analyses); |
21913
|
0 |
0 |
for (auto&& analyse : analyses) { |
21914
|
0 |
0 |
entry.assign(analyse.lemma).push_back('\t'); |
21915
|
0 |
0 |
entry.append(analyse.tag).push_back('\t'); |
21917
|
0 |
0 |
if (dictionary_entries.insert(entry).second) |
21918
|
0 |
0 |
if (!--to_add) |
21927
|
0 |
0 |
vector sorted_dictionary(dictionary_entries.begin(), dictionary_entries.end()); |
21930
|
0 |
0 |
stringstream morpho_input; |
21931
|
0 |
0 |
for (auto&& entry : sorted_dictionary) |
21934
|
0 |
0 |
morpho_description.put(morphodita::morpho_ids::GENERIC); |
21935
|
0 |
0 |
morphodita::generic_morpho_encoder::encode(morpho_input, dictionary_suffix_len, dictionary_special_tags, guesser_description, morpho_description); |
21939
|
0 |
0 |
const string& dictionary_accuracy = option_str(tagger, "dictionary_accuracy", model); |
|
0 |
0 |
const string& dictionary_accuracy = option_str(tagger, "dictionary_accuracy", model); |
21940
|
0 |
0 |
if (!dictionary_accuracy.empty()) { |
21941
|
0 |
0 |
unique_ptr morpho(morphodita::morpho::load(morpho_description)); |
21942
|
0 |
0 |
if (!morpho) return error.assign("Cannot create temporary morphology for evaluating accuracy!"), false; |
|
0 |
0 |
if (!morpho) return error.assign("Cannot create temporary morphology for evaluating accuracy!"), false; |
21943
|
0 |
0 |
morpho_description.seekg(0, ios::beg); |
21948
|
0 |
0 |
word w; |
21950
|
0 |
0 |
conllu_input_format->set_text(dictionary_accuracy.c_str()); |
21951
|
0 |
0 |
for (sentence sentence; conllu_input_format->next_sentence(sentence, error); ) |
|
0 |
0 |
for (sentence sentence; conllu_input_format->next_sentence(sentence, error); ) |
|
0 |
0 |
for (sentence sentence; conllu_input_format->next_sentence(sentence, error); ) |
21952
|
0 |
0 |
for (size_t i = 1; i < sentence.words.size(); i++) { |
21953
|
0 |
0 |
morpho->analyze(model_normalize_form(sentence.words[i].form, normalized_form), morphodita::morpho::GUESSER, analyses); |
|
0 |
0 |
morpho->analyze(model_normalize_form(sentence.words[i].form, normalized_form), morphodita::morpho::GUESSER, analyses); |
21955
|
0 |
0 |
for (auto&& analysis : analyses) { |
21956
|
0 |
0 |
w.lemma.assign("_"); |
21957
|
0 |
0 |
model_fill_word_analysis(analysis, true, use_lemma, true, true, w); |
21961
|
0 |
0 |
all_tags_ok |= int(sentence.words[i].upostag == w.upostag && sentence.words[i].xpostag == w.xpostag && sentence.words[i].feats == w.feats); |
|
0 |
0 |
all_tags_ok |= int(sentence.words[i].upostag == w.upostag && sentence.words[i].xpostag == w.xpostag && sentence.words[i].feats == w.feats); |
|
0 |
0 |
all_tags_ok |= int(sentence.words[i].upostag == w.upostag && sentence.words[i].xpostag == w.xpostag && sentence.words[i].feats == w.feats); |
21972
|
0 |
0 |
if (!error.empty()) return false; |
21981
|
0 |
0 |
double tagger_order = 3; if (!option_double(tagger, "order", tagger_order, error, model)) return false; |
|
0 |
0 |
double tagger_order = 3; if (!option_double(tagger, "order", tagger_order, error, model)) return false; |
|
0 |
0 |
double tagger_order = 3; if (!option_double(tagger, "order", tagger_order, error, model)) return false; |
21983
|
0 |
0 |
if (tagger_order == 2) tagger_id = morphodita::tagger_ids::CONLLU2; |
21984
|
0 |
0 |
else if (tagger_order == 2.5) tagger_id = morphodita::tagger_ids::CONLLU2_3; |
21985
|
0 |
0 |
else if (tagger_order == 3) tagger_id = morphodita::tagger_ids::CONLLU3; |
21986
|
0 |
0 |
else return error.assign("The tagger_order can be only 2, 2.5 or 3!"), false; |
21988
|
0 |
0 |
int tagger_iterations = 20; if (!option_int(tagger, "iterations", tagger_iterations, error, model)) return false; |
|
0 |
0 |
int tagger_iterations = 20; if (!option_int(tagger, "iterations", tagger_iterations, error, model)) return false; |
|
0 |
0 |
int tagger_iterations = 20; if (!option_int(tagger, "iterations", tagger_iterations, error, model)) return false; |
21989
|
0 |
0 |
bool tagger_prune_features = false; if (!option_bool(tagger, "prune_features", tagger_prune_features, error, model)) return false; |
|
0 |
0 |
bool tagger_prune_features = false; if (!option_bool(tagger, "prune_features", tagger_prune_features, error, model)) return false; |
|
0 |
0 |
bool tagger_prune_features = false; if (!option_bool(tagger, "prune_features", tagger_prune_features, error, model)) return false; |
21990
|
0 |
0 |
bool tagger_early_stopping = true; if (!option_bool(tagger, "early_stopping", tagger_early_stopping, error, model)) return false; |
|
0 |
0 |
bool tagger_early_stopping = true; if (!option_bool(tagger, "early_stopping", tagger_early_stopping, error, model)) return false; |
|
0 |
0 |
bool tagger_early_stopping = true; if (!option_bool(tagger, "early_stopping", tagger_early_stopping, error, model)) return false; |
21992
|
0 |
0 |
option_str(tagger, "templates", model) == "tagger" ? tagger_features_tagger : |
|
0 |
0 |
option_str(tagger, "templates", model) == "tagger" ? tagger_features_tagger : |
21993
|
0 |
0 |
option_str(tagger, "templates", model) == "lemmatizer" ? tagger_features_lemmatizer : |
|
0 |
0 |
option_str(tagger, "templates", model) == "lemmatizer" ? tagger_features_lemmatizer : |
|
0 |
0 |
option_str(tagger, "templates", model) == "lemmatizer" ? tagger_features_lemmatizer : |
|
0 |
0 |
option_str(tagger, "templates", model) == "lemmatizer" ? tagger_features_lemmatizer : |
21994
|
0 |
0 |
!option_str(tagger, "templates", model).empty() ? option_str(tagger, "templates", model) : |
|
0 |
0 |
!option_str(tagger, "templates", model).empty() ? option_str(tagger, "templates", model) : |
|
0 |
0 |
!option_str(tagger, "templates", model).empty() ? option_str(tagger, "templates", model) : |
|
0 |
0 |
!option_str(tagger, "templates", model).empty() ? option_str(tagger, "templates", model) : |
|
0 |
0 |
!option_str(tagger, "templates", model).empty() ? option_str(tagger, "templates", model) : |
|
0 |
0 |
!option_str(tagger, "templates", model).empty() ? option_str(tagger, "templates", model) : |
|
0 |
0 |
!option_str(tagger, "templates", model).empty() ? option_str(tagger, "templates", model) : |
21995
|
0 |
0 |
model == 1 ? tagger_features_lemmatizer : tagger_features_tagger; |
|
0 |
0 |
model == 1 ? tagger_features_lemmatizer : tagger_features_tagger; |
|
0 |
0 |
model == 1 ? tagger_features_lemmatizer : tagger_features_tagger; |
|
0 |
0 |
model == 1 ? tagger_features_lemmatizer : tagger_features_tagger; |
|
0 |
0 |
model == 1 ? tagger_features_lemmatizer : tagger_features_tagger; |
21996
|
0 |
0 |
if (heldout.empty()) tagger_early_stopping = false; |
21998
|
0 |
0 |
cerr << "Tagger model " << model+1 << " options: iterations=" << tagger_iterations |
21999
|
0 |
0 |
<< ", early_stopping=" << (tagger_early_stopping ? 1 : 0) << ", templates=" |
|
0 |
0 |
<< ", early_stopping=" << (tagger_early_stopping ? 1 : 0) << ", templates=" |
22001
|
0 |
0 |
tagger_feature_templates == tagger_features_lemmatizer ? "lemmatizer" : "custom") << endl; |
|
0 |
0 |
tagger_feature_templates == tagger_features_lemmatizer ? "lemmatizer" : "custom") << endl; |
|
0 |
0 |
tagger_feature_templates == tagger_features_lemmatizer ? "lemmatizer" : "custom") << endl; |
22005
|
0 |
0 |
stringstream input, heldout_input, feature_templates_input(tagger_feature_templates); |
|
0 |
0 |
stringstream input, heldout_input, feature_templates_input(tagger_feature_templates); |
|
0 |
0 |
stringstream input, heldout_input, feature_templates_input(tagger_feature_templates); |
22006
|
0 |
0 |
for (auto&& sentence : training) { |
22007
|
0 |
0 |
for (size_t i = 1; i < sentence.words.size(); i++) |
22008
|
0 |
0 |
input << model_normalize_form(sentence.words[i].form, normalized_form) << '\t' |
22009
|
0 |
0 |
<< combine_lemma(sentence.words[i], use_lemma, combined_lemma) << '\t' |
22010
|
0 |
0 |
<< combine_tag(sentence.words[i], use_xpostag, use_feats, combined_tag) << '\n'; |
22014
|
0 |
0 |
for (auto&& sentence : heldout) { |
22015
|
0 |
0 |
for (size_t i = 1; i < sentence.words.size(); i++) |
22016
|
0 |
0 |
heldout_input << model_normalize_form(sentence.words[i].form, normalized_form) << '\t' |
22017
|
0 |
0 |
<< combine_lemma(sentence.words[i], use_lemma, combined_lemma) << '\t' |
22018
|
0 |
0 |
<< combine_tag(sentence.words[i], use_xpostag, use_feats, combined_tag) << '\n'; |
22022
|
0 |
0 |
os.put(tagger_id); |
22023
|
0 |
0 |
morphodita::tagger_trainer>>::train(morphodita::tagger_ids::decoding_order(tagger_id), morphodita::tagger_ids::window_size(tagger_id), tagger_iterations, morpho_description, true, feature_templates_input, tagger_prune_features, input, heldout_input, tagger_early_stopping, os); |
|
0 |
0 |
morphodita::tagger_trainer>>::train(morphodita::tagger_ids::decoding_order(tagger_id), morphodita::tagger_ids::window_size(tagger_id), tagger_iterations, morpho_description, true, feature_templates_input, tagger_prune_features, input, heldout_input, tagger_early_stopping, os); |
|
0 |
0 |
morphodita::tagger_trainer>>::train(morphodita::tagger_ids::decoding_order(tagger_id), morphodita::tagger_ids::window_size(tagger_id), tagger_iterations, morpho_description, true, feature_templates_input, tagger_prune_features, input, heldout_input, tagger_early_stopping, os); |
|
0 |
0 |
morphodita::tagger_trainer>>::train(morphodita::tagger_ids::decoding_order(tagger_id), morphodita::tagger_ids::window_size(tagger_id), tagger_iterations, morpho_description, true, feature_templates_input, tagger_prune_features, input, heldout_input, tagger_early_stopping, os); |
|
0 |
0 |
morphodita::tagger_trainer>>::train(morphodita::tagger_ids::decoding_order(tagger_id), morphodita::tagger_ids::window_size(tagger_id), tagger_iterations, morpho_description, true, feature_templates_input, tagger_prune_features, input, heldout_input, tagger_early_stopping, os); |
22032
|
0 |
0 |
while (separator < tag_separators.size() && |
22033
|
0 |
0 |
(w.upostag.find(tag_separators[separator]) != string::npos || w.xpostag.find(tag_separators[separator]) != string::npos)) |
22036
|
0 |
0 |
if (separator >= tag_separators.size()) { |
22045
|
0 |
0 |
while (separator < tag_separators.size() && |
22046
|
0 |
0 |
(w.upostag.find(tag_separators[separator]) != string::npos || w.xpostag.find(tag_separators[separator]) != string::npos)) |
22048
|
0 |
0 |
if (separator >= tag_separators.size()) |
22054
|
0 |
0 |
if (xpostag || feats) { |
22056
|
0 |
0 |
if (xpostag) combined_tag.append(w.xpostag); |
22057
|
0 |
0 |
if (feats) combined_tag.push_back(tag_separators[separator]); |
22058
|
0 |
0 |
if (feats) combined_tag.append(w.feats); |
22067
|
0 |
0 |
for (auto&& sentence : data) |
22068
|
0 |
0 |
for (size_t i = 1; i < sentence.words.size(); i++) |
22069
|
0 |
0 |
if (sentence.words[i].upostag == upostag) |
22070
|
0 |
0 |
counts[combine_tag(sentence.words[i], xpostag, feats, combined_tag)]++; |
22072
|
0 |
0 |
combined_tag.assign("~").append(upostag); |
22074
|
0 |
0 |
for (auto&& tags : counts) |
22075
|
0 |
0 |
if (tags.second > best) { |
22088
|
0 |
0 |
if (flat_lemmas.count(w.lemma) || flat_lemmas.count(combined_lemma)) |
22092
|
0 |
0 |
if (w.lemma == "") |
22094
|
0 |
0 |
else if (w.lemma == "_") |
22098
|
0 |
0 |
if (flat_lemmas.count(w.lemma) || flat_lemmas.count(combined_lemma)) { |
22100
|
0 |
0 |
model_normalize_form(w.form, normalized_form); |
22101
|
0 |
0 |
return combined_lemma.insert(0, "~").append("~").append(normalized_form); |
|
0 |
0 |
return combined_lemma.insert(0, "~").append("~").append(normalized_form); |
22111
|
0 |
0 |
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
|
0 |
0 |
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
|
0 |
0 |
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
22118
|
0 |
0 |
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
|
0 |
0 |
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
|
0 |
0 |
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
22120
|
0 |
0 |
if (options.count(indexed_name)) |
22121
|
0 |
0 |
return parse_int(options.at(indexed_name), name.c_str(), value, error); |
22122
|
0 |
0 |
if (options.count(name)) |
22123
|
0 |
0 |
return parse_int(options.at(name), name.c_str(), value, error); |
22129
|
0 |
0 |
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
|
0 |
0 |
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
|
0 |
0 |
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
22131
|
0 |
0 |
if (options.count(indexed_name) || options.count(name)) { |
22133
|
0 |
0 |
if (!parse_int(options.count(indexed_name) ? options.at(indexed_name) : options.at(name), name.c_str(), int_value, error)) |
|
0 |
0 |
if (!parse_int(options.count(indexed_name) ? options.at(indexed_name) : options.at(name), name.c_str(), int_value, error)) |
|
0 |
0 |
if (!parse_int(options.count(indexed_name) ? options.at(indexed_name) : options.at(name), name.c_str(), int_value, error)) |
22142
|
0 |
0 |
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
|
0 |
0 |
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
|
0 |
0 |
if (model >= 0 && model < 9) indexed_name.append("_").push_back('1' + model); |
22144
|
0 |
0 |
if (options.count(indexed_name)) |
22145
|
0 |
0 |
return parse_double(options.at(indexed_name), name.c_str(), value, error); |
22146
|
0 |
0 |
if (options.count(name)) |
22147
|
0 |
0 |
return parse_double(options.at(name), name.c_str(), value, error); |
22307
|
0 |
0 |
training_error::training_error() : runtime_error(message_collector.str()) { |
22580
|
0 |
0 |
decompose(str, true); |
|
0 |
0 |
decompose(str, true); |
|
0 |
0 |
decompose(str, true); |
22585
|
0 |
0 |
for (old = 0, com = 0; old < str.size(); old++, com++) { |
22587
|
0 |
0 |
if (str[old] >= Hangul::LBase && str[old] < Hangul::LBase + Hangul::LCount) { |
|
0 |
0 |
if (str[old] >= Hangul::LBase && str[old] < Hangul::LBase + Hangul::LCount) { |
|
0 |
0 |
if (str[old] >= Hangul::LBase && str[old] < Hangul::LBase + Hangul::LCount) { |
22589
|
0 |
0 |
if (old + 1 < str.size() && str[old + 1] >= Hangul::VBase && str[old + 1] < Hangul::VBase + Hangul::VCount) { |
|
0 |
0 |
if (old + 1 < str.size() && str[old + 1] >= Hangul::VBase && str[old + 1] < Hangul::VBase + Hangul::VCount) { |
|
0 |
0 |
if (old + 1 < str.size() && str[old + 1] >= Hangul::VBase && str[old + 1] < Hangul::VBase + Hangul::VCount) { |
|
0 |
0 |
if (old + 1 < str.size() && str[old + 1] >= Hangul::VBase && str[old + 1] < Hangul::VBase + Hangul::VCount) { |
22592
|
0 |
0 |
if (old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount) |
|
0 |
0 |
if (old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount) |
|
0 |
0 |
if (old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount) |
|
0 |
0 |
if (old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount) |
22595
|
0 |
0 |
} else if (str[old] >= Hangul::SBase && str[old] < Hangul::SBase + Hangul::SCount) { |
|
0 |
0 |
} else if (str[old] >= Hangul::SBase && str[old] < Hangul::SBase + Hangul::SCount) { |
|
0 |
0 |
} else if (str[old] >= Hangul::SBase && str[old] < Hangul::SBase + Hangul::SCount) { |
22597
|
0 |
0 |
if ((str[old] - Hangul::SBase) % Hangul::TCount && old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount) |
|
0 |
0 |
if ((str[old] - Hangul::SBase) % Hangul::TCount && old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount) |
|
0 |
0 |
if ((str[old] - Hangul::SBase) % Hangul::TCount && old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount) |
|
0 |
0 |
if ((str[old] - Hangul::SBase) % Hangul::TCount && old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount) |
|
0 |
0 |
if ((str[old] - Hangul::SBase) % Hangul::TCount && old + 1 < str.size() && str[old + 1] > Hangul::TBase && str[old + 1] < Hangul::TBase + Hangul::TCount) |
22599
|
0 |
0 |
} else if (str[old] < CHARS) { |
22603
|
0 |
0 |
for (int last_ccc = -1; old + 1 < str.size(); old++) { |
22604
|
0 |
0 |
int ccc = str[old + 1] < CHARS ? ccc_block[ccc_index[str[old + 1] >> 8]][str[old + 1] & 0xFF] : 0; |
22605
|
0 |
0 |
if (composition[1] - composition[0] && last_ccc < ccc) { |
|
0 |
0 |
if (composition[1] - composition[0] && last_ccc < ccc) { |
22608
|
0 |
0 |
while (l + 2 < r) { |
22610
|
0 |
0 |
if (composition_data[m] <= str[old + 1]) l = m; |
22611
|
0 |
0 |
if (composition_data[m] >= str[old + 1]) r = m; |
22613
|
0 |
0 |
if (composition_data[l] == str[old + 1]) { |
22621
|
0 |
0 |
if (!ccc) break; |
22628
|
0 |
0 |
if (com < old) str.resize(com); |
22635
|
0 |
0 |
for (auto&& chr : str) { |
22638
|
0 |
0 |
if (chr >= Hangul::SBase && chr < Hangul::SBase + Hangul::SCount) { |
22640
|
0 |
0 |
decomposition_len = 2 + ((chr - Hangul::SBase) % Hangul::TCount ? 1 : 0); |
22641
|
0 |
0 |
} else if (chr < CHARS) { |
22645
|
0 |
0 |
if (decomposition_len && !kompatibility && (decomposition[0] & 1)) decomposition_len = 0; |
|
0 |
0 |
if (decomposition_len && !kompatibility && (decomposition[0] & 1)) decomposition_len = 0; |
22646
|
0 |
0 |
if (decomposition_len && kompatibility && (decomposition[0] & 2)) |
|
0 |
0 |
if (decomposition_len && kompatibility && (decomposition[0] & 2)) |
22648
|
0 |
0 |
for (auto i = decomposition[0] >> 2; i < decomposition[1] >> 2; i++) { |
22650
|
0 |
0 |
if (further_decomposition[0] & 1) decomposition_len += (further_decomposition[1] >> 2) - (further_decomposition[0] >> 2) - 1; |
22654
|
0 |
0 |
if (!decomposition_len) continue; |
22660
|
0 |
0 |
if (any_decomposition) { |
22662
|
0 |
0 |
for (size_t dec = str.size(), old = dec - additional; old--; ) |
22663
|
0 |
0 |
if (str[old] >= Hangul::SBase && str[old] < Hangul::SBase + Hangul::SCount) { |
|
0 |
0 |
if (str[old] >= Hangul::SBase && str[old] < Hangul::SBase + Hangul::SCount) { |
|
0 |
0 |
if (str[old] >= Hangul::SBase && str[old] < Hangul::SBase + Hangul::SCount) { |
22666
|
0 |
0 |
if (s_index % Hangul::TCount) str[--dec] = Hangul::TBase + s_index % Hangul::TCount; |
22669
|
0 |
0 |
} else if (str[old] < CHARS) { |
22673
|
0 |
0 |
if (decomposition_len && !kompatibility && (decomposition[0] & 1)) decomposition_len = 0; |
|
0 |
0 |
if (decomposition_len && !kompatibility && (decomposition[0] & 1)) decomposition_len = 0; |
22674
|
0 |
0 |
if (decomposition_len && kompatibility && (decomposition[0] & 2)) { |
|
0 |
0 |
if (decomposition_len && kompatibility && (decomposition[0] & 2)) { |
22676
|
0 |
0 |
while (decomposition_len--) { |
22679
|
0 |
0 |
if (further_decomposition[0] & 1) { |
22680
|
0 |
0 |
for (int further_decomposition_len = (further_decomposition[1] >> 2) - (further_decomposition[0] >> 2); further_decomposition_len--; ) |
22686
|
0 |
0 |
} else if (decomposition_len) { |
22688
|
0 |
0 |
while (decomposition_len--) |
22701
|
0 |
0 |
for (size_t i = 1; i < str.size(); i++) { |
22702
|
0 |
0 |
unsigned ccc = str[i] < CHARS ? ccc_block[ccc_index[str[i] >> 8]][str[i] & 0xFF] : 0; |
22703
|
0 |
0 |
if (!ccc) continue; |
22707
|
0 |
0 |
for (j = i; j && (str[j-1] < CHARS ? ccc_block[ccc_index[str[j-1] >> 8]][str[j-1] & 0xFF] : 0) > ccc; j--) str[j] = str[j-1]; |
|
0 |
0 |
for (j = i; j && (str[j-1] < CHARS ? ccc_block[ccc_index[str[j-1] >> 8]][str[j-1] & 0xFF] : 0) > ccc; j--) str[j] = str[j-1]; |
|
0 |
0 |
for (j = i; j && (str[j-1] < CHARS ? ccc_block[ccc_index[str[j-1] >> 8]][str[j-1] & 0xFF] : 0) > ccc; j--) str[j] = str[j-1]; |
|
0 |
0 |
for (j = i; j && (str[j-1] < CHARS ? ccc_block[ccc_index[str[j-1] >> 8]][str[j-1] & 0xFF] : 0) > ccc; j--) str[j] = str[j-1]; |
22963
|
0 |
0 |
for (; *str; str++) |
22964
|
0 |
0 |
if (((unsigned char)*str) >= 0x80) { |
22965
|
0 |
0 |
if (((unsigned char)*str) < 0xC0) return false; |
22966
|
0 |
0 |
else if (((unsigned char)*str) < 0xE0) { |
22967
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
22968
|
0 |
0 |
} else if (((unsigned char)*str) < 0xF0) { |
22969
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
22970
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
22971
|
0 |
0 |
} else if (((unsigned char)*str) < 0xF8) { |
22972
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
22973
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
22974
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
22981
|
0 |
0 |
for (; len > 0; str++, len--) |
22982
|
0 |
0 |
if (((unsigned char)*str) >= 0x80) { |
22983
|
0 |
0 |
if (((unsigned char)*str) < 0xC0) return false; |
22984
|
0 |
0 |
else if (((unsigned char)*str) < 0xE0) { |
22985
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
22986
|
0 |
0 |
} else if (((unsigned char)*str) < 0xF0) { |
22987
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
22988
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
22989
|
0 |
0 |
} else if (((unsigned char)*str) < 0xF8) { |
22990
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
22991
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
22992
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
|
0 |
0 |
str++; if (!--len || ((unsigned char)*str) < 0x80 || ((unsigned char)*str) >= 0xC0) return false; |
23001
|
0 |
0 |
for (char32_t chr; (chr = decode(str)); ) |
23008
|
0 |
0 |
while (len) |
23015
|
0 |
0 |
for (auto&& chr : str) |
23043
|
0 |
0 |
return {3, 3, 0, ""}; |
|
0 |
0 |
return {3, 3, 0, ""}; |
|
0 |
0 |
return {3, 3, 0, ""}; |
23573
|
3092 |
104350 |
IF_BIT_0(prob) |
|
23097 |
84345 |
IF_BIT_0(prob) |
23578
|
23091 |
6 |
if (checkDicSize != 0 || processedPos != 0) |
23580
|
0 |
23091 |
(dic[(dicPos == 0 ? dicBufSize : dicPos) - 1] >> (8 - lc)))); |
23582
|
21934 |
1163 |
if (state < kNumLitStates) |
23586
|
18121 |
157351 |
do { GET_BIT(prob + symbol, symbol) } while (symbol < 0x100); |
|
81155 |
94317 |
do { GET_BIT(prob + symbol, symbol) } while (symbol < 0x100); |
|
153538 |
21934 |
do { GET_BIT(prob + symbol, symbol) } while (symbol < 0x100); |
23590
|
0 |
1163 |
unsigned matchByte = p->dic[(dicPos - rep0) + ((dicPos < rep0) ? dicBufSize : 0)]; |
23592
|
166 |
997 |
state -= (state < 10) ? 3 : 6; |
23601
|
1028 |
8276 |
GET_BIT2(probLit, symbol, offs &= ~bit, offs &= bit) |
|
5355 |
3949 |
GET_BIT2(probLit, symbol, offs &= ~bit, offs &= bit) |
23603
|
8141 |
1163 |
while (symbol < 0x100); |
23613
|
572 |
83773 |
IF_BIT_0(prob) |
|
487 |
83858 |
IF_BIT_0(prob) |
23622
|
83858 |
0 |
if (checkDicSize == 0 && processedPos == 0) |
23625
|
280 |
83578 |
IF_BIT_0(prob) |
|
83695 |
163 |
IF_BIT_0(prob) |
23629
|
283 |
83412 |
IF_BIT_0(prob) |
|
645 |
83050 |
IF_BIT_0(prob) |
23632
|
0 |
645 |
dic[dicPos] = dic[(dicPos - rep0) + ((dicPos < rep0) ? dicBufSize : 0)]; |
23635
|
3 |
642 |
state = state < kNumLitStates ? 9 : 11; |
23645
|
49 |
114 |
IF_BIT_0(prob) |
|
97 |
66 |
IF_BIT_0(prob) |
23654
|
9 |
57 |
IF_BIT_0(prob) |
|
37 |
29 |
IF_BIT_0(prob) |
23670
|
82945 |
268 |
state = state < kNumLitStates ? 8 : 11; |
23676
|
462 |
83238 |
IF_BIT_0(probLen) |
|
445 |
83255 |
IF_BIT_0(probLen) |
23687
|
274 |
82981 |
IF_BIT_0(probLen) |
|
113 |
83142 |
IF_BIT_0(probLen) |
23702
|
2505 |
664305 |
TREE_DECODE(probLen, limit, len); |
|
2240 |
664570 |
TREE_DECODE(probLen, limit, len); |
|
583110 |
83700 |
TREE_DECODE(probLen, limit, len); |
23706
|
487 |
83213 |
if (state >= kNumStates) |
23711
|
55 |
432 |
TREE_6_DECODE(prob, distance); |
|
325 |
162 |
TREE_6_DECODE(prob, distance); |
|
55 |
432 |
TREE_6_DECODE(prob, distance); |
|
429 |
58 |
TREE_6_DECODE(prob, distance); |
|
48 |
439 |
TREE_6_DECODE(prob, distance); |
|
303 |
184 |
TREE_6_DECODE(prob, distance); |
|
65 |
422 |
TREE_6_DECODE(prob, distance); |
|
254 |
233 |
TREE_6_DECODE(prob, distance); |
|
58 |
429 |
TREE_6_DECODE(prob, distance); |
|
260 |
227 |
TREE_6_DECODE(prob, distance); |
|
56 |
431 |
TREE_6_DECODE(prob, distance); |
|
272 |
215 |
TREE_6_DECODE(prob, distance); |
23712
|
405 |
82 |
if (distance >= kStartPosModelIndex) |
23717
|
166 |
239 |
if (posSlot < kEndPosModelIndex) |
23724
|
325 |
166 |
do |
23726
|
54 |
437 |
GET_BIT2(prob + i, i, ; , distance |= mask); |
|
239 |
252 |
GET_BIT2(prob + i, i, ; , distance |= mask); |
23735
|
2333 |
239 |
do |
23737
|
309 |
2263 |
NORMALIZE |
23761
|
29 |
210 |
GET_BIT2(prob + i, i, ; , distance |= 1); |
|
124 |
115 |
GET_BIT2(prob + i, i, ; , distance |= 1); |
23762
|
37 |
202 |
GET_BIT2(prob + i, i, ; , distance |= 2); |
|
118 |
121 |
GET_BIT2(prob + i, i, ; , distance |= 2); |
23763
|
32 |
207 |
GET_BIT2(prob + i, i, ; , distance |= 4); |
|
130 |
109 |
GET_BIT2(prob + i, i, ; , distance |= 4); |
23764
|
26 |
213 |
GET_BIT2(prob + i, i, ; , distance |= 8); |
|
126 |
113 |
GET_BIT2(prob + i, i, ; , distance |= 8); |
23766
|
0 |
239 |
if (distance == (uint32_t)0xFFFFFFFF) |
23778
|
487 |
0 |
if (checkDicSize == 0) |
23780
|
487 |
0 |
if (distance >= processedPos) |
23783
|
0 |
0 |
else if (distance >= checkDicSize) |
23785
|
229 |
258 |
state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3; |
23790
|
83700 |
0 |
if (limit == dicPos) |
23794
|
0 |
83700 |
unsigned curLen = ((rem < len) ? (unsigned)rem : len); |
23795
|
0 |
83700 |
size_t pos = (dicPos - rep0) + ((dicPos < rep0) ? dicBufSize : 0); |
23800
|
83700 |
0 |
if (pos + curLen <= dicBufSize) |
23806
|
22566528 |
83700 |
do |
23812
|
0 |
0 |
do |
23815
|
0 |
0 |
if (++pos == dicBufSize) |
23823
|
106938 |
504 |
while (dicPos < limit && buf < bufLimit); |
23824
|
9 |
495 |
NORMALIZE; |
23842
|
0 |
510 |
if (p->remainLen != 0 && p->remainLen < kMatchSpecLenStart) |
23849
|
0 |
0 |
if (limit - dicPos < len) |
23852
|
0 |
0 |
if (p->checkDicSize == 0 && p->prop.dicSize - p->processedPos <= len) |
|
0 |
0 |
if (p->checkDicSize == 0 && p->prop.dicSize - p->processedPos <= len) |
23857
|
0 |
0 |
while (len-- != 0) |
23859
|
0 |
0 |
dic[dicPos] = dic[(dicPos - rep0) + ((dicPos < rep0) ? dicBufSize : 0)]; |
23871
|
504 |
0 |
if (p->checkDicSize == 0) |
23874
|
0 |
504 |
if (limit - p->dicPos > rem) |
23877
|
504 |
0 |
RINOK(LzmaDec_DecodeReal(p, limit2, bufLimit)); |
23878
|
0 |
504 |
if (p->processedPos >= p->prop.dicSize) |
23882
|
498 |
6 |
while (p->dicPos < limit && p->buf < bufLimit && p->remainLen < kMatchSpecLenStart); |
|
0 |
498 |
while (p->dicPos < limit && p->buf < bufLimit && p->remainLen < kMatchSpecLenStart); |
|
0 |
0 |
while (p->dicPos < limit && p->buf < bufLimit && p->remainLen < kMatchSpecLenStart); |
23884
|
0 |
504 |
if (p->remainLen > kMatchSpecLenStart) |
23915
|
0 |
470 |
IF_BIT_0_CHECK(prob) |
|
0 |
0 |
IF_BIT_0_CHECK(prob) |
|
37 |
433 |
IF_BIT_0_CHECK(prob) |
23922
|
36 |
1 |
if (p->checkDicSize != 0 || p->processedPos != 0) |
23925
|
0 |
36 |
(p->dic[(p->dicPos == 0 ? p->dicBufSize : p->dicPos) - 1] >> (8 - p->prop.lc)))); |
23927
|
27 |
10 |
if (state < kNumLitStates) |
23930
|
23 |
193 |
do { GET_BIT_CHECK(prob + symbol, symbol) } while (symbol < 0x100); |
|
23 |
0 |
do { GET_BIT_CHECK(prob + symbol, symbol) } while (symbol < 0x100); |
|
111 |
105 |
do { GET_BIT_CHECK(prob + symbol, symbol) } while (symbol < 0x100); |
|
189 |
27 |
do { GET_BIT_CHECK(prob + symbol, symbol) } while (symbol < 0x100); |
23935
|
0 |
10 |
((p->dicPos < p->reps[0]) ? p->dicBufSize : 0)]; |
23945
|
10 |
70 |
GET_BIT2_CHECK(probLit, symbol, offs &= ~bit, offs &= bit) |
|
10 |
0 |
GET_BIT2_CHECK(probLit, symbol, offs &= ~bit, offs &= bit) |
|
56 |
24 |
GET_BIT2_CHECK(probLit, symbol, offs &= ~bit, offs &= bit) |
23947
|
70 |
10 |
while (symbol < 0x100); |
23957
|
2 |
431 |
IF_BIT_0_CHECK(prob) |
|
2 |
0 |
IF_BIT_0_CHECK(prob) |
|
17 |
416 |
IF_BIT_0_CHECK(prob) |
23969
|
0 |
416 |
IF_BIT_0_CHECK(prob) |
|
0 |
0 |
IF_BIT_0_CHECK(prob) |
|
415 |
1 |
IF_BIT_0_CHECK(prob) |
23973
|
1 |
414 |
IF_BIT_0_CHECK(prob) |
|
1 |
0 |
IF_BIT_0_CHECK(prob) |
|
0 |
415 |
IF_BIT_0_CHECK(prob) |
23976
|
0 |
0 |
NORMALIZE_CHECK; |
|
0 |
0 |
NORMALIZE_CHECK; |
23988
|
0 |
1 |
IF_BIT_0_CHECK(prob) |
|
0 |
0 |
IF_BIT_0_CHECK(prob) |
|
1 |
0 |
IF_BIT_0_CHECK(prob) |
23996
|
0 |
1 |
IF_BIT_0_CHECK(prob) |
|
0 |
0 |
IF_BIT_0_CHECK(prob) |
|
0 |
1 |
IF_BIT_0_CHECK(prob) |
24012
|
4 |
429 |
IF_BIT_0_CHECK(probLen) |
|
4 |
0 |
IF_BIT_0_CHECK(probLen) |
|
16 |
417 |
IF_BIT_0_CHECK(probLen) |
24023
|
2 |
415 |
IF_BIT_0_CHECK(probLen) |
|
2 |
0 |
IF_BIT_0_CHECK(probLen) |
|
2 |
415 |
IF_BIT_0_CHECK(probLen) |
24038
|
21 |
3353 |
TREE_DECODE_CHECK(probLen, limit, len); |
|
21 |
0 |
TREE_DECODE_CHECK(probLen, limit, len); |
|
56 |
3318 |
TREE_DECODE_CHECK(probLen, limit, len); |
|
2941 |
433 |
TREE_DECODE_CHECK(probLen, limit, len); |
24042
|
17 |
416 |
if (state < 4) |
24048
|
12 |
90 |
TREE_DECODE_CHECK(prob, 1 << kNumPosSlotBits, posSlot); |
|
12 |
0 |
TREE_DECODE_CHECK(prob, 1 << kNumPosSlotBits, posSlot); |
|
62 |
40 |
TREE_DECODE_CHECK(prob, 1 << kNumPosSlotBits, posSlot); |
|
85 |
17 |
TREE_DECODE_CHECK(prob, 1 << kNumPosSlotBits, posSlot); |
24049
|
14 |
3 |
if (posSlot >= kStartPosModelIndex) |
24055
|
9 |
5 |
if (posSlot < kEndPosModelIndex) |
24062
|
37 |
5 |
do |
24064
|
5 |
37 |
NORMALIZE_CHECK |
|
5 |
0 |
NORMALIZE_CHECK |
24075
|
33 |
14 |
do |
24077
|
6 |
41 |
GET_BIT_CHECK(prob + i, i); |
|
6 |
0 |
GET_BIT_CHECK(prob + i, i); |
|
27 |
20 |
GET_BIT_CHECK(prob + i, i); |
24085
|
9 |
461 |
NORMALIZE_CHECK; |
|
9 |
0 |
NORMALIZE_CHECK; |
24102
|
0 |
0 |
if (initDic) |
24108
|
0 |
0 |
if (initState) |
24123
|
47940 |
6 |
for (i = 0; i < numProbs; i++) |
24139
|
510 |
0 |
while (p->remainLen != kMatchSpecLenStart) |
24143
|
6 |
504 |
if (p->needFlush != 0) |
24145
|
36 |
0 |
for (; inSize > 0 && p->tempBufSize < RC_INIT_SIZE; (*srcLen)++, inSize--) |
|
30 |
6 |
for (; inSize > 0 && p->tempBufSize < RC_INIT_SIZE; (*srcLen)++, inSize--) |
24147
|
0 |
6 |
if (p->tempBufSize < RC_INIT_SIZE) |
24152
|
6 |
0 |
if (p->tempBuf[0] != 0) |
24160
|
6 |
504 |
if (p->dicPos >= dicLimit) |
24162
|
6 |
0 |
if (p->remainLen == 0 && p->code == 0) |
|
6 |
0 |
if (p->remainLen == 0 && p->code == 0) |
24167
|
0 |
0 |
if (finishMode == LZMA_FINISH_ANY) |
24172
|
0 |
0 |
if (p->remainLen != 0) |
24180
|
6 |
498 |
if (p->needInitState) |
24183
|
0 |
504 |
if (p->tempBufSize == 0) |
24187
|
470 |
34 |
if (inSize < LZMA_REQUIRED_INPUT_MAX || checkEndMarkNow) |
24190
|
0 |
470 |
if (dummyRes == DUMMY_ERROR) |
24198
|
0 |
470 |
if (checkEndMarkNow && dummyRes != DUMMY_MATCH) |
24208
|
504 |
0 |
if (LzmaDec_DecodeReal2(p, dicLimit, bufLimit) != 0) |
24218
|
0 |
0 |
while (rem < LZMA_REQUIRED_INPUT_MAX && lookAhead < inSize) |
|
0 |
0 |
while (rem < LZMA_REQUIRED_INPUT_MAX && lookAhead < inSize) |
24221
|
0 |
0 |
if (rem < LZMA_REQUIRED_INPUT_MAX || checkEndMarkNow) |
24224
|
0 |
0 |
if (dummyRes == DUMMY_ERROR) |
24230
|
0 |
0 |
if (checkEndMarkNow && dummyRes != DUMMY_MATCH) |
24237
|
0 |
0 |
if (LzmaDec_DecodeReal2(p, dicLimit, p->buf) != 0) |
24246
|
0 |
0 |
if (p->code == 0) |
24261
|
0 |
0 |
if (p->dicPos == p->dicBufSize) |
24264
|
0 |
0 |
if (outSize > p->dicBufSize - dicPos) |
24284
|
0 |
0 |
if (res != 0) |
24286
|
0 |
0 |
if (outSizeCur == 0 || outSize == 0) |
24314
|
6 |
0 |
if (size < LZMA_PROPS_SIZE) |
24319
|
0 |
6 |
if (dicSize < LZMA_DIC_MIN) |
24324
|
6 |
0 |
if (d >= (9 * 5 * 5)) |
24338
|
0 |
6 |
if (p->probs == 0 || numProbs != p->numProbs) |
|
0 |
0 |
if (p->probs == 0 || numProbs != p->numProbs) |
24343
|
6 |
0 |
if (p->probs == 0) |
24352
|
6 |
0 |
RINOK(LzmaProps_Decode(&propNew, props, propsSize)); |
24353
|
6 |
0 |
RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc)); |
24362
|
0 |
0 |
RINOK(LzmaProps_Decode(&propNew, props, propsSize)); |
24363
|
0 |
0 |
RINOK(LzmaDec_AllocateProbs2(p, &propNew, alloc)); |
24365
|
0 |
0 |
if (p->dic == 0 || dicBufSize != p->dicBufSize) |
|
0 |
0 |
if (p->dic == 0 || dicBufSize != p->dicBufSize) |
24369
|
0 |
0 |
if (p->dic == 0) |
24389
|
6 |
0 |
if (inSize < RC_INIT_SIZE) |
24394
|
6 |
0 |
if (res != 0) |
24404
|
6 |
0 |
if (res == SZ_OK && *status == LZMA_STATUS_NEEDS_MORE_INPUT) |
|
0 |
6 |
if (res == SZ_OK && *status == LZMA_STATUS_NEEDS_MORE_INPUT) |
24418
|
6 |
6 |
static void LzmaFree(void* /*p*/, void *address) { delete[] (char*) address; } |
24426
|
6 |
0 |
if (!is.read((char *) &uncompressed_len, sizeof(uncompressed_len))) return false; |
24427
|
6 |
0 |
if (!is.read((char *) &compressed_len, sizeof(compressed_len))) return false; |
24428
|
6 |
0 |
if (!is.read((char *) &poor_crc, sizeof(poor_crc))) return false; |
24429
|
6 |
0 |
if (poor_crc != uncompressed_len * 19991 + compressed_len * 199999991 + 1234567890) return false; |
24430
|
6 |
0 |
if (!is.read((char *) props_encoded, sizeof(props_encoded))) return false; |
24433
|
6 |
0 |
if (!is.read((char *) compressed.data(), compressed_len)) return false; |
|
6 |
0 |
if (!is.read((char *) compressed.data(), compressed_len)) return false; |
24437
|
6 |
0 |
auto res = lzma::LzmaDecode(data.fill(uncompressed_len), &uncompressed_size, compressed.data(), &compressed_size, props_encoded, LZMA_PROPS_SIZE, lzma::LZMA_FINISH_ANY, &status, &lzmaAllocator); |
24438
|
6 |
0 |
if (res != SZ_OK || uncompressed_size != uncompressed_len || compressed_size != compressed_len) return false; |
|
6 |
0 |
if (res != SZ_OK || uncompressed_size != uncompressed_len || compressed_size != compressed_len) return false; |
|
6 |
0 |
if (res != SZ_OK || uncompressed_size != uncompressed_len || compressed_size != compressed_len) return false; |
24768
|
0 |
0 |
if (!p->directInput) |
24780
|
0 |
0 |
if (p->directInput) |
24785
|
0 |
0 |
if (p->bufferBase == 0 || p->blockSize != blockSize) |
|
0 |
0 |
if (p->bufferBase == 0 || p->blockSize != blockSize) |
24808
|
0 |
0 |
if (p->streamEndWasReached || p->result != SZ_OK) |
|
0 |
0 |
if (p->streamEndWasReached || p->result != SZ_OK) |
24810
|
0 |
0 |
if (p->directInput) |
24813
|
0 |
0 |
if (curSize > p->directInputRem) |
24817
|
0 |
0 |
if (p->directInputRem == 0) |
24825
|
0 |
0 |
if (size == 0) |
24828
|
0 |
0 |
if (p->result != SZ_OK) |
24830
|
0 |
0 |
if (size == 0) |
24836
|
0 |
0 |
if (p->streamPos - p->pos > p->keepSizeAfter) |
24851
|
0 |
0 |
if (p->directInput) |
|
0 |
0 |
if (p->directInput) |
24859
|
0 |
0 |
if (p->streamEndWasReached) |
24861
|
0 |
0 |
if (p->keepSizeAfter >= p->streamPos - p->pos) |
24867
|
0 |
0 |
if (MatchFinder_NeedMove(p)) |
24890
|
0 |
0 |
for (i = 0; i < 256; i++) |
|
0 |
0 |
for (i = 0; i < 256; i++) |
24894
|
0 |
0 |
for (j = 0; j < 8; j++) |
|
0 |
0 |
for (j = 0; j < 8; j++) |
24915
|
0 |
0 |
if (sizeInBytes / sizeof(CLzRef) != num) |
24925
|
0 |
0 |
if (historySize > kMaxHistorySize) |
24931
|
0 |
0 |
if (historySize > ((uint32_t)2 << 30)) |
24938
|
0 |
0 |
if (LzInWindow_Create(p, sizeReserv, alloc)) |
24945
|
0 |
0 |
if (p->numHashBytes == 2) |
24956
|
0 |
0 |
if (hs > (1 << 24)) |
24958
|
0 |
0 |
if (p->numHashBytes == 3) |
24966
|
0 |
0 |
if (p->numHashBytes > 2) p->fixedHashSize += kHash2Size; |
24967
|
0 |
0 |
if (p->numHashBytes > 3) p->fixedHashSize += kHash3Size; |
24968
|
0 |
0 |
if (p->numHashBytes > 4) p->fixedHashSize += kHash4Size; |
24978
|
0 |
0 |
p->numSons = (p->btMode ? newCyclicBufferSize * 2 : newCyclicBufferSize); |
24980
|
0 |
0 |
if (p->hash != 0 && prevSize == newSize) |
|
0 |
0 |
if (p->hash != 0 && prevSize == newSize) |
24984
|
0 |
0 |
if (p->hash != 0) |
24999
|
0 |
0 |
if (limit2 < limit) |
25002
|
0 |
0 |
if (limit2 <= p->keepSizeAfter) |
25004
|
0 |
0 |
if (limit2 > 0) |
25009
|
0 |
0 |
if (limit2 < limit) |
25013
|
0 |
0 |
if (lenLimit > p->matchMaxLen) |
25023
|
0 |
0 |
for (i = 0; i < p->hashSizeSum; i++) |
25042
|
0 |
0 |
for (i = 0; i < numItems; i++) |
|
0 |
0 |
for (i = 0; i < numItems; i++) |
25045
|
0 |
0 |
if (value <= subValue) |
|
0 |
0 |
if (value <= subValue) |
25062
|
0 |
0 |
if (p->pos == kMaxValForNormalize) |
25064
|
0 |
0 |
if (!p->streamEndWasReached && p->keepSizeAfter == p->streamPos - p->pos) |
|
0 |
0 |
if (!p->streamEndWasReached && p->keepSizeAfter == p->streamPos - p->pos) |
25066
|
0 |
0 |
if (p->cyclicBufferPos == p->cyclicBufferSize) |
25079
|
0 |
0 |
if (cutValue-- == 0 || delta >= _cyclicBufferSize) |
|
0 |
0 |
if (cutValue-- == 0 || delta >= _cyclicBufferSize) |
|
0 |
0 |
if (cutValue-- == 0 || delta >= _cyclicBufferSize) |
25083
|
0 |
0 |
curMatch = son[_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)]; |
25084
|
0 |
0 |
if (pb[maxLen] == cur[maxLen] && *pb == *cur) |
|
0 |
0 |
if (pb[maxLen] == cur[maxLen] && *pb == *cur) |
25087
|
0 |
0 |
while (++len != lenLimit) |
25088
|
0 |
0 |
if (pb[len] != cur[len]) |
25090
|
0 |
0 |
if (maxLen < len) |
25094
|
0 |
0 |
if (len == lenLimit) |
25112
|
0 |
0 |
if (cutValue-- == 0 || delta >= _cyclicBufferSize) |
|
0 |
0 |
if (cutValue-- == 0 || delta >= _cyclicBufferSize) |
|
0 |
0 |
if (cutValue-- == 0 || delta >= _cyclicBufferSize) |
25118
|
0 |
0 |
CLzRef *pair = son + ((_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1); |
25120
|
0 |
0 |
uint32_t len = (len0 < len1 ? len0 : len1); |
25121
|
0 |
0 |
if (pb[len] == cur[len]) |
25123
|
0 |
0 |
if (++len != lenLimit && pb[len] == cur[len]) |
|
0 |
0 |
if (++len != lenLimit && pb[len] == cur[len]) |
|
0 |
0 |
if (++len != lenLimit && pb[len] == cur[len]) |
25124
|
0 |
0 |
while (++len != lenLimit) |
25125
|
0 |
0 |
if (pb[len] != cur[len]) |
25127
|
0 |
0 |
if (maxLen < len) |
25131
|
0 |
0 |
if (len == lenLimit) |
25139
|
0 |
0 |
if (pb[len] < cur[len]) |
25166
|
0 |
0 |
if (cutValue-- == 0 || delta >= _cyclicBufferSize) |
|
0 |
0 |
if (cutValue-- == 0 || delta >= _cyclicBufferSize) |
|
0 |
0 |
if (cutValue-- == 0 || delta >= _cyclicBufferSize) |
25172
|
0 |
0 |
CLzRef *pair = son + ((_cyclicBufferPos - delta + ((delta > _cyclicBufferPos) ? _cyclicBufferSize : 0)) << 1); |
25174
|
0 |
0 |
uint32_t len = (len0 < len1 ? len0 : len1); |
25175
|
0 |
0 |
if (pb[len] == cur[len]) |
25177
|
0 |
0 |
while (++len != lenLimit) |
25178
|
0 |
0 |
if (pb[len] != cur[len]) |
25181
|
0 |
0 |
if (len == lenLimit) |
25189
|
0 |
0 |
if (pb[len] < cur[len]) |
25214
|
0 |
0 |
static void MatchFinder_MovePos(CMatchFinder *p) { MOVE_POS; } |
25236
|
0 |
0 |
GET_MATCHES_HEADER(2) |
25241
|
0 |
0 |
GET_MATCHES_FOOTER(offset, 1) |
25247
|
0 |
0 |
GET_MATCHES_HEADER(3) |
25252
|
0 |
0 |
GET_MATCHES_FOOTER(offset, 2) |
25258
|
0 |
0 |
GET_MATCHES_HEADER(3) |
25270
|
0 |
0 |
if (delta2 < p->cyclicBufferSize && *(cur - delta2) == *cur) |
|
0 |
0 |
if (delta2 < p->cyclicBufferSize && *(cur - delta2) == *cur) |
25272
|
0 |
0 |
for (; maxLen != lenLimit; maxLen++) |
25273
|
0 |
0 |
if (cur[(ptrdiff_t)maxLen - delta2] != cur[maxLen]) |
25278
|
0 |
0 |
if (maxLen == lenLimit) |
25281
|
0 |
0 |
MOVE_POS_RET; |
25284
|
0 |
0 |
GET_MATCHES_FOOTER(offset, maxLen) |
25290
|
0 |
0 |
GET_MATCHES_HEADER(4) |
25304
|
0 |
0 |
if (delta2 < p->cyclicBufferSize && *(cur - delta2) == *cur) |
|
0 |
0 |
if (delta2 < p->cyclicBufferSize && *(cur - delta2) == *cur) |
25310
|
0 |
0 |
if (delta2 != delta3 && delta3 < p->cyclicBufferSize && *(cur - delta3) == *cur) |
|
0 |
0 |
if (delta2 != delta3 && delta3 < p->cyclicBufferSize && *(cur - delta3) == *cur) |
|
0 |
0 |
if (delta2 != delta3 && delta3 < p->cyclicBufferSize && *(cur - delta3) == *cur) |
25317
|
0 |
0 |
if (offset != 0) |
25319
|
0 |
0 |
for (; maxLen != lenLimit; maxLen++) |
25320
|
0 |
0 |
if (cur[(ptrdiff_t)maxLen - delta2] != cur[maxLen]) |
25323
|
0 |
0 |
if (maxLen == lenLimit) |
25326
|
0 |
0 |
MOVE_POS_RET; |
25329
|
0 |
0 |
if (maxLen < 3) |
25331
|
0 |
0 |
GET_MATCHES_FOOTER(offset, maxLen) |
25337
|
0 |
0 |
GET_MATCHES_HEADER(4) |
25351
|
0 |
0 |
if (delta2 < p->cyclicBufferSize && *(cur - delta2) == *cur) |
|
0 |
0 |
if (delta2 < p->cyclicBufferSize && *(cur - delta2) == *cur) |
25357
|
0 |
0 |
if (delta2 != delta3 && delta3 < p->cyclicBufferSize && *(cur - delta3) == *cur) |
|
0 |
0 |
if (delta2 != delta3 && delta3 < p->cyclicBufferSize && *(cur - delta3) == *cur) |
|
0 |
0 |
if (delta2 != delta3 && delta3 < p->cyclicBufferSize && *(cur - delta3) == *cur) |
25364
|
0 |
0 |
if (offset != 0) |
25366
|
0 |
0 |
for (; maxLen != lenLimit; maxLen++) |
25367
|
0 |
0 |
if (cur[(ptrdiff_t)maxLen - delta2] != cur[maxLen]) |
25370
|
0 |
0 |
if (maxLen == lenLimit) |
25373
|
0 |
0 |
MOVE_POS_RET; |
25376
|
0 |
0 |
if (maxLen < 3) |
25380
|
0 |
0 |
MOVE_POS_RET |
25386
|
0 |
0 |
GET_MATCHES_HEADER(3) |
25392
|
0 |
0 |
MOVE_POS_RET |
25397
|
0 |
0 |
do |
25399
|
0 |
0 |
SKIP_HEADER(2) |
25403
|
0 |
0 |
SKIP_FOOTER |
25410
|
0 |
0 |
do |
25412
|
0 |
0 |
SKIP_HEADER(3) |
25416
|
0 |
0 |
SKIP_FOOTER |
25423
|
0 |
0 |
do |
25426
|
0 |
0 |
SKIP_HEADER(3) |
25431
|
0 |
0 |
SKIP_FOOTER |
25438
|
0 |
0 |
do |
25441
|
0 |
0 |
SKIP_HEADER(4) |
25447
|
0 |
0 |
SKIP_FOOTER |
25454
|
0 |
0 |
do |
25457
|
0 |
0 |
SKIP_HEADER(4) |
25464
|
0 |
0 |
MOVE_POS |
25471
|
0 |
0 |
do |
25473
|
0 |
0 |
SKIP_HEADER(3) |
25478
|
0 |
0 |
MOVE_POS |
25489
|
0 |
0 |
if (!p->btMode) |
|
0 |
0 |
if (!p->btMode) |
25494
|
0 |
0 |
else if (p->numHashBytes == 2) |
|
0 |
0 |
else if (p->numHashBytes == 2) |
25499
|
0 |
0 |
else if (p->numHashBytes == 3) |
|
0 |
0 |
else if (p->numHashBytes == 3) |
25611
|
0 |
0 |
if (level < 0) level = 5; |
25613
|
0 |
0 |
if (p->dictSize == 0) p->dictSize = (level <= 5 ? (1 << (level * 2 + 14)) : (level == 6 ? (1 << 25) : (1 << 26))); |
|
0 |
0 |
if (p->dictSize == 0) p->dictSize = (level <= 5 ? (1 << (level * 2 + 14)) : (level == 6 ? (1 << 25) : (1 << 26))); |
|
0 |
0 |
if (p->dictSize == 0) p->dictSize = (level <= 5 ? (1 << (level * 2 + 14)) : (level == 6 ? (1 << 25) : (1 << 26))); |
25614
|
0 |
0 |
if (p->lc < 0) p->lc = 3; |
25615
|
0 |
0 |
if (p->lp < 0) p->lp = 0; |
25616
|
0 |
0 |
if (p->pb < 0) p->pb = 2; |
25617
|
0 |
0 |
if (p->algo < 0) p->algo = (level < 5 ? 0 : 1); |
25618
|
0 |
0 |
if (p->fb < 0) p->fb = (level < 7 ? 32 : 64); |
|
0 |
0 |
if (p->fb < 0) p->fb = (level < 7 ? 32 : 64); |
25619
|
0 |
0 |
if (p->btMode < 0) p->btMode = (p->algo == 0 ? 0 : 1); |
25620
|
0 |
0 |
if (p->numHashBytes < 0) p->numHashBytes = 4; |
25621
|
0 |
0 |
if (p->mc == 0) p->mc = (16 + (p->fb >> 1)) >> (p->btMode ? 0 : 1); |
25622
|
0 |
0 |
if (p->numThreads < 0) |
25663
|
0 |
0 |
for (slotFast = 2; slotFast < kNumLogBits * 2; slotFast++) |
|
0 |
0 |
for (slotFast = 2; slotFast < kNumLogBits * 2; slotFast++) |
25667
|
0 |
0 |
for (j = 0; j < k; j++, c++) |
|
0 |
0 |
for (j = 0; j < k; j++, c++) |
25886
|
0 |
0 |
for (i = 0; i < kNumStates; i++) |
25891
|
0 |
0 |
for (i = 0; i < kNumLenToPosStates; i++) |
25912
|
0 |
0 |
for (i = 0; i < kNumStates; i++) |
25917
|
0 |
0 |
for (i = 0; i < kNumLenToPosStates; i++) |
25935
|
0 |
0 |
if (props.lc > LZMA_LC_MAX || props.lp > LZMA_LP_MAX || props.pb > LZMA_PB_MAX || |
|
0 |
0 |
if (props.lc > LZMA_LC_MAX || props.lp > LZMA_LP_MAX || props.pb > LZMA_PB_MAX || |
|
0 |
0 |
if (props.lc > LZMA_LC_MAX || props.lp > LZMA_LP_MAX || props.pb > LZMA_PB_MAX || |
|
0 |
0 |
if (props.lc > LZMA_LC_MAX || props.lp > LZMA_LP_MAX || props.pb > LZMA_PB_MAX || |
25936
|
0 |
0 |
props.dictSize > ((uint32_t)1 << kDicLogSizeMaxCompress) || props.dictSize > ((uint32_t)1 << 30)) |
25942
|
0 |
0 |
if (fb < 5) |
25944
|
0 |
0 |
if (fb > LZMA_MATCH_LEN_MAX) |
25955
|
0 |
0 |
if (props.btMode) |
25957
|
0 |
0 |
if (props.numHashBytes < 2) |
25959
|
0 |
0 |
else if (props.numHashBytes < 4) |
25994
|
0 |
0 |
if (p->bufBase == 0) |
25997
|
0 |
0 |
if (p->bufBase == 0) |
26027
|
0 |
0 |
if (p->res != SZ_OK) |
26030
|
0 |
0 |
if (num != p->outStream->Write(p->outStream, p->bufBase, num)) |
26038
|
0 |
0 |
if ((uint32_t)p->low < (uint32_t)0xFF000000 || (int)(p->low >> 32) != 0) |
|
0 |
0 |
if ((uint32_t)p->low < (uint32_t)0xFF000000 || (int)(p->low >> 32) != 0) |
26041
|
0 |
0 |
do |
26046
|
0 |
0 |
if (buf == p->bufLim) |
26060
|
0 |
0 |
for (i = 0; i < 5; i++) |
26070
|
0 |
0 |
if (p->range < kTopValue) |
26076
|
0 |
0 |
while (numBits != 0); |
26083
|
0 |
0 |
if (symbol == 0) |
26095
|
0 |
0 |
if (p->range < kTopValue) |
26110
|
0 |
0 |
while (symbol < 0x10000); |
26124
|
0 |
0 |
while (symbol < 0x10000); |
26130
|
0 |
0 |
for (i = (1 << kNumMoveReducingBits) / 2; i < kBitModelTotal; i += (1 << kNumMoveReducingBits)) |
26136
|
0 |
0 |
for (j = 0; j < kCyclesBits; j++) |
26140
|
0 |
0 |
while (w >= ((uint32_t)1 << 16)) |
26171
|
0 |
0 |
while (symbol < 0x10000); |
|
0 |
0 |
while (symbol < 0x10000); |
26187
|
0 |
0 |
while (symbol < 0x10000); |
26195
|
0 |
0 |
for (i = numBitLevels; i != 0;) |
26209
|
0 |
0 |
for (i = 0; i < numBitLevels; i++) |
26222
|
0 |
0 |
while (symbol != 1) |
|
0 |
0 |
while (symbol != 1) |
|
0 |
0 |
while (symbol != 1) |
|
0 |
0 |
while (symbol != 1) |
26235
|
0 |
0 |
for (i = numBitLevels; i != 0; i--) |
|
0 |
0 |
for (i = numBitLevels; i != 0; i--) |
26249
|
0 |
0 |
for (i = 0; i < (LZMA_NUM_PB_STATES_MAX << kLenNumLowBits); i++) |
|
0 |
0 |
for (i = 0; i < (LZMA_NUM_PB_STATES_MAX << kLenNumLowBits); i++) |
26251
|
0 |
0 |
for (i = 0; i < (LZMA_NUM_PB_STATES_MAX << kLenNumMidBits); i++) |
|
0 |
0 |
for (i = 0; i < (LZMA_NUM_PB_STATES_MAX << kLenNumMidBits); i++) |
26253
|
0 |
0 |
for (i = 0; i < kLenNumHighSymbols; i++) |
|
0 |
0 |
for (i = 0; i < kLenNumHighSymbols; i++) |
26259
|
0 |
0 |
if (symbol < kLenNumLowSymbols) |
26267
|
0 |
0 |
if (symbol < kLenNumLowSymbols + kLenNumMidSymbols) |
26287
|
0 |
0 |
for (i = 0; i < kLenNumLowSymbols; i++) |
26289
|
0 |
0 |
if (i >= numSymbols) |
26293
|
0 |
0 |
for (; i < kLenNumLowSymbols + kLenNumMidSymbols; i++) |
26295
|
0 |
0 |
if (i >= numSymbols) |
26299
|
0 |
0 |
for (; i < numSymbols; i++) |
26312
|
0 |
0 |
for (posState = 0; posState < numPosStates; posState++) |
|
0 |
0 |
for (posState = 0; posState < numPosStates; posState++) |
26319
|
0 |
0 |
if (updatePrice) |
26320
|
0 |
0 |
if (--p->counters[posState] == 0) |
26326
|
0 |
0 |
if (num != 0) |
|
0 |
0 |
if (num != 0) |
|
0 |
0 |
if (num != 0) |
|
0 |
0 |
if (num != 0) |
|
0 |
0 |
if (num != 0) |
|
0 |
0 |
if (num != 0) |
26338
|
0 |
0 |
if (numPairs > 0) |
26341
|
0 |
0 |
if (lenRes == p->numFastBytes) |
26346
|
0 |
0 |
if (numAvail > LZMA_MATCH_LEN_MAX) |
26350
|
0 |
0 |
for (; lenRes < numAvail && pby[lenRes] == pby2[lenRes]; lenRes++); |
|
0 |
0 |
for (; lenRes < numAvail && pby[lenRes] == pby2[lenRes]; lenRes++); |
26373
|
0 |
0 |
if (repIndex == 0) |
26381
|
0 |
0 |
if (repIndex == 1) |
26405
|
0 |
0 |
if (p->opt[cur].prev1IsChar) |
|
0 |
0 |
if (p->opt[cur].prev1IsChar) |
26409
|
0 |
0 |
if (p->opt[cur].prev2) |
|
0 |
0 |
if (p->opt[cur].prev2) |
26428
|
0 |
0 |
while (cur != 0); |
|
0 |
0 |
while (cur != 0); |
26444
|
0 |
0 |
if (p->optimumEndIndex != p->optimumCurrentIndex) |
26454
|
0 |
0 |
if (p->additionalOffset == 0) |
26463
|
0 |
0 |
if (numAvail < 2) |
26468
|
0 |
0 |
if (numAvail > LZMA_MATCH_LEN_MAX) |
26473
|
0 |
0 |
for (i = 0; i < LZMA_NUM_REPS; i++) |
26479
|
0 |
0 |
if (data[0] != data2[0] || data[1] != data2[1]) |
|
0 |
0 |
if (data[0] != data2[0] || data[1] != data2[1]) |
26484
|
0 |
0 |
for (lenTest = 2; lenTest < numAvail && data[lenTest] == data2[lenTest]; lenTest++); |
|
0 |
0 |
for (lenTest = 2; lenTest < numAvail && data[lenTest] == data2[lenTest]; lenTest++); |
26486
|
0 |
0 |
if (lenTest > repLens[repMaxIndex]) |
26489
|
0 |
0 |
if (repLens[repMaxIndex] >= p->numFastBytes) |
26499
|
0 |
0 |
if (mainLen >= p->numFastBytes) |
26508
|
0 |
0 |
if (mainLen < 2 && curByte != matchByte && repLens[repMaxIndex] < 2) |
|
0 |
0 |
if (mainLen < 2 && curByte != matchByte && repLens[repMaxIndex] < 2) |
26520
|
0 |
0 |
p->opt[1].price = GET_PRICE_0(p->isMatch[p->state][posState]) + |
26531
|
0 |
0 |
if (matchByte == curByte) |
26534
|
0 |
0 |
if (shortRepPrice < p->opt[1].price) |
26540
|
0 |
0 |
lenEnd = ((mainLen >= repLens[repMaxIndex]) ? mainLen : repLens[repMaxIndex]); |
26542
|
0 |
0 |
if (lenEnd < 2) |
26549
|
0 |
0 |
for (i = 0; i < LZMA_NUM_REPS; i++) |
26555
|
0 |
0 |
while (len >= 2); |
26557
|
0 |
0 |
for (i = 0; i < LZMA_NUM_REPS; i++) |
26561
|
0 |
0 |
if (repLen < 2) |
26564
|
0 |
0 |
do |
26568
|
0 |
0 |
if (curAndLenPrice < opt->price) |
26581
|
0 |
0 |
len = ((repLens[0] >= 2) ? repLens[0] + 1 : 2); |
26582
|
0 |
0 |
if (len <= mainLen) |
26585
|
0 |
0 |
while (len > matches[offs]) |
26593
|
0 |
0 |
uint32_t lenToPosState = GetLenToPosState(len); |
26594
|
0 |
0 |
if (distance < kNumFullDistances) |
26603
|
0 |
0 |
if (curAndLenPrice < opt->price) |
26610
|
0 |
0 |
if (len == matches[offs]) |
26613
|
0 |
0 |
if (offs == numPairs) |
26632
|
0 |
0 |
if (cur == lenEnd) |
26636
|
0 |
0 |
if (newLen >= p->numFastBytes) |
26645
|
0 |
0 |
if (curOpt->prev1IsChar) |
26648
|
0 |
0 |
if (curOpt->prev2) |
26651
|
0 |
0 |
if (curOpt->backPrev2 < LZMA_NUM_REPS) |
26662
|
0 |
0 |
if (posPrev == cur - 1) |
26664
|
0 |
0 |
if (IsShortRep(curOpt)) |
26673
|
0 |
0 |
if (curOpt->prev1IsChar && curOpt->prev2) |
|
0 |
0 |
if (curOpt->prev1IsChar && curOpt->prev2) |
26682
|
0 |
0 |
if (pos < LZMA_NUM_REPS) |
26688
|
0 |
0 |
if (pos < LZMA_NUM_REPS) |
26692
|
0 |
0 |
for (i = 1; i <= pos; i++) |
26694
|
0 |
0 |
for (; i < LZMA_NUM_REPS; i++) |
26701
|
0 |
0 |
for (i = 1; i < LZMA_NUM_REPS; i++) |
26726
|
0 |
0 |
LitEnc_GetPrice(probs, curByte, p->ProbPrices)); |
26731
|
0 |
0 |
if (curAnd1Price < nextOpt->price) |
26742
|
0 |
0 |
if (matchByte == curByte && !(nextOpt->posPrev < cur && nextOpt->backPrev == 0)) |
|
0 |
0 |
if (matchByte == curByte && !(nextOpt->posPrev < cur && nextOpt->backPrev == 0)) |
|
0 |
0 |
if (matchByte == curByte && !(nextOpt->posPrev < cur && nextOpt->backPrev == 0)) |
26745
|
0 |
0 |
if (shortRepPrice <= nextOpt->price) |
26756
|
0 |
0 |
if (temp < numAvailFull) |
26760
|
0 |
0 |
if (numAvailFull < 2) |
26762
|
0 |
0 |
numAvail = (numAvailFull <= p->numFastBytes ? numAvailFull : p->numFastBytes); |
26764
|
0 |
0 |
if (!nextIsChar && matchByte != curByte) /* speed optimization */ |
26771
|
0 |
0 |
if (limit > numAvailFull) |
26774
|
0 |
0 |
for (temp = 1; temp < limit && data[temp] == data2[temp]; temp++); |
|
0 |
0 |
for (temp = 1; temp < limit && data[temp] == data2[temp]; temp++); |
26776
|
0 |
0 |
if (lenTest2 >= 2) |
26788
|
0 |
0 |
while (lenEnd < offset) |
26792
|
0 |
0 |
if (curAndLenPrice < opt->price) |
26807
|
0 |
0 |
for (repIndex = 0; repIndex < LZMA_NUM_REPS; repIndex++) |
26813
|
0 |
0 |
if (data[0] != data2[0] || data[1] != data2[1]) |
|
0 |
0 |
if (data[0] != data2[0] || data[1] != data2[1]) |
26815
|
0 |
0 |
for (lenTest = 2; lenTest < numAvail && data[lenTest] == data2[lenTest]; lenTest++); |
|
0 |
0 |
for (lenTest = 2; lenTest < numAvail && data[lenTest] == data2[lenTest]; lenTest++); |
26816
|
0 |
0 |
while (lenEnd < cur + lenTest) |
26820
|
0 |
0 |
do |
26824
|
0 |
0 |
if (curAndLenPrice < opt->price) |
26835
|
0 |
0 |
if (repIndex == 0) |
26844
|
0 |
0 |
if (limit > numAvailFull) |
26846
|
0 |
0 |
for (; lenTest2 < limit && data[lenTest2] == data2[lenTest2]; lenTest2++); |
|
0 |
0 |
for (; lenTest2 < limit && data[lenTest2] == data2[lenTest2]; lenTest2++); |
26848
|
0 |
0 |
if (lenTest2 >= 2) |
26868
|
0 |
0 |
while (lenEnd < offset) |
26872
|
0 |
0 |
if (curAndLenPrice < opt->price) |
26888
|
0 |
0 |
if (newLen > numAvail) |
26891
|
0 |
0 |
for (numPairs = 0; newLen > matches[numPairs]; numPairs += 2); |
26895
|
0 |
0 |
if (newLen >= startLen) |
26900
|
0 |
0 |
while (lenEnd < cur + newLen) |
26904
|
0 |
0 |
while (startLen > matches[offs]) |
26911
|
0 |
0 |
uint32_t lenToPosState = GetLenToPosState(lenTest); |
26913
|
0 |
0 |
if (curBack < kNumFullDistances) |
26919
|
0 |
0 |
if (curAndLenPrice < opt->price) |
26927
|
0 |
0 |
if (/*_maxMode && */lenTest == matches[offs]) |
26934
|
0 |
0 |
if (limit > numAvailFull) |
26936
|
0 |
0 |
for (; lenTest2 < limit && data[lenTest2] == data2[lenTest2]; lenTest2++); |
|
0 |
0 |
for (; lenTest2 < limit && data[lenTest2] == data2[lenTest2]; lenTest2++); |
26938
|
0 |
0 |
if (lenTest2 >= 2) |
26957
|
0 |
0 |
while (lenEnd < offset) |
26961
|
0 |
0 |
if (curAndLenPrice < opt->price) |
26974
|
0 |
0 |
if (offs == numPairs) |
26977
|
0 |
0 |
if (curBack >= kNumFullDistances) |
26993
|
0 |
0 |
if (p->additionalOffset == 0) |
27003
|
0 |
0 |
if (numAvail < 2) |
27005
|
0 |
0 |
if (numAvail > LZMA_MATCH_LEN_MAX) |
27010
|
0 |
0 |
for (i = 0; i < LZMA_NUM_REPS; i++) |
27014
|
0 |
0 |
if (data[0] != data2[0] || data[1] != data2[1]) |
|
0 |
0 |
if (data[0] != data2[0] || data[1] != data2[1]) |
27016
|
0 |
0 |
for (len = 2; len < numAvail && data[len] == data2[len]; len++); |
|
0 |
0 |
for (len = 2; len < numAvail && data[len] == data2[len]; len++); |
27017
|
0 |
0 |
if (len >= p->numFastBytes) |
27023
|
0 |
0 |
if (len > repLen) |
27031
|
0 |
0 |
if (mainLen >= p->numFastBytes) |
27039
|
0 |
0 |
if (mainLen >= 2) |
27042
|
0 |
0 |
while (numPairs > 2 && mainLen == matches[numPairs - 4] + 1) |
|
0 |
0 |
while (numPairs > 2 && mainLen == matches[numPairs - 4] + 1) |
27044
|
0 |
0 |
if (!ChangePair(matches[numPairs - 3], mainDist)) |
27050
|
0 |
0 |
if (mainLen == 2 && mainDist >= 0x80) |
27054
|
0 |
0 |
if (repLen >= 2 && ( |
|
0 |
0 |
if (repLen >= 2 && ( |
27055
|
0 |
0 |
(repLen + 1 >= mainLen) || |
27056
|
0 |
0 |
(repLen + 2 >= mainLen && mainDist >= (1 << 9)) || |
|
0 |
0 |
(repLen + 2 >= mainLen && mainDist >= (1 << 9)) || |
27057
|
0 |
0 |
(repLen + 3 >= mainLen && mainDist >= (1 << 15)))) |
27064
|
0 |
0 |
if (mainLen < 2 || numAvail <= 2) |
27068
|
0 |
0 |
if (p->longestMatchLength >= 2) |
27071
|
0 |
0 |
if ((p->longestMatchLength >= mainLen && newDistance < mainDist) || |
|
0 |
0 |
if ((p->longestMatchLength >= mainLen && newDistance < mainDist) || |
|
0 |
0 |
if ((p->longestMatchLength >= mainLen && newDistance < mainDist) || |
27072
|
0 |
0 |
(p->longestMatchLength == mainLen + 1 && !ChangePair(mainDist, newDistance)) || |
|
0 |
0 |
(p->longestMatchLength == mainLen + 1 && !ChangePair(mainDist, newDistance)) || |
27073
|
0 |
0 |
(p->longestMatchLength > mainLen + 1) || |
27074
|
0 |
0 |
(p->longestMatchLength + 1 >= mainLen && mainLen >= 3 && ChangePair(newDistance, mainDist))) |
|
0 |
0 |
(p->longestMatchLength + 1 >= mainLen && mainLen >= 3 && ChangePair(newDistance, mainDist))) |
27079
|
0 |
0 |
for (i = 0; i < LZMA_NUM_REPS; i++) |
27083
|
0 |
0 |
if (data[0] != data2[0] || data[1] != data2[1]) |
|
0 |
0 |
if (data[0] != data2[0] || data[1] != data2[1]) |
27086
|
0 |
0 |
for (len = 2; len < limit && data[len] == data2[len]; len++); |
|
0 |
0 |
for (len = 2; len < limit && data[len] == data2[len]; len++); |
27087
|
0 |
0 |
if (len >= limit) |
27110
|
0 |
0 |
if (p->result != SZ_OK) |
27112
|
0 |
0 |
if (p->rc.res != SZ_OK) |
|
0 |
0 |
if (p->rc.res != SZ_OK) |
|
0 |
0 |
if (p->rc.res != SZ_OK) |
27114
|
0 |
0 |
if (p->matchFinderBase.result != SZ_OK) |
|
0 |
0 |
if (p->matchFinderBase.result != SZ_OK) |
|
0 |
0 |
if (p->matchFinderBase.result != SZ_OK) |
27116
|
0 |
0 |
if (p->result != SZ_OK) |
|
0 |
0 |
if (p->result != SZ_OK) |
|
0 |
0 |
if (p->result != SZ_OK) |
27125
|
0 |
0 |
if (p->writeEndMark) |
27135
|
0 |
0 |
for (i = 0; i < kAlignTableSize; i++) |
27144
|
0 |
0 |
for (i = kStartPosModelIndex; i < kNumFullDistances; i++) |
27152
|
0 |
0 |
for (lenToPosState = 0; lenToPosState < kNumLenToPosStates; lenToPosState++) |
27157
|
0 |
0 |
for (posSlot = 0; posSlot < p->distTableSize; posSlot++) |
27159
|
0 |
0 |
for (posSlot = kEndPosModelIndex; posSlot < p->distTableSize; posSlot++) |
27165
|
0 |
0 |
for (i = 0; i < kStartPosModelIndex; i++) |
27167
|
0 |
0 |
for (; i < kNumFullDistances; i++) |
27198
|
0 |
0 |
if (p != 0) |
27227
|
0 |
0 |
if (p->needInit) |
27233
|
0 |
0 |
if (p->finished) |
27235
|
0 |
0 |
RINOK(CheckErrors(p)); |
27240
|
0 |
0 |
if (p->nowPos64 == 0) |
27244
|
0 |
0 |
if (p->matchFinder.GetNumAvailableBytes(p->matchFinderObj) == 0) |
27255
|
0 |
0 |
if (p->matchFinder.GetNumAvailableBytes(p->matchFinderObj) != 0) |
27260
|
0 |
0 |
if (p->fastMode) |
27266
|
0 |
0 |
if (len == 1 && pos == (uint32_t)-1) |
|
0 |
0 |
if (len == 1 && pos == (uint32_t)-1) |
27276
|
0 |
0 |
if (IsCharState(p->state)) |
27285
|
0 |
0 |
if (pos < LZMA_NUM_REPS) |
27288
|
0 |
0 |
if (pos == 0) |
27297
|
0 |
0 |
if (pos == 1) |
27303
|
0 |
0 |
if (pos == 3) |
27310
|
0 |
0 |
if (len == 1) |
27325
|
0 |
0 |
GetPosSlot(pos, posSlot); |
27326
|
0 |
0 |
RcTree_Encode(&p->rc, p->posSlotEncoder[GetLenToPosState(len)], kNumPosSlotBits, posSlot); |
27328
|
0 |
0 |
if (posSlot >= kStartPosModelIndex) |
27334
|
0 |
0 |
if (posSlot < kEndPosModelIndex) |
27352
|
0 |
0 |
if (p->additionalOffset == 0) |
27355
|
0 |
0 |
if (!p->fastMode) |
27357
|
0 |
0 |
if (p->matchPriceCount >= (1 << 7)) |
27359
|
0 |
0 |
if (p->alignPriceCount >= kAlignTableSize) |
27362
|
0 |
0 |
if (p->matchFinder.GetNumAvailableBytes(p->matchFinderObj) == 0) |
27365
|
0 |
0 |
if (useLimits) |
27367
|
0 |
0 |
if (processed + kNumOpts + 300 >= maxUnpackSize || |
|
0 |
0 |
if (processed + kNumOpts + 300 >= maxUnpackSize || |
27371
|
0 |
0 |
else if (processed >= (1 << 15)) |
27387
|
0 |
0 |
if (!RangeEnc_Alloc(&p->rc, alloc)) |
27392
|
0 |
0 |
if (p->litProbs == 0 || p->saveState.litProbs == 0 || p->lclp != lclp) |
|
0 |
0 |
if (p->litProbs == 0 || p->saveState.litProbs == 0 || p->lclp != lclp) |
|
0 |
0 |
if (p->litProbs == 0 || p->saveState.litProbs == 0 || p->lclp != lclp) |
27397
|
0 |
0 |
if (p->litProbs == 0 || p->saveState.litProbs == 0) |
|
0 |
0 |
if (p->litProbs == 0 || p->saveState.litProbs == 0) |
27408
|
0 |
0 |
if (beforeSize + p->dictSize < keepWindowSize) |
27412
|
0 |
0 |
if (!MatchFinder_Create(&p->matchFinderBase, p->dictSize, beforeSize, p->numFastBytes, LZMA_MATCH_LEN_MAX, allocBig)) |
27424
|
0 |
0 |
for (i = 0 ; i < LZMA_NUM_REPS; i++) |
27429
|
0 |
0 |
for (i = 0; i < kNumStates; i++) |
27432
|
0 |
0 |
for (j = 0; j < LZMA_NUM_PB_STATES_MAX; j++) |
27445
|
0 |
0 |
for (i = 0; i < num; i++) |
27450
|
0 |
0 |
for (i = 0; i < kNumLenToPosStates; i++) |
27454
|
0 |
0 |
for (j = 0; j < (1 << kNumPosSlotBits); j++) |
27459
|
0 |
0 |
for (i = 0; i < kNumFullDistances - kEndPosModelIndex; i++) |
27466
|
0 |
0 |
for (i = 0; i < (1 << kNumAlignBits); i++) |
27479
|
0 |
0 |
if (!p->fastMode) |
27495
|
0 |
0 |
for (i = 0; i < (uint32_t)kDicLogSizeMaxCompress; i++) |
27496
|
0 |
0 |
if (p->dictSize <= ((uint32_t)1 << i)) |
27502
|
0 |
0 |
RINOK(LzmaEnc_Alloc(p, keepWindowSize, alloc, allocBig)); |
27561
|
0 |
0 |
if (p->rem < size) |
27601
|
0 |
0 |
if (reInit) |
27612
|
0 |
0 |
if (outStream.overflow) |
27625
|
0 |
0 |
if (res != SZ_OK || p->finished != 0) |
|
0 |
0 |
if (res != SZ_OK || p->finished != 0) |
27627
|
0 |
0 |
if (progress != 0) |
27630
|
0 |
0 |
if (res != SZ_OK) |
27644
|
0 |
0 |
RINOK(LzmaEnc_Prepare(pp, outStream, inStream, alloc, allocBig)); |
27653
|
0 |
0 |
if (*size < LZMA_PROPS_SIZE) |
27658
|
0 |
0 |
for (i = 11; i <= 30; i++) |
27660
|
0 |
0 |
if (dictSize <= ((uint32_t)2 << i)) |
27665
|
0 |
0 |
if (dictSize <= ((uint32_t)3 << i)) |
27672
|
0 |
0 |
for (i = 0; i < 4; i++) |
27696
|
0 |
0 |
if (res == SZ_OK) |
27700
|
0 |
0 |
if (outStream.overflow) |
27711
|
0 |
0 |
if (p == 0) |
27715
|
0 |
0 |
if (res == SZ_OK) |
27718
|
0 |
0 |
if (res == SZ_OK) |
27746
|
0 |
0 |
auto res = lzma::LzmaEncode(compressed.data(), &compressed_size, enc.data.data(), uncompressed_size, &props, props_encoded, &props_encoded_size, 0, nullptr, &lzmaAllocator, &lzmaAllocator); |
27747
|
0 |
0 |
if (res != SZ_OK) return false; |
27750
|
0 |
0 |
if (uint32_t(uncompressed_size) != uncompressed_size || uint32_t(compressed_size) != compressed_size) return false; |
|
0 |
0 |
if (uint32_t(uncompressed_size) != uncompressed_size || uint32_t(compressed_size) != compressed_size) return false; |
27751
|
0 |
0 |
if (!os.write((const char*) &uncompressed_size, sizeof(uint32_t))) return false; |
|
0 |
0 |
if (!os.write((const char*) &uncompressed_size, sizeof(uint32_t))) return false; |
27752
|
0 |
0 |
if (!os.write((const char*) &compressed_size, sizeof(uint32_t))) return false; |
|
0 |
0 |
if (!os.write((const char*) &compressed_size, sizeof(uint32_t))) return false; |
27753
|
0 |
0 |
if (!os.write((const char*) &poor_crc, sizeof(uint32_t))) return false; |
|
0 |
0 |
if (!os.write((const char*) &poor_crc, sizeof(uint32_t))) return false; |
27754
|
0 |
0 |
if (!os.write((const char*) props_encoded, sizeof(props_encoded))) return false; |
|
0 |
0 |
if (!os.write((const char*) props_encoded, sizeof(props_encoded))) return false; |
27755
|
0 |
0 |
if (!os.write((const char*) compressed.data(), compressed_size)) return false; |
|
0 |
0 |
if (!os.write((const char*) compressed.data(), compressed_size)) return false; |
27777
|
0 |
0 |
return {1, 3, 0, ""}; |
27790
|
0 |
0 |
<< (udpipe.prerelease.empty() ? "" : "-") << udpipe.prerelease |
|
0 |
0 |
<< (udpipe.prerelease.empty() ? "" : "-") << udpipe.prerelease |
27792
|
0 |
0 |
<< (unilib.prerelease.empty() ? "" : "-") << unilib.prerelease |
|
0 |
0 |
<< (unilib.prerelease.empty() ? "" : "-") << unilib.prerelease |
27794
|
0 |
0 |
<< (morphodita.prerelease.empty() ? "" : "-") << morphodita.prerelease |
|
0 |
0 |
<< (morphodita.prerelease.empty() ? "" : "-") << morphodita.prerelease |
27796
|
0 |
0 |
<< (parsito.prerelease.empty() ? "" : "-") << parsito.prerelease |
|
0 |
0 |
<< (parsito.prerelease.empty() ? "" : "-") << parsito.prerelease |
27797
|
0 |
0 |
<< (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n" |
|
0 |
0 |
<< (other_libraries.empty() ? "" : " and ") << other_libraries << ")\n" |
27799
|
0 |
0 |
"Mathematics and Physics, Charles University in Prague, Czech Republic."; |
27805
|
2 |
0 |
} // namespace ufal |
|
2 |
0 |
} // namespace ufal |